More than 3 years have passed since last update.

webスクレイピングで画像収集

Posted at 2020-07-04

今回はwebスクレイピングを使用し指定したURLから画像を収集するコードを書き、説明していきたいと思います。

###実装コード

import requests
from requests.compat import urljoin
from bs4 import BeautifulSoup
import time
from PIL import Image
import urllib.request
import sys.os

class web_scryping:
    def __init__(self , url):
        self.url = url
        self.soup = BeautifulSoup(requests.get(self.url).content, 'lxml')
        
class download_images(web_scryping):
    def download(self , max_down_num):
        self.down_num = 0
        self.max_down_num = max_down_num
        self.save_path = './img/' + str(self.down_num+1) + '.jpg'
        now_num = 0
        for link in self.soup.find_all("img"):
            src_attr = link.get("src")
            target = urljoin(self.url, src_attr)
            resp = requests.get(target)
            image = resp.content
            #breakpoint()
            print(str(resp) + '  ' + str(now_num))
            now_num = now_num + 1
            if str(resp) != '<Response [404]>':
                with open(self.save_path, 'wb') as f:
                    f.write(image)
                self.down_num = self.down_num + 1
            time.sleep(1)
            self.save_path = './img/' + str(self.down_num+1) + '.jpg'
            if self.down_num == self.max_down_num:
                break
                
    def img_resize(self , img_path):
        try:            
            im = Image.open(img_path)
            print("元の画像サイズ　width: {}, height: {}".format(im.size[0], im.size[1]))
            im_resize = im.resize(size=(800,1200))         
            im_resize.save(save_path)
            print('image resize sucess')
        except:
            print('image resize failed')


def main():
    url = sys.argv[0]
    di = download_images(url)
    di.download(50)

if __name__ == '__main__':
    main()

#プログラムの流れについて
###手順１

def main():
    url = sys.argv[0]
    di = download_images(url)
    di.download(50)

コマンドライン引数にて第一引数にURLを指定します。
URLをweb_scrypingクラスを継承したdownload_imagesクラスに渡します。

###手順２

class web_scryping:
    def __init__(self , url):
        self.url = url
        self.soup = BeautifulSoup(requests.get(self.url).content, 'lxml')

download_imagesクラスはweb_scrypingクラスを継承し、なおかつdownload_imagesクラスにはinitメソッドがないためweb_scrypingクラスのinitメソッドが起動します。
ここではURLをrequests.getメソッドで内容を取得し、BeautifulSoupでhtmlの内容を解析します。解析結果をself.soupというクラス変数に入れます。

###手順３

class download_images(web_scryping):
    def download(self , max_down_num):
        self.down_num = 0
        self.max_down_num = max_down_num
        self.save_path = './img/' + str(self.down_num+1) + '.jpg'
        now_num = 0
        for link in self.soup.find_all("img"):
            src_attr = link.get("src")
            target = urljoin(self.url, src_attr)
            resp = requests.get(target)
            image = resp.content
            #breakpoint()
            print(str(resp) + '  ' + str(now_num))
            now_num = now_num + 1
            if str(resp) != '<Response [404]>':
                with open(self.save_path, 'wb') as f:
                    f.write(image)
                self.down_num = self.down_num + 1
            time.sleep(1)
            self.save_path = './img/' + str(self.down_num+1) + '.jpg'
            if self.down_num == self.max_down_num:
                break

手順１のdownload_imagesクラスのdownloadメソッドを使用し、downloadを開始します。
self.save_pathはimgディレクトリの中に数字.jpgという感じでどんどん画像ファイルの名前を指定していきます。
self.soup.find_all("img"): htmlの中からimgタグを探します。
src_attr = link.get("src"): imgタグからsrcの項目を取得します。
image = resp.content: image変数に画像オブジェクトが入ります。
if str(resp) != '': resp変数にレスポンスの結果が入っているため40４でなければ画像を保存します。
time.sleep(1): スクレイピングを行う時はwebサイトに負担をかけることは好ましくないためsleepメソッドを使用し時間を空けます。

###おまけ

def img_resize(self , img_path):
        try:            
            im = Image.open(img_path)
            print("元の画像サイズ　width: {}, height: {}".format(im.size[0], im.size[1]))
            im_resize = im.resize(size=(800,1200))         
            im_resize.save(save_path)
            print('image resize sucess')
        except:
            print('image resize failed')

このメソッドは画像のサイズを調整するメソッドなんですが大きくすると解像度があまりよくなかったため使用していません。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up