ディープラーニングで画像分類したいので、スクレーピングを調べつつ実装しました。

0. モジュールのインポート

BeautifulSoupを使います。

from bs4 import BeautifulSoup
import urllib
import os
from PIL import Image
import matplotlib.pyplot as plt
from numpy import *
import six
%matplotlib inline

1. 画像検索からスクレイピング

yahooとbingの検索エンジンから抜いてます。

class imageGetter():
    def __init__(self):
        pass

    def set_search_engine(self, key="yahoo"):
        if key == "yahoo":
            self.search_engine = u"https://search.yahoo.co.jp/image/search?p={0}&oq=&ei=UTF-8&b=21"
        elif key == "bing":
            self.search_engine = u"https://www.bing.com/images/search?q={0}"
    def series_process(self, word_list, padir="padir", key="yahoo", rec=1000):
        os.mkdir(padir)
        for word in word_list:
            o = imageGetter()
            o.search(word, False, key)

            cnt = 0
            try:
                for i in range(2, rec):
                    print(i)
                    o.next(i)
                    cnt = i
            except:
                pass
            print("Curation of the images from {} pages is Succeed".format(cnt))
            path = padir+"/"+word
            os.mkdir(path)
            print("Images is dumped at {}".format(path))
            o.dump(path)

    def search(self, search_word, is_show=False, key="yahoo"):
        self.set_search_engine(key)
        self.search_word = search_word
        response = urllib.request.urlopen(self.search_engine.format(urllib.parse.quote(search_word)))
        soup = BeautifulSoup(response, "lxml")

        urllst = []
        for obj in soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                urllst += [line[:n+3]]
            except:
                pass
        self.urllst = urllst
        if is_show:
            self.print()

    def next(self, idx=2, is_show=False):
        url = self.soup.find_all("a",string="%s"%idx)[0].get("href")
        response = urllib.request.urlopen(url)
        self.soup = BeautifulSoup(response, "lxml")

        for obj in self.soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                self.urllst += [line[:n+3]]
            except:
                pass
        if is_show:
            self.print()

    def print(self):
        for line in self.urllst:
            print(line)

    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

        plt.show()

    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

10枚オーダでしか取得できないです。。。
何かいい方法ないですかね。

1.1. 使い方

画像のurlのリストを取得。

o = imageGetter()
o.search("犬", False) # Trueにするとurlを標準出力

画像のプレビュー(出力)。jupyter notebookだとoutputに画像が列挙されます。

o.show()

：
：
：

ディレクトリに画像をダンプできます。

path = "./tmp"
o.dump(path)

2. Flickrからスクレイピング

APIキーが要ります(こことか参照)。

class Flickr_handler:
    def __init__(self, key):
        self.key = key

    def search(self, search_word):
        self.search_word = search_word
        line = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&per_page=500&format=rest&text={1}"

        response = urllib.request.urlopen(line.format(self.key, search_word))
        soup = BeautifulSoup(response, "lxml")
        self.soup = soup
        lst = soup.find_all("photo")
        self.lst = lst
        urllst = []
        for tag in lst:
            urllst += ["http://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg".format(lst[0].get("farm"),
                 tag.get("server"),
                 tag.get("id"),
                 tag.get("secret"))]
        self.urllst = urllst
        #response = urllib.request.urlopen(url.format(urllib.parse.quote(search_word)))

    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

        plt.show()

    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

key = #your key 
o = Flickr_handler(key)
o.search("car")

その他の使い方は1.と同じ。こちらは最大500枚抜けます。

画像のスクレイピング② ー bing, yahoo, Flickrから画像取得

0. モジュールのインポート

1. 画像検索からスクレイピング

1.1. 使い方

2. Flickrからスクレイピング