適当だけどメモ
downloaderをほぼコピペしてきてDL完了時にcsvファイルに書き出してます。
from icrawler import ImageDownloader
from icrawler.builtin import GoogleImageCrawler
class URLDownloader(ImageDownloader):
def save_column(self, filepath, file_url, output_csv_path='list.csv'):
with open(output_csv_path, 'a') as f:
output_str = f'{filepath},{file_url}\n'
f.write(output_str)
def download(self,
task,
default_ext,
timeout=5,
max_retry=3,
overwrite=False,
**kwargs):
"""Download the image and save it to the corresponding path.
Args:
task (dict): The task dict got from ``task_queue``.
timeout (int): Timeout of making requests for downloading images.
max_retry (int): the max retry times if the request fails.
**kwargs: reserved arguments for overriding.
"""
file_url = task['file_url']
task['success'] = False
task['filename'] = None
retry = max_retry
if not overwrite:
with self.lock:
self.fetched_num += 1
filename = self.get_filename(task, default_ext)
if self.storage.exists(filename):
self.logger.info('skip downloading file %s', filename)
return
self.fetched_num -= 1
while retry > 0 and not self.signal.get('reach_max_num'):
try:
response = self.session.get(file_url, timeout=timeout)
except Exception as e:
self.logger.error('Exception caught when downloading file %s, '
'error: %s, remaining retry times: %d',
file_url, e, retry - 1)
else:
if self.reach_max_num():
self.signal.set(reach_max_num=True)
break
elif response.status_code != 200:
self.logger.error('Response status code %d, file %s',
response.status_code, file_url)
break
elif not self.keep_file(task, response, **kwargs):
break
with self.lock:
self.fetched_num += 1
filename = self.get_filename(task, default_ext)
self.logger.info('image #%s\t%s', self.fetched_num, file_url)
self.storage.write(filename, response.content)
task['success'] = True
task['filename'] = filename
self.save_column(filename, file_url)
break
finally:
retry -= 1
使う
def main():
google_crawler = GoogleImageCrawler(
downloader_cls=URLDownloader,
feeder_threads=1,
parser_threads=2,
downloader_threads=4,
storage={'root_dir': 'output'})
filters = dict(
type='photo',
color='color',
size='large',
license='commercial,modify')
google_crawler.crawl(keyword='food', filters=filters,
max_num=1000, file_idx_offset=0)
if __name__ == "__main__":
main()
もっといい方法があればいいけど