LoginSignup
0
0

More than 3 years have passed since last update.

icrawlerでurlも同時に保存する

Posted at

適当だけどメモ

downloaderをほぼコピペしてきてDL完了時にcsvファイルに書き出してます。

from icrawler import ImageDownloader
from icrawler.builtin import GoogleImageCrawler


class URLDownloader(ImageDownloader):

    def save_column(self, filepath, file_url, output_csv_path='list.csv'):
        with open(output_csv_path, 'a') as f:
            output_str = f'{filepath},{file_url}\n'
            f.write(output_str)

    def download(self,
                 task,
                 default_ext,
                 timeout=5,
                 max_retry=3,
                 overwrite=False,
                 **kwargs):
        """Download the image and save it to the corresponding path.

        Args:
            task (dict): The task dict got from ``task_queue``.
            timeout (int): Timeout of making requests for downloading images.
            max_retry (int): the max retry times if the request fails.
            **kwargs: reserved arguments for overriding.
        """
        file_url = task['file_url']
        task['success'] = False
        task['filename'] = None
        retry = max_retry

        if not overwrite:
            with self.lock:
                self.fetched_num += 1
                filename = self.get_filename(task, default_ext)
                if self.storage.exists(filename):
                    self.logger.info('skip downloading file %s', filename)
                    return
                self.fetched_num -= 1

        while retry > 0 and not self.signal.get('reach_max_num'):
            try:
                response = self.session.get(file_url, timeout=timeout)
            except Exception as e:
                self.logger.error('Exception caught when downloading file %s, '
                                  'error: %s, remaining retry times: %d',
                                  file_url, e, retry - 1)
            else:
                if self.reach_max_num():
                    self.signal.set(reach_max_num=True)
                    break
                elif response.status_code != 200:
                    self.logger.error('Response status code %d, file %s',
                                      response.status_code, file_url)
                    break
                elif not self.keep_file(task, response, **kwargs):
                    break
                with self.lock:
                    self.fetched_num += 1
                    filename = self.get_filename(task, default_ext)
                self.logger.info('image #%s\t%s', self.fetched_num, file_url)
                self.storage.write(filename, response.content)
                task['success'] = True
                task['filename'] = filename
                self.save_column(filename, file_url)
                break
            finally:
                retry -= 1

使う

def main():
    google_crawler = GoogleImageCrawler(
        downloader_cls=URLDownloader,
        feeder_threads=1,
        parser_threads=2,
        downloader_threads=4,
        storage={'root_dir': 'output'})
    filters = dict(
        type='photo',
        color='color',
        size='large',
        license='commercial,modify')
    google_crawler.crawl(keyword='food', filters=filters,
                         max_num=1000, file_idx_offset=0)


if __name__ == "__main__":
    main()

もっといい方法があればいいけど

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0