PythonでURLリストからファイルダウンロード
以下の記事で特定のWEBサイトをクロールしてURLのリストができてきたので、それをダウンロードするコードを書いてみました。
BeautifulSoup4でWEBスクレイピング(連番ページ)
BeautifulSoup4でWEBスクレイピング(階層化ページ)
ソース
simple_downloader.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import requests, os
headers = { 'User-Agent' : 'Mozilla/5.0' }
cwd = os.getcwd()
result_dir = cwd + '/download/'
list_file = cwd + '/list.txt'
done_file = 'done.txt'
fail_file = 'fail.txt'
def fetchImage(url):
path_relative = url.replace('http://', '').replace('https://', '')
try:
res = requests.get(url, headers = headers)
image = res.content
paths = os.path.split(path_relative)[0].split('/')
path_current = result_dir
for path in paths:
path_current += path + '/'
if not os.path.exists(path_current):
os.mkdir(path_current)
with open('{result_dir}{path_relative}'.format(result_dir = result_dir, path_relative = path_relative), 'wb') as f:
f.write(image)
except:
return False
return True
def getUrl():
result = ''
with open(list_file, 'r') as f:
url_list = f.read().split('\n')
result = url_list.pop(0)
with open(list_file, 'w') as f:
f.write('\n'.join(url_list))
return result
def saveUrl(file_name, url):
with open(file_name, 'a') as f:
f.write(url + '\n')
def download():
url = getUrl()
while url != '':
if fetchImage(url):
saveUrl(done_file, url)
print('done ' + url)
else:
saveUrl(fail_file, url)
print('fail ' + url)
url = getUrl()
download()