BeutifulSoup4でWEBスクレイピング
よくあるURLが連番になっているページで、後からまとめてダウンロードするためにURLリストを作成するコードを書いたのでメモ
インストール
$ apt-get install lxml-python
$ pip install beautifulsoup4
ソース
scraper.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
# Python 3
from urllib import request
except ImportError:
# Python 2
import urllib2 as request
from bs4 import BeautifulSoup
import codecs
import time
def getSoup(url):
response = request.urlopen(url)
body = response.read()
# Parse HTML
return BeautifulSoup(body, 'lxml')
wait_sec = 3
domain = 'http://hoge.com'
result_file = 'list.txt'
i = 1
while(True):
url = '{domain}/{index:0>2}/'.format(domain = domain, index = i)
try:
soup = getSoup(url)
except IOError:
break
div = soup.find('div', attrs = {'id': 'div_id'})
all_a = div.find_all('a', attrs = {'class': 'a_class'})
src_list = []
for a in all_a:
src_list.append(a.img['src'])
with codecs.open(result_file, 'a', 'utf-8') as f:
f.write('\n'.join(src_list))
print(i)
i += 1
time.sleep(wait_sec)