BeutifulSoup4でWEBスクレイピング
BeautifulSoup4でWEBスクレイピング(連番ページ)に引き続き、階層化されているページのためのコードを書いたのでメモ
ポイント
カテゴリ→ページ→欲しいファイルと順にリストを作成、処理していくと途中で中断しても再開しやすい
コード
scraper.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
# Python 3
from urllib import request
except ImportError:
# Python 2
import urllib2 as request
from bs4 import BeautifulSoup
import time, os, codecs, string, json
domain = 'http://hoge.com'
wait_sec = 3
headers = { 'User-Agent' : 'Mozilla/5.0)' }
cwd = os.getcwd()
result_file = cwd + '/result_url.txt'
category_file = cwd + '/category.txt'
page_file = cwd + '/page.txt'
def fetchSoup(url):
time.sleep(wait_sec)
req = request.Request(url, headers = headers)
try:
print('open {url}'.format(url = url))
response = request.urlopen(req)
print('ok')
body = response.read()
return BeautifulSoup(body, 'lxml')
except URLError, e:
print('error: {reason}'.format(reason = e.reason))
return None
def getUrl(src):
return '{domain}{src}'.format(domain = domain, src = src)
def extractUrlFromTags(tags):
result = []
for tag in tags:
if tag.name == 'a':
result.append(getUrl(tag['href']))
elif tag.name == 'img':
result.append(getUrl(tag['src']))
return result
def saveUrl(file_name, url_list):
with codecs.open(file_name, 'a', 'utf-8') as f:
f.write('{list}\n'.format(list = '\n'.join(url_list)))
def deleteFirstLine(file_name):
with codecs.open(file_name, 'r', 'utf-8') as f:
content = f.read()
content = content[content.find('\n') + 1:]
with codecs.open(file_name, 'w', 'utf-8') as f:
f.write(content)
def fetchAllCategories():
page = 1
while True:
url = '{domain}/category_{page}/'.format(domain = domain, page = page)
soup = fetchSoup(url)
categories = soup.find('div', id = 'list').find_all('a')
url_list = extractUrlFromTags(categories)
if len(url_list):
saveUrl(category_file, url_list)
page_list_last = soup.find('div', class = 'pagenation').find_all('a')[-1].string
if page_list_last not in ['>', '>>']:
break
page += 1
def fetchCategory():
if not os.path.exists(category_file):
fetchAllCategories()
with codecs.open(category_file, 'r', 'utf-8') as f:
result = f.readline().rstrip('\n')
return result
def fetchAllPages():
category = fetchCategory()
while category != '':
soup = fetchSoup(category)
pages = soup.find_all('a', class = 'page')
url_list = extractUrlFromTag(pages)
if len(url_list):
saveUrl(page_file)
deleteFirstLine(page_file)
small_category = fetchCategory()
def fetchPage():
if os.path.exists(page_file) or fetchCategory() != '':
fetchAllPages()
with codecs.open(page_file, 'r', 'utf-8'):
result = f.readline().rstrip('\n')
return result
def fetchTargets():
page = fetchPage()
while page != '':
soup = fetchSoup(page)
targets = soup.find_all('img', class = 'target')
url_list = extractUrlFromTags(targets)
if len(url_list):
saveUrl(result_file, url_list)
deleteFirstLine(page_file)
page = fetchPage()
fetchTargets()
便利技
名前などがアルファベットでカテゴライズされているときに
alphabet_l = list(string.ascii_lowercase)
alphabet_u = list(string.ascii_uppercase)
scriptタグから抜き出した変数などを処理するのに
data = json.loads(json_string)
VPS等での実行時にバックグラウンドで処理、ログオフしても継続するために
$ nohup python scraper.py < /dev/null &
継続処理しているプロセスを確認するために
$ ps x