LoginSignup
12
13

More than 5 years have passed since last update.

BeautifulSoup4でWEBスクレイピング(階層化ページ)

Last updated at Posted at 2016-07-09

BeutifulSoup4でWEBスクレイピング

BeautifulSoup4でWEBスクレイピング(連番ページ)に引き続き、階層化されているページのためのコードを書いたのでメモ

ポイント

カテゴリ→ページ→欲しいファイルと順にリストを作成、処理していくと途中で中断しても再開しやすい

コード

scraper.py
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

try:
    # Python 3
    from urllib import request
except ImportError:
    # Python 2
    import urllib2 as request

from bs4 import BeautifulSoup
import time, os, codecs, string, json

domain = 'http://hoge.com'
wait_sec = 3
headers = { 'User-Agent' : 'Mozilla/5.0)' }
cwd = os.getcwd()
result_file = cwd + '/result_url.txt'
category_file = cwd + '/category.txt'
page_file = cwd + '/page.txt'

def fetchSoup(url):
    time.sleep(wait_sec)

    req = request.Request(url, headers = headers)
    try:
        print('open {url}'.format(url = url))
        response = request.urlopen(req)
        print('ok')
        body = response.read()
        return BeautifulSoup(body, 'lxml')
    except URLError, e:
        print('error: {reason}'.format(reason = e.reason))
        return None

def getUrl(src):
    return '{domain}{src}'.format(domain = domain, src = src)

def extractUrlFromTags(tags):
    result = []
    for tag in tags:
        if tag.name == 'a':
            result.append(getUrl(tag['href']))
        elif tag.name == 'img':
            result.append(getUrl(tag['src']))
    return result

def saveUrl(file_name, url_list):
    with codecs.open(file_name, 'a', 'utf-8') as f:
        f.write('{list}\n'.format(list = '\n'.join(url_list)))

def deleteFirstLine(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f:
        content = f.read()
        content = content[content.find('\n') + 1:]
    with codecs.open(file_name, 'w', 'utf-8') as f:
        f.write(content)

def fetchAllCategories():
    page = 1
    while True:
        url = '{domain}/category_{page}/'.format(domain = domain, page = page)
        soup = fetchSoup(url)
        categories = soup.find('div', id = 'list').find_all('a')
        url_list = extractUrlFromTags(categories)
        if len(url_list):
            saveUrl(category_file, url_list)
        page_list_last = soup.find('div', class = 'pagenation').find_all('a')[-1].string
        if page_list_last not in ['>', '>>']:
            break
        page += 1

def fetchCategory():
    if not os.path.exists(category_file):
        fetchAllCategories()
    with codecs.open(category_file, 'r', 'utf-8') as f:
        result = f.readline().rstrip('\n')
    return result

def fetchAllPages():
    category = fetchCategory()
    while category != '':
        soup = fetchSoup(category)
        pages = soup.find_all('a', class = 'page')
        url_list = extractUrlFromTag(pages)
        if len(url_list):
            saveUrl(page_file)
        deleteFirstLine(page_file)
        small_category = fetchCategory()

def fetchPage():
    if os.path.exists(page_file) or fetchCategory() != '':
        fetchAllPages()
    with codecs.open(page_file, 'r', 'utf-8'):
        result = f.readline().rstrip('\n')
    return result

def fetchTargets():
    page = fetchPage()
    while page != '':
        soup = fetchSoup(page)
        targets = soup.find_all('img', class = 'target')
        url_list = extractUrlFromTags(targets)
        if len(url_list):
            saveUrl(result_file, url_list)
        deleteFirstLine(page_file)
        page = fetchPage()

fetchTargets()

便利技

名前などがアルファベットでカテゴライズされているときに

alphabet_l = list(string.ascii_lowercase)
alphabet_u = list(string.ascii_uppercase)

scriptタグから抜き出した変数などを処理するのに

data = json.loads(json_string)

VPS等での実行時にバックグラウンドで処理、ログオフしても継続するために

$ nohup python scraper.py < /dev/null &

継続処理しているプロセスを確認するために

$ ps x
12
13
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
12
13