1
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

pythonでwebスクレイピング初心者

Last updated at Posted at 2020-04-08

なに

特定のヒトへの共有メモ。

初心者がpythonでwebスクレイピングする時、偉大な先輩方のトライアルを再現しようと奮闘した物語。

やりたいこと

『Pythonを使ってGoogleスプレッドシートを自動で読み書きする, id:temceeさん』を再現したいと思う。

準備

1: phantomjsを使えるようにする

『Windows 7にPhantomJSをインストールする方法, maechabinさん』 を参考に。

2: Google APIの有効化とkeyの発行

『Pythonを使ってGoogleスプレッドシートを自動で読み書きする, id:temceeさん』に従って、keyが保存されたjsonを獲得する。
このとき、

3: 実行

ちょっと書き換え。

# coding=utf-8
import os
import json
import gspread
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from oauth2client.service_account import ServiceAccountCredentials


cred_info = json.load( open( "{環境に応じて書き換え}/spread_sheet_credential.json", "r") )


SCOPE_URL = 'https://spreadsheets.google.com/feeds'
CREDENTIAL_FILE_NAME = 'spread_sheet_credential.json'
TEMPLATE_FILE_NAME = 'spread_sheet_credential_template.txt'
SHEET_PROJECT_ID = cred_info['project_id']
SHEET_PRIVATE_KEY_ID = cred_info['private_key_id']
SHEET_PRIVATE_KEY = cred_info['private_key']
SHEET_CLIENT_EMAIL = cred_info['client_email']
SHEET_CLIENT_ID = cred_info['client_id']
SHEET_CLIENT_X509_CERT_URL = cred_info['client_x509_cert_url']


def write_news(sheet, link, max_loop_count):
    driver = webdriver.PhantomJS()
    driver.get(link)
    loop_count = 0
    while loop_count < max_loop_count:
        loop_count += 1
        print('-------------- {}回目のページアクセス... --------------'.format(loop_count))
        # Spread Sheet書き込み
        write_techcrunch_news_elements(driver, sheet)
        # 次のページへアクセス
        driver = access_to_next(driver)


def access_to_next(driver):
    next = driver.find_element_by_link_text('次へ')
    # タイムアウトが発生したら新しくページにアクセスする
    page_content = '/page/'
    url = driver.current_url
    splited_url_contents = url.split(page_content)
    next_url = splited_url_contents[0] + page_content + str(int(splited_url_contents[1].split('/')[0]) + 1)
    try:
        next.click()
    except Exception as e:
        print('Timeoutが発生したので新しく「{}」にアクセスする。'.format(next_url))
        driver = webdriver.PhantomJS()
        driver.get(next_url)
    return driver


def write_techcrunch_news_elements(driver, sheet):
    # ページが完全に読み込まれるまでの時間を加味して最大10秒間待つ
    driver.set_page_load_timeout(10)
    title_dict = {}
    blocks = driver.find_elements_by_class_name('river-block')
    count = 0
    for block in blocks:
        count += 1
        ad_contain = None
        print('----- {}個目のriver-blockは... -----'.format(count))
        try:
            ad_contain = block.find_element_by_class_name('ad-contain')
        except Exception as e:
            try:
                news_title = block.find_element_by_class_name('post-title').find_element_by_tag_name('a').text
                news_time = block.find_element_by_tag_name('time').get_attribute('datetime')
                title_dict[news_title] = news_time
                print('News No.{} title:{} date:{}'.format(count, news_title, news_time))
            except Exception as e:
                print('スポンサー記事だった.'.format(count))
                continue
        if ad_contain is not None:
            print('広告だった.??'.format(count))
    write_to_sheet(sheet, title_dict)


def write_to_sheet(sheet, dict):
    keys = list(dict.keys())
    values = list(dict.values())
    titles = sheet.col_values(1)
    start_row_num = len(titles) + 1
    start_row = str(len(titles) + 1)
    end_row = str(len(keys) + start_row_num)
    # Spread Sheetに書き込み
    update_cells_with_list(sheet, 'A'+start_row, 'A'+end_row, keys, value_input_option='USER_ENTERED')
    update_cells_with_list(sheet, 'B'+start_row, 'B'+end_row, values, value_input_option='USER_ENTERED')


def access_to_sheet(gid):
    # 書き込み用ファイル
    credential_file = open(CREDENTIAL_FILE_NAME, 'r')
    credentials = ServiceAccountCredentials.from_json_keyfile_name( CREDENTIAL_FILE_NAME, SCOPE_URL)
    # credentials = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIAL_FILE_NAME, SCOPE_URL)
    client = gspread.authorize(credentials)
    return client.open_by_key(gid)


def update_cells_with_list(sheet, from_cell, to_cell, id_list, value_input_option):
    cell_list = sheet.range('{}:{}'.format(from_cell, to_cell))
    count_num = -1
    for cell in cell_list:
        count_num += 1
        try:
            val = id_list[count_num]
        except Exception as e:
            continue
        if val is None:
            continue
        cell.value = val
    print('{}から{}まで書き込むよ'.format(from_cell, to_cell))
    sheet.update_cells(cell_list, value_input_option=value_input_option)


# 書き込みしたいSpread SheetのシートID
sheet_gid = {スプレッドシートのID}
sheet_name = スプレッドシートの書き込み先シート名
target_link = 'https://jp.techcrunch.com/page/149/'
max_loop_count = 50
# Spread Sheetの書き込みできる一番上の行番号を確認する
sheet = access_to_sheet(sheet_gid).worksheet(sheet_name)
write_news(sheet, target_link, max_loop_count)
1
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?