More than 1 year has passed since last update.

オークションサイトから、seleniumを使ってスクレイピングし、googleカレンダーに反映

Posted at 2023-06-05

オークションサイトから、日時、開催者、オークション名、概要をスクレイピングし、googleカレンダーに反映させるプログラムです。
開始日時と終了日時を適切な形式にしたり、javascriptで書かれた部分をseleniumを使いスクレイピングするなどいくつか関門がありました。

各コードの説明

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from googleapiclient.discovery import build
from google.oauth2 import service_account

必要なライブラリのインポート

# GoogleカレンダーAPIの認証情報を設定
SCOPES = ['https://www.googleapis.com/auth/calendar']
SERVICE_ACCOUNT_FILE = 'service_accountのコピー.json' #こちらにサービスアカウントの秘密鍵ファイルのパスを入力

googleカレンダーAPIを認証

# サービスアカウントキーを読み込む
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# GoogleカレンダーのAPIクライアントを作成
service = build('calendar', 'v3', credentials=credentials)

APIオブジェクトを作成

base_url='https://www.numisbids.com/'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

ページの読み込み

auctions = soup.find_all('td', {'class': 'firmcell'})
# 相対urlが含まれる要素を指定

オークションの詳細ページの各相対urlが含まれる要素を取得

#相対urlを取得し、絶対urlを作成
urls = []
for auction in auctions:
    a_tag = auction.find('a')
    href = a_tag.get('href')
    absolute_url = urljoin(base_url,href)
    urls.append(absolute_url)

各ページのurlを作成

# 各URLから情報をスクレイピング
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    organizer = soup.find('span',{'class':'name'}).text
    
    # divタグの中のbタグを見つける
    div_tag = soup.find('div', {'class': 'text'})
    b_tag = div_tag.find('b') if div_tag else None

    # オークションの名前を取得
    auction_name = b_tag.text if b_tag else "Not found"

    #日付情報
    date_string = soup.find('div', {'class': 'text'}).text.split('\xa0\xa0')[1]
    # 日付の正規表現パターン
    date_pattern = r"(\d{1,2}-\d{1,2}|\d{1,2}) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}"
    
    # 各テキストから日付を抽出
    date = re.search(date_pattern, date_string).group()
    
    # 各日付を開始日と終了日に分割
    if "-" in date:
        start_day, end_day = date.split('-')[0], date.split('-')[1].split(' ')[0]
        month_year = ' '.join(date.split(' ')[1:])
        start_date = start_day + ' ' + month_year
        end_date = end_day + ' ' + month_year
    else:
        start_date = end_date = date
        
    # 日付を変換
    start_date = datetime.strptime(start_date, '%d %b %Y').strftime('%Y-%m-%d') + 'T00:00:00'
    
    end_date = datetime.strptime(end_date, '%d %b %Y').strftime('%Y-%m-%d') + 'T00:00:00'

    

    # WebDriverのパスを指定して初期化（Chromeの場合）
    driver = webdriver.Chrome('./chromedriver') #こちらにchromedriverファイルのパスを入力
    
    # Set window size
    driver.set_window_size(1024, 768)
    
    # ウェブサイトにアクセス
    driver.get(url)
    
    # Wait for the 'Sale information' button to be clickable
    wait = WebDriverWait(driver, 10)
    
    
    # Wait for the 'Sale Details/Timetable' button to be clickable
    details_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[text()="Sale Details/Timetable"]')))
    
    # Click on the 'Sale Details/Timetable' button
    details_button.click()
    
    # Get the text from the modal
    modal_text = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[@class="body-full"]'))).text
    
    
    # Close the driver
    driver.quit()
    
    
    event = {
    'summary': auction_name,
    'description': f'{organizer}\n\n{modal_text}',
    'start': {
        'dateTime': start_date,
        'timeZone': 'Asia/Tokyo'
    },
    'end': {
        'dateTime': end_date,
        'timeZone': 'Asia/Tokyo'
    }
    }
    
    # イベントをカレンダーに追加
    event = service.events().insert(calendarId='gaku2002bambo@gmail.com', body=event).execute()  #カレンダーidの部分にお使いのカレンダーのIDを入力

    print(f'イベントが作成されました: {event.get("htmlLink")}')

各urlごとに、情報を取得し、カレンダーに登録する。
日付取得は空白で分けたり、２日以上の期間のオークションと１日のオークションで分岐を作り開始日時、終了日時を取得。最終的にカレンダーに使える形に変換。
概要はクリックして出てくるのでseleniumで操作して取得。

全体コード

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from googleapiclient.discovery import build
from google.oauth2 import service_account

GoogleカレンダーAPIの認証情報を設定

SCOPES = ['https://www.googleapis.com/auth/calendar']
SERVICE_ACCOUNT_FILE = 'service_accountのコピー.json' #こちらにサービスアカウントの秘密鍵ファイルのパスを入力

サービスアカウントキーを読み込む

credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)

GoogleカレンダーのAPIクライアントを作成

service = build('calendar', 'v3', credentials=credentials)

#あとで関数にする
base_url='https://www.numisbids.com/'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

auctions = soup.find_all('td', {'class': 'firmcell'})

相対urlが含まれる要素を指定

#相対urlを取得し、絶対urlを作成
urls = []
for auction in auctions:
a_tag = auction.find('a')
href = a_tag.get('href')
absolute_url = urljoin(base_url,href)
urls.append(absolute_url)

各URLから情報をスクレイピング

for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

organizer = soup.find('span',{'class':'name'}).text

# divタグの中のbタグを見つける
div_tag = soup.find('div', {'class': 'text'})
b_tag = div_tag.find('b') if div_tag else None

# オークションの名前を取得
auction_name = b_tag.text if b_tag else "Not found"

#日付情報
date_string = soup.find('div', {'class': 'text'}).text.split('\xa0\xa0')[1]
# 日付の正規表現パターン
date_pattern = r"(\d{1,2}-\d{1,2}|\d{1,2}) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}"

# 各テキストから日付を抽出
date = re.search(date_pattern, date_string).group()

# 各日付を開始日と終了日に分割
if "-" in date:
    start_day, end_day = date.split('-')[0], date.split('-')[1].split(' ')[0]
    month_year = ' '.join(date.split(' ')[1:])
    start_date = start_day + ' ' + month_year
    end_date = end_day + ' ' + month_year
else:
    start_date = end_date = date
    
# 日付を変換
start_date = datetime.strptime(start_date, '%d %b %Y').strftime('%Y-%m-%d') + 'T00:00:00'

end_date = datetime.strptime(end_date, '%d %b %Y').strftime('%Y-%m-%d') + 'T00:00:00'



# WebDriverのパスを指定して初期化（Chromeの場合）
driver = webdriver.Chrome('./chromedriver') #こちらにchromedriverファイルのパスを入力

# Set window size
driver.set_window_size(1024, 768)

# ウェブサイトにアクセス
driver.get(url)

# Wait for the 'Sale information' button to be clickable
wait = WebDriverWait(driver, 10)


# Wait for the 'Sale Details/Timetable' button to be clickable
details_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[text()="Sale Details/Timetable"]')))

# Click on the 'Sale Details/Timetable' button
details_button.click()

# Get the text from the modal
modal_text = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[@class="body-full"]'))).text


# Close the driver
driver.quit()


event = {
'summary': auction_name,
'description': f'{organizer}\n\n{modal_text}',
'start': {
    'dateTime': start_date,
    'timeZone': 'Asia/Tokyo'
},
'end': {
    'dateTime': end_date,
    'timeZone': 'Asia/Tokyo'
}
}

# イベントをカレンダーに追加
event = service.events().insert(calendarId='gaku2002bambo@gmail.com', body=event).execute()  #カレンダーidの部分にお使いのカレンダーのIDを入力

print(f'イベントが作成されました: {event.get("htmlLink")}')

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up