インストール
pip install pdfplumber
pip install japanera
pip install requests
pip install BeautifulSoup4
スクレイピング
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
url = "https://www.city.kawasaki.jp/nakahara/"
link = urljoin(url, "./page/0000088519.html")
soup = fetch_soup(link)
pdf_links = []
# h2の等々力陸上競技場利用予定表を基準にPDFを抽出
for i in (
soup.find("h2", text="等々力陸上競技場利用予定表")
.find_next_sibling("div", class_="mol_attachfileblock")
.select("ul > li > a")
):
d = {}
d["link"] = urljoin(url, i.get("href"))
d["text"] = i.get_text(strip=True)
pdf_links.append(d)
pdf_links
日付変換
# PDFの和暦日付からdatetimeを生成
import re
from datetime import date
from japanera import Japanera, EraDate, EraDateTime
s = pdf_links[-1]["text"]
m = re.match("令和(\d{1,2})年(\d{1,2})月", s)
janera = Japanera()
dt_date = janera.strptime(m.group(0), "%-E%-O年%m月")[0]
dt_date
PDF変換
import pathlib
import requests
import pdfplumber
import pandas as pd
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
# 最新のPDFをダウンロード
p = fetch_file(pdf_links[-1]["link"])
with pdfplumber.open(p) as pdf:
dfs = []
for page in pdf.pages:
for table in page.extract_tables():
df = pd.DataFrame(table[2:], columns=["day", "曜日", "大会名", "午前", "午後"])
# 年・月・日
df["year"] = dt_date.year
df["month"] = dt_date.month
df["day"] = df["day"].astype(int)
# 日付変換
df["日付"] = pd.to_datetime(df[["year", "month", "day"]])
df["大会名"].mask(df["大会名"] == "", inplace=True)
df["午後"].mask(df["午後"].isna(), df["午前"], inplace=True)
dfs.append(df.reindex(["日付", "曜日", "大会名", "午前", "午後"], axis=1))
# 陸上競技場(メイン)
dfs[0]
dfs[0].to_csv(f"{dt_date:%Y%m}_main.csv", encoding="utf_8_sig", index=False)
# 陸上競技場(補助)
dfs[1]
dfs[1].to_csv(f"{dt_date:%Y%m}_sub.csv", encoding="utf_8_sig", index=False)