camelotでも作成
https://qiita.com/barobaro/items/7ecfd6d3b24e3bbd6477
表部分は固定みたいなので表の位置を指定して抽出
import datetime
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
# スクレイピング
url = "https://www.fukushihoken.metro.tokyo.lg.jp/iryo/kansen/corona_portal/info/kunishihyou.html"
soup = fetch_soup(url)
tag = soup.find("a", class_="pdf", text=re.compile("^国のステージ判断のための指標"))
link = urljoin(url, tag.get("href"))
p = fetch_file(link)
pdf = pdfplumber.open(p)
page = pdf.pages[0]
# 日付取得
crop = page.within_bbox((26, 7, 375, 39))
title = crop.extract_text()
print(title)
dt_now = datetime.datetime.now()
m = re.search("(\d{1,2})月(\d{1,2})日", title)
month, day = map(int, m.groups())
latest = datetime.date(dt_now.year, month, day)
# PDF表の座標を指定
table_settings = {
"vertical_strategy": "explicit",
"explicit_vertical_lines": [236, 367, 502, 605, 682],
"horizontal_strategy": "explicit",
"explicit_horizontal_lines": [45, 90, 145, 202, 260, 317, 376, 430, 493],
}
table = page.extract_table(table_settings)
df0 = pd.DataFrame(
table[1:],
index=["新規報告者数", "感染経路不明割合", "PCR陽性率", "療養者数", "病床全体", "入院率", "うち重症者用病床"],
columns=table[0],
)
df1 = df0.applymap(lambda s: s.replace("\n", ""))
df1.to_csv(f"{latest.isoformat()}.csv", encoding="utf_8_sig")