- camelotは変換できなかったのでpdfplumberで変換
- 年月が省略
プログラム
import pathlib
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pdfplumber
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
def str2date(s):
lst = [None, None] + list(map(int, re.findall("\d+", s)))
return lst[-3:]
url = "https://www.city.suginami.tokyo.jp/news/kansensho/covid-19/1058987.html"
soup = fetch_soup(url)
tag = soup.select_one("ul.objectlink > li.pdf > a")
link = urljoin(url, tag.get("href"))
p = fetch_file(link)
with pdfplumber.open(p) as pdf:
dfs = []
for page in pdf.pages:
for table in page.extract_tables():
df_tmp = pd.DataFrame(table).set_index(0).T
if df_tmp.shape[0] > 2:
# df_tmp.set_axis(["日にち", "感染者数"], axis=1, inplace=True)
dfs.append(df_tmp)
df0 = pd.concat(dfs).reset_index(drop=True)
df1 = df0[df0["感染者数"].str.endswith("人") & (~df0["日にち"].str.endswith("計"))].copy()
# 日付変換
df_date = (
df1["日にち"]
.apply(str2date)
.apply(pd.Series)
.rename(columns={0: "year", 1: "month", 2: "day"})
.fillna(method="ffill")
.astype(int)
)
df_date["year"] = df_date["year"] + 2018
df1["日にち"] = pd.to_datetime(df_date, errors="coerce")
df1["感染者数"] = df1["感染者数"].str.rstrip("人").str.replace(",", "").astype(int)
df1.to_csv("suginami.csv", encoding="utf_8_sig")