東京都福祉保健局の新型コロナウイルス感染症の都内感染者の状況のPDFをCSVに変換
import pathlib
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
import requests
from tqdm.notebook import tqdm
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.fukushihoken.metro.tokyo.lg.jp/iryo/kansen/todokedehcyouseisya.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.select_one("div#main p.filelink > a.pdf")
link = urljoin(url, tag.get("href"))
path_pdf = fetch_file(link)
dfs = []
# PDFを変換
with pdfplumber.open(path_pdf) as pdf:
for page in tqdm(pdf.pages):
table = page.extract_table()
df_tmp = pd.DataFrame(table[1:], columns=table[0])
dfs.append(df_tmp)
# すべてのページを結合
df = pd.concat(dfs)
df.shape
# 前後の空白文字、正規化
for col in df.select_dtypes(include=object).columns:
df[col] = df[col].str.strip().str.normalize("NFKC")
# 拡張子をCSVに変更
path_csv = path_pdf.with_suffix(".csv")
df.to_csv(path_csv, encoding="utf_8_sig", index=False)
df1 = df.copy()
# データラングリング
import datetime
dt_now = datetime.datetime.now()
# 月日を現在の年で補完し年月日に変換、現在より未来の日付の場合は1年前にする
def str2date(s: pd.Series) -> pd.Series:
df = s.str.extract("(\d{1,2})月(\d{1,2})日").rename(columns={0: "month", 1: "day"}).fillna(0).astype(int)
df["year"] = dt_now.year
tmp = pd.to_datetime(df, errors="coerce")
df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)
return pd.to_datetime(df, errors="coerce")
df1["リリース日YMD"] = str2date(df1["リリース日"])
df1["発症日YMD"] = str2date(df1["発症日"])
df1["確定日YMD"] = str2date(df1["確定日"])
p = path_csv.with_name(path_csv.name.replace(".csv", "_c.csv"))
df1.to_csv(p, index=False, encoding="utf_8_sig")
# ダウンロード
from google.colab import files
files.download(str(p))