LoginSignup
1
0

More than 3 years have passed since last update.

新型コロナウイルス感染症に関する相模原市発表資料(発生状況等)のPDFをCSVに変換

Posted at
import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.city.sagamihara.kanagawa.jp/shisei/koho/1019191.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find(
    "a", href=re.compile(".pdf$"), onclick=re.compile("新型コロナウイルス感染症による新たな患者の確認")
)

link = urljoin(url, tag.get("href"))

path_pdf = fetch_file(link)

with pdfplumber.open(path_pdf) as pdf:

    dfs = []

    for page in pdf.pages:

        if page.page_number == 1:

            # cropでテキスト取得
            crop = page.within_bbox((400, 44, page.width, 60))
            update = crop.extract_text()

        for table in page.extract_tables():

            df_tmp = pd.DataFrame(table)

            row, col = df_tmp.shape

            # 列が11

            if col == 11:

                # 表の一番先頭に未満が含まれない

                if "未満" not in table[0][0]:

                    dfs.append(df_tmp)

df = (
    pd.concat(dfs)
    .iloc[1:]
    .set_axis(
        ["症例No.", "年代", "性別", "職業等", "場所", "居住地", "症状", "発症日", "陽性判明日", "感染経路等", "備考"],
        axis=1,
    )
)

df

# 前後の空白文字、正規化
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.replace("\s", "").str.normalize("NFKC")

dt_now = datetime.datetime.now()


def str2date(s: pd.Series) -> pd.Series:

    df = (
        s.str.extract("(\d{1,2})月(\d{1,2})日")
        .rename(columns={0: "month", 1: "day"})
        .fillna(0)
        .astype(int)
    )

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")


df["発症日YMD"] = str2date(df["発症日"])

df["陽性判明日YMD"] = str2date(df["陽性判明日"])

y, m, d = map(int, re.findall("\d+", update))

dt_update = datetime.datetime(2018 + y, m, d)


df.to_csv(f'sagamihara{dt_update.strftime("%Y%m%d")}.csv', encoding="utf_8_sig")
1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0