More than 3 years have passed since last update.

愛知県の新型コロナ発生事例のPDFをCSVに変換

Last updated at 2021-01-02Posted at 2020-11-22

2020年と2021年でPDFが分かれているので暫定対応

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def days2date(s):

    y = s["年"]

    days = re.findall("[0-9]{1,2}", s["発表日"])

    if len(days) == 2:
        m, d = map(int, days)
        return pd.Timestamp(year=y, month=m, day=d)
    else:
        return pd.NaT


def wareki2date(s):

    m = re.search("(昭和|平成|令和)([ 0-9元]{1,2})年(\d{1,2})月(\d{1,2})日", s)

    if m:

        year, month, day = [1 if i == "元" else int(i.strip()) for i in m.group(2, 3, 4)]

        if m.group(1) == "昭和":
            year += 1925
        elif m.group(1) == "平成":
            year += 1988
        elif m.group(1) == "令和":
            year += 2018

        return datetime.date(year, month, day)

    else:
        return dt_now.date


url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

dfs = []
dt_text = ""

for i, tag in enumerate(
    soup.find("span", text="▶ 愛知県内の発生事例").parent.find_all(
        "a", href=re.compile(".pdf$")
    )[::-1]
):

    link = urljoin(url, tag.get("href"))

    path_pdf = fetch_file(link)

    with pdfplumber.open(path_pdf) as pdf:

        for page in pdf.pages:

            if page.page_number == 1:

                text = page.within_bbox((0, 70, page.width, 90)).extract_text()

            table = page.extract_table()

            df_tmp = pd.DataFrame(table[1:], columns=table[0])

            df_tmp["年"] = 2020 + i

            dfs.append(df_tmp)

df = pd.concat(dfs).set_index("No")

df.dropna(subset=["発表日"], inplace=True)

df["発表日"] = df.apply(days2date, axis=1)

# 年代と性別を分割
df_ages = df["年代・性別"].str.extract("(.+)(男性|女性)").rename(columns={0: "年代", 1: "性別"})

df = df.join(df_ages)


dt_update = wareki2date(dt_text)

path_csv = pathlib.Path(dt_update.strftime("%Y%m%d") + ".csv")

df.to_csv(path_csv, encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up