LoginSignup
0
0

More than 3 years have passed since last update.

愛知県の新型コロナ発生事例のPDFをCSVに変換

Last updated at Posted at 2020-11-22

2020年と2021年でPDFが分かれているので暫定対応

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def days2date(s):

    y = s["年"]

    days = re.findall("[0-9]{1,2}", s["発表日"])

    if len(days) == 2:
        m, d = map(int, days)
        return pd.Timestamp(year=y, month=m, day=d)
    else:
        return pd.NaT


def wareki2date(s):

    m = re.search("(昭和|平成|令和)([ 0-9元]{1,2})年(\d{1,2})月(\d{1,2})日", s)

    if m:

        year, month, day = [1 if i == "元" else int(i.strip()) for i in m.group(2, 3, 4)]

        if m.group(1) == "昭和":
            year += 1925
        elif m.group(1) == "平成":
            year += 1988
        elif m.group(1) == "令和":
            year += 2018

        return datetime.date(year, month, day)

    else:
        return dt_now.date


url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

dfs = []
dt_text = ""

for i, tag in enumerate(
    soup.find("span", text="▶ 愛知県内の発生事例").parent.find_all(
        "a", href=re.compile(".pdf$")
    )[::-1]
):

    link = urljoin(url, tag.get("href"))

    path_pdf = fetch_file(link)

    with pdfplumber.open(path_pdf) as pdf:

        for page in pdf.pages:

            if page.page_number == 1:

                text = page.within_bbox((0, 70, page.width, 90)).extract_text()

            table = page.extract_table()

            df_tmp = pd.DataFrame(table[1:], columns=table[0])

            df_tmp["年"] = 2020 + i

            dfs.append(df_tmp)

df = pd.concat(dfs).set_index("No")

df.dropna(subset=["発表日"], inplace=True)

df["発表日"] = df.apply(days2date, axis=1)

# 年代と性別を分割
df_ages = df["年代・性別"].str.extract("(.+)(男性|女性)").rename(columns={0: "年代", 1: "性別"})

df = df.join(df_ages)


dt_update = wareki2date(dt_text)

path_csv = pathlib.Path(dt_update.strftime("%Y%m%d") + ".csv")

df.to_csv(path_csv, encoding="utf_8_sig")
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0