More than 3 years have passed since last update.

杉並区内の新型コロナウイルス感染者数（日ごと）のPDFをCSV変換

Last updated at 2021-08-27Posted at 2021-08-27

camelotは変換できなかったのでpdfplumberで変換
年月が省略

プログラム

import pathlib
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

import pdfplumber
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def str2date(s):

    lst = [None, None] + list(map(int, re.findall("\d+", s)))

    return lst[-3:]


url = "https://www.city.suginami.tokyo.jp/news/kansensho/covid-19/1058987.html"

soup = fetch_soup(url)
tag = soup.select_one("ul.objectlink > li.pdf > a")
link = urljoin(url, tag.get("href"))

p = fetch_file(link)

with pdfplumber.open(p) as pdf:

    dfs = []

    for page in pdf.pages:

        for table in page.extract_tables():

            df_tmp = pd.DataFrame(table).set_index(0).T

            if df_tmp.shape[0] > 2:

                # df_tmp.set_axis(["日にち", "感染者数"], axis=1, inplace=True)
                dfs.append(df_tmp)

df0 = pd.concat(dfs).reset_index(drop=True)

df1 = df0[df0["感染者数"].str.endswith("人") & (~df0["日にち"].str.endswith("計"))].copy()

# 日付変換
df_date = (
    df1["日にち"]
    .apply(str2date)
    .apply(pd.Series)
    .rename(columns={0: "year", 1: "month", 2: "day"})
    .fillna(method="ffill")
    .astype(int)
)
df_date["year"] = df_date["year"] + 2018
df1["日にち"] = pd.to_datetime(df_date, errors="coerce")

df1["感染者数"] = df1["感染者数"].str.rstrip("人").str.replace(",", "").astype(int)

df1.to_csv("suginami.csv", encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up