More than 3 years have passed since last update.

滋賀県の新型コロナウイルス感染症患者の発生状況をCSVに変換

Posted at 2021-04-15

最新の情報を抽出できるように更新日を追加

apt install python3-tk ghostscript
pip install camelot-py[cv]

import requests
from bs4 import BeautifulSoup

import pathlib
import re

from urllib.parse import urljoin

import pandas as pd

import camelot

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p


def fetch_pdf(url):

    soup = fetch_soup(url)

    tag = soup.find("span", text=re.compile("^患者の発生について")).find_parent("a")

    link = urljoin(url, tag.get("href"))

    p = fetch_file(link)

    dfs = []

    tables = camelot.read_pdf(
        str(p), pages="all", split_text=True, strip_text="\n", line_scale=40
    )

    for table in tables:

        if table.data[0][0] == "例目":

            df_tmp = pd.DataFrame(table.data[1:], columns=table.data[0])

            row, col = df_tmp.shape

            if 10 < col < 13:
                dfs.append(df_tmp)

    df = pd.concat(dfs)

    df["更新日"] = update

    return df


url = "https://www.pref.shiga.lg.jp/ippan/kenkouiryouhukushi/yakuzi/310735.html"

soup = fetch_soup(url)

dfs = []

for i in soup.find_all("a", text="患者の発生について"):

    update = (
        i.find_parent("span")
        .find_previous_sibling("span", class_="first")
        .get_text(strip=True)
    )

    print(update)

    link = urljoin(url, i.get("href"))

    d = fetch_pdf(link)

    dfs.append(d)

df = pd.concat(dfs)

df.info()

df

df.to_csv("data.csv", encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up