More than 3 years have passed since last update.

東京都の国のステージ判断のための指標のPDFからCSV作成（pdfplumber）

Last updated at 2021-08-26Posted at 2021-08-25

camelotでも作成
https://qiita.com/barobaro/items/7ecfd6d3b24e3bbd6477

表部分は固定みたいなので表の位置を指定して抽出

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


# スクレイピング

url = "https://www.fukushihoken.metro.tokyo.lg.jp/iryo/kansen/corona_portal/info/kunishihyou.html"

soup = fetch_soup(url)
tag = soup.find("a", class_="pdf", text=re.compile("^国のステージ判断のための指標"))
link = urljoin(url, tag.get("href"))

p = fetch_file(link)

pdf = pdfplumber.open(p)

page = pdf.pages[0]

# 日付取得
crop = page.within_bbox((26, 7, 375, 39))

title = crop.extract_text()

print(title)

dt_now = datetime.datetime.now()

m = re.search("(\d{1,2})月(\d{1,2})日", title)
month, day = map(int, m.groups())
latest = datetime.date(dt_now.year, month, day)

# PDF表の座標を指定

table_settings = {
    "vertical_strategy": "explicit",
    "explicit_vertical_lines": [236, 367, 502, 605, 682],
    "horizontal_strategy": "explicit",
    "explicit_horizontal_lines": [45, 90, 145, 202, 260, 317, 376, 430, 493],
}

table = page.extract_table(table_settings)

df0 = pd.DataFrame(
    table[1:],
    index=["新規報告者数", "感染経路不明割合", "PCR陽性率", "療養者数", "病床全体", "入院率", "うち重症者用病床"],
    columns=table[0],
)

df1 = df0.applymap(lambda s: s.replace("\n", ""))

df1.to_csv(f"{latest.isoformat()}.csv", encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up