LoginSignup
0
1

More than 3 years have passed since last update.

厚生労働省の各都道府県の検査陽性者の状況のPDFをスクレイピング

Last updated at Posted at 2020-10-04
apt install python3-tk ghostscript
pip install camelot-py[cv]
pip install requests
pip install beautifulsoup4
import re
from urllib.parse import urljoin

import camelot
import pandas as pd

import requests
from bs4 import BeautifulSoup

def get_link(url, text):

    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    tag = soup.find("a", text=re.compile(text))

    link = urljoin(url, tag.get("href"))

    return link

def set_col(df, n = 1):

    if n > 1:
        columns = ["".join(i) for i in zip(*(df.head(n).values))]

    else:
        columns = df.iloc[0]

    return df.iloc[n:].set_axis(columns, axis=1).reset_index(drop=True)

url = get_link(
    "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html",
    "^新型コロナウイルス感染症の現在の状況と厚生労働省の対応について",
)

link = get_link(url, "別紙1")

tables = camelot.read_pdf(link, pages="all", split_text=True, strip_text="  ,\n")

df1 = set_col(tables[0].df, 2)
df2 = set_col(tables[1].df)

df = pd.concat([df1, df2], axis=1)

df.columns = df.columns.str.replace("※\d", "")

df["都道府県名"] = df["都道府県名"].str.replace("※\d", "")

df.mask(df == "-", inplace=True)

df.to_csv("corona.csv", encoding="utf_8_sig")
0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1