最新の情報を抽出できるように更新日を追加
apt install python3-tk ghostscript
pip install camelot-py[cv]
import requests
from bs4 import BeautifulSoup
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import camelot
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
def fetch_pdf(url):
soup = fetch_soup(url)
tag = soup.find("span", text=re.compile("^患者の発生について")).find_parent("a")
link = urljoin(url, tag.get("href"))
p = fetch_file(link)
dfs = []
tables = camelot.read_pdf(
str(p), pages="all", split_text=True, strip_text="\n", line_scale=40
)
for table in tables:
if table.data[0][0] == "例目":
df_tmp = pd.DataFrame(table.data[1:], columns=table.data[0])
row, col = df_tmp.shape
if 10 < col < 13:
dfs.append(df_tmp)
df = pd.concat(dfs)
df["更新日"] = update
return df
url = "https://www.pref.shiga.lg.jp/ippan/kenkouiryouhukushi/yakuzi/310735.html"
soup = fetch_soup(url)
dfs = []
for i in soup.find_all("a", text="患者の発生について"):
update = (
i.find_parent("span")
.find_previous_sibling("span", class_="first")
.get_text(strip=True)
)
print(update)
link = urljoin(url, i.get("href"))
d = fetch_pdf(link)
dfs.append(d)
df = pd.concat(dfs)
df.info()
df
df.to_csv("data.csv", encoding="utf_8_sig")