pdfplumberだと変換に4分ぐらいかかるが、pdftotextだと数秒で完了
pdfplumber
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.city.nerima.tokyo.jp/hokenfukushi/hoken/kansensho/2019-nCoV/ruikei.html"
soup = fetch_soup(url)
tag = soup.find("a", text=re.compile("^区内感染者数累計データ詳細"))
link = urljoin(url, tag.get("href"))
p = fetch_file(link)
with pdfplumber.open(str(p)) as pdf:
dfs = []
for page in pdf.pages:
table = page.extract_table()
df_tmp = pd.DataFrame(table)
dfs.append(df_tmp)
data = pd.concat(dfs).values
df = pd.DataFrame(data[1:], columns=data[0]).set_index("NO.")
df.to_csv("result.csv", encoding="utf_8_sig")
pdftotext
shellでのスクレイピングがわからないためPDFからCSV変換のみ
pdftotext -layout data.pdf - | awk 'BEGIN { OFS=","; print "No,公表日,年代,性別,退院等"; } NR>=6 { print $1, $2, $3, $4, $5 }' > result.csv