福島県 県民割プラスのクーポンが使える加盟店
import pathlib
from urllib.parse import urljoin
import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
def fetch_soup(url, parser="html.parser"):
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.tif.ne.jp/kenminwariplus/"
soup = fetch_soup(url)
links = [
urljoin(url, i.get("href")) for i in soup.select("div#area > div.btn > ul > li > a")
]
dfs = []
for link in tqdm(links):
p = fetch_file(link)
tables = camelot.read_pdf(
str(p), pages="2-end", split_text=True, strip_text=" ", line_scale=40
)
for table in tqdm(tables):
dfs.append(pd.DataFrame(table.data[1:], columns=table.data[0]))
df = pd.concat(dfs).reset_index(drop=True)
df.to_csv("result.csv", encoding="utf_8_sig")