LoginSignup
0
2

More than 1 year has passed since last update.

等々力陸上競技場利用予定表のPDFをCSV変換

Last updated at Posted at 2022-11-12

インストール

pip install pdfplumber
pip install japanera
pip install requests
pip install BeautifulSoup4

スクレイピング

import requests
from bs4 import BeautifulSoup

from urllib.parse import urljoin

def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

url = "https://www.city.kawasaki.jp/nakahara/"

link = urljoin(url, "./page/0000088519.html")

soup = fetch_soup(link)

pdf_links = []

# h2の等々力陸上競技場利用予定表を基準にPDFを抽出
for i in (
    soup.find("h2", text="等々力陸上競技場利用予定表")
    .find_next_sibling("div", class_="mol_attachfileblock")
    .select("ul > li > a")
):

    d = {}

    d["link"] = urljoin(url, i.get("href"))
    d["text"] = i.get_text(strip=True)

    pdf_links.append(d)

pdf_links

日付変換

# PDFの和暦日付からdatetimeを生成

import re

from datetime import date
from japanera import Japanera, EraDate, EraDateTime

s = pdf_links[-1]["text"]
m = re.match("令和(\d{1,2})年(\d{1,2})月", s)

janera = Japanera()

dt_date = janera.strptime(m.group(0), "%-E%-O年%m月")[0]

dt_date

PDF変換

import pathlib

import requests
import pdfplumber
import pandas as pd

def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

# 最新のPDFをダウンロード
p = fetch_file(pdf_links[-1]["link"])

with pdfplumber.open(p) as pdf:

    dfs = []

    for page in pdf.pages:

        for table in page.extract_tables():

            df = pd.DataFrame(table[2:], columns=["day", "曜日", "大会名", "午前", "午後"])

            # 年・月・日
            df["year"] = dt_date.year
            df["month"] = dt_date.month
            df["day"] = df["day"].astype(int)

            # 日付変換
            df["日付"] = pd.to_datetime(df[["year", "month", "day"]])

            df["大会名"].mask(df["大会名"] == "", inplace=True)
            df["午後"].mask(df["午後"].isna(), df["午前"], inplace=True)

            dfs.append(df.reindex(["日付", "曜日", "大会名", "午前", "午後"], axis=1))

# 陸上競技場(メイン)
dfs[0]

dfs[0].to_csv(f"{dt_date:%Y%m}_main.csv", encoding="utf_8_sig", index=False)

# 陸上競技場(補助)
dfs[1]

dfs[1].to_csv(f"{dt_date:%Y%m}_sub.csv", encoding="utf_8_sig", index=False)
0
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
2