More than 1 year has passed since last update.

東京証券取引所日報の株式相場表のPDFをCSVに変換

Posted at 2022-10-29

株式相場表の最新PDFをCSVに変換

列数が違うので立会市場普通取引のみ

売買単位があるもののみ抽出（アルファベット表記はなし）

# スクレイピング

import pathlib
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p


url = "https://www.jpx.co.jp/markets/statistics-equities/daily/index.html"

soup = fetch_soup(url)

href = soup.find(
    href=re.compile("/markets/statistics-equities/daily/.*stq_\d{8}.pdf$")
).get("href")

link = urljoin(url, href)

p = fetch_file(link)

# PDF変換

import pandas as pd
import pdfplumber

pdf = pdfplumber.open(p)

page = pdf.pages[0]

# PDF確認
im = page.to_image()
im

im.reset().draw_hlines([307, 750])

im.save("pdf.png", format="PNG")

table_settings = {
    # 垂直基準
    "vertical_strategy": "explicit",
    # 垂直区切を数値指定（リスト）
    "explicit_vertical_lines": [
        72,
        113,
        153,
        283,
        343,
        403,
        463,
        523,
        583,
        643,
        703,
        763,
        828,
        893,
        958,
        1038,
        1119,
    ],
    # 水平基準
    "horizontal_strategy": "text",
}

# テーブル確認
im.reset().debug_tablefinder(table_settings)

from tqdm.notebook import tqdm

with pdfplumber.open(p) as pdf:

    dfs = []
    top, bottom = 307, 750

    for page in tqdm(pdf.pages):

        chapter = page.extract_words()[1]["text"].strip()

        if chapter.startswith("1-"):

            # cropでテキスト取得
            crop = page.within_bbox((0, top, page.width, bottom))

            table = crop.extract_table(table_settings)

            df_tmp = pd.DataFrame(table)

            dfs.append(df_tmp)

            top, bottom = 137, 750

        else:
            break

df0 = pd.concat(dfs).reset_index(drop=True)

df0.mask(df0 == "", inplace=True)

df1 = df0.dropna(thresh=15).copy().reset_index(drop=True)

df1.mask(df1 == "－", inplace=True)

df1.columns = df1.columns.map(
    {
        0: "コード",
        1: "売買単位",
        2: "銘柄名",
        3: "午前_始値",
        4: "午前_高値",
        5: "午前_安値",
        6: "午前_終値",
        7: "午後_始値",
        8: "午後_高値",
        9: "午後_安値",
        10: "午後_終値",
        11: "最終気配",
        12: "前日比",
        13: "売買高加重平均価格",
        14: "売買高",
        15: "売買代金",
    }
)

df1["unit"] = pd.to_numeric(df1["売買単位"].str.replace(",", ""), errors="coerce")

df2 = df1.dropna(subset=["unit"]).copy().reset_index(drop=True)

df2.head(30)
df2.tail(30)

df2[["特別気配", "最終気配"]] = df2["最終気配"].str.extract("([ｶｳ])?([0-9,.]+)").dropna(how="all")

columns = [
    "午前_始値",
    "午前_高値",
    "午前_安値",
    "午前_終値",
    "午後_始値",
    "午後_高値",
    "午後_安値",
    "午後_終値",
    "最終気配",
    "前日比",
    "売買高加重平均価格",
    "売買高",
    "売買代金",
]

for col in columns:

    df2[col] = (
        df2[col]
        .where(df2[col].isna(), df2[col].astype(str).str.replace(",", ""))
        .astype(float)
    )

df2[~df2["コード"].str.isnumeric()]

df2.to_csv("result.csv", encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up