LoginSignup
1
2

More than 1 year has passed since last update.

東京証券取引所日報の株式相場表のPDFをCSVに変換

Posted at

株式相場表の最新PDFをCSVに変換

列数が違うので立会市場 普通取引のみ

売買単位があるもののみ抽出(アルファベット表記はなし)

# スクレイピング

import pathlib
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p


url = "https://www.jpx.co.jp/markets/statistics-equities/daily/index.html"

soup = fetch_soup(url)

href = soup.find(
    href=re.compile("/markets/statistics-equities/daily/.*stq_\d{8}.pdf$")
).get("href")

link = urljoin(url, href)

p = fetch_file(link)

# PDF変換

import pandas as pd
import pdfplumber

pdf = pdfplumber.open(p)

page = pdf.pages[0]

# PDF確認
im = page.to_image()
im

im.reset().draw_hlines([307, 750])

im.save("pdf.png", format="PNG")

table_settings = {
    # 垂直基準
    "vertical_strategy": "explicit",
    # 垂直区切を数値指定(リスト)
    "explicit_vertical_lines": [
        72,
        113,
        153,
        283,
        343,
        403,
        463,
        523,
        583,
        643,
        703,
        763,
        828,
        893,
        958,
        1038,
        1119,
    ],
    # 水平基準
    "horizontal_strategy": "text",
}

# テーブル確認
im.reset().debug_tablefinder(table_settings)

from tqdm.notebook import tqdm

with pdfplumber.open(p) as pdf:

    dfs = []
    top, bottom = 307, 750

    for page in tqdm(pdf.pages):

        chapter = page.extract_words()[1]["text"].strip()

        if chapter.startswith("1-"):

            # cropでテキスト取得
            crop = page.within_bbox((0, top, page.width, bottom))

            table = crop.extract_table(table_settings)

            df_tmp = pd.DataFrame(table)

            dfs.append(df_tmp)

            top, bottom = 137, 750

        else:
            break

df0 = pd.concat(dfs).reset_index(drop=True)

df0.mask(df0 == "", inplace=True)

df1 = df0.dropna(thresh=15).copy().reset_index(drop=True)

df1.mask(df1 == "", inplace=True)

df1.columns = df1.columns.map(
    {
        0: "コード",
        1: "売買単位",
        2: "銘柄名",
        3: "午前_始値",
        4: "午前_高値",
        5: "午前_安値",
        6: "午前_終値",
        7: "午後_始値",
        8: "午後_高値",
        9: "午後_安値",
        10: "午後_終値",
        11: "最終気配",
        12: "前日比",
        13: "売買高加重平均価格",
        14: "売買高",
        15: "売買代金",
    }
)

df1["unit"] = pd.to_numeric(df1["売買単位"].str.replace(",", ""), errors="coerce")

df2 = df1.dropna(subset=["unit"]).copy().reset_index(drop=True)

df2.head(30)
df2.tail(30)

df2[["特別気配", "最終気配"]] = df2["最終気配"].str.extract("([カウ])?([0-9,.]+)").dropna(how="all")

columns = [
    "午前_始値",
    "午前_高値",
    "午前_安値",
    "午前_終値",
    "午後_始値",
    "午後_高値",
    "午後_安値",
    "午後_終値",
    "最終気配",
    "前日比",
    "売買高加重平均価格",
    "売買高",
    "売買代金",
]

for col in columns:

    df2[col] = (
        df2[col]
        .where(df2[col].isna(), df2[col].astype(str).str.replace(",", ""))
        .astype(float)
    )

df2[~df2["コード"].str.isnumeric()]

df2.to_csv("result.csv", encoding="utf_8_sig")
1
2
1

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2