株式相場表の最新PDFをCSVに変換
列数が違うので立会市場 普通取引のみ
売買単位があるもののみ抽出(アルファベット表記はなし)
# スクレイピング
import pathlib
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.jpx.co.jp/markets/statistics-equities/daily/index.html"
soup = fetch_soup(url)
href = soup.find(
href=re.compile("/markets/statistics-equities/daily/.*stq_\d{8}.pdf$")
).get("href")
link = urljoin(url, href)
p = fetch_file(link)
# PDF変換
import pandas as pd
import pdfplumber
pdf = pdfplumber.open(p)
page = pdf.pages[0]
# PDF確認
im = page.to_image()
im
im.reset().draw_hlines([307, 750])
im.save("pdf.png", format="PNG")
table_settings = {
# 垂直基準
"vertical_strategy": "explicit",
# 垂直区切を数値指定(リスト)
"explicit_vertical_lines": [
72,
113,
153,
283,
343,
403,
463,
523,
583,
643,
703,
763,
828,
893,
958,
1038,
1119,
],
# 水平基準
"horizontal_strategy": "text",
}
# テーブル確認
im.reset().debug_tablefinder(table_settings)
from tqdm.notebook import tqdm
with pdfplumber.open(p) as pdf:
dfs = []
top, bottom = 307, 750
for page in tqdm(pdf.pages):
chapter = page.extract_words()[1]["text"].strip()
if chapter.startswith("1-"):
# cropでテキスト取得
crop = page.within_bbox((0, top, page.width, bottom))
table = crop.extract_table(table_settings)
df_tmp = pd.DataFrame(table)
dfs.append(df_tmp)
top, bottom = 137, 750
else:
break
df0 = pd.concat(dfs).reset_index(drop=True)
df0.mask(df0 == "", inplace=True)
df1 = df0.dropna(thresh=15).copy().reset_index(drop=True)
df1.mask(df1 == "-", inplace=True)
df1.columns = df1.columns.map(
{
0: "コード",
1: "売買単位",
2: "銘柄名",
3: "午前_始値",
4: "午前_高値",
5: "午前_安値",
6: "午前_終値",
7: "午後_始値",
8: "午後_高値",
9: "午後_安値",
10: "午後_終値",
11: "最終気配",
12: "前日比",
13: "売買高加重平均価格",
14: "売買高",
15: "売買代金",
}
)
df1["unit"] = pd.to_numeric(df1["売買単位"].str.replace(",", ""), errors="coerce")
df2 = df1.dropna(subset=["unit"]).copy().reset_index(drop=True)
df2.head(30)
df2.tail(30)
df2[["特別気配", "最終気配"]] = df2["最終気配"].str.extract("([カウ])?([0-9,.]+)").dropna(how="all")
columns = [
"午前_始値",
"午前_高値",
"午前_安値",
"午前_終値",
"午後_始値",
"午後_高値",
"午後_安値",
"午後_終値",
"最終気配",
"前日比",
"売買高加重平均価格",
"売買高",
"売買代金",
]
for col in columns:
df2[col] = (
df2[col]
.where(df2[col].isna(), df2[col].astype(str).str.replace(",", ""))
.astype(float)
)
df2[~df2["コード"].str.isnumeric()]
df2.to_csv("result.csv", encoding="utf_8_sig")