netkeibaの出馬表から各馬のページに遷移し
データを見比べるのが面倒なため、予想に必要な情報をcsvで出力するためのプログラムを組んでみた
書き方が汚いので改良の余地はありそう
import pandas as pd
import urllib
import requests
import re
import unicodedata
import csv
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from tqdm import tqdm
def main():
site_url = input("データを取得したい出馬表のURLを入力してください(中央競馬のレースのみ対応):")
browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get(site_url)
browser.implicitly_wait(10)
link_list = []
link_list = make_horseURL(browser, link_list)
df_RaceResult = pd.DataFrame()
for url in tqdm(link_list):
df_RaceResult = pd.concat([df_RaceResult, get_result(url)], axis=0)
df_RaceResult.to_csv('./中間.csv', encoding='cp932', header=False, index=False, errors="ignore")
file = open('./中間.csv',"r")
out_file = open('./最終.csv',"w")
lines = file.readlines()
for line1 in lines:
line = line1.split(',')
if line[6] != "":
line.pop(12)
line.pop(16)
line.pop(16)
weight = line[21]
if weight != "":
horse_weight = re.split('[()]', weight)
line.pop(21)
line = line + horse_weight
elif line[6] == "":
line.pop(6)
line.pop(6)
line.pop(13)
race_name = line[5]
if '(' in race_name:
grade_name = re.split('[()]', race_name)
grade = grade_name[-2]
line.insert(6,grade)
else:
line.insert(6,"")
race_track2 = line[2]
if "札幌" in race_track2:
race_track = "札幌"
line.pop(2)
line.insert(2,race_track)
elif "函館" in race_track2:
race_track = "函館"
line.pop(2)
line.insert(2,race_track)
elif "新潟" in race_track2:
race_track = "新潟"
line.pop(2)
line.insert(2,race_track)
elif "福島" in race_track2:
race_track = "福島"
line.pop(2)
line.insert(2,race_track)
elif "中山" in race_track2:
race_track = "中山"
line.pop(2)
line.insert(2,race_track)
elif "東京" in race_track2:
race_track = "東京"
line.pop(2)
line.insert(2,race_track)
elif "中京" in race_track2:
race_track = "中京"
line.pop(2)
line.insert(2,race_track)
elif "京都" in race_track2:
race_track = "京都"
line.pop(2)
line.insert(2,race_track)
elif "阪神" in race_track2:
race_track = "阪神"
line.pop(2)
line.insert(2,race_track)
elif "小倉" in race_track2:
race_track = "小倉"
line.pop(2)
line.insert(2,race_track)
result = ','.join(str(s) for s in line)
out_file.write(result)
def make_horseURL(browser, link_list):
print('取得したHTMLからレース結果のURLを抽出します')
html = browser.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
table_data = soup.find(class_='Shutuba_Table')
for element in table_data.find_all('a'):
url = element.get('href')
link_url = urllib.parse.urljoin('https://db.netkeiba.com', url)
word_list = ['jockey', 'trainer', 'popup', 'bookmark']
tmp_list = link_url.split('/')
and_list = set(word_list) & set(tmp_list)
if len(and_list) == 0:
link_list.append(link_url)
return link_list
def get_result(url):
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
title = (soup.title.string.split(' '))
horse_name = title[0]
tables = soup.find('table', attrs={'class': 'db_h_race_results'})
tables = tables.find_all('tr')
indexs = tables[0].text.split('\n')
tmp = []
df = pd.DataFrame()
df_tmp1 = pd.DataFrame()
for table in tables[1:]:
tmp = table.text.split('\n')
df_tmp1 = pd.Series(tmp)
df = pd.concat([df, df_tmp1], axis=1)
df_tmp2 = pd.DataFrame()
df_tmp3 = pd.DataFrame()
for i in ( 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 26, 27, 33, 34, 35, 36,):
df_tmp2 = df.iloc[i]
df_tmp3['馬名'] = horse_name
df_tmp3 = pd.concat([df_tmp3, df_tmp2], axis=1)
return df_tmp3
if __name__ == "__main__":
main()