メインの処理
任意のスパイダーにたいして下記のプログラムを書く
minkabu_shintaku.py
import scrapy
from kabu_suku.items import minkabuItem
import pandas as pd
class MinkabuShintakuSpider(scrapy.Spider):
name = "minkabu_shintaku"
allowed_domains = ["itf.minkabu.jp"]
start_urls = ["https://itf.minkabu.jp/ranking/popular?period=30"]
def __init__(self):
self.items_df = pd.DataFrame() #空のデータフレームを初期化
def parse(self, response):
items = []
for i in range(1,21):
minkabu_rank = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[1]/div[1]/span[1]/text()').extract_first()
minkabu_company = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[1]/text()').extract_first()
mimnakbu_fund_name = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[2]/a/text()').extract_first()
minkabu_fund_url = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[2]/a/@href').extract_first()
minkabu_bench_price = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[3]/div[1]/text()').extract_first()
minkabu_return = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[4]/span/text()').extract_first()
minkabu_trust_fee = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[5]/span/text()').extract_first()
items.append({
'minkabu_rank': minkabu_rank,
'minkabu_company': minkabu_company,
'mimnakbu_fund_name': mimnakbu_fund_name,
'minkabu_fund_url': minkabu_fund_url,
'minkabu_bench_price': minkabu_bench_price,
'minkabu_return(%)': minkabu_return,
'minkabu_trust_fee(%)': minkabu_trust_fee})
new_df = pd.DataFrame(items)
new_df['minkabu_company'] = new_df['minkabu_company'].str.strip().replace('\uff0d', '-')
new_df["minkabu_fund_url"] = "https://itf.minkabu.jp/" + new_df["minkabu_fund_url"]
new_df["minkabu_bench_price"] = new_df["minkabu_bench_price"].str.replace(",","").str.replace('\uff0d', '-')
new_df["minkabu_return(%)"] = new_df["minkabu_return(%)"].str.replace("%","").str.replace('\uff0d', '-')
new_df["minkabu_trust_fee(%)"] = new_df["minkabu_trust_fee(%)"].str.replace("%","").str.replace('\uff0d', '-')
print(new_df)
self.items_df = pd.concat([self.items_df, new_df], ignore_index=True)
# ページ切り替えのリストは5,7,8,9,9,9...と遷移していく。
# yiledはfor文の中に書くこともできる?
page_list = [5,7,9,9,9,9]
for i in page_list:
next_page = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[2]/ul/li[{i}]/a/@href').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def closed(self, reason):
# スパイダーが閉じる際にCSVを出力
self.items_df.to_csv('minkabu_ranking.csv', index=False)
print('スクレイピングが完了しました。')
取得したいデータセットの処理
itemsets.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class KabuSukuItem(scrapy.Item):
# define the fields for your item here like:
titles = scrapy.Field()
urls = scrapy.Field()
pass
class weatherItem(scrapy.Item):
wealth_fund_name = scrapy.Field()
wealth_fund_company = scrapy.Field()
wealth_url = scrapy.Field()
wealth_wealth_category = scrapy.Field()
wealth_wealth_rating = scrapy.Field()
wealth_wealth_bemchmark = scrapy.Field()
wealth_trust_fee = scrapy.Field()
wealth_three_years_return = scrapy.Field()
wealth_aum = scrapy.Field()
# ファンドランキングについてのスクレイピング
rank = scrapy.Field()
rank_fuand_name = scrapy.Field()
rank_fund_url = scrapy.Field()
rank_fund_company = scrapy.Field()
rank_fund_category = scrapy.Field()
rank_fund_return = scrapy.Field()
rank_fund_aum = scrapy.Field()
pass
class minkabuItem(scrapy.Item):
# みんかぶにけるランク
minkabu_rank = scrapy.Field()
# みんかぶにおける会社名
minkabu_company = scrapy.Field()
# みんかぶにおけるファンド名
minakbu_fund_name = scrapy.Field()
# 民かぶにおけるファンドのURL
minkabu_fund_url = scrapy.Field()
# みんかぶにおける基準価格
minkabu_bench_price = scrapy.Field()
# みんかぶにおける利回り
minkabu_return = scrapy.Field()
# みんかぶにおける信託報酬
minkabu_trust_fee = scrapy.Field()