メインの処理

任意のスパイダーにたいして下記のプログラムを書く

minkabu_shintaku.py

import scrapy
from kabu_suku.items import minkabuItem
import pandas as pd


class MinkabuShintakuSpider(scrapy.Spider):
    name = "minkabu_shintaku"
    allowed_domains = ["itf.minkabu.jp"]
    start_urls = ["https://itf.minkabu.jp/ranking/popular?period=30"]

    def __init__(self):
        self.items_df = pd.DataFrame() #空のデータフレームを初期化

    def parse(self, response):
        items = []
        for i in range(1,21):
            minkabu_rank = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[1]/div[1]/span[1]/text()').extract_first()
            minkabu_company = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[1]/text()').extract_first()
            mimnakbu_fund_name = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[2]/a/text()').extract_first()
            minkabu_fund_url = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[2]/a/@href').extract_first()
            minkabu_bench_price = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[3]/div[1]/text()').extract_first()
            minkabu_return = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[4]/span/text()').extract_first()
            minkabu_trust_fee = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[5]/span/text()').extract_first()
            items.append({
                'minkabu_rank': minkabu_rank,
                'minkabu_company': minkabu_company,
                'mimnakbu_fund_name': mimnakbu_fund_name,
                'minkabu_fund_url': minkabu_fund_url,
                'minkabu_bench_price': minkabu_bench_price,
                'minkabu_return(%)': minkabu_return,
                'minkabu_trust_fee(%)': minkabu_trust_fee})
        new_df = pd.DataFrame(items)
        new_df['minkabu_company'] = new_df['minkabu_company'].str.strip().replace('\uff0d', '-')
        new_df["minkabu_fund_url"] = "https://itf.minkabu.jp/" + new_df["minkabu_fund_url"]
        new_df["minkabu_bench_price"] = new_df["minkabu_bench_price"].str.replace(",","").str.replace('\uff0d', '-')
        new_df["minkabu_return(%)"] = new_df["minkabu_return(%)"].str.replace("%","").str.replace('\uff0d', '-')
        new_df["minkabu_trust_fee(%)"] = new_df["minkabu_trust_fee(%)"].str.replace("%","").str.replace('\uff0d', '-')
        print(new_df)
        self.items_df = pd.concat([self.items_df, new_df], ignore_index=True)
        # ページ切り替えのリストは5,7,8,9,9,9...と遷移していく。
        # yiledはfor文の中に書くこともできる？
        page_list = [5,7,9,9,9,9]
        for i in page_list:
            next_page = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[2]/ul/li[{i}]/a/@href').extract_first()
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
    
    def closed(self, reason):
        # スパイダーが閉じる際にCSVを出力
        self.items_df.to_csv('minkabu_ranking.csv', index=False)
        print('スクレイピングが完了しました。')

取得したいデータセットの処理

itemsets.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class KabuSukuItem(scrapy.Item):
    # define the fields for your item here like:
    titles = scrapy.Field()
    urls = scrapy.Field()
    pass

class weatherItem(scrapy.Item):
    wealth_fund_name = scrapy.Field()
    wealth_fund_company = scrapy.Field()
    wealth_url = scrapy.Field()
    wealth_wealth_category = scrapy.Field()
    wealth_wealth_rating = scrapy.Field()
    wealth_wealth_bemchmark = scrapy.Field()
    wealth_trust_fee = scrapy.Field()
    wealth_three_years_return = scrapy.Field()
    wealth_aum = scrapy.Field()

    # ファンドランキングについてのスクレイピング
    rank = scrapy.Field()
    rank_fuand_name = scrapy.Field()
    rank_fund_url = scrapy.Field()
    rank_fund_company = scrapy.Field()
    rank_fund_category = scrapy.Field()
    rank_fund_return = scrapy.Field()
    rank_fund_aum = scrapy.Field()
    pass

class minkabuItem(scrapy.Item):
    # みんかぶにけるランク
    minkabu_rank = scrapy.Field()
    # みんかぶにおける会社名
    minkabu_company = scrapy.Field()
    # みんかぶにおけるファンド名
    minakbu_fund_name = scrapy.Field()
    # 民かぶにおけるファンドのURL
    minkabu_fund_url = scrapy.Field()
    # みんかぶにおける基準価格
    minkabu_bench_price = scrapy.Field()
    # みんかぶにおける利回り
    minkabu_return = scrapy.Field()
    # みんかぶにおける信託報酬
    minkabu_trust_fee = scrapy.Field()

scrapyを用いて投資信託の情報をスクレイピングするプログラムver_2

メインの処理

取得したいデータセットの処理