LoginSignup
0
1

scrapyを用いて投資信託の情報をスクレイピングするプログラムver_2

Last updated at Posted at 2024-05-13

メインの処理

任意のスパイダーにたいして下記のプログラムを書く

minkabu_shintaku.py
import scrapy
from kabu_suku.items import minkabuItem
import pandas as pd


class MinkabuShintakuSpider(scrapy.Spider):
    name = "minkabu_shintaku"
    allowed_domains = ["itf.minkabu.jp"]
    start_urls = ["https://itf.minkabu.jp/ranking/popular?period=30"]

    def __init__(self):
        self.items_df = pd.DataFrame() #空のデータフレームを初期化

    def parse(self, response):
        items = []
        for i in range(1,21):
            minkabu_rank = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[1]/div[1]/span[1]/text()').extract_first()
            minkabu_company = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[1]/text()').extract_first()
            mimnakbu_fund_name = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[2]/a/text()').extract_first()
            minkabu_fund_url = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[2]/div[2]/a/@href').extract_first()
            minkabu_bench_price = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[3]/div[1]/text()').extract_first()
            minkabu_return = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[4]/span/text()').extract_first()
            minkabu_trust_fee = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[1]/div/div[3]/div[1]/table/tbody/tr[{i}]/td[5]/span/text()').extract_first()
            items.append({
                'minkabu_rank': minkabu_rank,
                'minkabu_company': minkabu_company,
                'mimnakbu_fund_name': mimnakbu_fund_name,
                'minkabu_fund_url': minkabu_fund_url,
                'minkabu_bench_price': minkabu_bench_price,
                'minkabu_return(%)': minkabu_return,
                'minkabu_trust_fee(%)': minkabu_trust_fee})
        new_df = pd.DataFrame(items)
        new_df['minkabu_company'] = new_df['minkabu_company'].str.strip().replace('\uff0d', '-')
        new_df["minkabu_fund_url"] = "https://itf.minkabu.jp/" + new_df["minkabu_fund_url"]
        new_df["minkabu_bench_price"] = new_df["minkabu_bench_price"].str.replace(",","").str.replace('\uff0d', '-')
        new_df["minkabu_return(%)"] = new_df["minkabu_return(%)"].str.replace("%","").str.replace('\uff0d', '-')
        new_df["minkabu_trust_fee(%)"] = new_df["minkabu_trust_fee(%)"].str.replace("%","").str.replace('\uff0d', '-')
        print(new_df)
        self.items_df = pd.concat([self.items_df, new_df], ignore_index=True)
        # ページ切り替えのリストは5,7,8,9,9,9...と遷移していく。
        # yiledはfor文の中に書くこともできる?
        page_list = [5,7,9,9,9,9]
        for i in page_list:
            next_page = response.xpath(f'/html/body/div[2]/div[6]/div[1]/div[3]/div[2]/ul/li[{i}]/a/@href').extract_first()
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
    
    def closed(self, reason):
        # スパイダーが閉じる際にCSVを出力
        self.items_df.to_csv('minkabu_ranking.csv', index=False)
        print('スクレイピングが完了しました。')


取得したいデータセットの処理

itemsets.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class KabuSukuItem(scrapy.Item):
    # define the fields for your item here like:
    titles = scrapy.Field()
    urls = scrapy.Field()
    pass

class weatherItem(scrapy.Item):
    wealth_fund_name = scrapy.Field()
    wealth_fund_company = scrapy.Field()
    wealth_url = scrapy.Field()
    wealth_wealth_category = scrapy.Field()
    wealth_wealth_rating = scrapy.Field()
    wealth_wealth_bemchmark = scrapy.Field()
    wealth_trust_fee = scrapy.Field()
    wealth_three_years_return = scrapy.Field()
    wealth_aum = scrapy.Field()

    # ファンドランキングについてのスクレイピング
    rank = scrapy.Field()
    rank_fuand_name = scrapy.Field()
    rank_fund_url = scrapy.Field()
    rank_fund_company = scrapy.Field()
    rank_fund_category = scrapy.Field()
    rank_fund_return = scrapy.Field()
    rank_fund_aum = scrapy.Field()
    pass

class minkabuItem(scrapy.Item):
    # みんかぶにけるランク
    minkabu_rank = scrapy.Field()
    # みんかぶにおける会社名
    minkabu_company = scrapy.Field()
    # みんかぶにおけるファンド名
    minakbu_fund_name = scrapy.Field()
    # 民かぶにおけるファンドのURL
    minkabu_fund_url = scrapy.Field()
    # みんかぶにおける基準価格
    minkabu_bench_price = scrapy.Field()
    # みんかぶにおける利回り
    minkabu_return = scrapy.Field()
    # みんかぶにおける信託報酬
    minkabu_trust_fee = scrapy.Field()
0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1