LoginSignup
2
4

TDnetから適時開示ダウンロードしてくる方法

Last updated at Posted at 2023-03-13

適時開示をpdf形式でダウンロードしてくるスクリプトになります。このコードはやのしん氏が提供しているAPIを使用しています。使用する場合は、負荷がかからないように常識的な利用をお願いします。

注意
このAPIは個人運営のため急に公開終了する場合があります。

以下コード解説になります。
Google Colaboratoryでもmain.pyのみで動作しますが、ダウンロードには30時間ほどかかるため、proでない限りタイムアウトすると思われます。main関数のlimitで取得数の上限を指定してください。
以下必要なライブラリです。

requests
pypdf
schedule
pycryptodome
tqdm

このコードは改修予定で、githubの方で更新します。

main.py
import time
import requests
import json
import random
import os
from pypdf import PdfReader
import schedule
from tqdm import tqdm
import logging

logging.basicConfig(filename='download.log', level=logging.INFO)

class TdnetDownloader:
    def __init__(self, max_retries=5):
        self.max_retries = max_retries

    def get_url(self, URL):
        res = requests.get(URL)
        response_loads = json.loads(res.content)
        data_list = [data["Tdnet"] for data in response_loads["items"]]

        # pubdate, company_code, document_urlを抽出して辞書にまとめ、リストに格納
        extracted_data = [
            {
                "pubdate": item["pubdate"],
                "company_code": item["company_code"],
                "document_url": item["document_url"]
            } for item in data_list]

        # Save the extracted data
        with open("path_to_extracted_data.json", "w") as f:
            json.dump(extracted_data, f, indent=4)

        return extracted_data
    
    def download_pdf(self, data):
        total = len(data)
        progress_bar = tqdm(total=total, desc='Downloading PDFs', unit='file')
        failed_downloads = []

        for d in data:
            if d["company_code"][-1] == "0":
                retries = 0
                success = False

                while retries < self.max_retries and not success:
                    try:
                        pdf = requests.get(d["document_url"])
                        time.sleep(random.randint(6, 15))

                        date = d["pubdate"].split(" ")
                        name = date[0] + "_" + d["company_code"]
                        file_path = f"rawdata/{name}.pdf"

                        with open(file_path, "wb") as f:
                            f.write(pdf.content)

                        if self.validate_pdf(file_path):
                            success = True
                            progress_bar.update(1)
                            logging.info(f"Downloaded {file_path}.")
                        else:
                            retries += 1
                            os.remove(file_path)  # ダウンロードした壊れたファイルを削除
                            logging.warning(f"Failed to validate {file_path}. Retrying download.")

                    except Exception as e:
                        logging.error(f"Failed to download {d['document_url']}: {e}")
                        retries += 1

                if not success:
                    failed_downloads.append(d)

        progress_bar.close()

        # Save the failed downloads
        if failed_downloads:
            with open("path_to_failed_downloads.json", "w") as f:
                json.dump(failed_downloads, f, indent=4)

        return failed_downloads

    def validate_pdf(self, file_path):
        try:
            with open(file_path, "rb") as f:
             pdf = PdfReader(f)
             num_pages = len(pdf.pages)
            return num_pages > 0
        except Exception as e:
            print(f"Error validating PDF at {file_path}: {e}")
            return False

    def retry_failed_downloads(self):
        # Load the failed downloads
        if not os.path.exists("path_to_failed_downloads.json"):
            print("No failed downloads to retry.")
            return

        with open("path_to_failed_downloads.json", "r") as f:
            failed_downloads = json.load(f)

        # Remove the failed downloads file
        os.remove("path_to_failed_downloads.json")

        # Retry the downloads
        self.download_pdf(failed_downloads)

    def run(self):
        start_date = time.strftime("%Y%m%d", time.localtime(time.time() - 60 * 60 * 24 * 30))
        end_date = time.strftime("%Y%m%d", time.localtime(time.time()))

        # Set the limit of data to fetch
        limit = 10000

        URL = f"https://webapi.yanoshin.jp/webapi/tdnet/list/{start_date}-{end_date}.json?limit={limit}"

        # Fetch data from the URL
        data = self.get_url(URL)

        # Download PDF files
        self.download_pdf(data)

        # Retry failed downloads
        self.retry_failed_downloads()
"""
if __name__ == "__main__":
    downloader = TdnetDownloader(max_retries=5)
    downloader.run()
    
"""
downloader = TdnetDownloader(max_retries=5)
schedule.every().week.do(downloader.run())
#確認用
downloader.run()

while True:
    schedule.run_pending()
    time.sleep(1)
2
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
4