edgartoolsを使って米国の財務諸表を取得する

Python

Last updated at 2024-09-28Posted at 2024-09-28

米国はedgarというサイトで上場企業の財務諸表を管理しています。

以下は、Apple Inc.（AAPL）の財務情報を取得し、保存するためのコードです。このコードは、edgartoolsライブラリを使って最新の財務データ（バランスシート、損益計算書、キャッシュフロー計算書）を取得し、それぞれをCSVファイルに保存します。

from edgar import *

# SECに問い合わせるためのメールアドレスを設定
set_identity("your_email@example.com")

# Appleの会社情報を取得
company = Company("AAPL")

# 最新の10-Kフォームを取得
tenk = company.get_filings(form="10-K").latest(1).obj()

# バランスシートを取得してCSVとして保存
balance_sheet = tenk.financials.balance_sheet
balance_sheet_df = balance_sheet.get_dataframe()
balance_sheet_df.to_csv("balance_sheet_aapl.csv", index=False)
print("Balance Sheet saved as 'balance_sheet_aapl.csv'")

# 損益計算書を取得してCSVとして保存
income_statement = tenk.financials.income_statement
income_statement_df = income_statement.get_dataframe()
income_statement_df.to_csv("income_statement_aapl.csv", index=False)
print("Income Statement saved as 'income_statement_aapl.csv'")

# キャッシュフロー計算書を取得してCSVとして保存
cash_flow_statement = tenk.financials.cash_flow_statement
cash_flow_statement_df = cash_flow_statement.get_dataframe()
cash_flow_statement_df.to_csv("cash_flow_statement_aapl.csv", index=False)
print("Cash Flow Statement saved as 'cash_flow_statement_aapl.csv'")

下記のコードは有価証券報告書内の「Item 1. Business」のテキストを取得しています。

import re
from edgar import *
from bs4 import BeautifulSoup

# SECに問い合わせるためのメールアドレスを設定
set_identity("your_email@example.com")

# 複数の企業のティッカーシンボルをリストにまとめる
companies = ["AAPL", "GOOGL", "MSFT", "TSLA"]  # 企業ティッカーシンボルのリスト

# 検索するセクション名の正規表現パターン
section_patterns = [
    re.compile(r"Item\s*1[\.\s]*Business", re.IGNORECASE),
    # 他の可能なパターンを追加
]

# 特定の属性を利用したセクション検索パターン
section_ids = [
    "item_1_business",
    # 必要に応じて他のIDを追加
]

# セクションの終了を検出するためのパターン
end_section_patterns = [
    re.compile(r"Item\s*1A[\.\s]*", re.IGNORECASE),
    re.compile(r"Item\s*2[\.\s]*", re.IGNORECASE),
    # その他の終了セクションパターン
]

# 企業ごとにファイリングを取得して保存
for ticker in companies:
    try:
        # 各企業の会社情報を取得
        company = Company(ticker)

        # 最新の10-Kファイリングを取得
        filing = company.get_filings(form="10-K").latest(1)

        # ファイリングのHTMLコンテンツを取得
        html_content = filing.html()

        # BeautifulSoupを使ってHTMLを解析
        soup = BeautifulSoup(html_content, 'lxml')

        # セクションタグを特定
        item1_tag = None

        # 1. 特定のIDを持つタグを検索
        for sec_id in section_ids:
            item1_tag = soup.find(id=sec_id)
            if item1_tag:
                break

        # 2. 正規表現パターンを用いた検索
        if not item1_tag:
            for pattern in section_patterns:
                item1_tag = soup.find(string=lambda text: text and pattern.search(text))
                if item1_tag:
                    # 親タグを取得（例：<span>内にテキストがある場合）
                    item1_tag = item1_tag.parent
                    break

        # 3. さらにテキストが分割されている場合の対処
        if not item1_tag:
            # 例えば、"Item 1." と "Business" が別々のタグに分かれている場合
            possible_tags = soup.find_all(string=lambda text: text and re.search(r"Item\s*1[\.\s]*B", text, re.IGNORECASE))
            for tag in possible_tags:
                # その次の兄弟タグのテキストを確認
                next_tag = tag.find_next(string=lambda text: text and "usiness" in text)
                if next_tag:
                    item1_tag = tag.parent
                    break

        if item1_tag:
            # "Item 1. Business"の次の要素をすべて取得
            next_elements = item1_tag.find_all_next()

            # 表示済みの要素を記録して重複を防ぐ
            seen = set()

            # 各企業のファイル名を設定してファイルにHTMLを書き込む
            file_name = f'extracted_elements_{ticker}.html'
            with open(file_name, 'w', encoding='utf-8') as f:
                for idx, element in enumerate(next_elements, 1):
                    # セクションの終了を検出
                    text = element.get_text(strip=True)
                    if any(pattern.match(text) for pattern in end_section_patterns):
                        break  # セクションの終了

                    # HTMLをそのまま取得
                    html_content = str(element)

                    # 重複する要素を除外
                    if html_content not in seen:
                        f.write(f"{html_content}\n\n")  # HTMLを書き込み、改行
                        seen.add(html_content)

                    # 任意の抽出内容の長さを調整（例: 1000要素まで抽出して停止）
                    if idx >= 1000:  # 必要に応じて要素数を調整
                        break
            print(f"{ticker} のデータが {file_name} に保存されました。")
        else:
            print(f"{ticker} の 'Item 1. Business' セクションが見つかりませんでした。")

    except Exception as e:
        print(f"{ticker} のデータを取得中にエラーが発生しました: {e}")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up