目的・要件
- 米国の上場会社一覧(Ticker Symbol)を取得し、pandasのdataframeとして保持すること(ついでにcsvに吐いておく)
- 上場市場(NYSE/NASDAQ/AMEX)を特定できること
- S&P 500 指数採用銘柄を特定できること
- ダウ平均採用銘柄を特定できること
-
ETF銘柄を特定できること(←怪しいので削除) - 日本ADR銘柄を特定できること
- 中国ADR銘柄を特定できること
データ取得元
- https://www.nasdaq.com/screening/company-list.aspx
- http://en.wikipedia.org/wiki/List_of_S%26P_500_companies
- https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average
- https://en.wikipedia.org/wiki/List_of_American_exchange-traded_funds
- http://topforeignstocks.com/foreign-adrs-list/the-full-list-of-japanese-adrs/
- http://topforeignstocks.com/foreign-adrs-list/the-full-list-of-chinese-adrs/
環境
- python 3.6.4
- Anaconda (conda ver.4.4.10):ライブラリ等は全部予め入っているかな?と思います。
- windows 10
注意
- どうもNASDAQ公式サイトからダウンロードしたCSVのなかにはETF銘柄が足りないようです。。。対策考え中。
- コーディングすること自体あまり慣れていないので、ググりながらとりあえず動くものを書いたまで。ご指摘、アドバイス等があればコメントください。
ソースコード
なお、個々のgetなんとかの関数は独立して動くようにしていますので、importして必要に応じてお使いください。
getCompanyList.py
"""
Get all US listing companies ticker symbols
"""
from io import StringIO
import requests
import pandas as pd
import bs4
import re
def get_company_list():
"""[Get the list of US listing companies (stock ticker symbols) from www.nasdaq.com]
Returns:
[pandas data frame]
Columns:
-Symbol: ex. AAPL
-Name
-LastSale
-MarketCap
-IPOyear
-Sector
-industry
-Summary Quote
-exchange
"""
target_url_nasdaq = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download'
target_url_nyse = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download'
target_url_amex = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=amex&render=download'
r_nasdaq = requests.get(target_url_nasdaq)
r_nyse = requests.get(target_url_nyse)
r_amex = requests.get(target_url_amex)
df_companies_nasdaq = pd.read_csv(StringIO(r_nasdaq.text))
# 整形:最終列名を'exchange'に変更し、値に'NASDAQ'を代入
df_companies_nasdaq = df_companies_nasdaq.rename(
columns={'Unnamed: 8': 'exchange'})
df_companies_nasdaq["exchange"] = 'NASDAQ'
df_companies_nyse = pd.read_csv(StringIO(r_nyse.text))
# 整形:最終列名を'exchange'に変更し、値に'NYSE'を代入
df_companies_nyse = df_companies_nyse.rename(
columns={'Unnamed: 8': 'exchange'})
df_companies_nyse["exchange"] = 'NYSE'
df_companies_amex = pd.read_csv(StringIO(r_amex.text))
# 整形:最終列名を'exchange'に変更し、値に'AMEX'を代入
df_companies_amex = df_companies_amex.rename(
columns={'Unnamed: 8': 'exchange'})
df_companies_amex["exchange"] = 'AMEX'
# 3市場のデータを結合して一つにする
df_companies = pd.concat(
[df_companies_nasdaq, df_companies_nyse, df_companies_amex])
return df_companies
def get_sp500_company_list():
"""[Get the list of s&p 500 companies (stock ticker symbols) from wikipedia.com]
Returns:
[pandas series]
"""
target_url_wikipedia = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(target_url_wikipedia)
# 対象のhtml中に2つの表が存在するが、最初の表のみを使用
df_companies_in_sp500 = tables[0]
# ヘッダーの整形(1行目をヘッダー(Columns)にする)
header = df_companies_in_sp500[0][:]
df_companies_in_sp500 = df_companies_in_sp500[1:].rename(columns=header)
# 1列目(Ticker Symbol)を抜き出し(series型)returnする
sr_companies_in_sp500 = df_companies_in_sp500.loc[:]['Ticker symbol']
return sr_companies_in_sp500
def get_dow_company_list():
"""[Get the list of Dow Jones Industrial Average 30 companies (stock ticker symbols) from wikipedia.com]
Returns:
[pandas series]
"""
target_url_wikipedia = 'https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average'
tables = pd.read_html(target_url_wikipedia)
# 対象html中に複数の表が存在するが、2個目(つまり[1])の表のみ使用
df_companies_in_dow = tables[1]
# ヘッダーの整形(1行目をヘッダー(Columns)にする)
header = df_companies_in_dow.iloc[0][:]
df_companies_in_dow = df_companies_in_dow[1:].rename(columns=header)
# 列(Ticker Symbol)を抜き出し(series型)returnする
sr_companies_in_dow = df_companies_in_dow.Symbol
return sr_companies_in_dow
def get_etf_list():
"""[Get the list of ETF tickers from wikipedia.com]
Returns:
[pandas series]
"""
target_url_wikipedia = 'https://en.wikipedia.org/wiki/List_of_American_exchange-traded_funds'
res = requests.get(target_url_wikipedia)
res.raise_for_status()
# html平文をsoupに代入
soup = bs4.BeautifulSoup(res.text, "html.parser")
# 特徴に該当する要素(outerHTML)を抽出
elems_nyse = soup.find_all("a", rel="nofollow", class_="external text",
href=re.compile("https://www.nyse.com/quote/"))
elems_nasdaq = soup.find_all("a", rel="nofollow", class_="external text",
href=re.compile("http://www.nasdaq.com/symbol/"))
elems = elems_nyse + elems_nasdaq
# elems_textという空のlistを宣言しておく
elems_text = []
# 要素(outerHTML)の中のテキストを一つずつ抽出して、上で宣言したelems_textというリストに代入
for elem in elems:
elems_text.append(elem.getText())
# リストをseriesにする
sr_etf = pd.Series(elems_text)
# 重複するもの削除
sr_etf = sr_etf.drop_duplicates()
return sr_etf
def get_jp_company_list():
"""[Get the list of Japanese companies (stock ticker symbols) from topforeignstocks.com]
Returns:
[pandas series]
"""
target_url = 'http://topforeignstocks.com/foreign-adrs-list/the-full-list-of-japanese-adrs/'
tables = pd.read_html(target_url)
# 対象html中に複数の表が存在するが、1個目(つまり[0])の表のみ使用
df_jp_company_list = tables[0]
# "Ticker"列を抜き出し(series型)returnする
sr_jp_company_list = df_jp_company_list.Ticker
return sr_jp_company_list
def get_cn_company_list():
"""[Get the list of Chinese companies (stock ticker symbols) from topforeignstocks.com]
Returns:
[pandas series]
"""
target_url = 'http://topforeignstocks.com/foreign-adrs-list/the-full-list-of-chinese-adrs/'
tables = pd.read_html(target_url)
# 対象html中に複数の表が存在するが、1個目(つまり[0])の表のみ使用
df_cn_company_list = tables[0]
# "Ticker"列を抜き出し(series型)returnする
sr_cn_company_list = df_cn_company_list.Ticker
return sr_cn_company_list
def add_sp500_col(arg):
"""[Add a column to the argument that shows wether the ticker is a element of s&p500 index]
Returns:
[pandas dataframe]
"""
arg['Is_SP500'] = ""
df_with_sp500 = arg
sr_companies_in_sp500 = get_sp500_company_list()
for elem in sr_companies_in_sp500:
if elem in df_with_sp500.Symbol.values:
df_with_sp500.loc[df_with_sp500.Symbol == elem, 'Is_SP500'] = 'Y'
return df_with_sp500
def add_dow_col(arg):
"""[Add a column to the argument that shows wether the ticker is a element of Dow]
Returns:
[pandas dataframe]
"""
arg['Is_Dow'] = ""
df_with_dow = arg
sr_companies_in_dow = get_dow_company_list()
for elem in sr_companies_in_dow:
if elem in df_with_dow.Symbol.values:
df_with_dow.loc[df_with_dow.Symbol == elem, 'Is_Dow'] = 'Y'
return df_with_dow
def add_etf_col(arg):
"""[Add a column to the argument that shows wether the ticker is an etf]
Returns:
[pandas dataframe]
"""
arg['Is_etf'] = ""
df_with_etf = arg
sr_etf = get_etf_list()
for elem in sr_etf:
if elem in df_with_etf.Symbol.values:
df_with_etf.loc[df_with_etf.Symbol == elem, 'Is_etf'] = 'Y'
return df_with_etf
def add_jp_col(arg):
"""[Add a column to the argument that shows wether the ticker is a Japan ADR]
Returns:
[pandas dataframe]
"""
arg['Is_JP_ADR'] = ""
df_with_jp_adr = arg
sr_jp_adr = get_jp_company_list()
for elem in sr_jp_adr:
if elem in df_with_jp_adr.Symbol.values:
df_with_jp_adr.loc[df_with_jp_adr.Symbol ==
elem, 'Is_JP_ADR'] = 'Y'
return df_with_jp_adr
def add_cn_col(arg):
"""[Add a column to the argument that shows wether the ticker is a China ADR]
Returns:
[pandas dataframe]
"""
arg['Is_CN_ADR'] = ""
df_with_cn_adr = arg
sr_cn_adr = get_cn_company_list()
for elem in sr_cn_adr:
if elem in df_with_cn_adr.Symbol.values:
df_with_cn_adr.loc[df_with_cn_adr.Symbol ==
elem, 'Is_CN_ADR'] = 'Y'
return df_with_cn_adr
def get_full_company_list():
full_company_list = get_company_list()
full_company_list = add_sp500_col(full_company_list)
full_company_list = add_dow_col(full_company_list)
full_company_list = add_etf_col(full_company_list)
full_company_list = add_jp_col(full_company_list)
full_company_list = add_cn_col(full_company_list)
# index振り直し
full_company_list = full_company_list.reset_index(drop=True)
return full_company_list
if __name__ == '__main__':
full_company_list = get_full_company_list()
# print(full_company_list)
# import os
# os.chdir('G:\\dev\\code')
# CSV出力
full_company_list.to_csv("full_company_list.csv")