More than 5 years have passed since last update.

YouTuberマイニング #2

Last updated at 2018-07-29Posted at 2018-07-04

データ分析のためには、データ集め！ということで本記事では、YouTuberのデータを集めます。
日本のYouTuberといばHIKAKINやらはじめしゃちょーがいるUUUMなので、UUUM所属のYouTuberの情報を集めめます。
本記事はYouTuberマイニング #1の続きです。

この記事でやったこと

YouTuber事務所UUUMの公式サイトから所属タレントrのChannel idのリストをスクレイピング
各YouTuberの情報を取得して、csvで保存

コードと解説

コードここにある。
haradai1262/Notebooks-for-youtube-api-usage
※ 下のコードとても煩雑になっているので、自分で動かしてみたいという方は、上のリンクから"channel_mining_crawlUUUM.ipynb"をDLしてJupyterで実行することをお勧めします...

1. UUUM公式サイトからChannel idをスクレイピング

公式サイトは、下のように所属タレントの一覧ページがあり、各タレントはYouTubeチャンネルへのハイパーリンクを持っているので、そこから持ってきます。

import requests
from bs4 import BeautifulSoup

# UUUM公式サイトから所属YouTuberのリストを取得
target_url = 'https://www.uuum.jp/creator/'
r = requests.get(target_url)
soup = BeautifulSoup(r.text, 'lxml')
channel_dic = {}
for i in soup.select('a[href^="https://www.youtube.com/"]'):
    url = i.get('href')
    if url[-1] == '/': url = url[:-1]
    url = url.split('/')
    channel_dic[url[4]] = url[3]

工夫したところは特に無いですが、もっと良い方法があるかもしれないです。

2. 各Channel idからチャンネル自身の情報を取得

def get_rows_channel( channel_info_tmp, channel_info_tree_tmp ):
    rows = []
    for i, channel in channel_info_tmp.items():
        # 特定のkeyに値がない場合Noneを入れる
        for parent, child in channel_info_tree_tmp.items():
            if not parent in channel:
                channel[parent] = {}
                for c in child:
                    channel[parent][c] = None
            else:
                if child == None: continue
                for c in child:
                    if not c in channel[parent]:
                        channel[parent][c] = None

        row = []
        for parent, child in channel_info_tree_tmp.items():
            if child == None: row.append( channel[parent] )
            elif parent == 'snippet':
                for c in child:
                    if c == 'thumbnails':
                        row.append( channel[parent][c]['default']['url'] )
                    else:
                        row.append( channel[parent][c] )
            else:
                for c in child:
                    row.append( channel[parent][c] )
        rows.append( row )
    return rows

def get_cols( tmp ):
    df_cols_tmp = []
    for parent, child in tmp.items():
        if child == None: df_cols_tmp.append( parent )
        else:
            for c in child: df_cols_tmp.append( c )
    return df_cols_tmp

def get_rows_channel( channel_info_tmp, channel_info_tree_tmp ):
    rows = []
    for i, channel in channel_info_tmp.items():
        # 特定のkeyに値がない場合Noneを入れる
        for parent, child in channel_info_tree_tmp.items():
            if not parent in channel:
                channel[parent] = {}
                for c in child:
                    channel[parent][c] = None
            else:
                if child == None: continue
                for c in child:
                    if not c in channel[parent]:
                        channel[parent][c] = None

        row = []
        for parent, child in channel_info_tree_tmp.items():
            if child == None: row.append( channel[parent] )
            elif parent == 'snippet':
                for c in child:
                    if c == 'thumbnails':
                        row.append( channel[parent][c]['default']['url'] )
                    else:
                        row.append( channel[parent][c] )
            else:
                for c in child:
                    row.append( channel[parent][c] )
        rows.append( row )
    return rows

YT = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)
channel_info_dic = {}
for uid, flag in channel_dic.items():
    if flag == 'user': channel_feed = YT.channels().list(part="id,snippet,statistics",forUsername=uid).execute()
    else: channel_feed = YT.channels().list(part="id,snippet,statistics",id=uid).execute()
    id = channel_feed.get("items",[])[0]['id']
    channel_info_dic[id] = channel_feed.get("items",[])[0]

channel_info_tree = {
    'id': None,
    'snippet': ['title','description','publishedAt','thumbnails'],
    'statistics': ['viewCount','subscriberCount','hiddenSubscriberCount','videoCount','commentCount'],
}
df_cols = get_cols( channel_info_tree )
rows = get_rows_channel( channel_info_dic, channel_info_tree )
df = pd.DataFrame( rows, columns=df_cols )
df.to_csv( 'output/UUUM.csv' )

※ 関数が長くてとても見にくい(醜い)ですよね...すみません。

YouTube Data APIでは、Videoと同様に、Channelでも、タイトル、説明文や統計値など様々な情報が取得できます。こちらのコードでは、"part="id,snippet,statistics"で取得する情報を選択しています。
取得可能な情報の詳細は以下に記載されています。
Channels: list | YouTube Data API (v3)

最終的に、各レコードに１チャンネルを格納したDataframeを作成し、csvとして保存しています。DataFrameは下のような感じになります。

3. 各Channel idから投稿動画の情報を取得

※ Channel idから投稿動画の情報取得する方法はYouTuberマイニング #1にも書いているのでコードの解説は省きます。

def get_channel_vlist_quick( yt, cid, max_num ):
    channel_videos_tmp = []
    nextPageToken = ''
    page_num = 50
    while 1:
        feed = yt.search().list( channelId=cid, maxResults=page_num,
                                order='date', type='video', part='id', pageToken = nextPageToken
                               ).execute()
        if 'nextPageToken' in feed: nextPageToken = feed[ 'nextPageToken' ]
        else: break
        vids = [ i['id']['videoId'] for i in feed.get('items') ]
        channel_videos_tmp.extend( vids )
        if len( channel_videos_tmp ) > ( max_num - 60 ): page_num = 1
        if len( channel_videos_tmp ) > max_num: break
    return channel_videos_tmp

def get_video_detail( vid ):
    video_detail = YT.videos().list(part="id,snippet,statistics,contentDetails,topicDetails",id=vid).execute()
    return video_detail.get("items",[])[0]

def get_rows( channel_video_info_tmp, info_tree_tmp ):
    rows = []
    for i, v in channel_video_info_tmp.items():
        # 特定のkeyに値がない場合Noneを入れる
        for parent, child in info_tree_tmp.items():
            if not parent in v:
                v[parent] = {}
                for c in child: v[parent][c] = None
            else:
                if child == None: continue
                for c in child:
                    if not c in v[parent]: v[parent][c] = None
        # 値を格納
        row = []
        for parent, child in info_tree_tmp.items():
            if child == None: row.append( v[parent] )
            elif parent == 'snippet':
                for c in child:
                    if c == 'thumbnails': row.append( v[parent][c]['default']['url'] )
                    else: row.append( v[parent][c] )
            else:
                for c in child: row.append( v[parent][c] )
        rows.append( row )
    return rows

YT = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

channel_info_dic = {}
for uid, flag in channel_dic.items():
    if flag == 'user': channel_feed = YT.channels().list(part="id,snippet,statistics",forUsername=uid).execute()
    else: channel_feed = YT.channels().list(part="id,snippet,statistics",id=uid).execute()
    id = channel_feed.get("items",[])[0]['id']
    channel_info_dic[id] = channel_feed.get("items",[])[0]

save_dir = 'output/channel_csv'
roop = 0
for uid, flag in channel_dic.items():
    roop += 1
    print( roop, uid, flag )
    if flag == 'user':
        channel_feed = YT.channels().list(part="id",forUsername=uid).execute()
        CHANNEL_ID = channel_feed.get("items",[])[0]['id']
    else:
        CHANNEL_ID = uid
    if os.path.exists( '%s/%s.csv' % ( save_dir, CHANNEL_ID ) ):
        continue
    channel_videos = get_channel_vlist_quick( YT, CHANNEL_ID, 500 )
    vlist_details = [ get_video_detail( i ) for i in channel_videos ]    
    channel_video_info = {}
    for i, v_detail in enumerate(vlist_details): channel_video_info[i] = v_detail
    df_cols = get_cols( info_tree )
    rows = get_rows( channel_video_info, info_tree )
    df = pd.DataFrame( rows, columns=df_cols )
    df.to_csv( '%s/%s.csv' % ( save_dir, CHANNEL_ID ) )

これを実行すると、約150チャンネルの投稿動画(最大500件)の情報が取得され、csvで保存されます。

つぶやき

ダウンロード作業は完了したので、近日中にデータセット公開します
UUUM以外のYouTuberの事務所とかコミュニティ?みたいなの知りたい

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up