More than 3 years have passed since last update.

Youtubeのコメント欄をWordCloudで可視化する

Posted at 2020-11-27

はじめに

チャンネルや動画によってコメント欄にも個性があって、見ているのが楽しいのでWordCloudで可視化してみました。

動作環境

Python 3.6.8
YouTube Data API　※使用前に登録が必要。
WordCloud

実装内容

コメント取得
- Youtube Data APIを使用してコメントとそれに対するリプライコメントを取得する。
- コメント以外の情報（コメントした人のID等）は取得しない。
WordCloudでの可視化
- コメントの名詞部分だけを取り出し、可視化する
- 最終的には「VideoIDを指定するだけでWordCloudで可視化できる」ものを作る
  - [補足]：https://www.youtube.com/watch?v=[ここがVideoID]

ソースコード

※一部省略

import json
import re

import pandas as pd
import requests


import collections
import copy
import json
import re

import MeCab
import numpy as np
import pandas as pd
# markdown出力用
import pytablewriter
import requests
from wordcloud import WordCloud

API_KEY = '自分のAPIキーを入力する'
BASE_URL = 'https://www.googleapis.com/youtube/v3/'
FONT_PATH = 'WordCloudの描画に使用するフォントファイルのパスを入力する'


def get_comment_info(api_key, video_id, page_token):
    url = BASE_URL + 'commentThreads'
    param = {
        'key': api_key,
        'videoId': video_id,
        'part': 'replies, snippet',
        'maxResults': '100',
    }

    if page_token:
        param['pageToken'] = page_token

    response = requests.get(url, params=param)
    return response.json()


def get_video_comments(api_key, video_id):
    comments = []
    page_token = ''

    while page_token != None:
        resource = get_comment_info(api_key, video_id, page_token)

        for comment_thread in resource['items']:
            # コメント取得
            comment = comment_thread['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

            if ('replies' in comment_thread) and ('comments' in comment_thread['replies']):
                for replies in comment_thread['replies']['comments']:
                    # コメント取得
                    reply_comment = replies['snippet']['textDisplay']
                    comments.append(reply_comment)

        if 'nextPageToken' in resource:
            page_token = resource['nextPageToken']
        else:
            page_token = None
    return comments



# 特定の品詞の単語リストを取り出す
def get_word_class(text, kind='', min_len=1):
    word_list = []
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(text)

    while node:
        word = node.surface
        pos = node.feature.split(',')[1]
        if (pos == kind) & (len(word) > min_len):
            word_list.append(word)
        node = node.next

    return word_list


def get_noun_list(comments):
    all_noun = []

    # NOTE: 名詞を抽出する前にテキスト整形する
    # 改行タグの削除
    comments = list(map(lambda x: x.replace('<br />', ''), comments))

    for comment in comments:
        # aタグを消す
        reg = re.compile('<a.*/a>')
        atags = reg.findall(comment)
        for atag in atags:
            comment = comment.replace(atag, '')

        all_noun.extend(get_word_class(comment, kind='一般', min_len=1))

    return all_noun


# %%  ある動画に対するコメントを取得、解析する
def get_wc_from_comment(video_id, video_title):
    comments = get_video_comments(API_KEY, video_id)
    all_noun = get_noun_list(comments)
    wc = WordCloud(background_color='white',
                   font_path=FONT_PATH, width=900, height=500, collocations=False).generate(' '.join(all_noun))

    wc.to_file('./wordcloud_{}.png'.format(video_title))


# NOTE: 今回video_dfを取得する部分は省略しています。特定チャンネルの動画を一括で取得しています。
# NOTE: 別記事をご参照ください。
for index, row in video_df.iterrows():
    get_wc_from_comment(row['videoId'], row['title'])