More than 3 years have passed since last update.

ツイートデータのテキストマイニング

Posted at 2021-03-28

前回の続きです。
ツイッターで検索するワードは、自動車メーカー3社（トヨタ・日産・ホンダ）の社名と合わせ、昨今話題の「自動運転」が同時にツイートされているものを対象としました。

やりたいこと・処理の流れ

概要
- 収集したツイートデータを適宜整形・加工し、形態素解析を行う
- 品詞ごとの頻度集計を行い、ワードクラウドとしてプロットしてみる
- 単語間の係り方を図示すべく、n-gram集計(今回はn=2)し、共起ネットワークを描いてみる
前提
- MeCabやimportする各種ライブラリのインストールが済んでいること
- PythonとMeCabの連携、Pythonバインディングのインストールが済んでいること
- お好みでシステム辞書の拡張(mecab-ipadic-neologd)、ユーザー辞書の作成・コンパイルが済んでいること
完成イメージ
- 日別推移
- ワードクラウド
- 共起ネットワーク
参考サイト
- ワードクラウドの関数部分：TF-IDFで見る評価の高いラーメン屋の口コミ傾向（自然言語処理, TF-IDF, Mecab, wordcloud, 形態素解析、分かち書き）
- 共起ネットワークの関数部分：MeCabとnetworkXを使って共起ネットワークを書いたのでコードをメモっておく

コードと出力結果

前処理

収集したツイートのうち、重複ツイートが存在する可能性があるためユニークにする
※単一ツイート内でトヨタ・日産・ホンダのいずれか2つ以上をツイートしたものは重複して取得しているため

# pickleファイルをロード
raw_tweetlog = pd.read_pickle('./raw_tweetlog.pkl')

# 同一ツイートを重複して取得しているため、ツイートidのユニークなDataFrameを作成（TARGET_WORDS（メーカー名）ごとのフラグも集約）
dupulicate_target = 'id'
uniqueflag_df = raw_tweetlog[raw_tweetlog.duplicated(subset=dupulicate_target, keep=False)].\
                groupby(dupulicate_target).agg(
                {'トヨタ': lambda x: True if sum(x) > 0 else False,
                '日産': lambda x: True if sum(x) > 0 else False,
                'ホンダ': lambda x: True if sum(x) > 0 else False}).reset_index()

# ユニークなDataFrameを参照してツイートがあれば各TARGET_WORDのフラグを上書き、新たなDataFrameを作成
df = raw_tweetlog.merge(uniqueflag_df, on=[dupulicate_target], how='left', suffixes=('_',''))
target_word = ['トヨタ', '日産', 'ホンダ']
for target in target_word:
    df[target] = df[target].fillna(df[target+'_'])
    df = df.drop(target+'_', axis=1)

df.drop_duplicates(subset=dupulicate_target, inplace=True)

ツイート日時のフォーマット変換と、ツイート内のターゲットとなる社名数をカウント

df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d')
df['target_count'] = df.apply(lambda x: sum(x[['トヨタ', '日産', 'ホンダ']]), axis=1)

形態素解析

MeCabを使って解析

def mecab_list(sentence):
    # ユーザー辞書とシステム辞書がある場合は指定
    tagger = MeCab.Tagger('-Ochasen -u /usr/local/lib/mecab/dic/ipadic/user_dic.dic -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    tagger.parse('')
    node = tagger.parseToNode(str(sentence))
    word_class = []
    type_of_word_class = []
    
    # 除外ワードがある場合は指定する
    stopwords = ['し', 'する', 'こと', 'てる', 'ん', 'の', 'て', 'なっ', 'れ', 'さ', 'なる', 'そう', 'い', 'さん',\
                 'co', 'https', 't', '思っ', 'いる', 'くる', 'ー', 'みたい', '見', '出', '方', '事', '何',\
                 '中', 'ある', 'とき', '人', '私', 'ため']
    target_wclass = ['名詞', '動詞', '形容詞'] # 抽出したい品詞を指定する
    
    while node:
        # sentenceから表層形と品詞情報の出力
        word, wclass = node.surface, node.feature.split(',')[0]
        # 対象外の表層形を除外、対象の品詞に絞り込み
        if wclass != u'BOS/EOS' and \
           word not in stopwords and wclass in target_wclass:
            word_class.append(word)
            type_of_word_class.append(wclass)
        node = node.next

    return pd.Series({'morphene': word_class, 'morphene_type': type_of_word_class})

df = pd.concat([df, df['text'].apply(mecab_list)], axis=1)
df

出力結果

日別推移

メーカーごとの日別推移を集計

def daily_tweet(title, data):
    plt.figure(figsize=(8, 6))
    temp_df = data.set_index(data['created_at'].map(lambda s: s.strftime('%m/%d'))).\
    groupby(level=0).size()
    plt.bar(temp_df.index, temp_df.values)
    plt.title(label='"{}" を含むツイート'.format(title))

    # ツイート総数
    plt.text(x=temp_df.index[-1], y=max(temp_df.values)*0.95,\
             s='ツイート総数:'+str(sum(temp_df.values))+'件', ha='center',\
             bbox=dict(boxstyle='round', fc='white', alpha=0.3, ec='gray'))
    # 日別件数
    [plt.text(x=temp_df.index[i], y=temp_df.values[i], s=temp_df.values[i], ha='center')\
     for i in range(len(temp_df))]

    plt.show()

# メーカーごとのDataFrameを作成
toyota_df = df.loc[df['トヨタ']==True]
nissan_df = df.loc[df['日産']==True]
honda_df = df.loc[df['ホンダ']==True]

for target, data in zip(target_word, [toyota_df, nissan_df, honda_df]):
    daily_tweet(target, data)

 * ホンダ・レジェンド「自動運転で世界初レベル3」のニュースによって、3/24にツイート数が激増
 * それに引っ張られる形でトヨタもツイートが増えている（ホンダと比較されている？）
 * 日産にはほぼ影響を及ぼしておらず、なんなら3/26はツイート数ゼロ

ワードクラウド

同じくメーカーごとにワードクラウドとして描画

# 品詞ごとの頻度をカウント
def word_frequency(data):
    documents = data['morphene']
    dct = corpora.Dictionary(documents)
    # コーパスの中で出現頻度の低すぎる単語と高すぎる単語は、文書間の違いを表せないので特徴語には不適切と考えて除去
    dct.filter_extremes(no_below = 3, no_above = 0.8)

    word_freq = {x:dct.dfs[y] for x, y in dct.token2id.items()}
    word_freq = dict(sorted(word_freq.items(), key=lambda x:x[1], reverse=True))
    word_freq_df = pd.DataFrame(data=word_freq.values(), index=word_freq.keys(), columns=['freq']).head(100)
         
    return word_freq_df

# ワードクラウドの描画
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), 
                   title=None, title_size=60, title_color='gray', bg_color='white'):
    
    # 日本語に対応させるためにフォントのパスを指定
    f_path = '/System/Library/Fonts/ヒラギノ角ゴシック W1.ttc'
    
    # wordcloudの生成
    wordcloud = WordCloud(background_color=bg_color,
                    font_path=f_path, #日本語対応
                    max_words=max_words,
                    max_font_size=max_font_size, 
                    width=800, 
                    height=400,
                    mask=mask)
    wordcloud.generate(str(text).replace("'", ""))
    
    plt.figure(figsize=figure_size)
    plt.imshow(wordcloud)
    plt.title(title, fontdict={'size': title_size, 
                               'color': title_color, 
                               'verticalalignment': 'bottom'})
    plt.axis('off')
    plt.tight_layout()
    
#　横棒グラフの描画
def plot_bar_horizontal(data, figure_size):
    plt.figure(figsize=figure_size)
    plt.barh(data.index, data.values)

for company_df, title in zip([toyota_df, nissan_df, honda_df], target_word):
    company_word_freq_df = word_frequency(company_df)
    plot_wordcloud(list(company_word_freq_df.index), figure_size=(12, 6), title='')
    plot_bar_horizontal(company_word_freq_df[:20][::-1]['freq'], figure_size=(8, 6))

トヨタ

日産

ホンダ

共起ネットワーク

メーカーごとの共起ネットワークを描画

def plot_co_occurrence_network(data_morphene, data_morphene_type, text=''):
    
    node_name = defaultdict(str)
    node_idx = defaultdict(int)
    node_type = defaultdict(list)
    node_count = defaultdict(int)
    edge_list = []
    cnt = 0
    
    # DataFrameの形態素・品詞種類の各列からデータを読み込み
    for morphene, morphene_type in zip(data_morphene, data_morphene_type):
        node_prev = None

        for m, m_t in zip(morphene, morphene_type):
            # Nodeの処理
            if m not in node_name.values():
                node_name[cnt] = m
                node_idx[m] = cnt
                node_count[cnt] = 1
                node_type[m_t].append(node_idx[m])
                cnt += 1
            else:
                node_count[node_idx[m]] += 1

            # edgeの処理
            if (node_prev is not None) & (node_prev != node_idx[m]): # 循環グラフ、有向グラフを回避
                edge = (min(node_prev, node_idx[m]), max(node_prev, node_idx[m]))
                edge_list.append(edge)
            node_prev = node_idx[m]

    edge_count = Counter(edge_list)

    # Networkxに格納
    G = nx.Graph()
    G.add_nodes_from([(idx, {'cnt': node_count[idx]}) for idx in node_name])
    G.number_of_nodes(), len(node_name)
    G.add_edges_from([(a, b, {'cnt': edge_count[(a, b)]}) for a, b in edge_list])

    # Node, Edgeを剪定
    G2 = deepcopy(G)
    # Node: cnt >= 5で剪定
    # 破壊的操作なので、予め破壊用のグラフ(G2)と検索用グラフ(G)を分けておく
    for n, attr in G.nodes().items():
        if (attr['cnt'] < 5):
            G2.remove_edges_from(list(G.edges(n)))
            G2.remove_node(n)

    G3 = deepcopy(G2)
    # Edge: cnt >= 2で剪定
    # EdgeがなくなったNodeは一旦そのまま
    for e, attr in G2.edges().items():
        if attr['cnt'] < 2:
            G3.remove_edge(*e)

    G4 = deepcopy(G3)
    # EdgeがなくなったNodeを削除
    for n in list(G3.nodes()):
        if len(G3[n]) == 0:
            G4.remove_node(n)

    G_result = deepcopy(G4)

    pos = nx.layout.spring_layout(G_result, k=0.7, seed=10) # 2次元平面上の座標を計算
    labels = {n: node_name[n] for n in pos.keys()} # Nodeに日本語を描画するための辞書
    # node_size = [np.log(node_count[n])*400 for n in pos.keys()] # 対数スケール
    node_size = [node_count[n]*25 for n in pos.keys()]

    edge_alpha = [edge_count[e] for e in G_result.edges()]
    edge_colors = [edge_count[e]*2.5 for e in G_result.edges()]
    edge_width = [edge_count[e]*0.4 for e in G_result.edges()]

    node_dict = dict(zip(G_result.nodes(), node_size))

    # 描画
    fig, ax = plt.subplots(figsize=(12,12))
    # Nodeを色分けしたいときは、nodelistを使ってNodeのグループ毎に描画関数を繰り返し実行する
    # nodelistはグループ毎のNode番号を指定するが、それ以外の引数(posやnode_sizeなど)は全てのNodeについての値を入れる
    # 指定出来る色はmatplotlibのcolor exampleを参照
    # https://matplotlib.org/examples/color/named_colors.html

    node_type_list = ['名詞', '動詞', '形容詞']
    node_color_list = ['orange', 'yellowgreen', 'tomato']
    
    for n_t, n_c in zip(node_type_list, node_color_list):
        nx.draw_networkx_nodes(G_result, pos, 
                               nodelist=[n for n in G_result.nodes() if n in node_type[n_t]], 
                               node_size=[val for key, val in node_dict.items() if key in \
                                         [n for n in G_result.nodes() if n in node_type[n_t]]], 
                               node_color=n_c, alpha=0.6, ax=ax)
        
        # 凡例の出力準備
        plt.scatter([], [], c=n_c, alpha=0.5, s=350, label=n_t)

    # edgeの色に濃淡をつけたいときは、edge_colorに数値のlistを代入してedge_cmapを使用
    # Sequentialなカラーマップから好きなやつを選ぶ
    # https://matplotlib.org/examples/color/colormaps_reference.html
    # 色の濃淡の具合はedge_vmin, edge_vmaxで調整
    nx.draw_networkx_edges(G_result, pos, alpha=0.6,
                           width=edge_width, edge_color=edge_colors, 
                           edge_vmin=0, edge_vmax=10,
                           edge_cmap=plt.cm.Blues,ax=ax)
    # Nodeにラベルをつけたいときは、以下の関数を使う
    # font_familyにPCに入っている日本語フォントを指定してあげると、日本語を描画してくれる
    nx.draw_networkx_labels(G_result, pos, labels, font_size=10, font_family="Hiragino sans", ax=ax)

    plt.title(text)
    
    # 凡例表示
    plt.legend(scatterpoints=1, frameon=True,
           labelspacing=1, title='品詞の種類')
    
    plt.axis('off')
    # fig.patch.set_alpha(0.3)
    fig.patch.set_facecolor('white')
    plt.show()

for company_df, text in zip([toyota_df, nissan_df, honda_df], target_word):
    plot_co_occurrence_network(company_df['morphene'].tolist(),\
                               company_df['morphene_type'].tolist(), text)

トヨタ

 * 日産

 * ホンダ

おまけ
- 同一ツイート内に複数の自動車メーカーを含んでいるもの
```
df.loc[df['target_count']>=2]
```

まとめ

日別推移
- ホンダのレベル3のニュースでトヨタもツイート数爆上げ、日産に対しては限定的
- 自動運転のニュースをリリースするまでホンダは影が薄かったが、今回「やっぱり技術のホンダ」として世に知らしめた印象を受ける
ワードクラウド
- トヨタ：「自動運転」のワードと同時に、別日にニュースとなった「小型商用分野で資本提携」のネタとも合わせてたくさんツイートされている
- 日産：他2社と比べると印象薄い。CMに対するツイート多い。なんかやっちゃえNISSAN!
- ホンダ：「世界初」「レベル3」「実用化」など、驚きとともにインパクトのあるニュースとして広まっている
共起ネットワーク
- トヨタ：自動運転まわりだと、自動運転−燃料電池−資本提携や、自動運転−ユニコーン−Momenta(中国のユニコーン企業)なんかも
- 日産：特徴掴みきれず
- ホンダ：自動運転・レベル3は言わずもがな、枝葉にフォーカスすると、追い越せ−ウサギ−カメ、挑む−理由など、応援するツイートも多い
反省
- ユーザー辞書の作り込みが甘く、「レベル3」「センシングエリート」などの固有名詞が分かち書きされてしまっていた
- 共起ネットワークを作成する際、Networkxライブラリの使用方法をきちんと押さえきれておらず、edge・nodeしきい値の最適解は道半ば
- 同様に、MeCabのstopwordのチューニングも試行錯誤が必要そう

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up