はじめに
word2vecについて,論文[1]を読んでいたらword2phraseがでてきたので実装してみました.
参考
- [1] Distributed Representations of Words and Phrases and their Compositionality
- [2] Python3からMeCabを使う(*Mac)
- [3] Lorem Ipsum
Phraseのスコア計算式
[1]から引用.$\delta$ は低頻出のフレーズ生成を防ぐためのパラメータ.
score(w_i, w_j) = \frac{count(w_i, w_j) - \delta} {count(w_i) \times count(w_j)}
コード
import collections
import numpy as np
import pandas as pd
import MeCab
JOINT = '_+_'
# 各単語の出現頻度を計算
def _unigram_count(corpus):
return dict([(word, count) for word, count in collections.Counter(corpus).most_common()])
# 隣同士にいる2つ名詞の組み合わせの出現頻度を計算
def _bigram_count(corpus):
candidates = []
tagger = MeCab.Tagger()
previous_noun = None
for index, word in enumerate(corpus):
part_of_speech = tagger.parse(word).split('\t')[1].split(',')[0]
if part_of_speech == '名詞':
if previous_noun == None:
previous_noun = word
else:
candidates.append('{}{}{}'.format(previous_noun, JOINT, word))
previous_noun = None
else:
previous_noun = None
return dict([[candidate, count] for candidate, count in collections.Counter(candidates).most_common()])
# 上記の式を用いてスコアを計算し,ある値以上の組み合わせをフレーズとして抽出
def word2phrase(corpus, delta=8, threshold=.00001):
unigram_count = _unigram_count(corpus)
bigram_count = _bigram_count(corpus)
phrases = []
for candidate in bigram_count:
w1, w2 = candidate.split(JOINT)
score = (bigram_count[candidate] - delta) / (unigram_count[w1] * unigram_count[w2])
if score > threshold:
phrases.append(candidate)
return phrases
# ダミーコーパスを用意
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.".replace('.', ' .').split(' ')
corpus = text
for _ in range(0, 5, 1):
corpus += text
# 実行
phrases = word2phrase(corpus)
print(phrases)
# ['Lorem_+_ipsum', 'dolor_+_sit', 'amet,_+_consectetur', 'adipiscing_+_elit,', 'sed_+_do', 'eiusmod_+_tempor', 'incididunt_+_ut', 'labore_+_et', 'dolore_+_magna', 'aliqua_+_.', 'Ut_+_enim', 'ad_+_minim', 'veniam,_+_quis', 'nostrud_+_exercitation', 'ullamco_+_laboris', 'nisi_+_ut', 'aliquip_+_ex', 'ea_+_commodo', 'consequat_+_.', 'Duis_+_aute', 'irure_+_dolor', 'in_+_reprehenderit', 'in_+_voluptate', 'velit_+_esse', 'cillum_+_dolore', 'eu_+_fugiat', 'nulla_+_pariatur', '._+_Excepteur', 'sint_+_occaecat', 'cupidatat_+_non', 'proident,_+_sunt', 'in_+_culpa', 'qui_+_officia', 'deserunt_+_mollit', 'anim_+_id', 'est_+_laborum', '._+_Lorem', 'ipsum_+_dolor', 'sit_+_amet,', 'consectetur_+_adipiscing', 'elit,_+_sed', 'do_+_eiusmod', 'tempor_+_incididunt', 'ut_+_labore', 'et_+_dolore', 'magna_+_aliqua', '._+_Ut', 'enim_+_ad', 'minim_+_veniam,', 'quis_+_nostrud', 'exercitation_+_ullamco', 'laboris_+_nisi', 'ut_+_aliquip', 'ex_+_ea', 'commodo_+_consequat', '._+_Duis', 'aute_+_irure', 'dolor_+_in', 'reprehenderit_+_in', 'voluptate_+_velit', 'esse_+_cillum', 'dolore_+_eu', 'fugiat_+_nulla', 'pariatur_+_.', 'Excepteur_+_sint', 'occaecat_+_cupidatat', 'non_+_proident,', 'sunt_+_in', 'culpa_+_qui', 'officia_+_deserunt', 'mollit_+_anim', 'id_+_est', 'laborum_+_.']
さいごに
こんなものでフレーズ抽出ができてしまうのか,というあっさり感があります.いい感じのフレーズを抽出するためのパラメータ($\delta$, threshold)の最適値を探索するのに簡単で早い方法はないのだろうか?