More than 5 years have passed since last update.

word2vecをもう少し調べてみる（「言語処理100本ノック2015の10章復習）

Posted at 2020-04-19

背景

「言語処理100本ノック 2015」を終わらす事が出来た（但し７章はパス）。ただ、ちょっと不思議に思っている事がある。
「93. アナロジータスクの正解率の計算」の正解率が低い。

問題93での正解率

問題93とその元値である問題92の設問。

92. アナロジーデータへの適用
91で作成した評価データの各事例に対して，vec(2列目の単語) - vec(1列目の単語) + vec(3列目の単語)を計算し，
そのベクトルと類似度が最も高い単語と，その類似度を求めよ．
求めた単語と類似度は，各事例の末尾に追記せよ．
このプログラムを85で作成した単語ベクトル，90で作成した単語ベクトルに対して適用せよ．

93. アナロジータスクの正解率の計算
92で作ったデータを用い，各モデルのアナロジータスクの正解率を求めよ．

正解率１（33 / 506 = 0.065217）

問題92の「85で作成した単語ベクトルに対して適用せよ」に準拠。
問題85でベクトル作成した流れは以下の感じ。

前回の投稿からちょっと変更し、作成したデータをこんな感じで、疎行列化して保存。

ソース

※前回の投稿と同じソース内で実行しています。

問題84

import pickle
from scipy import sparse, io

def lesson84():
    map_t, list_t = getCountMap(FNAME_84_FILTERED_T)
    map_c, list_c = getCountMap(FNAME_84_FILTERED_C)
    with open(FNAME_84_TLIST, 'wb') as f_t_list:
        pickle.dump(list_t, f_t_list)
    with open(FNAME_84_CLIST, 'wb') as f_c_list:
        pickle.dump(list_c, f_c_list)
    f_wiki_tc = open(FNAME_84_FILTERED_TC, 'rt')
    f_wiki_matrix = open(FNAME_84_OUTPUT, 'wt')
    try:
        # 行列作成
        size_t = len(map_t.keys())
        size_c = len(map_c.keys())
        print('size = {}, {}'.format(size_t, size_c))
        matrix_x = sparse.lil_matrix((size_t, size_c))

        for datawk in f_wiki_tc:
            data = datawk.rstrip()
            elems = data.split('\t')
            t = elems[0]
            c = elems[1]
            tc_cnt = int(elems[2])
            t_inf = map_t[t]
            t_cnt = t_inf['cnt']
            c_inf = map_c[c]
            c_cnt = c_inf['cnt']
            calcwk = (N_84 * tc_cnt) / (t_cnt * c_cnt)
            if calcwk >= 1: # log(calcwk) >= 0
                ppmi = math.log(calcwk)
                outstr = '{0:d}\t{1:d}\t{2:f}\n'.format(t_inf['idx'], c_inf['idx'], ppmi)
                f_wiki_matrix.write(outstr)
                matrix_x[t_inf['idx'], c_inf['idx']] = ppmi

        io.savemat(FNAME_84_MATRIX, {'matrix_x': matrix_x})

それをsklearn.decomposition.TruncatedSVD関数で300次元に圧縮して保存。

ソース

問題85

import sklearn.decomposition

FNAME_85_MATRIX = 'enwiki-20150112-400-r10-105752-85.mat'

def lesson85():
    # 行列読み込み
    matrix_x = io.loadmat(FNAME_84_MATRIX)['matrix_x']
    # 読込を確認
    print('matrix_x Shape:', matrix_x.shape)
    print('matrix_x Number of non-zero entries:', matrix_x.nnz)
    print('matrix_x Format:', matrix_x.getformat())

    # 次元圧縮
    clf = sklearn.decomposition.TruncatedSVD(300)
    matrix_x300 = clf.fit_transform(matrix_x)
    io.savemat(FNAME_85_MATRIX, {'matrix_x300': matrix_x300})

それを読み込んで、アナロジー計算評価

ソース

word2vecのベクトル作成コマンド

from scipy import io
import knock100_chapter9 #9章を学習していたソース

FNAME_RESULT91 = 'questions-familyword.txt'
FNAME_RESULT92 = 'questions-familyword-92.txt'

def lesson92():
    print('load vector 85')
    matrix_85 = io.loadmat(knock100_chapter9.FNAME_85_MATRIX)['matrix_x300']
    word_idx_85 = knock100_chapter9.getTIndexObj()

    f_questions = open(FNAME_RESULT91, 'rt')
    f_results = open(FNAME_RESULT92, 'wt')
    try:
        for question in f_questions:
            wk = question.rstrip()
            elems = wk.split(' ')
            result85 = knock100_chapter9.lesson89sub(matrix_85, word_idx_85, elems[1], elems[0], elems[2])[0]
            appenddat = []
            appenddat.append(result85[0])
            appenddat.append(str(result85[1]))

            f_results.write(wk + ' ' + ' '.join(appenddat) + '\n')

    finally:
        f_questions.close()

処理が間違えていないか不安に思ってる点：

そもそもデータ作成がイケてないのでは
その他致命的なバグがあるのでは

※一応問題89ではそれらしき答えが出てきているので箸にも棒にもかからないほどではないと思ってる。（正しい答えはgreeceのはず）

問題89の出力

[['spain', 0.9136016090019428], ['portugal', 0.8802865604755158],
 ['sweden', 0.8539706504225352], ['denmark', 0.8488735147943424],
 ['greece', 0.8449216569083768], ['belgium', 0.8404291273013003],
 ['norway', 0.8363626851350214], ['netherlands', 0.826501154104712],
 ['italy', 0.8083035146227875], ['finland', 0.8037861405387765],
 ['britain', 0.7951431450253504]]

正解率２（45 / 506 = 0.088933）

問題92の「90で作成した単語ベクトルに対して適用せよ」に準拠。
問題81で作成したコーパスに対して、word2vecで単語ベクトル学習。

word2vecのベクトル作成コマンド

time ./word2vec -train ../enwiki-20150112-400-r10-105752-81-s.txt -output vectors-90s1.bin -cbow 1 -size 300 -window 5 -hs 0 -sample 1e-5 -threads 10 -binary 1 -iter 15

word2vecで作成したベクトルをgensimで読み込んででアナロジー処理

ソース

from gensim.models import KeyedVectors
from gensim.models import word2vec
from gensim.similarities.nmslib import NmslibIndexer

FNAME_WORD2VEC = './word2vec/vectors-90s1.bin'
FNAME_RESULT91 = 'questions-familyword.txt'
FNAME_RESULT92 = 'questions-familyword-92.txt'

def lesson90sub(idxer, model, word_a, word_b, word_c, topcnt):
    matrix = model.vectors
    idx_a = model.vocab[word_a].index
    vec_a = matrix[idx_a]
    idx_b = model.vocab[word_b].index
    vec_b = matrix[idx_b]
    idx_c = model.vocab[word_c].index
    vec_c = matrix[idx_c]

    vec_calc = vec_a - vec_b + vec_c

    return idxer.most_similar(vec_calc, topcnt)

def lesson92():
    model_90 = word2vec.Word2Vec.load(FNAME_WORD2VEC, binary=True)
    idxer = NmslibIndexer(model_90)

    f_questions = open(FNAME_RESULT91, 'rt')
    f_results = open(FNAME_RESULT92, 'wt')
    try:
        for question in f_questions:
            wk = question.rstrip()
            elems = wk.split(' ')
            result90 = lesson90sub(idxer, model_90, elems[1], elems[0], elems[2], 1)[0]
            appenddat = []
            appenddat.append(result90[0])
            appenddat.append(str(result90[1]))

            f_results.write(wk + ' ' + ' '.join(appenddat) + '\n')
    finally:
        f_questions.close()

処理が間違えていないか不安に思ってる点：

関数lesson90sub内、単語毎のベクトル取得方法に間違いがないか（NmslibIndexer使うのが正解かどうか）
同、ベクトル計算方法に間違いがないか
同、NmslibIndexer#most_similar関数の使い方を間違えていないか

word2vec の compute-accuracy での正解率

標準デモの結果

word2vec には、解析デモ（demo-word-accuracy.sh）もある。実行してみる。

実行コマンド（ベクトル作成は別途済ませているため、その後のコマンドだけ実行）

ubuntu@ubuntu:~/word2vec$ ./compute-accuracy vectors.bin  < questions-words.txt

実行結果

capital-common-countries:
ACCURACY TOP1: 78.66 %  (398 / 506)
Total accuracy: 78.66 %   Semantic accuracy: 78.66 %   Syntactic accuracy: -nan %
capital-world:
ACCURACY TOP1: 48.04 %  (1712 / 3564)
Total accuracy: 51.84 %   Semantic accuracy: 51.84 %   Syntactic accuracy: -nan %
currency:
ACCURACY TOP1: 21.14 %  (126 / 596)
Total accuracy: 47.92 %   Semantic accuracy: 47.92 %   Syntactic accuracy: -nan %
city-in-state:
ACCURACY TOP1: 45.41 %  (1058 / 2330)
Total accuracy: 47.08 %   Semantic accuracy: 47.08 %   Syntactic accuracy: -nan %
family:
ACCURACY TOP1: 59.05 %  (248 / 420)
Total accuracy: 47.76 %   Semantic accuracy: 47.76 %   Syntactic accuracy: -nan %
・・中略・・
Questions seen / total: 17827 19544   91.21 %

結果を見てみると、ジャンルにより違いはあれど、結構正解してる。家族ジャンルに関してだと59.05%の正解率。

問題90で作成したベクトルを使っての結果

元コーパスデータに原因があるかを調べるため、問題90で作成したベクトルで検証してみる。

実行コマンド

ubuntu@ubuntu:~/word2vec$ ./compute-accuracy vectors-90s1.bin  < questions-words.txt

実行結果

capital-common-countries:
ACCURACY TOP1: 92.69 %  (469 / 506)
Total accuracy: 92.69 %   Semantic accuracy: 92.69 %   Syntactic accuracy: -nan %
capital-world:
ACCURACY TOP1: 63.48 %  (2872 / 4524)
Total accuracy: 66.42 %   Semantic accuracy: 66.42 %   Syntactic accuracy: -nan %
currency:
ACCURACY TOP1: 8.78 %  (66 / 752)
Total accuracy: 58.92 %   Semantic accuracy: 58.92 %   Syntactic accuracy: -nan %
city-in-state:
ACCURACY TOP1: 70.49 %  (1739 / 2467)
Total accuracy: 62.38 %   Semantic accuracy: 62.38 %   Syntactic accuracy: -nan %
family:
ACCURACY TOP1: 73.52 %  (372 / 506)
Total accuracy: 63.03 %   Semantic accuracy: 63.03 %   Syntactic accuracy: -nan %
・・中略・・
Questions seen / total: 19364 19544   99.08 %

結構高い正解率。課題92で使用している家族ジャンルでも73.52% の正解率。
コーパスに問題はなさそう。自分が使ったロジックに問題があるか、このcompute-accuracyの正解率の算出方法が異なっているか。

原因検証

どこに原因があるのかを探っていく。

word2vec の word-analogyコマンドでテスト

word2vecはアナロジーをコマンドラインでも実行できる。問題90で作ったベクトルを使って実行してみる。

./word-analogy vectors-90s1.bin

３単語を入力すると答えが出てくる。

Enter three words (EXIT to break): boy girl prince

Word: boy  Position in vocabulary: 1538

Word: girl  Position in vocabulary: 1418

Word: prince  Position in vocabulary: 1059

                                              Word              Distance
------------------------------------------------------------------------
                                          princess		0.700857
                                             queen		0.551449
                              mecklenburg-strelitz		0.485567
                                           duchess		0.484104
                                   lady-in-waiting		0.479164
・・中略・・
Enter three words (EXIT to break):

正解してる。自分が問題92で実行した時には間違えていた単語。すなわち、問題92での自分の処理に問題がある。
word-analogyで行っているアナロジー処理をpythonでやる必要がありそう。gensim.models.KeyedVectors の most_similar がそれにあたるらしい。その関数を使った形に変えて実行してみる。

from gensim.models import KeyedVectors

FNAME_WORD2VEC = './word2vec/vectors-90s1.bin'
FNAME_RESULT92 = 'questions-familyword-92.txt'
FNAME_RESULT92a = 'questions-familyword-92a.txt'

def word2vec_analogy(model, worda, wordb, wordc):
    result = model.most_similar(negative=[wordb],
                                positive=[worda, wordc])
    return result

def lesson92():
    model_90 = KeyedVectors.load_word2vec_format(FNAME_WORD2VEC, binary=True)

    f_questions = open(FNAME_RESULT91, 'rt')
    f_results = open(FNAME_RESULT92, 'wt')
    try:
        for question in f_questions:
            wk = question.rstrip()
            elems = wk.split(' ')
            result90 = word2vec_analogy(model_90, elems[1], elems[0], elems[2])[0]
            appenddat = []
            appenddat.append(result90[0])
            appenddat.append(str(result90[1]))

            f_results.write(wk + ' ' + ' '.join(appenddat) + '\n')
    finally:
        f_questions.close()

それを再度問題93の処理にかけてみると、372 / 506 = 0.735178 の正解率。
やはり、最初の自分のベクトル計算には問題があった。

most_similar関数を深く見ていく

gesim/models/keyvectors.py most_similar 関数を探る。

ライブラリソース

引用

    def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
        ・・中略・・
        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [
            (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
            for word in positive
        ]
        negative = [
            (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
            for word in negative
        ]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            else:
                mean.append(weight * self.word_vec(word, use_norm=True))
                if word in self.vocab:
                    all_words.add(self.vocab[word].index)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        if indexer is not None and isinstance(topn, int):
            return indexer.most_similar(mean, topn)

        limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab]
        dists = dot(limited, mean)
        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
        return result[:topn]

        ・・中略・・

詳細は読み切れないが、単語に1.0か-1.0の重み付けをした上でmean配列に追加し、mean関数で平均値取得。その上で、matutils.unitvecにて処理をしている様子。単純なベクトルのスカラ加減算ではダメだったという事か。
そして後半部分のもう一つ重要ポイント。all_words という変数に演算で使用したIndexを保持し、類似度上位の結果から抜いてる。

gesim/matutils.py unitvec 関数を探る。

ライブラリソース

引用


def unitvec(vec, norm='l2', return_norm=False):
    """Scale a vector to unit length.
    ・・中略・・
    """

関数定義部分にnormという単語が見える。デフォルトはl2。
ベクトル空間モデルについてというページが見つかる。正規化に関する話。ベクトルを単位長に変換する関数という事らしい。

自分が当初作ったロジックで足りなかったと思われる処理

ベクトル演算前に、各ベクトルを単位長ベクトルに変換。
演算後のベクトルも、単位長ベクトルに変換。
当然、コサイン類似度を算出する対象のベクトルも単位長に処理されたものを使用。
演算に使った単語は、類似度上位に上がっても結果から除外する。

修正してみる

もちろん一番は model.most_similar を使う事。ただ、勉強の為にライブラリソースをパクリつつロジックを組む。

FNAME_RESULT92b = 'questions-familyword-92b.txt'

def word2vec_analogyb(model, worda, wordb, wordc):
    idxa = model.vocab[worda].index
    idxb = model.vocab[wordb].index
    idxc = model.vocab[wordc].index

    mean = []
    all_idxs = []
    all_idxs.append(idxa)
    all_idxs.append(idxb)
    all_idxs.append(idxc)

    if idxa is not None:
        veca = model.word_vec(worda, use_norm=True)  # L2-normalized
        mean.append(1.0 * veca)
    if idxb is not None:
        vecb = model.word_vec(wordb, use_norm=True)  # L2-normalized
        mean.append(-1.0 * vecb)
    if idxc is not None:
        vecc = model.word_vec(wordc, use_norm=True)  # L2-normalized
        mean.append(1.0 * vecc)

    # L2-normalized
    mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

    # 単位行列同士なので、コサイン類似度の分母部分省略できますね。
    # 行列演算で一気に出来ます。（keyvectors.py ロジックのパクリ）。
    dists = np.dot(model.vectors_norm, mean)
    best = matutils.argsort(dists, topn=10, reverse=True)
    return [(model.index2word[sim], float(dists[sim])) for sim in best if sim not in all_idxs]

def lesson92b():
    model_90 = KeyedVectors.load_word2vec_format(FNAME_WORD2VEC, binary=True)
    model_90.init_sims() # for init norm

    f_questions = open(FNAME_RESULT91, 'rt')
    f_results = open(FNAME_RESULT92b, 'wt')
    try:
        for question in f_questions:
            wk = question.rstrip()
            elems = wk.split(' ')
            result90 = word2vec_analogyb(model_90, elems[1], elems[0], elems[2])[0]
            appenddat = []
            appenddat.append(result90[0])
            appenddat.append(str(result90[1]))

            f_results.write(wk + ' ' + ' '.join(appenddat) + '\n')
    finally:
        f_questions.close()

これで、372 / 506 = 0.735178 の正解率が出た。

一応、問題85で作成したベクトルでも前述ポイントに気を付けて修正した結果、131 / 506 = 0.258893 の正解率が出た。ソースはあまりにも散らかってQiitaに載せるレベルでないので省略。

参考にさせてもらったページ

言語処理100本ノック 2015

ベクトル空間モデルについて

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up