言語処理100本ノック 2015の挑戦記録です。環境はUbuntu 16.04 LTS + Python 3.5.2 :: Anaconda 4.1.1 (64-bit)です。過去のノックの一覧はこちらからどうぞ。
第10章: ベクトル空間法 (II)
90. word2vecによる学習
# coding: utf-8
import pickle
from collections import OrderedDict
import numpy as np
from scipy import io
import word2vec
fname_input = 'corpus81.txt'
fname_word2vec_out = 'vectors.txt'
fname_dict_index_t = 'dict_index_t'
fname_matrix_x300 = 'matrix_x300'
# word2vecでベクトル化
word2vec.word2vec(train=fname_input, output=fname_word2vec_out,
size=300, threads=4, binary=0)
# その結果を読み込んで行列と辞書作成
with open(fname_word2vec_out, 'rt') as data_file:
# 先頭行から用語数と次元を取得
work = data_file.readline().split(' ')
size_dict = int(work[0])
size_x = int(work[1])
# 辞書と行列作成
dict_index_t = OrderedDict()
matrix_x = np.zeros([size_dict, size_x], dtype=np.float64)
for i, line in enumerate(data_file):
work = line.strip().split(' ')
dict_index_t[work[0]] = i
matrix_x[i] = work[1:]
# 結果の書き出し
io.savemat(fname_matrix_x300, {'matrix_x300': matrix_x})
with open(fname_dict_index_t, 'wb') as data_file:
pickle.dump(dict_index_t, data_file)
[ 2.32081000e-01 1.34141400e+00 7.57177000e-01 9.18121000e-01
1.41462400e+00 4.61902000e-01 -3.19372000e-01 -9.11796000e-01
6.74263000e-01 8.88596000e-01 8.66489000e-01 4.41949000e-01
-6.52780000e-02 -5.73398000e-01 -1.72020000e-01 2.79280000e-01
-1.61161000e-01 4.50549000e-01 7.46780000e-02 -3.13907000e-01
-4.32671000e-01 6.18620000e-02 -1.27725100e+00 6.85341000e-01
3.03760000e-02 -3.19811000e-01 -7.68924000e-01 -2.62472000e-01
4.91034000e-01 9.34251000e-01 -6.05433000e-01 -5.19170000e-02
-6.72454000e-01 1.55326600e+00 -7.37928000e-01 1.66526200e+00
-6.69270000e-02 8.88963000e-01 -6.68554000e-01 2.86349000e-01
-1.27271300e+00 -1.21432000e-01 1.26359000e+00 1.25684600e+00
1.97781000e-01 8.14802000e-01 2.05766000e-01 -4.26121000e-01
7.07411000e-01 7.51749000e-01 6.40161000e-01 -3.28497000e-01
4.20656000e-01 4.26616000e-01 -2.29688000e-01 -4.02054000e-01
-2.33294000e-01 -6.42150000e-02 -7.11624000e-01 1.82619000e-01
-7.58055000e-01 -2.03132000e-01 5.12000000e-04 1.31971700e+00
1.03481400e+00 2.22623000e-01 6.24024000e-01 9.64505000e-01
-7.62032000e-01 -3.60960000e-02 4.45112000e-01 -5.08120000e-01
-1.00680500e+00 -2.55381000e-01 8.55365000e-01 6.17396000e-01
-7.78720000e-01 -6.18505000e-01 1.21397000e-01 -1.69275000e-01
6.60319000e-01 -3.36548000e-01 -5.62175000e-01 -2.04378300e+00
-7.94834000e-01 -4.65775000e-01 -7.54679000e-01 3.90806000e-01
-8.01828000e-01 -4.92555000e-01 3.47642000e-01 -4.28183000e-01
-1.99666800e+00 1.82001000e-01 -1.70085000e-01 9.28966000e-01
-1.96638600e+00 9.23961000e-01 4.84498000e-01 -5.24912000e-01
1.02234000e+00 4.62904000e-01 4.10672000e-01 6.97174000e-01
6.19435000e-01 8.32230000e-02 1.41234000e-01 6.12439000e-01
-1.45182000e+00 1.85729000e-01 5.67926000e-01 -3.29128000e-01
-3.83217000e-01 3.79447000e-01 -5.50135000e-01 -4.12838000e-01
-4.16418000e-01 1.05820000e-02 6.92200000e-02 -6.27480000e-02
1.24219800e+00 -3.96815000e-01 -4.01746000e-01 -6.71752000e-01
7.81617000e-01 -8.54749000e-01 -1.07806700e+00 7.44280000e-02
-1.91329200e+00 -1.21407300e+00 -5.23873000e-01 -1.01673500e+00
4.35801000e-01 1.73546700e+00 -7.54100000e-01 -5.14167000e-01
-2.15539000e-01 -6.96321000e-01 1.45136000e-01 6.40906000e-01
-4.21082000e-01 -3.60932000e-01 -2.98236100e+00 1.05500300e+00
-5.42376000e-01 2.06387000e-01 2.28400000e-02 -1.87433000e-01
-4.26448000e-01 -7.00808000e-01 -1.91694000e-01 -6.80270000e-02
8.37304000e-01 6.18913000e-01 3.09183000e-01 -2.22531000e-01
-3.08164000e-01 1.91496000e+00 -2.05698000e-01 -1.38298000e+00
1.08415000e-01 5.35886000e-01 -2.32130000e-02 6.94406000e-01
-4.17144000e-01 -1.90199000e+00 6.69315000e-01 -6.32312000e-01
-3.45570000e-02 -6.03989000e-01 3.56266000e-01 -1.02690000e+00
4.67688000e-01 5.27140000e-02 3.66741000e-01 1.92638600e+00
6.22386000e-01 4.83680000e-01 1.00020800e+00 4.46445000e-01
4.13120000e-01 2.12195000e-01 1.56286000e-01 1.33522500e+00
6.97672000e-01 5.66884000e-01 1.53622000e-01 6.39750000e-01
-2.03707000e-01 2.10565800e+00 -1.17320000e-01 8.55233000e-01
2.61317700e+00 -2.14519000e-01 8.55025000e-01 9.06171000e-01
-4.56919000e-01 -1.40941000e-01 -6.24079000e-01 -1.26463800e+00
-9.31688000e-01 9.94177000e-01 -6.76021000e-01 -9.58533000e-01
4.40553000e-01 -1.23600000e-03 -5.81909000e-01 3.57520000e-01
-7.99588000e-01 1.11611700e+00 -4.93985000e-01 1.23746500e+00
-7.51088000e-01 -9.28216000e-01 3.05621000e-01 -5.11757000e-01
1.05883000e-01 4.88388000e-01 8.31103000e-01 -5.05967000e-01
-1.01836400e+00 -2.54270000e-01 -4.25978000e-01 2.21318000e-01
-7.14479000e-01 3.37610000e-01 -6.56314000e-01 -3.55550000e-01
2.31042000e-01 -9.86197000e-01 -7.63255000e-01 1.04544800e+00
1.57370400e+00 1.95025900e+00 5.00542000e-01 -5.48677000e-01
5.21174000e-01 -2.04218000e-01 -2.11823000e-01 -2.30830000e-01
1.45851700e+00 -2.69244000e-01 -8.57567000e-01 1.28116000e+00
1.18514300e+00 7.82615000e-01 -7.24170000e-02 -1.07394300e+00
-5.76223000e-01 5.17903000e-01 6.55052000e-01 1.56492300e+00
1.58710000e-01 1.64205300e+00 4.27021000e-01 1.65960000e-01
1.27899000e-01 2.45154000e-01 -3.33136000e-01 3.69693000e-01
6.90610000e-01 -5.47800000e-01 1.87585000e-01 6.63304000e-01
-1.18193300e+00 -3.42415000e-01 -1.97505000e-01 1.55585000e+00
6.80237000e-01 7.02119000e-01 -1.39572100e+00 -2.07230000e-02
-4.62809000e-01 -4.94772000e-01 2.25839000e-01 3.32944000e-01
-7.71918000e-01 -8.55043000e-01 -5.98472000e-01 -1.60165800e+00
-3.56646000e-01 -3.89552000e-01 -7.58449000e-01 2.03913000e-01
2.84149000e-01 -5.72755000e-01 -4.92234000e-01 -1.15743600e+00
-5.41931000e-01 -7.22312000e-01 8.08674000e-01 -3.62800000e-02
2.92228000e-01 4.90371000e-01 5.50050000e-01 1.82185000e-01
-2.12689000e-01 -1.03393500e+00 1.97234000e-01 -2.13381000e-01]
「United States」と「U.S.」のコサイン類似度の算出ですが、きちんと高い値になっています。
Wales 0.7539543550055905
Scotland 0.7386559299178808
Britain 0.6479338653237635
Ireland 0.6348035977770026
Sweden 0.6046247805709913
Spain 0.6012807753931683
Germany 0.5945993118023707
England. 0.5886246671101062
Norway 0.5712078065200615
London 0.5622154447245881
Spain 0.8975386269080241
Austria 0.8165995526197494
Greece 0.8115120679668039
Egypt 0.8108041287727046
Italy 0.7967845991447613
Russia 0.7903349902284371
Denmark 0.784935131008747
Sweden 0.7731913094622944
Germany 0.7689020148989952
Portugal 0.7634638759682534
pythonで使えるword2vecの実装はいくつかあるようですが、今回はpipで見つけたword2vecのラッパーライブラリを使いました。pip install word2vec
segavvy@ubuntu:~$ pip search word2vec
brocas-lm (1.0) - Broca's LM is a free python library
providing a probabilistic language model
based on a Recurrent Neural Network (RNN)
with Long Short-Term Memory (LSTM). It
utilizes Gensim's Word2Vec implementation
to transform input word sequences into a
dense vector space. The output of the model
is a seqeuence of probability distributions
across the given vocabulary.
word2vec-wikification-py (0.16) - A package to run wikification
sense2vec (0.6.0) - word2vec with NLP-specific tokens
ShallowLearn (0.0.5) - A collection of supervised learning models
based on shallow neural network approaches
(e.g., word2vec and fastText) with some
additional exclusive features
theano-word2vec (0.2.1) - word2vec using Theano and Lasagne
word2vec (0.9.1) - Wrapper for Google word2vec
word2veckeras ( - word2vec based on Kearas and gensim
segavvy@ubuntu:~$ pip install word2vec
Collecting word2vec
Downloading word2vec-0.9.1.tar.gz (49kB)
100% |████████████████████████████████| 51kB 1.9MB/s
Requirement already satisfied: numpy in ./anaconda3/lib/python3.5/site-packages (from word2vec)
Requirement already satisfied: cython in ./anaconda3/lib/python3.5/site-packages (from word2vec)
Building wheels for collected packages: word2vec
Running setup.py bdist_wheel for word2vec ... done
Stored in directory: /home/segavvy/.cache/pip/wheels/f9/fa/6a/4cdbfefd2835490548505e4136b8f41f063d8f3c4639bf0f53
Successfully built word2vec
Installing collected packages: word2vec
Successfully installed word2vec-0.9.1
これで「import word2vec」できればインストールは完了です。
segavvy@ubuntu:~$ python
Python 3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul 2 2016, 17:53:06)
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import word2vec
Help on function word2vec in module word2vec.scripts_interface:
word2vec(train, output, size=100, window=5, sample='1e-3', hs=0, negative=5, threads=12, iter_=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocab=None, read_vocab=None, verbose=False)
word2vec execution
Parameters for training:
train <file>
Use text data from <file> to train the model
output <file>
Use <file> to save the resulting word vectors / word clusters
size <int>
Set size of word vectors; default is 100
window <int>
Set max skip length between words; default is 5
sample <float>
Set threshold for occurrence of words. Those that appear with
higher frequency in the training data will be randomly
down-sampled; default is 0 (off), useful value is 1e-5
hs <int>
Use Hierarchical Softmax; default is 1 (0 = not used)
negative <int>
Number of negative examples; default is 0, common values are 5 - 10
(0 = not used)
threads <int>
Use <int> threads (default 1)
min_count <int>
This will discard words that appear less than <int> times; default
is 5
alpha <float>
Set the starting learning rate; default is 0.025
debug <int>
Set the debug mode (default = 2 = more info during training)
binary <int>
Save the resulting vectors in binary moded; default is 0 (off)
cbow <int>
Use the continuous back of words model; default is 1 (skip-gram
save_vocab <file>
The vocabulary will be saved to <file>
read_vocab <file>
The vocabulary will be read from <file>, not constructed from the
training data
// Save the word vectors
fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
for (a = 0; a < vocab_size; a++) {
fprintf(fo, "%s ", vocab[a].word);
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
fprintf(fo, "\n");
実行結果には、100本ノックで用いるコーパス・データで配布されているデータの一部が含まれます。この第10章で用いているデータのライセンスはクリエイティブ・コモンズ 表示-継承 3.0 非移植(日本語訳)です。