#事前学習済みモデルのダウンロード
https://github.com/jhlau/doc2vec
#モデルをloadし,単語ベクトルを確認してみる
confirm_WordVector.py
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('model/enwiki_dbow/doc2vec.bin')
vector = model.infer_vector(["word"])
print(len(vector))
print(vector)
300
[ 1.40280828e-01 1.83409289e-01 -2.64408961e-02 -1.11115627e-01
-1.84268013e-01 2.21883774e-01 -1.39962300e-03 -7.80699700e-02
-8.71175826e-02 2.56892532e-01 1.28477469e-01 -1.32150203e-01
9.16299447e-02 -1.08361199e-01 2.24919140e-01 2.95133799e-01
-1.05266787e-01 -8.87315348e-02 -3.57903123e-01 -1.16929680e-01
2.43903622e-01 -2.26327643e-01 5.54789364e-01 -4.85761255e-01
1.12485945e-01 -2.10972637e-01 2.92448878e-01 1.36567667e-01
-3.39356661e-01 1.54809028e-01 -4.06004965e-01 2.51045078e-01
-5.08944094e-01 -2.72488981e-01 2.54280418e-01 -5.28896302e-02
5.78162111e-02 1.88219622e-01 5.74344695e-01 -4.67062503e-01
-2.05024302e-01 -5.98332509e-02 -3.44163746e-01 -3.80116701e-01
1.05316617e-01 -1.75678745e-01 -4.92762923e-01 3.11034918e-01
-3.04395765e-01 -6.21335721e-03 -2.51851439e-01 6.83335662e-02
2.69055486e-01 -4.56707805e-01 3.17851663e-01 -1.69105187e-01
3.56151521e-01 -5.05028307e-01 -2.53974706e-01 -5.85785925e-01
1.44802809e-01 1.71069667e-01 2.14749686e-02 2.62290016e-02
-5.90268746e-02 -4.17226970e-01 -2.58289903e-01 -1.34147465e-01
-1.69140883e-02 2.69945771e-01 -8.30643922e-02 -2.70083934e-01
-5.48509397e-02 -3.51466686e-01 -2.83847153e-01 5.26780486e-01
-9.27017778e-02 3.41789305e-01 1.61628351e-01 -9.79063809e-02
2.50723511e-01 -3.06959093e-01 -4.54114348e-01 -1.49249837e-01
-6.02198720e-01 -3.59645128e-01 1.29344389e-01 -4.97040823e-02
1.67680234e-01 3.98838282e-01 3.97429094e-02 -8.42189014e-01
4.17290986e-01 9.80646759e-02 5.52689396e-02 2.00707242e-01
-4.96996380e-02 -3.10181230e-01 7.32129142e-02 1.78322211e-01
1.99462384e-01 1.85920909e-01 4.16447707e-02 3.06156427e-02
5.19993417e-02 -9.45110098e-02 3.29695880e-01 -6.64467752e-01
-4.22538340e-01 1.76553596e-02 3.59327137e-01 1.87507823e-01
-4.77306396e-01 -1.01719558e-01 -4.10893440e-01 -1.98205486e-01
-2.00183213e-01 -2.72218496e-01 -2.06492599e-02 3.01751882e-01
4.59669717e-02 -2.81522602e-01 1.15110882e-01 1.12400606e-01
-3.65632564e-01 -2.55062699e-01 2.92361856e-01 -4.80110735e-01
1.91051483e-01 -1.09290645e-01 3.52236956e-01 2.30695501e-01
4.36141849e-01 -4.78955433e-02 1.11169226e-01 1.39120921e-01
-2.11431772e-01 4.52448912e-02 -2.72998810e-01 -4.09108907e-01
-1.19410396e-01 1.38503099e-02 7.53449369e-03 -2.37264037e-01
-1.67033702e-01 -2.26302013e-01 -1.10190071e-01 -3.45773011e-01
1.81666419e-01 -1.88263834e-01 2.19820291e-01 -2.88389564e-01
1.02379367e-01 4.77272905e-02 4.77848239e-02 -2.84629092e-02
-2.28211567e-01 -2.59289056e-01 7.43009150e-04 -1.49935097e-01
1.42509758e-01 3.70406181e-01 4.54252928e-01 2.22431928e-01
-2.51703948e-01 1.28542066e-01 7.27307573e-02 -3.60925421e-02
6.45418346e-01 -2.29296759e-01 -2.46794242e-02 -3.51088405e-01
2.99131393e-01 -1.01994380e-01 2.05502391e-01 5.13257325e-01
2.81603962e-01 3.98386598e-01 7.68973529e-02 -2.05001414e-01
-1.08222596e-01 3.70851427e-01 6.77625686e-02 -4.04938042e-01
2.17772741e-02 2.16333512e-02 -1.00487657e-01 2.47037604e-01
6.34489302e-03 2.80573443e-02 2.21345127e-01 -5.39463460e-01
-1.15930647e-01 8.56445264e-03 7.61211962e-02 -1.54177174e-01
1.10860772e-01 -6.26938343e-01 -3.82335544e-01 -6.73514232e-02
-4.02066022e-01 -1.69688925e-01 -3.15610260e-01 6.77945558e-03
3.97334605e-01 7.22034797e-02 1.42006814e-01 -2.81334162e-01
1.22516409e-01 3.27533394e-01 -1.56189814e-01 4.97612879e-02
2.15303227e-01 -8.69842410e-01 -1.04782172e-01 -2.61912316e-01
8.11299086e-02 -1.06915340e-01 -4.68756080e-01 2.54943911e-02
1.79967985e-01 -1.35952368e-01 1.55958846e-01 -1.72587708e-01
-7.17891514e-01 1.18898049e-01 4.15051430e-02 -3.22812885e-01
1.64221272e-01 5.38067400e-01 2.20012248e-01 -3.49850534e-03
-9.77616534e-02 4.59246367e-01 2.99039483e-01 6.88107729e-01
3.54239970e-01 8.27741176e-02 -1.64990410e-01 1.75694339e-02
-4.47227359e-01 3.88276935e-01 1.59138501e-01 -4.20660712e-02
-1.80355012e-01 -2.54727751e-01 1.02000490e-01 -1.31719366e-01
2.61006087e-01 3.00956100e-01 1.64773628e-01 -1.77720655e-02
3.05260122e-01 2.02634603e-01 -5.14772385e-02 1.07577242e-01
1.58462778e-01 -2.40044415e-01 2.85942465e-01 -4.73183356e-02
-4.39267427e-01 -1.93622246e-01 -5.14240086e-01 -1.80472374e-01
7.56859004e-01 -7.72481337e-02 -1.83406934e-01 -1.04551455e-02
1.79811299e-01 -2.92110503e-01 2.21000046e-01 -9.72364619e-02
8.97284423e-04 3.09679478e-01 -5.13073541e-02 5.61714590e-01
-8.19747671e-02 -4.49352294e-01 5.06668910e-02 4.01322357e-02
-1.38008431e-01 -8.54075775e-02 -3.30738991e-01 -2.61910200e-01
3.30826312e-01 -1.27152279e-01 -2.62047518e-02 -1.46580711e-01
2.61440694e-01 -5.41437566e-01 -2.53788650e-01 1.29316911e-01
4.26622987e-01 1.34229422e-01 5.41095473e-02 7.28423670e-02
3.60608399e-02 -3.21376175e-01 -4.56876233e-02 -1.15793005e-01]
#文書ベクトルの類似度を計算してみる
confirm_sim.py
doc_words1 = 'the picture was taken by me .'.split()
doc_words2 = 'i took a picture .'.split()
doc_words3 = 'the picture is very popular .'.split()
sim_value = model.docvecs.similarity_unseen_docs(model, doc_words1, doc_words2, alpha=1, min_alpha=0.0001, steps=5)
print(doc_words1)
print(doc_words2)
print(sim_value)
sim_value = model.docvecs.similarity_unseen_docs(model, doc_words2, doc_words3, alpha=1, min_alpha=0.0001, steps=5)
print(doc_words2)
print(doc_words3)
print(sim_value)
['the', 'picture', 'was', 'taken', 'by', 'me', '.']
['i', 'took', 'a', 'picture', '.']
0.47444016
['i', 'took', 'a', 'picture', '.']
['the', 'picture', 'is', 'very', 'popular', '.']
0.534385