LoginSignup
4
4

More than 3 years have passed since last update.

文書のタグ付けをするためのメモ(Word2Vec、Poincaré Embeddings)

Last updated at Posted at 2019-07-11

文書中のキーワード(企業名)を検索する

import pandas as pd
import re

df_meigara = pd.read_csv('xxx.csv')
df_news = pd.read_csv('xxx.csv')

stock_code_list = df_meigara['STOCK_CODE'].values
news_id_list = df_news['ID'].values

news_stock = {}

for news_id in news_id_list:
    news_text = df_news[df_news['ID']==news_id]['NEWS_DETAIL'].values[0]

    stock_list = set()
    for stock_code in stock_code_list:
        stock_name = df_meigara[df_meigara['STOCK_CODE']==stock_code]['STOCK_NAME'].values[0]            

        s = re.search(stock_name, news_text)
        if s:
            stock_list.add(stock_name)

    news_stock[news_id] = stock_list    

print('news_id', '    stock names')
for news_id in news_id_list[:30]:
    if news_stock[news_id]:
        print(news_id, news_stock[news_id])

image.png

news_id = 438663
text = df_news[df_news['ID']==news_id]['NEWS_DETAIL'].values[0]

print(text)

image.png

Word2Vec で類似語リストを作成し、文書中に含まれる類似語の数をカウントする

import pandas as pd
import numpy as np

import MeCab
from gensim.models import word2vec

# コーパスの作成
df_news = pd.read_csv('xxx.csv')
tagger = MeCab.Tagger("-Ochasen")

news_list = df_news['NEWS_DETAIL'].values

corpus = []
for text in news_list:
    word_list = []
    node = tagger.parseToNode(text)
    while node:
        word = node.surface
        hinshi = node.feature.split(",")[0]    
        if hinshi=='名詞':
            word_list.append(word)        
        node = node.next
    corpus.append(word_list)

# 学習
model = word2vec.Word2Vec(corpus, size=100, min_count=1, window=10, 
                          seed=1, workers=1, sg=1, hs=0, negative=5, iter=1)
vocab = model.wv.vocab
index2word = model.wv.index2word

# 各単語に対する類似度リストの作成
similar_words_dict = {}
for word in index2word:
    temp = model.wv.most_similar(positive=[word], topn=10)
    similar_words = [t[0] for t in temp]
    similar_words_dict[word] = similar_words


# 使用例
keyword = '日本'
similar_words = np.concatenate(([keyword] ,similar_words_dict[keyword]))
text = news_list[0]

word_list = set()
node = tagger.parseToNode(text)
while node:
    word = node.surface
    hinshi = node.feature.split(",")[0]    
    if hinshi=='名詞':
        word_list.add(word)
        word_list2.append(word)
    node = node.next 
word_list = list(word_list)

count = 0
for word in word_list:
    if word in similar_words:
        count += 1

print('# of similar words in the text')        
print(count)
print()
print('similar words')
print(similar_words)
print()
print('text')
print(text)

image.png

Gensim の Poincaré Embeddings を試す

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import MeCab
from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.viz.poincare import poincare_2d_visualization

# 階層データの作成
df_news = pd.read_csv('xxx.csv')
news_list = df_news['NEWS_DETAIL'].values

tagger = MeCab.Tagger("-Ochasen")

#relations = set()
relations = []   # 重複があってもよい
for text in news_list[:10]:
    word_list = []
    node = tagger.parseToNode(text)
    while node:
        word = node.surface
        hinshi = node.feature.split(",")[0]    
        if hinshi=='名詞':
            word_list.append(word)        
        node = node.next

    for i in range(len(word_list)-1):
        #relations.add((word_list[i], word_list[i+1]))
        relations.append((word_list[i], word_list[i+1]))

#relations = list(relations)

# 学習
model = PoincareModel(relations, size=2, negative=2, burn_in=10, seed=1)
model.train(epochs=100)

# 作図
relations_set = set(relations)
figure_title = ''
show_node_labels = model.kv.vocab.keys()
vis = poincare_2d_visualization(model, relations_set,
                                figure_title, num_nodes=None,
                                show_node_labels=show_node_labels)

x = vis['data'][1]['x']
y = vis['data'][1]['y']
text = vis['data'][1]['text']

plt.figure(figsize=(7, 7))
plt.scatter(x, y)
#for i, _ in enumerate(x):
#    plt.text(x[i], y[i], text[i])
plt.xticks(np.arange(-1.0, 1.2, 0.2))
plt.yticks(np.arange(-1.0, 1.2, 0.2))
plt.grid('on')
plt.show()

image.png

keyword = '日本'
similar_words = model.kv.most_similar(keyword)

print('keyword: ', keyword)
print(len(similar_words))
print(similar_words)

image.png

print(model.kv.norm('日本'))
print(model.kv.norm('政府'))

image.png

[参照論文・記事]

Poincaré Embeddings for Learning Hierarchical Representations

異空間への埋め込み!Poincare Embeddingsが拓く表現学習の新展開

Poincaré Embeddings による職種の類似度計算とその利用

双曲空間ではじめるレコメンデーション

4
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
4