LoginSignup
0
4

More than 5 years have passed since last update.

Gensim (word2vec, doc2vec, web news, clustering)

Last updated at Posted at 2018-12-13

Reference

Distributed Representations of Sentences and Documents

Library

!apt-get install libmecab-dev mecab mecab-ipadic-utf8
!apt-get -q -y install swig  
!pip install mecab-python3
!echo `mecab-config --dicdir`"/mecab-ipadic-neologd"

image.png

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import MeCab

from tqdm import tqdm

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

Data

df_news = pd.read_csv('/content/gdrive/My Drive/xxx/xxx.csv')

print ('shape of df_news: ', df_news.shape)
df_news.head()

image.png

df_g = df_news.groupby('NEWS_DATE')['NEWS_TITLE'].sum().reset_index()

print (df_g.shape)
df_g.head()

image.png

dates = df_g['NEWS_DATE']
titles = df_g['NEWS_TITLE']
idx = np.random.randint(len(dates))

txt = titles[idx]

tagger = MeCab.Tagger()
#tagger = MeCab.Tagger('-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')

tagger.parse('')

node = tagger.parseToNode(txt)

result = []
s1 = txt
flag = 0
while node:
  s2 = node.surface
  s3 = s1.replace(s2, '')
  hinshi = node.feature.split(',')[0]

  if flag == 1:
    result.append(s3)
    flag = 0
  if hinshi in ['名詞']:
    flag = 1
  elif hinshi in ['動詞', '形容詞']:
    result.append(node.feature.split(',')[6])
  else:
    pass

  s1 = s2
  node = node.next

print ('Date: ', dates[idx])  
print ('words: ', result)
print ('# of words: ', len(result))

image.png

Doc2Vec

%%time

tagged_docs = []

for date, title in tqdm(zip(dates, titles)):
  tagger = MeCab.Tagger()
  tagger.parse('')

  node = tagger.parseToNode(title)

  words = []
  s1 = title
  flag = 0
  while node:
    s2 = node.surface
    s3 = s1.replace(s2, '')
    hinshi = node.feature.split(',')[0]

    if flag == 1:
      words.append(s3)
      flag = 0
    if hinshi in ['名詞']:
      flag = 1
    elif hinshi in ['動詞', '形容詞']:
      words.append(node.feature.split(',')[6])
    else:
      pass

    s1 = s2
    node = node.next

  tagged_docs.append(TaggedDocument(words=words, tags=[date]))  
%%time

vec_size = 50
alpha = 0.025
alpha_delta = 0.0002
max_epochs = 20

model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=alpha, min_count=1, dm =1) 
model.build_vocab(tagged_docs)

for epoch in range(max_epochs):
  model.train(tagged_docs, total_examples=model.corpus_count, 
              epochs=model.iter)
  model.alpha -= alpha_delta
  model.min_alpha = model.alpha
idx = np.random.randint(len(tagged_docs))

print ('Date: ', dates[idx])
print ('similar dates: ', model.docvecs.most_similar(dates[idx], topn=3))

image.png

date1 = '2018-06-07'
date2 = '2018-08-29'

print (df_news[df_news['NEWS_DATE']==date1].sort_values('NEWS_TIME'))
print ()
print (df_news[df_news['NEWS_DATE']==date2].sort_values('NEWS_TIME'))

image.png

image.png

#save and load model
from gensim.test.utils import get_tmpfile

path = '/content/gdrive/My Drive/xxx/'

fname = get_tmpfile(path+'doc2vec_model')
model.save(fname)

loaded_model = Doc2Vec.load(fname)
#no more updates
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

#infer vector for a new document
vector = model.infer_vector(['米ダウ平均', '上昇'])

print (vector[:10])

image.png

Clustering

doc_vectors = model.docvecs.vectors_docs

print ('shape of doc_vectors: ', doc_vectors.shape)
#print (doc_vectors[0])

kmeans_model = KMeans(n_clusters=3, init='k-means++', max_iter=100)  
kmeans_model.fit(doc_vectors)

labels=kmeans_model.labels_.tolist()
pca = PCA(n_components=2).fit(doc_vectors)
datapoint = pca.transform(doc_vectors)

plt.figure(figsize=(5, 3))
#label1 = ["#FFFF00", "#008000", "#0000FF", "#800080"]
label1 = ["#FFFF00", "#008000", "#0000FF"]

color = [label1[i] for i in labels]

plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)

centroids = kmeans_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')
plt.show()

image.png

Word2Vec

#model = gensim.models.Word2Vec.load(path + 'ja.bin')
model = gensim.models.Word2Vec.load(path + 'jv.bin')

#model.wv['king']
model.wv.most_similar('king')
#model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

image.png

0
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
4