More than 5 years have passed since last update.

Python2 + word2vec

Last updated at 2016-09-01Posted at 2016-08-30

やったこと

tweepyでtweetを保存しword2vecに入れて遊ぶ
いろんなところを参考にもといコピペしてやったので新規性は余り有りません.
'ショタ - 男 + 女 = ロリ'が出ることを一つの目標にしました.

環境

CentOS7
Anaconda2-4.1.0

前提

tweepy
MeCab
mecab-python
gensim

コード

ところどころ冗長なのは気にしない.

TwStream.py

# -*- encoding:utf-8 -*-

import sys
import os
import re
import time
import tweepy

HERE = os.path.abspath(os.path.dirname(__file__))

CK = ''
CS = ''
AT = ''
AS = ''

class MyListener(tweepy.StreamListener):
	def __init__(self):
		super(MyListener, self).__init__()

	def on_status(self, status):
		try:
			tw = status.text.strip()
			# 日本語tweetのみ
			if re.search(u'[ぁ-んァ-ン]', tw) is not None:
				with open(HERE + '/stream.txt', 'a') as f:
					# tweetにはタブが無いので区切り文字に使います
					f.write(tw.encode('utf-8') + '\n\t\n')
				print tw.encode('utf-8')
		except tweepy.TweepError as e:
			print e.reason
			if  'u\'code\': 88' in e.reason:
				print 'wait 15 min'
				time.sleep(15*60)

	def on_error(self, status_code):
		print 'error ', status_code
		if status_code == 420:
			print 'wait 15 min'
			time.sleep(15*60)
		time.sleep(10)

	def on_limit(self, status):
		print 'limit'
		time.sleep(10)

	def on_timeout(self, status):
		print 'timeout'
		time.sleep(10)

if __name__ == '__main__':
	while True:
		try:
			auth = tweepy.OAuthHandler(CK, CS)
			auth.set_access_token(AT, AS)
			print 'auth set'

			st = tweepy.Stream(auth, MyListener())
			print 'sampling'
			st.sample()

		except tweepy.TweepError as e:
			st.disconnect()
			print e.reason
			if 'u\'code\': 88' in e.reason:
				print 'wait 15 min'
				time.sleep(15*60)
		except KeyboardInterrupt:
			st.disconnect()
			break
		except:
			st.disconnect()
			continue

ここで'stream.txt'を'raw.txt'にリネーム.

W2V.py

# -*- encoding:utf-8 -*-

import sys
import os
import MeCab
import gensim
from gensim.models import word2vec

# このファイルのパス
HERE = os.path.abspath(os.path.dirname(__file__))
sys.path.append(HERE)

# 自作モジュール
from MeCabRW import *
from ProcStr import *

if __name__ == '__main__':
    MODEL = HERE + '/twitter.model'

    try:
        # モデルが有れば読み込む
        print 'loading model'
        model = word2vec.Word2Vec.load(MODEL)
        print 'model loaded'
    except:
        # 無ければ作成
        print 'model not loaded'
        print 'creating model'
        # mt = MeCab.Tagger('-Owakati')で可
        mt = mtWakatiNeo()
        avoid = ['RT']
        mecabParseRW(HERE + '/raw.txt', HERE + '/sep.txt', mt, avoid)

        # 分かち書きデータを読み込みます
        corp = word2vec.Text8Corpus(HERE + '/sep.txt')
        # フレーズ単位で解析できるようにします
        phrcorp = gensim.models.Phrases(corp)
        model = word2vec.Word2Vec(phrcorp[corp], size=2000, min_count=2)
        model.save(MODEL)

        print 'creating done'

    pos = [u'ショタ', u'女']
    neg = [u'男']

    sim = model.most_similar(positive=pos, negative=neg)

    print '+: ', ' '.join([i.encode('utf-8') for i in pos])
    print '-: ', ' '.join([i.encode('utf-8') for i in neg])
    print
    for i, j in sim:
        print i.encode('utf-8'), '\t', j

MeCabRW.py

# -*- coding: utf-8 -*-

import re
import MeCab

def mtWakatiNeo():
    opt = '-O wakati -d /usr/lib64/mecab/dic/mecab-ipadic-neologd'
    return MeCab.Tagger(opt)

def mecabParseRW(pathIn, pathOut, mt, avoid=[]):
    with open(pathIn, 'r') as f:
        sIn = f.read()
    # urlと@[id]除去
    sIn = re.sub('https?://[A-Za-z0-9/:%#\$&\?\(\)~\.=\+\-]+', ' ', sIn)
    sIn = re.sub('@[A-Za-z0-9_]+', ' ', sIn)
    sOut = []
    for i in sIn.split('\n\t\n'):
        if all([j not in i for j in avoid]):
            p = mt.parse(i) # ここでたまにNoneになるので
            if type(p) == str: # 型チェック
                try:
                    p.decode('utf-8')
                    sOut.append(p)
                except:
                    continue
    sOut = '\n\t\n'.join(sOut)
    with open(pathOut, 'w') as f:
        f.write(sOut)
    return sOut

結果

60MBくらいtweetを集めて実行

loading model
model loaded
+:  ショタ 女
-:  男

猿 	0.833452105522
マカロン 	0.832771897316
ロリ 	0.830695152283
褒め言葉 	0.828270435333
喋り_方 	0.825944542885
梅原 	0.825801610947
有沙 	0.822319507599
貧乳 	0.818123817444
百_合 	0.817329347134
本田翼 	0.816138386726

良い感じではないでしょうか.
逆の式も同じ感じになりました.

loading model
model loaded
+:  ロリ 男
-:  女

紫 	0.847893893719
百_合 	0.824845731258
ショタ 	0.82099032402
ド 	0.81635427475
つむぎ 	0.813044965267
姫 	0.812274694443
パロディ 	0.809535622597
モブ 	0.804774940014
白 	0.802413225174
黒髪 	0.800325155258

もろもろ

本当はWindows + Python3でやりたかったんですが, 文字コードや既存資料の関係でこうなりました.
Python3しか使ったことが無かったので書き方がおかしいところが有るかも知れません.

参考

http://docs.tweepy.org/en/v3.5.0/streaming_how_to.html
https://radimrehurek.com/gensim/models/phrases.html#module-gensim.models.phrases
http://tjo.hatenablog.com/entry/2014/06/19/233949

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up