PythonからTwitter検索をかけ、マルコフ連鎖で文章生成をしてみる。 #Python

Twitter APIで任意のキーワードでツイートを取得。
取得したデータをtextで保存し、MeCabに渡す。
形態素解析を行い、マルコフ連鎖で文章を作ってみる。

今回はカウント140に抑えて、そのままツイートに使えるようにしているけど、
長文にしてみてもいいかもしれない。精度は低い。
しゅうまい、圧縮新聞の凄さを実感しただけだった。

markov.py


#!/user/bin/env python
# -*- coding: utf-8 -*-
from requests_oauthlib import OAuth1Session
import json
import sys
import MeCab
import random


while True:
    search_words = raw_input(u"Keyword?: ")

    C_KEY = "******************************"
    C_SECRET = "******************************"
    A_KEY = "******************************"
    A_SECRET = "******************************"


    def Limit_Status():
        url = "https://api.twitter.com/1.1/application/rate_limit_status.json"
        params = {}
        tw = OAuth1Session(C_KEY,C_SECRET,A_KEY,A_SECRET)
        req = tw.get(url, params = params)
        if req.status_code == 200:
            limit = req.headers["x-rate-limit-remaining"]
            print ("API remain: " + limit)
        return Limit_Status

    def Search_words():
        url = "https://api.twitter.com/1.1/search/tweets.json?"
        params = {
                "q": unicode(search_words, "utf-8"),
                "lang": "ja",
                "result_type": "recent",
                "count": "100"
                }
        tw = OAuth1Session(C_KEY,C_SECRET,A_KEY,A_SECRET)
        req = tw.get(url, params = params)
        tweets = json.loads(req.text)
        for tweet in tweets["statuses"]:
            f = open("test.txt" , "aw")
            lists = (tweet["text"].encode("utf-8"))
            if "http" in lists:
                lists = lists.split("http", 1)[0]
                lists = lists.split("@")[0]
                lists = lists.split("RT")[0]

                f.write(lists)
                f.flush()
                f.close()


    def Mecab_file():   
        f = open("test.txt","rb")
        data = f.read()
        f.close()

        mt = MeCab.Tagger("-Owakati")
        wordlist = mt.parse(data)

        markov = {}
        w1 = ""
        w2 = ""
        w3 = ""
        w4 = ""
        w5 = ""
        w6 = ""
        w7 = ""
        w8 = ""
        for word in wordlist:
            if w1 and w2 and w3 and w4 and w5 and w6 and w7 and w8:
                if (w1,w2,w3,w4,w5,w6,w7,w8) not in markov:
                    markov[(w1,w2,w3,w4,w5,w6,w7,w8)] = []
                markov[(w1,w2,w3,w4,w5,w6,w7,w8)].append(word)
            w1,w2,w3,w4,w5,w6,w7,w8 = w2,w3,w4,w5,w6,w7,w8,word
        count = 0
        sentence = ""
        w1,w2,w3,w4,w5,w6,w7,w8 = random.choice(markov.keys())

        while count < 140:
            if markov.has_key((w1,w2,w3,w4,w5,w6,w7,w8)) == True:
                tmp = random.choice(markov[(w1,w2,w3,w4,w5,w6,w7,w8)])
                sentence += tmp
                w1,w2,w3,w4,w5,w6,w7,w8 = w2,w3,w4,w5,w6,w7,w8,tmp
                count +=1
            if " " in sentence:
                sentence = sentence.split(" ", 1)[0]

        print sentence

    if search_words:
        Search_words()
        Mecab_file()
        Limit_Status()
    else:
        break

8連鎖で運用してみた。
4連鎖くらいで止めておかないと面白くないことが分かった。

本来はJsonデータから不要なデータをすべて取り除きたいところであったが、現時点の私の知識では限界。ひとまずhttpが本文中に含まれる場合にsplitで取り除いてみた。

例によって同一ディレクトリにtest.txtがなければ生成。
ある場合はドンドン上書きしていく。

Whileでのループはサーチワードを入力せずに実行するとbreak。
様々なサーチワードを別々に格納していくと良いのかもしれない。

編集かけてみました。
正規表現でいらない箇所をガッツリ削り、
文末がおかしくならないように「です」「ます」などをランダムチョイス。

なんかこっちのほうが実用性がある気がしました。

    def Mecab_file():   
        f = open("tweet.txt","rb")
        data = f.read()
        f.close()

        mt = MeCab.Tagger("-Owakati")

        wordlist = mt.parse(data)
        wordlist = wordlist.rstrip(" \n").split(" ")

        markov = {}
        w = ""

        for x in wordlist:
            if w:
                if markov.has_key(w):
                    new_list = markov[w]
                else:
                    new_list =[]

                new_list.append(x)
                markov[w] = new_list
            w = x

        choice_words = wordlist[0]
        sentence = ""
        count = 0

        while count < 90:
            sentence += choice_words
            choice_words = random.choice(markov[choice_words])
            count += 1

            sentence = sentence.split(" ", 1)[0]
            p = re.compile("[!-/:-@[-`{-~]")
            sus = p.sub("", sentence)

            random_words_list = [u"。", u"です。", u"だ。"]
            last_word = random.choice(random_words_list)

        print re.sub(re.compile("[!-~]"),"",sus), last_word