Help us understand the problem. What is going on with this article?

たぶん1分くらいでできる形態素解析とtfidf(テストコードつき)

More than 5 years have passed since last update.

下準備

pip install nltk
pip install mecab-python

下のコードを貼りつけて実行してみてください

TF-IDFを出力するための関数がtfidf
形態素解析するための関数がextract_words
下の方にあるimport unittest以下の長ったらしいヤツはテスト

#!/usr/bin/env python
#-*- encoding: utf-8 -*-
import nltk
import MeCab
import urllib2
from urllib2 import HTTPError
from itertools import chain


def tfidf(doc,docs):
  """対象の文書と全文の形態素解析した単語リストを指定すると対象の文書のTF-IDFを返す"""
  tokens = list(chain.from_iterable(docs)) #flatten
  A = nltk.TextCollection(docs)
  token_types = set(tokens)
  return [{"word":token_type,"tfidf":A.tf_idf(token_type, doc)} for token_type in token_types]


def extract_words(text):
  """テキストを与えると名詞のリストにして返す"""
  text =  text.encode("utf-8") if isinstance(text,unicode) else text
  mecab = MeCab.Tagger("")
  node = mecab.parseToNode(text)
  words = []
  while node:
    fs = node.feature.split(",")
    if (node.surface is not None) and node.surface != "" and fs[0] in [u'名詞']:
      words.append(node.surface)
    node = node.next
  return words

import unittest

class MachineLearningTest(unittest.TestCase):
  def test_extract_words(self):
    """形態素解析のテスト"""
    text = "textを形態素解析して、名詞のリストを返す"
    keywords = extract_words(text)
    self.assertEqual(keywords, ["text","形態素","解析","名詞","リスト"])
  def test_tfidf(self):
    """tfidfのテスト"""
    urls = ["http://qiita.com/puriketu99/items/"+str(i) for i in range(1,10)]
    def url2words(url):
      try:
        html = urllib2.urlopen(url).read()
      except HTTPError:
        html = ""
      plain_text = nltk.clean_html(html).replace('\n','')
      words = extract_words(plain_text)
      return words
    docs = [url2words(url) for url in urls]
    tfidfs_fizzbuzz = tfidf(docs[0],docs)
    tfidfs_fizzbuzz.sort(cmp=lambda x,y:cmp(x["tfidf"],y["tfidf"]),reverse=True)
    result = [e for i,e in enumerate(tfidfs_fizzbuzz) if len(e["word"]) > 2 and i < 30]
    self.assertEqual(result[7]["word"],"yaotti")#Qiita側がデザイン変えるとテスト失敗するかも
    print result
    #[{'tfidf': 0.08270135278254376, 'word': 'quot'},
    # {'tfidf': 0.02819364299404901, 'word': 'FizzBuzz'},
    # {'tfidf': 0.02067533819563594, 'word': 'fizzbuzz'},
    # {'tfidf': 0.02067533819563594, 'word': 'Buzz'},
    # {'tfidf': 0.016916185796429405, 'word': 'Fizz'},
    # {'tfidf': 0.016726267030018446, 'word': 'end'},
    # {'tfidf': 0.015036609596826138, 'word': 'map'},
    # {'tfidf': 0.015036609596826138, 'word': 'yaotti'},
    # {'tfidf': 0.011277457197619604, 'word': 'def'}]

if __name__ == '__main__':
  unittest.main()

参考
TF-IDFの計算
http://everydayprog.blogspot.jp/2011/12/tf-idf.html

Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away