Posted at

# NLTKを使って情報利得を計算

More than 5 years have passed since last update.

この本がとてもわかりやすいです．

informationgain.py

from math import log

from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import ConditionalFreqDist
from nltk.probability import FreqDist

def information_gain(labeled_documents):
ig = {}
labeldist = FreqDist()
docnumdist = FreqDist()
cldist = ConditionalFreqDist()

n = 1
for (label, doc) in labeled_documents:
labeldist.inc(label)
unigrams = set(ngrams(word_tokenize(doc), n))
for unit in unigrams:
cldist[unit].inc(label)
docnumdist.inc(unit)

H_C = 0.0
pr_label = {}
for label in labeldist.samples():
if not label in pr_label: pr_label[label] = 0.0
pr_label[label] = labeldist[label] / float(labeldist.N())
H_C = -sum([pr * log(pr, 2) for pr in pr_label.values()])
print "H(C) = %.2f" % H_C

H_C_given_X = {}
for c in cldist.conditions():
for label in cldist[c].samples():
print "Pr(%s|X_{%s} = 1) = %.2f" % (label, c, cldist[c][label] / float(labeldist[label]))
print "Pr(%s|X_{%s} = 0) = %.2f" % (label, c, 1.0 - cldist[c][label] / float(labeldist[label]))
pr_label_given_x1 = cldist[c][label] / float(labeldist[label])
pr_label_given_x0 = 1.0 - pr_label_given_x1

if not "%s = 1" % c in H_C_given_X: H_C_given_X["%s = 1" % c] = 0.0
H_C_given_X["%s = 1" % c] -= pr_label_given_x1 * log(pr_label_given_x1, 2)
if not "%s = 0" % c in H_C_given_X: H_C_given_X["%s = 0" % c] = 0.0
H_C_given_X["%s = 0" % c] -= pr_label_given_x0 * log(pr_label_given_x0, 2)
print

for c in cldist.conditions():
pr_x1 = docnumdist[c] / float(docnumdist.N())
pr_x0 = 1.0 - pr_x1
if not c in ig:
ig[c] = H_C - pr_x1 * H_C_given_X["%s = 1" % c] - pr_x0 * H_C_given_X["%s = 0" % c]

for i, j in sorted(ig.items(), key=lambda x:x[1], reverse=True): print i, j

documents = [("pos", "good good good excellent"), ("pos", "good very excellent"),
("pos", "good fine good excellent"), ("pos", "bad very fine"),

information_gain(documents)

どちらかの極性にしか現れない''fine''や''worse''が情報利得が高く，特徴的な単語であることがわかります．

また，どちらの極性にも現れる''very''は，特徴的でない単語なので，情報利得が低いことが確認できます．

('worse',) 0.52573480121

('fine',) 0.5
('good',) 0.188721875541