Posted at

NLTKを使って情報利得を計算

More than 5 years have passed since last update.

最大エントロピー分類器などを訓練するときに,素性選択をおこなって特徴的な素性を用いたいときに情報利得を使います.

この本がとてもわかりやすいです.


informationgain.py

from math import log

from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import ConditionalFreqDist
from nltk.probability import FreqDist

def information_gain(labeled_documents):
ig = {}
labeldist = FreqDist()
docnumdist = FreqDist()
cldist = ConditionalFreqDist()

n = 1
for (label, doc) in labeled_documents:
labeldist.inc(label)
unigrams = set(ngrams(word_tokenize(doc), n))
for unit in unigrams:
cldist[unit].inc(label)
docnumdist.inc(unit)

H_C = 0.0
pr_label = {}
for label in labeldist.samples():
if not label in pr_label: pr_label[label] = 0.0
pr_label[label] = labeldist[label] / float(labeldist.N())
H_C = -sum([pr * log(pr, 2) for pr in pr_label.values()])
print "H(C) = %.2f" % H_C

H_C_given_X = {}
for c in cldist.conditions():
for label in cldist[c].samples():
print "Pr(%s|X_{%s} = 1) = %.2f" % (label, c, cldist[c][label] / float(labeldist[label]))
print "Pr(%s|X_{%s} = 0) = %.2f" % (label, c, 1.0 - cldist[c][label] / float(labeldist[label]))
pr_label_given_x1 = cldist[c][label] / float(labeldist[label])
pr_label_given_x0 = 1.0 - pr_label_given_x1

if not "%s = 1" % c in H_C_given_X: H_C_given_X["%s = 1" % c] = 0.0
H_C_given_X["%s = 1" % c] -= pr_label_given_x1 * log(pr_label_given_x1, 2)
if not "%s = 0" % c in H_C_given_X: H_C_given_X["%s = 0" % c] = 0.0
H_C_given_X["%s = 0" % c] -= pr_label_given_x0 * log(pr_label_given_x0, 2)
print

for c in cldist.conditions():
pr_x1 = docnumdist[c] / float(docnumdist.N())
pr_x0 = 1.0 - pr_x1
if not c in ig:
ig[c] = H_C - pr_x1 * H_C_given_X["%s = 1" % c] - pr_x0 * H_C_given_X["%s = 0" % c]

for i, j in sorted(ig.items(), key=lambda x:x[1], reverse=True): print i, j

documents = [("pos", "good good good excellent"), ("pos", "good very excellent"),
("pos", "good fine good excellent"), ("pos", "bad very fine"),
("neg", "bad bad worse"), ("neg", "worse good worse excellent"),
("neg", "excellent very bad"), ("neg", "bad very worse")]

information_gain(documents)


どちらかの極性にしか現れない''fine''や''worse''が情報利得が高く,特徴的な単語であることがわかります.

また,どちらの極性にも現れる''very''は,特徴的でない単語なので,情報利得が低いことが確認できます.

('worse',) 0.52573480121

('fine',) 0.5
('good',) 0.188721875541
('bad',) 0.188721875541
('excellent',) 0.0428913353502
('very',) 0.0