最大エントロピー分類器などを訓練するときに,素性選択をおこなって特徴的な素性を用いたいときに情報利得を使います.
この本がとてもわかりやすいです.
informationgain.py
from math import log
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import ConditionalFreqDist
from nltk.probability import FreqDist
def information_gain(labeled_documents):
ig = {}
labeldist = FreqDist()
docnumdist = FreqDist()
cldist = ConditionalFreqDist()
n = 1
for (label, doc) in labeled_documents:
labeldist.inc(label)
unigrams = set(ngrams(word_tokenize(doc), n))
for unit in unigrams:
cldist[unit].inc(label)
docnumdist.inc(unit)
H_C = 0.0
pr_label = {}
for label in labeldist.samples():
if not label in pr_label: pr_label[label] = 0.0
pr_label[label] = labeldist[label] / float(labeldist.N())
H_C = -sum([pr * log(pr, 2) for pr in pr_label.values()])
print "H(C) = %.2f" % H_C
H_C_given_X = {}
for c in cldist.conditions():
for label in cldist[c].samples():
print "Pr(%s|X_{%s} = 1) = %.2f" % (label, c, cldist[c][label] / float(labeldist[label]))
print "Pr(%s|X_{%s} = 0) = %.2f" % (label, c, 1.0 - cldist[c][label] / float(labeldist[label]))
pr_label_given_x1 = cldist[c][label] / float(labeldist[label])
pr_label_given_x0 = 1.0 - pr_label_given_x1
if not "%s = 1" % c in H_C_given_X: H_C_given_X["%s = 1" % c] = 0.0
H_C_given_X["%s = 1" % c] -= pr_label_given_x1 * log(pr_label_given_x1, 2)
if not "%s = 0" % c in H_C_given_X: H_C_given_X["%s = 0" % c] = 0.0
H_C_given_X["%s = 0" % c] -= pr_label_given_x0 * log(pr_label_given_x0, 2)
print
for c in cldist.conditions():
pr_x1 = docnumdist[c] / float(docnumdist.N())
pr_x0 = 1.0 - pr_x1
if not c in ig:
ig[c] = H_C - pr_x1 * H_C_given_X["%s = 1" % c] - pr_x0 * H_C_given_X["%s = 0" % c]
for i, j in sorted(ig.items(), key=lambda x:x[1], reverse=True): print i, j
documents = [("pos", "good good good excellent"), ("pos", "good very excellent"),
("pos", "good fine good excellent"), ("pos", "bad very fine"),
("neg", "bad bad worse"), ("neg", "worse good worse excellent"),
("neg", "excellent very bad"), ("neg", "bad very worse")]
information_gain(documents)
どちらかの極性にしか現れない''fine''や''worse''が情報利得が高く,特徴的な単語であることがわかります.
また,どちらの極性にも現れる''very''は,特徴的でない単語なので,情報利得が低いことが確認できます.
('worse',) 0.52573480121
('fine',) 0.5
('good',) 0.188721875541
('bad',) 0.188721875541
('excellent',) 0.0428913353502
('very',) 0.0