TF-IDFのメモ.
TfidfVectorizerを使ったほうがはるかに簡単 なのですが,勉強も兼ねて.
おかしなところがあれば,優しく教えてください.
計算の対象となる文書
hoge.txt
white black red
white white black
white black black black
white
#準備
print(word_set)
['black', 'red', 'white']
print(doc_words)
[['white', 'black', 'red'], ['white', 'white', 'black'], ['white', 'black', 'black', 'black'], ['white']]
def tokenizer(word_set, doc_words):
token_doc = []
for words in doc_words:
temp = []
for w in words:
temp.append(word_set.index(w))
token_doc.append(temp)
return token_doc
token_doc = tokenizer(word_set, doc_words)
#print(token_doc)
doc_num = len(token_doc)
#print(doc_num)
IDF = []
count = 0
import math
for j in range(len(word_set)):
count = 0
for d_list in token_doc:
if j in d_list:
count += 1
IDF.append(math.log(doc_num / count) + 1)
TF_set = []
for doc in token_doc:
TF = [0] * len(word_set)
for t in doc:
TF[t] += 1
TF_set.append(TF)
TF_IDF_set = []
TF_IDF = []
for temp_TF in TF_set:
for m in range(len(word_set)):
TF_IDF.append(temp_TF[m] * IDF[m])
TF_IDF_set.append(TF_IDF)
TF_IDF = []
#結果
print(token_doc)
[[2, 0, 1], [2, 2, 0], [2, 0, 0, 0], [2]]
print(word_set)
['black', 'red', 'white']
print(TF_IDF_set)
[[1.2876820724517808, 2.386294361119891, 1.0], [1.2876820724517808, 0.0, 2.0], [3.8630462173553424, 0.0, 1.0], [0.0, 0.0, 1.0]]