ＴＦＩＤＦ

Python

Posted at 2025-01-07

import collections
import math
import re

def tokenize(text):
return re.findall(r'\w+', text)

def compute_tf(tokens):
tf = collections.Counter(tokens)
return tf

def compute_idf(documents):
N = len(documents)
idf = collections.defaultdict(int)
for doc in documents:
for term in set(doc):
idf[term] += 1
for term, count in idf.items():
idf[term] = math.log(N / count)
return idf

def compute_tfidf(tf, idf):
tfidf = {}
for term, freq in tf.items():
tfidf[term] = freq * idf.get(term, 0)
return tfidf

def cosine_similarity(vec1, vec2):
dot_product = sum(vec1.get(term, 0) * vec2.get(term, 0) for term in set(vec1) | set(vec2))
magnitude1 = math.sqrt(sum(val2 for val in vec1.values()))
magnitude2 = math.sqrt(sum(val2 for val in vec2.values()))
if magnitude1 == 0 or magnitude2 == 0:
return 0
return dot_product / (magnitude1 * magnitude2)

def similarity_search(query, documents):
tokenized_query = tokenize(query)
tokenized_docs = [tokenize(doc) for doc in documents]

idf = compute_idf(tokenized_docs)

query_tf = compute_tf(tokenized_query)
query_tfidf = compute_tfidf(query_tf, idf)

results = []
for i, doc in enumerate(tokenized_docs):
    doc_tf = compute_tf(doc)
    doc_tfidf = compute_tfidf(doc_tf, idf)
    similarity = cosine_similarity(query_tfidf, doc_tfidf)
    results.append((i, similarity))

results.sort(key=lambda x: x[1], reverse=True)
return results

if name == 'main':
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"これは最初のドキュメントです。",
"このドキュメントは2番目のドキュメントです。",
"そして、これは3番目のものです。"
]
query = "ドキュメント"
results = similarity_search(query, documents)
print(f"Query: {query}")
for index, score in results:
print(f"Document {index}: {score}")

query = "first document"
results = similarity_search(query, documents)
print(f"Query: {query}")
for index, score in results:
    print(f"Document {index}: {score}")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up