import collections
import math
import re
def tokenize(text):
return re.findall(r'\w+', text)
def compute_tf(tokens):
tf = collections.Counter(tokens)
return tf
def compute_idf(documents):
N = len(documents)
idf = collections.defaultdict(int)
for doc in documents:
for term in set(doc):
idf[term] += 1
for term, count in idf.items():
idf[term] = math.log(N / count)
return idf
def compute_tfidf(tf, idf):
tfidf = {}
for term, freq in tf.items():
tfidf[term] = freq * idf.get(term, 0)
return tfidf
def cosine_similarity(vec1, vec2):
dot_product = sum(vec1.get(term, 0) * vec2.get(term, 0) for term in set(vec1) | set(vec2))
magnitude1 = math.sqrt(sum(val2 for val in vec1.values()))
magnitude2 = math.sqrt(sum(val2 for val in vec2.values()))
if magnitude1 == 0 or magnitude2 == 0:
return 0
return dot_product / (magnitude1 * magnitude2)
def similarity_search(query, documents):
tokenized_query = tokenize(query)
tokenized_docs = [tokenize(doc) for doc in documents]
idf = compute_idf(tokenized_docs)
query_tf = compute_tf(tokenized_query)
query_tfidf = compute_tfidf(query_tf, idf)
results = []
for i, doc in enumerate(tokenized_docs):
doc_tf = compute_tf(doc)
doc_tfidf = compute_tfidf(doc_tf, idf)
similarity = cosine_similarity(query_tfidf, doc_tfidf)
results.append((i, similarity))
results.sort(key=lambda x: x[1], reverse=True)
return results
if name == 'main':
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"これは最初のドキュメントです。",
"このドキュメントは2番目のドキュメントです。",
"そして、これは3番目のものです。"
]
query = "ドキュメント"
results = similarity_search(query, documents)
print(f"Query: {query}")
for index, score in results:
print(f"Document {index}: {score}")
query = "first document"
results = similarity_search(query, documents)
print(f"Query: {query}")
for index, score in results:
print(f"Document {index}: {score}")