Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?


Posted at

import collections
import math
import re

def tokenize(text):
return re.findall(r'\w+', text)

def compute_tf(tokens):
tf = collections.Counter(tokens)
return tf

def compute_idf(documents):
N = len(documents)
idf = collections.defaultdict(int)
for doc in documents:
for term in set(doc):
idf[term] += 1
for term, count in idf.items():
idf[term] = math.log(N / count)
return idf

def compute_tfidf(tf, idf):
tfidf = {}
for term, freq in tf.items():
tfidf[term] = freq * idf.get(term, 0)
return tfidf

def cosine_similarity(vec1, vec2):
dot_product = sum(vec1.get(term, 0) * vec2.get(term, 0) for term in set(vec1) | set(vec2))
magnitude1 = math.sqrt(sum(val2 for val in vec1.values()))
magnitude2 = math.sqrt(sum(val
2 for val in vec2.values()))
if magnitude1 == 0 or magnitude2 == 0:
return 0
return dot_product / (magnitude1 * magnitude2)

def similarity_search(query, documents):
tokenized_query = tokenize(query)
tokenized_docs = [tokenize(doc) for doc in documents]

idf = compute_idf(tokenized_docs)

query_tf = compute_tf(tokenized_query)
query_tfidf = compute_tfidf(query_tf, idf)

results = []
for i, doc in enumerate(tokenized_docs):
    doc_tf = compute_tf(doc)
    doc_tfidf = compute_tfidf(doc_tf, idf)
    similarity = cosine_similarity(query_tfidf, doc_tfidf)
    results.append((i, similarity))

results.sort(key=lambda x: x[1], reverse=True)
return results

if name == 'main':
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
query = "ドキュメント"
results = similarity_search(query, documents)
print(f"Query: {query}")
for index, score in results:
print(f"Document {index}: {score}")

query = "first document"
results = similarity_search(query, documents)
print(f"Query: {query}")
for index, score in results:
    print(f"Document {index}: {score}")

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?