This is a use case where I am dealing with hundreds or thousands of messages, i.e. text, and I want to categorize them into clusters.
I can achieve this with the help of sklearn libraries such as TfIdfVectorizer and NMF.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import logging
from gpwrap.utils.configdict.configDict import ConfigDict
def get_sorted_non_zero_words_and_freqs_from_csr_mat_row(csr_matrix_row, tfidf_features):
"""
csr_matrix which is the result of transforming messages into word frquencies by TfidfVectorizer
has zeros across many columns and non-zero values only for limited columns where the message
actually contained that token. We want to get non-zero columns and values which represent token and word frequency
Finally we want to sort these tokens by their word frequency
:param csr_matrix_row:
:param tfidf_features:
:return:
"""
items = [(w, csr_matrix_row[0, idx]) for idx, w in enumerate(tfidf_features) if csr_matrix_row[0, idx] > 0]
items = sorted(items, key=lambda item: item[-1])
return items
def get_common_common_part_of_message_across_documents(message, tokenizer, csr_matrix_row, tfidf_features, throw_off_thresh=1):
"""
We are assuming a use case where we have lots of similar messages that differ only by one or two words
We want this function to return common part across these similar messages
:param message:
:param tokenizer:
:param csr_matrix_row:
:param tfidf_features:
:param throw_off_thresh:
:return:
"""
tokens = tokenizer(message)
words_and_freqs = get_sorted_non_zero_words_and_freqs_from_csr_mat_row(csr_matrix_row, tfidf_features)
words_and_freqs = sorted(words_and_freqs, key=lambda item: item[-1])
words_and_freqs = words_and_freqs[:-1 * throw_off_thresh]
words = [word for (word, freq) in words_and_freqs]
return " ".join([token for token in tokens if token.lower() in words])
def get_message_clusters(messages, n_components=7):
"""
We are trying to categorize a list of messages into clusters
To achieve this we first convert all messages to word frequency sparce matrix via TfidfVectorizer
Then we reduce csr_matrix dimension to n_components principal components with NMF (Non-negative Factorizing Model)
Finally we compute similarities between messages taking dot products and assign a cluster label to each message based on that calculation
:param messages:
:param n_components:
:return:
"""
# TfidfVectorizer trains on our messages and transforms them to a sparse matrix
tfidf = TfidfVectorizer()
csr_mat = tfidf.fit_transform(messages)
# initialize tokenizer for later usage
tokenizer = tfidf.build_tokenizer()
logging.info(f"sparce matrix shape : {csr_mat.shape}")
# tfidf features (tokens) across all messages
tfidf_features = tfidf.get_feature_names()
logging.info(f"there are total of {len(tfidf_features)} tfidf features for specified messages")
# NMF to reduce csr_matr dimensionality to principal components
nmf = NMF(n_components=n_components)
nmf_features = nmf.fit_transform(csr_mat)
# normalize nmf features
norm_nmf_features = normalize(nmf_features)
# initial a dictionary to hold messages and their mapped cluster
clustered_messages = {}
for idx, message in enumerate(messages):
if message not in clustered_messages:
essential_part = get_common_common_part_of_message_across_documents(message, tokenizer, csr_mat[idx], tfidf_features, 1)
similarities = norm_nmf_features.dot(norm_nmf_features[idx, :])
for msg_idx, similarity in enumerate(similarities):
if similarity > 0.9:
clustered_messages[messages[msg_idx]] = essential_part
logging.info(f"total of {len(clustered_messages)} distinct messages were mapped to {len(set(clustered_messages.values()))} distinct clusters")
return ConfigDict({'clustered_messages': clustered_messages, 'tfidf': tfidf, 'tfidf_features': tfidf_features, 'tokenizer': tokenizer, 'nmf_feature': nmf_features})