Clustering messages

Python
Posted at 2024-09-25
This is a use case where I am dealing with hundreds or thousands of messages, i.e. text, and I want to categorize them into clusters.
I can achieve this with the help of sklearn libraries such as TfIdfVectorizer and NMF.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import logging
from gpwrap.utils.configdict.configDict import ConfigDict


def get_sorted_non_zero_words_and_freqs_from_csr_mat_row(csr_matrix_row, tfidf_features):
    """
    csr_matrix which is the result of transforming messages into word frquencies by TfidfVectorizer
    has zeros across many columns and non-zero values only for limited columns where the message
    actually contained that token. We want to get non-zero columns and values which represent token and word frequency
    Finally we want to sort these tokens by their word frequency
    :param csr_matrix_row:
    :param tfidf_features:
    :return:
    """
    items = [(w, csr_matrix_row[0, idx]) for idx, w in enumerate(tfidf_features) if csr_matrix_row[0, idx] > 0]
    items = sorted(items, key=lambda item: item[-1])
    return items


def get_common_common_part_of_message_across_documents(message, tokenizer, csr_matrix_row, tfidf_features, throw_off_thresh=1):
    """
    We are assuming a use case where we have lots of similar messages that differ only by one or two words
    We want this function to return common part across these similar messages
    :param message:
    :param tokenizer:
    :param csr_matrix_row:
    :param tfidf_features:
    :param throw_off_thresh:
    :return:
    """
    tokens = tokenizer(message)
    words_and_freqs = get_sorted_non_zero_words_and_freqs_from_csr_mat_row(csr_matrix_row, tfidf_features)
    words_and_freqs = sorted(words_and_freqs, key=lambda item: item[-1])
    words_and_freqs = words_and_freqs[:-1 * throw_off_thresh]
    words = [word for (word, freq) in words_and_freqs]
    return " ".join([token for token in tokens if token.lower() in words])


def get_message_clusters(messages, n_components=7):
    """
    We are trying to categorize a list of messages into clusters
    To achieve this we first convert all messages to word frequency sparce matrix via TfidfVectorizer
    Then we reduce csr_matrix dimension to n_components principal components with NMF (Non-negative Factorizing Model)
    Finally we compute similarities between messages taking dot products and assign a cluster label to each message based on that calculation
    :param messages:
    :param n_components:
    :return:
    """
    # TfidfVectorizer trains on our messages and transforms them to a sparse matrix
    tfidf = TfidfVectorizer()
    csr_mat = tfidf.fit_transform(messages)

    # initialize tokenizer for later usage
    tokenizer = tfidf.build_tokenizer()

    logging.info(f"sparce matrix shape : {csr_mat.shape}")

    # tfidf features (tokens) across all messages
    tfidf_features = tfidf.get_feature_names()
    logging.info(f"there are total of {len(tfidf_features)} tfidf features for specified messages")

    # NMF to reduce csr_matr dimensionality to principal components
    nmf = NMF(n_components=n_components)
    nmf_features = nmf.fit_transform(csr_mat)

    # normalize nmf features
    norm_nmf_features = normalize(nmf_features)

    # initial a dictionary to hold messages and their mapped cluster
    clustered_messages = {}

    for idx, message in enumerate(messages):
        if message not in clustered_messages:
            essential_part = get_common_common_part_of_message_across_documents(message, tokenizer, csr_mat[idx], tfidf_features, 1)
            similarities = norm_nmf_features.dot(norm_nmf_features[idx, :])
            for msg_idx, similarity in enumerate(similarities):
                if similarity > 0.9:
                    clustered_messages[messages[msg_idx]] = essential_part

    logging.info(f"total of {len(clustered_messages)} distinct messages were mapped to {len(set(clustered_messages.values()))} distinct clusters")
    return ConfigDict({'clustered_messages': clustered_messages, 'tfidf': tfidf, 'tfidf_features': tfidf_features, 'tokenizer': tokenizer, 'nmf_feature': nmf_features})
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up