Overview
This post supports the association analysis for large-scale transaction data which consists of python implemenation for table data with Pandas
and PySpark
and its visualization with NetworkX
.
Def. Metrics
The followings are standard metrics on association analysis.
-
Confidence - Conditional probability
$$
\text{Confidence}(X \to Y) := \frac{|X \cap Y|}{|X|} \approx P(Y|X)
$$ -
Support - Item's popularity, frequency, probability. "How popular is the item X?", "How popular is the combination of items (X, Y)?"
$$
\begin{align}
\text{Support}(X) &:= \frac{|X|}{|\Omega|} \approx P(X) \\
\text{Support}(X, Y) &:= \frac{|X \cap Y|}{|\Omega|} \approx P(X \cap Y) \\
\text{Support}(X, Y, Z) &:= \frac{|X \cap Y \cap Z|}{|\Omega|} \approx P(X \cap Y \cap z)
\end{align}
$$ -
Lift - Efficient conditional rules with high frequency (popularity)
$$
\text{Lift}(X \to Y) := \frac{\text{Confidence}(X \to Y)}{\text{Support}(X)} = \frac{ \frac{|X \cap Y|}{|X|} }{ \frac{|X|}{|\Omega|} }
\approx \frac{P(Y|X)}{P(X)}
$$
Case1: midle-scale trans-data
derive association rules by using mlxtendm and pandas
1) Busket data
import os
import numpy as np
import pandas as pd
import networkx as nx
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('/kaggle/input/my-association-analysis/toy_association_analysis_network.csv', encoding='utf-8')
basket_df = df.fillna(0).drop(["#", "Pages"], axis=1)
basket_d
2) Assoc Rules
Run Apriori-Algorithm
Assoc Rules within Supp > 0.1, Lift > 1.0
# Apriori algorithm
freq_items1 = apriori(basket_df, min_support = 0.1, use_colnames = True)
# Sort results by support-value
freq_items1 = freq_items1.sort_values('support', ascending = False)
# Check results
display(freq_items1.head(5))
display(freq_items1.tail(5))
# Num of itemsets
print(freq_items1.shape[0])
# Extract association rules
a_rules1 = association_rules(freq_items1, metric = "lift", min_threshold = 1)
# Sort rules by lift-value
a_rules1 = a_rules1.sort_values('lift',ascending = False).reset_index(drop=True)
# Type change: frozensets -> string
a_rules1['antecedents'] = a_rules1['antecedents'].apply(lambda x: ', '.join(list(x))).astype("unicode")
a_rules1['consequents'] = a_rules1['consequents'].apply(lambda x: ', '.join(list(x))).astype("unicode")
# Check results
display(a_rules1.head(10))
# Num of rules
print(a_rules1.shape[0])
3) Visualization
Visualization of graphical model (NetworkX)
# New graph
G = nx.from_pandas_edgelist(a_rules1, 'antecedents', 'consequents', ['lift'])
# Graph layout
pos = nx.spring_layout(G)
pr = nx.pagerank(G) # pagerank
# Visualization
plt.figure(figsize=(12, 12))
nx.draw_networkx_edges(G, pos,
edge_color='silver',
width=0.3)
nx.draw_networkx_nodes(G, pos,
node_color=list(pr.values()),
node_size=[30000 * v for v in pr.values()],
cmap=plt.cm.Blues,
edgecolors="gray",
alpha=0.5)
plt.axis('off')
plt.show()
# New Graph
G = nx.from_pandas_edgelist(a_rules1, 'antecedents', 'consequents', ['lift'])
# Graph layout
pos = nx.spring_layout(G)
pr = nx.pagerank(G) # pagerank
# Visualization
plt.figure(figsize=(12, 12))
nx.draw_networkx_edges(G, pos,
arrows=True,
arrowsize=30,
edge_color='silver',
width=1.0)
nx.draw_networkx_nodes(G, pos,
node_color=list(pr.values()),
node_size=[40000 * v for v in pr.values()],
cmap=plt.cm.Blues,
edgecolors="gray",
alpha=0.75)
# Position nodes using Fruchterman-Reingold force-directed algorithm.
nx.draw_networkx_labels(G, pos,
font_size=10)
plt.axis('off')
plt.show()
Case2: large-scale trans-data
derive association rules by using PySpark.
1) Busket data
import pyspark.sql.functions as fn
from pyspark.ml.feqture import VectorAssembler
from pyspark.ml.fpm import FPGrowth
def make_basket_data(spark, input_sdf, customer_id_column, items_col_name, flg_columns_list):
for idx, flg_column in enumerate(flg_columns_list):
temp_sdf = input_sdf.withColumn('customer_behavior', fn.when(fn.col(flg_column) == 1, fn.lit(flg_column.replace('_flg', '')))).select(customer_id_column, 'customer_behavior')
if idx == 0:
res_sdf = temp_sdf
else:
res_sdf = res_sdf.union(temp_sdf)
basket_data = res_sdf.groupBy(customer_id_column).agg(fn.collect_list('customer_behavior').alias('customer_behavior_collection'))
return basket_data
def fit_association_rules(spark, basket_data, items_col_name='item_collection', min_supp=0.006, min_conf=0.006):
# make model && fit basket data
fpGrowth = FPGrowth(itemsCol=items_col_name, minSupport=min_supp, minConfidence=min_conf)
model = fpGrowth.fit(basket_data)
return model
flg_columns_list = [
'item_1_use_flg',
'item_2_use_flg',
'item_3_use_flg',
...
'item_N_use_flg',
]
# make basket data
basket_data = make_basket_data(spakr, input_sdf, customer_id_column='customer_id', items_col_name='item_collection', flg_columns_list=flg_columns_list)
basket_data.show()
2) Assoc Rules
Run Apriori-Algorithm
# FPGrowth algorithm; find association rules
model = fit_association_rules(spark, basket_data, items_col_name='item_collection', min_supp=0.006, min_conf=0.006)
# Frequent item sets
freq_behaviorset_sdf = model.freqItemsets()
freq_itemset_sdf.show()
# Association Rules
association_rules_sdf = model.associatoinRules
association_rules_sdf.show(truncate=False)
3) Visualization
same as the above.