ClCmal_base.py
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('\kaggle\input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# df = pd.read_parquet('../input/cicmalmem2022/Obfuscated-MalMem2022.parquet')
df = pd.read_parquet('.\input\Obfuscated-MalMem2022.parquet')
print(df.head(5))
# deps = ['Category', 'Class', 'MalwareType', 'MalwareFamily']
deps = ['Category', 'Class']
cats = []
conts = df.columns.difference(cats+deps)
dep = 'Class'
print(df.Class.value_counts())
binary_target_df = df.drop(columns=[d for d in deps if d != 'Class'])
#binary_target_df[dep] = binary_target_df[dep].cat.codes
binary_target_df[dep] = binary_target_df[dep].astype('category').cat.codes
print(binary_target_df.Class.value_counts())
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score
np.random.seed(42)
trn_bin_df,tst_bin_df = train_test_split(binary_target_df, test_size=0.90)
def xs_y(df_, targ):
if not isinstance(targ, list):
xs = df_[df_.columns.difference([targ])].copy()
else:
xs = df_[df_.columns.difference(targ)].copy()
y = df_[targ].copy()
return xs, y
X_train, y_train = xs_y(trn_bin_df, dep)
X_test, y_test = xs_y(tst_bin_df, dep)
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=50, criterion='gini', max_samples=0.75, max_features='sqrt', n_jobs=-1)
rf.fit(X_train, y_train)
err_rate = 1 - roc_auc_score(y_true=y_test, y_score = rf.predict(X_test))
print(err_rate)
ex = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=50, criterion='gini', max_samples=0.75, max_features='sqrt', bootstrap=True, n_jobs=-1)
ex.fit(X_train, y_train)
err_rate = 1 - roc_auc_score(y_true=y_test, y_score = ex.predict(X_test))
print(err_rate)
# P2: Even simpler model: one root-node decision tree per feature
from sklearn.tree import DecisionTreeClassifier
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count
def evaluate_one_feature(feature, index='', metric=roc_auc_score):
rootnode = DecisionTreeClassifier(max_depth=1, criterion='gini')
rootnode.fit(X_train[feature].array.reshape(-1,1), y_train)
preds = rootnode.predict(X_test[feature].array.reshape(-1,1))
preds_tr = rootnode.predict(X_train[feature].array.reshape(-1,1))
met = round(metric(y_test, preds), 4)
if met > 0.5:
return [feature, met, rootnode, preds, preds_tr]
else:
return [feature, met, None, [], []]
# results = parallel(f=evaluate_one_feature,
# items=conts, n_workers=cpu_count(), threadpool=False, progress=True)
# 結果を格納するリスト
results = []
# 各アイテムに対してループで計算
for item in conts:
result = evaluate_one_feature(item)
results.append(result)
result_df = pd.DataFrame(data=results, columns=['feature', 'roc_auc_score', 'fitted_models', 'predictions', 'preds_train']).sort_values(by='roc_auc_score', ascending=False)
print(result_df[['feature', 'roc_auc_score']].head(15))
import seaborn as sns
import matplotlib.pyplot as plt
data_for_plot = pd.concat(objs=[X_train.head(9), y_train.head(9)], axis=1, copy=False, sort=False)
fig, axes = plt.subplots(3,3, figsize=(12,12))
axes = axes.flatten()
for i, tf in enumerate(result_df['feature'].head(9)):
sns.histplot(data=data_for_plot, x=tf, stat='percent', hue='Class', bins=100, ax=axes[i])
plt.show()
# Conclusion: It is not likely that CIC-MalMem2022 is an effective dataset to test ML-based obfuscated malware detection methods.
# Outro: P3 Ensemble One-Rule Model
useful_features = result_df.loc[result_df['roc_auc_score'] > 0.5]
print(f"{len(useful_features)} / {len(conts)} features have direct separating power (linear)")
ensemble_preds = np.mean(np.vstack(useful_features['predictions'].to_numpy()), axis=0)
print(ensemble_preds.shape)
ensemble_preds_train = np.mean(np.vstack(useful_features['preds_train'].to_numpy()), axis=0)
print(ensemble_preds_train.shape)
fpr, tpr, thresholds = roc_curve(y_train, ensemble_preds_train)
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print("Best threshold", best_thresh)
print("The Ensemble OneR model (simple average)")
print("ROC-AUC", round(roc_auc_score(y_true=y_test, y_score=ensemble_preds),4))
print("Precision", round(precision_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Recall", round(recall_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("F1", round(f1_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))