More than 1 year has passed since last update.
ClCmal_base

Python
Posted at 2024-01-17
ClCmal_base.py

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('\kaggle\input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# df = pd.read_parquet('../input/cicmalmem2022/Obfuscated-MalMem2022.parquet')
df = pd.read_parquet('.\input\Obfuscated-MalMem2022.parquet')
print(df.head(5))

# deps = ['Category', 'Class', 'MalwareType', 'MalwareFamily']
deps = ['Category', 'Class']
cats = []
conts = df.columns.difference(cats+deps)

dep = 'Class'
print(df.Class.value_counts())

binary_target_df = df.drop(columns=[d for d in deps if d != 'Class'])
#binary_target_df[dep] = binary_target_df[dep].cat.codes
binary_target_df[dep] = binary_target_df[dep].astype('category').cat.codes
print(binary_target_df.Class.value_counts())

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score

np.random.seed(42)
trn_bin_df,tst_bin_df = train_test_split(binary_target_df, test_size=0.90)

def xs_y(df_, targ):    
    if not isinstance(targ, list):
        xs = df_[df_.columns.difference([targ])].copy()
    else:
        xs = df_[df_.columns.difference(targ)].copy()
    y = df_[targ].copy()
    return xs, y

X_train, y_train = xs_y(trn_bin_df, dep)
X_test, y_test = xs_y(tst_bin_df, dep)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=50, criterion='gini', max_samples=0.75, max_features='sqrt', n_jobs=-1)
rf.fit(X_train, y_train)

err_rate = 1 - roc_auc_score(y_true=y_test, y_score = rf.predict(X_test))
print(err_rate)

ex = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=50, criterion='gini', max_samples=0.75, max_features='sqrt', bootstrap=True, n_jobs=-1)
ex.fit(X_train, y_train)

err_rate = 1 - roc_auc_score(y_true=y_test, y_score = ex.predict(X_test))
print(err_rate)

# P2: Even simpler model: one root-node decision tree per feature

from sklearn.tree import DecisionTreeClassifier
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count

def evaluate_one_feature(feature, index='', metric=roc_auc_score):    
    rootnode = DecisionTreeClassifier(max_depth=1, criterion='gini')    
    rootnode.fit(X_train[feature].array.reshape(-1,1), y_train)    
    preds = rootnode.predict(X_test[feature].array.reshape(-1,1))
    preds_tr = rootnode.predict(X_train[feature].array.reshape(-1,1))    
    met = round(metric(y_test, preds), 4)
    if met > 0.5:
        return [feature, met, rootnode, preds, preds_tr]
    else:
        return [feature, met, None, [], []] 
    

# results = parallel(f=evaluate_one_feature, 
#                   items=conts, n_workers=cpu_count(), threadpool=False, progress=True)

# 結果を格納するリスト
results = []

# 各アイテムに対してループで計算
for item in conts:
    result = evaluate_one_feature(item)
    results.append(result)

result_df = pd.DataFrame(data=results, columns=['feature', 'roc_auc_score', 'fitted_models', 'predictions', 'preds_train']).sort_values(by='roc_auc_score', ascending=False)

print(result_df[['feature', 'roc_auc_score']].head(15))

import seaborn as sns
import matplotlib.pyplot as plt

data_for_plot = pd.concat(objs=[X_train.head(9), y_train.head(9)], axis=1, copy=False, sort=False)

fig, axes = plt.subplots(3,3, figsize=(12,12))
axes = axes.flatten()
for i, tf in enumerate(result_df['feature'].head(9)):    
    sns.histplot(data=data_for_plot, x=tf, stat='percent', hue='Class', bins=100, ax=axes[i])

plt.show()

# Conclusion: It is not likely that CIC-MalMem2022 is an effective dataset to test ML-based obfuscated malware detection methods.

# Outro: P3 Ensemble One-Rule Model

useful_features = result_df.loc[result_df['roc_auc_score'] > 0.5]
print(f"{len(useful_features)} / {len(conts)} features have direct separating power (linear)")

ensemble_preds = np.mean(np.vstack(useful_features['predictions'].to_numpy()), axis=0)
print(ensemble_preds.shape)

ensemble_preds_train = np.mean(np.vstack(useful_features['preds_train'].to_numpy()), axis=0)
print(ensemble_preds_train.shape)

fpr, tpr, thresholds = roc_curve(y_train, ensemble_preds_train)
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print("Best threshold", best_thresh)

print("The Ensemble OneR model (simple average)")
print("ROC-AUC", round(roc_auc_score(y_true=y_test, y_score=ensemble_preds),4))
print("Precision", round(precision_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("Recall", round(recall_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
print("F1", round(f1_score(y_true=y_test, y_pred=np.where(ensemble_preds >= best_thresh, 1, 0)), 4))
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up