More than 1 year has passed since last update.

20230827_SEOTO

Malware

Last updated at 2023-08-28Posted at 2023-08-27

のための機械学習
https://github.com/oreilly-japan/ml-security-jp

listing-7-7.py

# import sklearn modules

from sklearn import tree

from sklearn.feature_extraction import DictVectorizer

# initialize the decision tree classifier and vectorizer

classifier = tree.DecisionTreeClassifier()

vectorizer = DictVectorizer(sparse=False)

# declare toy training data

training_examples = [

{'packed':1,'contains_encrypted':0},

{'packed':0,'contains_encrypted':0},

{'packed':1,'contains_encrypted':1},

{'packed':1,'contains_encrypted':0},

{'packed':0,'contains_encrypted':1},

{'packed':1,'contains_encrypted':0},

{'packed':0,'contains_encrypted':0},

{'packed':0,'contains_encrypted':0},

]

ground_truth = [1,1,1,1,0,0,0,0]

# initialize the vectorizer with the training data

vectorizer.fit(training_examples)



# transform the training examples to vector form

X = vectorizer.transform(training_examples)

y = ground_truth # call ground truth 'y', by convention

# train the classifier (a.k.a. 'fit' the classifier)

classifier.fit(X,y)

test_example = {'packed':1,'contains_encrypted':0}

test_vector = vectorizer.transform(test_example)

print (classifier.predict(test_vector)) # prints [1]

#visualize the decision tree

with open("classifier.dot","w") as output_file:

    tree.export_graphviz(

        classifier,

        feature_names=vectorizer.get_feature_names_out(),

        out_file=output_file

    )



import os

os.system("dot classifier.dot -Tpng -o classifier.png")

complete_detector.py

#!/usr/bin/python



import os

import sys

import pickle

import argparse

import re

import numpy

from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction import FeatureHasher

import codecs



def get_string_features(path,hasher):

    # extract strings from binary file using regular expressions

    chars = r" -~"

    min_length = 5

    string_regexp = '[%s]{%d,}' % (chars, min_length)

    #file_object = open(path)

    file_object = codecs.open(path, 'r', 'utf-8', 'ignore')

    data = file_object.read()

    pattern = re.compile(string_regexp)

    strings = pattern.findall(data)



    # store string features in dictionary form

    string_features = {}

    for string in strings:

        string_features[string] = 1



    # hash the features using the hashing trick

    hashed_features = hasher.transform([string_features])



    # do some data munging to get the feature array

    hashed_features = hashed_features.todense()

    hashed_features = numpy.asarray(hashed_features)

    hashed_features = hashed_features[0]



    # return hashed string features

    print("Extracted {0} strings from {1}".format(len(string_features),path))

    return hashed_features



def scan_file(path):

    # scan a file to determine if it is malicious or benign

    if not os.path.exists("saved_detector.pkl"):

        print("It appears you haven't trained a detector yet!  Do this before scanning files.")

        sys.exit(1)

    with open("saved_detector.pkl") as saved_detector:

        classifier, hasher = pickle.load(saved_detector)

    features = get_string_features(path,hasher)

    result_proba = classifier.predict_proba([features])[:,1]

    # if the user specifies malware_paths and benignware_paths, train a detector

    if result_proba > 0.5:

        print("It appears this file is malicious!", result_proba)

    else:

        print("It appears this file is benign.", result_proba)



def train_detector(benign_path,malicious_path,hasher):

    # train the detector on the specified training data

    def get_training_paths(directory):

        targets = []

        for path in os.listdir(directory):

            targets.append(os.path.join(directory,path))

        return targets

    malicious_paths = get_training_paths(malicious_path)

    benign_paths = get_training_paths(benign_path)

    X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]

    y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]

    classifier = RandomForestClassifier(64)

    classifier.fit(X,y)

    pickle.dump((classifier,hasher),open("saved_detector.pkl","w+"))



def cv_evaluate(X,y,hasher):

    # use cross-validation to evaluate our model

    import random

    from sklearn import metrics

    from matplotlib import pyplot

    #from sklearn.cross_validation import KFold

    from sklearn.model_selection  import KFold

    X, y = numpy.array(X), numpy.array(y)

    fold_counter = 0

    #for train, test in KFold(len(X),3,shuffle=True):

    kf = KFold(3,shuffle=True)

    for train, test in kf.split(X):

        training_X, training_y = X[train], y[train]

        test_X, test_y = X[test], y[test]

        classifier = RandomForestClassifier(64)

        classifier.fit(training_X,training_y)

        scores = classifier.predict_proba(test_X)[:,-1]

        fpr, tpr, thresholds = metrics.roc_curve(test_y, scores)

        #pyplot.semilogx(fpr,tpr,label="Fold number {0}".format(fold_counter))

        pyplot.semilogx(fpr,tpr,label="ROC curve".format(fold_counter))

        fold_counter += 1

        #break

    pyplot.xlabel("detector false positive rate")

    pyplot.ylabel("detector true positive rate")

    pyplot.title("Detector ROC curve")

    #pyplot.title("detector cross-validation ROC curves")

    pyplot.legend()

    pyplot.grid()

    pyplot.show()



def get_training_data(benign_path,malicious_path,hasher):

    def get_training_paths(directory):

        targets = []

        for path in os.listdir(directory):

            targets.append(os.path.join(directory,path))

        return targets

    malicious_paths = get_training_paths(malicious_path)

    benign_paths = get_training_paths(benign_path)

    X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]

    y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]

    return X, y



parser = argparse.ArgumentParser("get windows object vectors for files")

parser.add_argument("--malware_paths",default=None,help="Path to malware training files")

parser.add_argument("--benignware_paths",default=None,help="Path to benignware training files")

parser.add_argument("--scan_file_path",default=None,help="File to scan")

parser.add_argument("--evaluate",default=False,action="store_true",help="Perform cross-validation")



args = parser.parse_args()



hasher = FeatureHasher(20000)

if args.malware_paths and args.benignware_paths and not args.evaluate:

    train_detector(args.benignware_paths,args.malware_paths,hasher)

elif args.scan_file_path:

    scan_file(args.scan_file_path)

elif args.malware_paths and args.benignware_paths and args.evaluate:

    X, y = get_training_data(args.benignware_paths,args.malware_paths,hasher)

    cv_evaluate(X,y,hasher)

else:

    print("[*] You did not specify a path to scan," ,

        " nor did you specify paths to malicious and benign training files" ,

        " please specify one of these to use the detector.\n",

    parser.print_help())

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up