0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

20230827_SEOTO

Last updated at Posted at 2023-08-27

のための機械学習
https://github.com/oreilly-japan/ml-security-jp

listing-7-7.py
# import sklearn modules

from sklearn import tree

from sklearn.feature_extraction import DictVectorizer

# initialize the decision tree classifier and vectorizer

classifier = tree.DecisionTreeClassifier()

vectorizer = DictVectorizer(sparse=False)

# declare toy training data

training_examples = [

{'packed':1,'contains_encrypted':0},

{'packed':0,'contains_encrypted':0},

{'packed':1,'contains_encrypted':1},

{'packed':1,'contains_encrypted':0},

{'packed':0,'contains_encrypted':1},

{'packed':1,'contains_encrypted':0},

{'packed':0,'contains_encrypted':0},

{'packed':0,'contains_encrypted':0},

]

ground_truth = [1,1,1,1,0,0,0,0]

# initialize the vectorizer with the training data

vectorizer.fit(training_examples)



# transform the training examples to vector form

X = vectorizer.transform(training_examples)

y = ground_truth # call ground truth 'y', by convention

# train the classifier (a.k.a. 'fit' the classifier)

classifier.fit(X,y)

test_example = {'packed':1,'contains_encrypted':0}

test_vector = vectorizer.transform(test_example)

print (classifier.predict(test_vector)) # prints [1]

#visualize the decision tree

with open("classifier.dot","w") as output_file:

    tree.export_graphviz(

        classifier,

        feature_names=vectorizer.get_feature_names_out(),

        out_file=output_file

    )



import os

os.system("dot classifier.dot -Tpng -o classifier.png")

complete_detector.py
#!/usr/bin/python



import os

import sys

import pickle

import argparse

import re

import numpy

from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction import FeatureHasher

import codecs



def get_string_features(path,hasher):

    # extract strings from binary file using regular expressions

    chars = r" -~"

    min_length = 5

    string_regexp = '[%s]{%d,}' % (chars, min_length)

    #file_object = open(path)

    file_object = codecs.open(path, 'r', 'utf-8', 'ignore')

    data = file_object.read()

    pattern = re.compile(string_regexp)

    strings = pattern.findall(data)



    # store string features in dictionary form

    string_features = {}

    for string in strings:

        string_features[string] = 1



    # hash the features using the hashing trick

    hashed_features = hasher.transform([string_features])



    # do some data munging to get the feature array

    hashed_features = hashed_features.todense()

    hashed_features = numpy.asarray(hashed_features)

    hashed_features = hashed_features[0]



    # return hashed string features

    print("Extracted {0} strings from {1}".format(len(string_features),path))

    return hashed_features



def scan_file(path):

    # scan a file to determine if it is malicious or benign

    if not os.path.exists("saved_detector.pkl"):

        print("It appears you haven't trained a detector yet!  Do this before scanning files.")

        sys.exit(1)

    with open("saved_detector.pkl") as saved_detector:

        classifier, hasher = pickle.load(saved_detector)

    features = get_string_features(path,hasher)

    result_proba = classifier.predict_proba([features])[:,1]

    # if the user specifies malware_paths and benignware_paths, train a detector

    if result_proba > 0.5:

        print("It appears this file is malicious!", result_proba)

    else:

        print("It appears this file is benign.", result_proba)



def train_detector(benign_path,malicious_path,hasher):

    # train the detector on the specified training data

    def get_training_paths(directory):

        targets = []

        for path in os.listdir(directory):

            targets.append(os.path.join(directory,path))

        return targets

    malicious_paths = get_training_paths(malicious_path)

    benign_paths = get_training_paths(benign_path)

    X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]

    y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]

    classifier = RandomForestClassifier(64)

    classifier.fit(X,y)

    pickle.dump((classifier,hasher),open("saved_detector.pkl","w+"))



def cv_evaluate(X,y,hasher):

    # use cross-validation to evaluate our model

    import random

    from sklearn import metrics

    from matplotlib import pyplot

    #from sklearn.cross_validation import KFold

    from sklearn.model_selection  import KFold

    X, y = numpy.array(X), numpy.array(y)

    fold_counter = 0

    #for train, test in KFold(len(X),3,shuffle=True):

    kf = KFold(3,shuffle=True)

    for train, test in kf.split(X):

        training_X, training_y = X[train], y[train]

        test_X, test_y = X[test], y[test]

        classifier = RandomForestClassifier(64)

        classifier.fit(training_X,training_y)

        scores = classifier.predict_proba(test_X)[:,-1]

        fpr, tpr, thresholds = metrics.roc_curve(test_y, scores)

        #pyplot.semilogx(fpr,tpr,label="Fold number {0}".format(fold_counter))

        pyplot.semilogx(fpr,tpr,label="ROC curve".format(fold_counter))

        fold_counter += 1

        #break

    pyplot.xlabel("detector false positive rate")

    pyplot.ylabel("detector true positive rate")

    pyplot.title("Detector ROC curve")

    #pyplot.title("detector cross-validation ROC curves")

    pyplot.legend()

    pyplot.grid()

    pyplot.show()



def get_training_data(benign_path,malicious_path,hasher):

    def get_training_paths(directory):

        targets = []

        for path in os.listdir(directory):

            targets.append(os.path.join(directory,path))

        return targets

    malicious_paths = get_training_paths(malicious_path)

    benign_paths = get_training_paths(benign_path)

    X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]

    y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]

    return X, y



parser = argparse.ArgumentParser("get windows object vectors for files")

parser.add_argument("--malware_paths",default=None,help="Path to malware training files")

parser.add_argument("--benignware_paths",default=None,help="Path to benignware training files")

parser.add_argument("--scan_file_path",default=None,help="File to scan")

parser.add_argument("--evaluate",default=False,action="store_true",help="Perform cross-validation")



args = parser.parse_args()



hasher = FeatureHasher(20000)

if args.malware_paths and args.benignware_paths and not args.evaluate:

    train_detector(args.benignware_paths,args.malware_paths,hasher)

elif args.scan_file_path:

    scan_file(args.scan_file_path)

elif args.malware_paths and args.benignware_paths and args.evaluate:

    X, y = get_training_data(args.benignware_paths,args.malware_paths,hasher)

    cv_evaluate(X,y,hasher)

else:

    print("[*] You did not specify a path to scan," ,

        " nor did you specify paths to malicious and benign training files" ,

        " please specify one of these to use the detector.\n",

    parser.print_help())
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?