のための機械学習
https://github.com/oreilly-japan/ml-security-jp
listing-7-7.py
# import sklearn modules
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer
# initialize the decision tree classifier and vectorizer
classifier = tree.DecisionTreeClassifier()
vectorizer = DictVectorizer(sparse=False)
# declare toy training data
training_examples = [
{'packed':1,'contains_encrypted':0},
{'packed':0,'contains_encrypted':0},
{'packed':1,'contains_encrypted':1},
{'packed':1,'contains_encrypted':0},
{'packed':0,'contains_encrypted':1},
{'packed':1,'contains_encrypted':0},
{'packed':0,'contains_encrypted':0},
{'packed':0,'contains_encrypted':0},
]
ground_truth = [1,1,1,1,0,0,0,0]
# initialize the vectorizer with the training data
vectorizer.fit(training_examples)
# transform the training examples to vector form
X = vectorizer.transform(training_examples)
y = ground_truth # call ground truth 'y', by convention
# train the classifier (a.k.a. 'fit' the classifier)
classifier.fit(X,y)
test_example = {'packed':1,'contains_encrypted':0}
test_vector = vectorizer.transform(test_example)
print (classifier.predict(test_vector)) # prints [1]
#visualize the decision tree
with open("classifier.dot","w") as output_file:
tree.export_graphviz(
classifier,
feature_names=vectorizer.get_feature_names_out(),
out_file=output_file
)
import os
os.system("dot classifier.dot -Tpng -o classifier.png")
complete_detector.py
#!/usr/bin/python
import os
import sys
import pickle
import argparse
import re
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import FeatureHasher
import codecs
def get_string_features(path,hasher):
# extract strings from binary file using regular expressions
chars = r" -~"
min_length = 5
string_regexp = '[%s]{%d,}' % (chars, min_length)
#file_object = open(path)
file_object = codecs.open(path, 'r', 'utf-8', 'ignore')
data = file_object.read()
pattern = re.compile(string_regexp)
strings = pattern.findall(data)
# store string features in dictionary form
string_features = {}
for string in strings:
string_features[string] = 1
# hash the features using the hashing trick
hashed_features = hasher.transform([string_features])
# do some data munging to get the feature array
hashed_features = hashed_features.todense()
hashed_features = numpy.asarray(hashed_features)
hashed_features = hashed_features[0]
# return hashed string features
print("Extracted {0} strings from {1}".format(len(string_features),path))
return hashed_features
def scan_file(path):
# scan a file to determine if it is malicious or benign
if not os.path.exists("saved_detector.pkl"):
print("It appears you haven't trained a detector yet! Do this before scanning files.")
sys.exit(1)
with open("saved_detector.pkl") as saved_detector:
classifier, hasher = pickle.load(saved_detector)
features = get_string_features(path,hasher)
result_proba = classifier.predict_proba([features])[:,1]
# if the user specifies malware_paths and benignware_paths, train a detector
if result_proba > 0.5:
print("It appears this file is malicious!", result_proba)
else:
print("It appears this file is benign.", result_proba)
def train_detector(benign_path,malicious_path,hasher):
# train the detector on the specified training data
def get_training_paths(directory):
targets = []
for path in os.listdir(directory):
targets.append(os.path.join(directory,path))
return targets
malicious_paths = get_training_paths(malicious_path)
benign_paths = get_training_paths(benign_path)
X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]
y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]
classifier = RandomForestClassifier(64)
classifier.fit(X,y)
pickle.dump((classifier,hasher),open("saved_detector.pkl","w+"))
def cv_evaluate(X,y,hasher):
# use cross-validation to evaluate our model
import random
from sklearn import metrics
from matplotlib import pyplot
#from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
X, y = numpy.array(X), numpy.array(y)
fold_counter = 0
#for train, test in KFold(len(X),3,shuffle=True):
kf = KFold(3,shuffle=True)
for train, test in kf.split(X):
training_X, training_y = X[train], y[train]
test_X, test_y = X[test], y[test]
classifier = RandomForestClassifier(64)
classifier.fit(training_X,training_y)
scores = classifier.predict_proba(test_X)[:,-1]
fpr, tpr, thresholds = metrics.roc_curve(test_y, scores)
#pyplot.semilogx(fpr,tpr,label="Fold number {0}".format(fold_counter))
pyplot.semilogx(fpr,tpr,label="ROC curve".format(fold_counter))
fold_counter += 1
#break
pyplot.xlabel("detector false positive rate")
pyplot.ylabel("detector true positive rate")
pyplot.title("Detector ROC curve")
#pyplot.title("detector cross-validation ROC curves")
pyplot.legend()
pyplot.grid()
pyplot.show()
def get_training_data(benign_path,malicious_path,hasher):
def get_training_paths(directory):
targets = []
for path in os.listdir(directory):
targets.append(os.path.join(directory,path))
return targets
malicious_paths = get_training_paths(malicious_path)
benign_paths = get_training_paths(benign_path)
X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]
y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]
return X, y
parser = argparse.ArgumentParser("get windows object vectors for files")
parser.add_argument("--malware_paths",default=None,help="Path to malware training files")
parser.add_argument("--benignware_paths",default=None,help="Path to benignware training files")
parser.add_argument("--scan_file_path",default=None,help="File to scan")
parser.add_argument("--evaluate",default=False,action="store_true",help="Perform cross-validation")
args = parser.parse_args()
hasher = FeatureHasher(20000)
if args.malware_paths and args.benignware_paths and not args.evaluate:
train_detector(args.benignware_paths,args.malware_paths,hasher)
elif args.scan_file_path:
scan_file(args.scan_file_path)
elif args.malware_paths and args.benignware_paths and args.evaluate:
X, y = get_training_data(args.benignware_paths,args.malware_paths,hasher)
cv_evaluate(X,y,hasher)
else:
print("[*] You did not specify a path to scan," ,
" nor did you specify paths to malicious and benign training files" ,
" please specify one of these to use the detector.\n",
parser.print_help())