NLTK #MachineLearning

Library

import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

Example 1

nltk.download('state_union')
from nltk.corpus import state_union

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

word_tokens = word_tokenize(train_text)
sentence_tokens = sent_tokenize(train_text)

print (len(word_tokens))
print (word_tokens[:5])
print ()
print (len(sentence_tokens))
print (sentence_tokens[:2])

stop_words = stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '(', ')', '_']

word_selected = [w.lower() for w in word_tokens if w.lower() not in stop_words + symbol]
print (len(word_selected))
print (word_selected[:10])

frequency = nltk.FreqDist(word_selected)

print (frequency['president'])
print (frequency['union'])

plt.figure(figsize=(10, 5))
frequency.plot(30,cumulative=True)

plt.figure(figsize=(10, 5))
frequency.plot(30,cumulative=False)

Example 2 (Movie reviews)

import numpy as np
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

import random

print ('# of pos: ', len(movie_reviews.fileids('pos')))
print ('# of neg: ', len(movie_reviews.fileids('neg')))

%%time

stop_words = stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '(', ')', '_']

all_words = []
for w in movie_reviews.words():
  if w.lower() not in stop_words + symbol:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print ('# of words: ', len(all_words))
print (all_words.most_common(5))
print (all_words['like'])

#word_features = list(all_words.keys())[:3000]

word_features = []
for w in dict(all_words.most_common(3000)).keys():
  word_features.append(w)

print (word_features[:10])

def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = (w in words)

  return features

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

data = [(find_features(doc), category) for (doc, category) in documents]

print (np.array(data).shape)
print (data[0][0])
print (data[0][1])

data_train = data[:1800]
data_test = data[1800:]

nb = nltk.NaiveBayesClassifier.train(data_train)

print ('NaiveBayes accuracy: ', nltk.classify.accuracy(nb, data_test))

nb.show_most_informative_features(15)

#saving
import pickle

path = '/content/gdrive/My Drive/MILIZE/'

save_nb = open(path+'naivebayes.pickle', 'wb')
pickle.dump(nb, save_nb)
save_nb.close()

saved_nb = open(path+'naivebayes.pickle', "rb")
nb_2 = pickle.load(saved_nb)
saved_nb.close()

#sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

mnb = SklearnClassifier(MultinomialNB())
mnb.train(data_train)
print("MultinomialNB accuracy:", nltk.classify.accuracy(mnb, data_test))

bnb = SklearnClassifier(BernoulliNB())
bnb.train(data_train)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(bnb, data_test))

lr = SklearnClassifier(LogisticRegression())
lr.train(data_train)
print("Logistic Regression accuracy:", nltk.classify.accuracy(lr, data_test))

svc = SklearnClassifier(SVC())
svc.train(data_train)
print("SVC accuracy percent:",nltk.classify.accuracy(svc, data_test))