Library
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
Example 1
nltk.download('state_union')
from nltk.corpus import state_union
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
word_tokens = word_tokenize(train_text)
sentence_tokens = sent_tokenize(train_text)
print (len(word_tokens))
print (word_tokens[:5])
print ()
print (len(sentence_tokens))
print (sentence_tokens[:2])
stop_words = stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '(', ')', '_']
word_selected = [w.lower() for w in word_tokens if w.lower() not in stop_words + symbol]
print (len(word_selected))
print (word_selected[:10])
frequency = nltk.FreqDist(word_selected)
print (frequency['president'])
print (frequency['union'])
plt.figure(figsize=(10, 5))
frequency.plot(30,cumulative=True)
plt.figure(figsize=(10, 5))
frequency.plot(30,cumulative=False)
Example 2 (Movie reviews)
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
import random
print ('# of pos: ', len(movie_reviews.fileids('pos')))
print ('# of neg: ', len(movie_reviews.fileids('neg')))
%%time
stop_words = stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '(', ')', '_']
all_words = []
for w in movie_reviews.words():
if w.lower() not in stop_words + symbol:
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
print ('# of words: ', len(all_words))
print (all_words.most_common(5))
print (all_words['like'])
#word_features = list(all_words.keys())[:3000]
word_features = []
for w in dict(all_words.most_common(3000)).keys():
word_features.append(w)
print (word_features[:10])
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
data = [(find_features(doc), category) for (doc, category) in documents]
print (np.array(data).shape)
print (data[0][0])
print (data[0][1])
data_train = data[:1800]
data_test = data[1800:]
nb = nltk.NaiveBayesClassifier.train(data_train)
print ('NaiveBayes accuracy: ', nltk.classify.accuracy(nb, data_test))
nb.show_most_informative_features(15)
#saving
import pickle
path = '/content/gdrive/My Drive/MILIZE/'
save_nb = open(path+'naivebayes.pickle', 'wb')
pickle.dump(nb, save_nb)
save_nb.close()
saved_nb = open(path+'naivebayes.pickle', "rb")
nb_2 = pickle.load(saved_nb)
saved_nb.close()
#sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
mnb = SklearnClassifier(MultinomialNB())
mnb.train(data_train)
print("MultinomialNB accuracy:", nltk.classify.accuracy(mnb, data_test))
bnb = SklearnClassifier(BernoulliNB())
bnb.train(data_train)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(bnb, data_test))
lr = SklearnClassifier(LogisticRegression())
lr.train(data_train)
print("Logistic Regression accuracy:", nltk.classify.accuracy(lr, data_test))
svc = SklearnClassifier(SVC())
svc.train(data_train)
print("SVC accuracy percent:",nltk.classify.accuracy(svc, data_test))