LoginSignup
0
0

More than 5 years have passed since last update.

NLTK

Last updated at Posted at 2018-12-28

Library

import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

Example 1

nltk.download('state_union')
from nltk.corpus import state_union

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

word_tokens = word_tokenize(train_text)
sentence_tokens = sent_tokenize(train_text)

print (len(word_tokens))
print (word_tokens[:5])
print ()
print (len(sentence_tokens))
print (sentence_tokens[:2])

image.png

stop_words = stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '(', ')', '_']

word_selected = [w.lower() for w in word_tokens if w.lower() not in stop_words + symbol]
print (len(word_selected))
print (word_selected[:10])

image.png

frequency = nltk.FreqDist(word_selected)

print (frequency['president'])
print (frequency['union'])

image.png

plt.figure(figsize=(10, 5))
frequency.plot(30,cumulative=True)

image.png

plt.figure(figsize=(10, 5))
frequency.plot(30,cumulative=False)

image.png

Example 2 (Movie reviews)

import numpy as np
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

import random
print ('# of pos: ', len(movie_reviews.fileids('pos')))
print ('# of neg: ', len(movie_reviews.fileids('neg')))

image.png

%%time

stop_words = stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '(', ')', '_']

all_words = []
for w in movie_reviews.words():
  if w.lower() not in stop_words + symbol:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print ('# of words: ', len(all_words))
print (all_words.most_common(5))
print (all_words['like'])

image.png

#word_features = list(all_words.keys())[:3000]

word_features = []
for w in dict(all_words.most_common(3000)).keys():
  word_features.append(w)

print (word_features[:10])

image.png

def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = (w in words)

  return features

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

data = [(find_features(doc), category) for (doc, category) in documents]

print (np.array(data).shape)
print (data[0][0])
print (data[0][1])

image.png

data_train = data[:1800]
data_test = data[1800:]

nb = nltk.NaiveBayesClassifier.train(data_train)

print ('NaiveBayes accuracy: ', nltk.classify.accuracy(nb, data_test))

image.png

nb.show_most_informative_features(15)

image.png

#saving
import pickle

path = '/content/gdrive/My Drive/MILIZE/'

save_nb = open(path+'naivebayes.pickle', 'wb')
pickle.dump(nb, save_nb)
save_nb.close()

saved_nb = open(path+'naivebayes.pickle', "rb")
nb_2 = pickle.load(saved_nb)
saved_nb.close()
#sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

mnb = SklearnClassifier(MultinomialNB())
mnb.train(data_train)
print("MultinomialNB accuracy:", nltk.classify.accuracy(mnb, data_test))

bnb = SklearnClassifier(BernoulliNB())
bnb.train(data_train)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(bnb, data_test))

lr = SklearnClassifier(LogisticRegression())
lr.train(data_train)
print("Logistic Regression accuracy:", nltk.classify.accuracy(lr, data_test))

svc = SklearnClassifier(SVC())
svc.train(data_train)
print("SVC accuracy percent:",nltk.classify.accuracy(svc, data_test))

image.png

image.png

image.png

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0