Edited at

scikit-learnのCountVectorizerメモ

CountVectorizerのメモ


test.py

from sklearn.feature_extraction.text import CountVectorizer

corpus = ["ああ いい うう", "ああ いい ええ"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

print(type(X))
print(X)
print(X.shape)
print(X.toarray())


出力

['ああ', 'いい', 'うう', 'ええ']

<class 'scipy.sparse.csr.csr_matrix'>
(0, 2) 1
(0, 1) 1
(0, 0) 1
(1, 3) 1
(1, 1) 1
(1, 0) 1
(2, 4) #サンプル数、特徴数
[[1 1 1 0] #ああ いい うう
[1 1 0 1]] #ああ いい ええ


transform

corpus = ["ああ いい うう", "ああ いい ええ"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
print(X.toarray())

new_doc = ["ああ いい うう ええ", "ええ おお"]
new_X = vectorizer.fit_transform(new_doc)

features = vectorizer.get_feature_names()
print(features)
print(new_X.toarray())

['ああ', 'いい', 'うう', 'ええ']

[[1 1 1 0]
[1 1 0 1]]
['ああ', 'いい', 'うう', 'ええ', 'おお']
[[1 1 1 1 0]
[0 0 0 1 1]]


例2

「,」区切りの場合は?

corpus = ["ああ,いい,うう", "ああ,いい,ええ"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

['ああ', 'いい', 'うう', 'ええ']

「:」区切りの場合は?

corpus = ["ああ:いい:うう", "ああ:いい:ええ"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

['ああ', 'いい', 'うう', 'ええ']

「、」の区切りは?

corpus = ["ああ、いい、うう", "ああ、いい、ええ"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

['ああ', 'いい', 'うう', 'ええ']


tokenizer

tokenizer引数

def tokenize(t):

print(t)
return t

corpus = ["ああ、いい、うう", "ああ、いい、ええ"]

vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

ああ、いい、うう

ああ、いい、ええ
['、', 'あ', 'い', 'う', 'え']

def tokenize(t):

print(t)
return t.split('猫')

corpus = ["ああ猫いい猫うう", "ああ猫いい猫ええ"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

ああ猫いい猫うう

ああ猫いい猫ええ
['ああ', 'いい', 'うう', 'ええ']

def tokenize(t):

print(t)
return t

corpus = ["あ、い、う", "あ、い、え"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

あ、い、う

あ、い、え
['、', 'あ', 'い', 'う', 'え']

def tokenize(t):

print(t)
return t

corpus = ["あい猫、う", "あ、い、え"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

あい猫、う

あ、い、え
['、', 'あ', 'い', 'う', 'え', '猫']

tokenizerを渡さない場合は?

corpus = ["あい猫、う", "あ、い、え"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

['あい猫']