Python
scikit-learn

scikit-learnのCountVectorizerメモ

CountVectorizerのメモ

test.py
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["ああ いい うう", "ああ いい ええ"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)

print(type(X))
print(X)
print(X.shape)
print(X.toarray())

出力

['ああ', 'いい', 'うう', 'ええ']
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 2)        1
  (0, 1)        1
  (0, 0)        1
  (1, 3)        1
  (1, 1)        1
  (1, 0)        1
(2, 4) #サンプル数、特徴数
[[1 1 1 0] #ああ いい うう
 [1 1 0 1]] #ああ いい ええ

transform

corpus = ["ああ いい うう", "ああ いい ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
print(X.toarray())

new_doc = ["ああ いい うう ええ", "ええ おお"]
new_X = vectorizer.transform(new_doc)

features = vectorizer.get_feature_names()
print(features)
print(new_X.toarray())
['ああ', 'いい', 'うう', 'ええ']
[[1 1 1 0]
 [1 1 0 1]]
['ああ', 'いい', 'うう', 'ええ']
[[1 1 1 1]
 [0 0 0 1]] #「おお」は対象外

例2

「,」区切りの場合は?

corpus = ["ああ,いい,うう", "ああ,いい,ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
['ああ', 'いい', 'うう', 'ええ']

「:」区切りの場合は?

corpus = ["ああ:いい:うう", "ああ:いい:ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
['ああ', 'いい', 'うう', 'ええ']

「、」の区切りは?

corpus = ["ああ、いい、うう", "ああ、いい、ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
['ああ', 'いい', 'うう', 'ええ']

tokenizer

tokenizer引数

def tokenize(t):
    print(t)
    return t

corpus = ["ああ、いい、うう", "ああ、いい、ええ"]

vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
ああ、いい、うう
ああ、いい、ええ
['、', 'あ', 'い', 'う', 'え']
def tokenize(t):
    print(t)
    return t.split('猫')

corpus = ["ああ猫いい猫うう", "ああ猫いい猫ええ"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
ああ猫いい猫うう
ああ猫いい猫ええ
['ああ', 'いい', 'うう', 'ええ']
def tokenize(t):
    print(t)
    return t

corpus = ["あ、い、う", "あ、い、え"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
あ、い、う
あ、い、え
['、', 'あ', 'い', 'う', 'え']
def tokenize(t):
    print(t)
    return t

corpus = ["あい猫、う", "あ、い、え"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

print(features)
あい猫、う
あ、い、え
['、', 'あ', 'い', 'う', 'え', '猫']

tokenizerを渡さない場合は?

corpus = ["あい猫、う", "あ、い、え"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
['あい猫']