test.py
from sklearn.feature_extraction.text import CountVectorizer
corpus = ["ああ いい うう", "ああ いい ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
print(type(X))
print(X)
print(X.shape)
print(X.toarray())
出力
['ああ', 'いい', 'うう', 'ええ']
<class 'scipy.sparse.csr.csr_matrix'>
(0, 2) 1
(0, 1) 1
(0, 0) 1
(1, 3) 1
(1, 1) 1
(1, 0) 1
(2, 4) #サンプル数、特徴数
[[1 1 1 0] #ああ いい うう
[1 1 0 1]] #ああ いい ええ
#transform
corpus = ["ああ いい うう", "ああ いい ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
print(X.toarray())
new_doc = ["ああ いい うう ええ", "ええ おお"]
new_X = vectorizer.fit_transform(new_doc)
features = vectorizer.get_feature_names()
print(features)
print(new_X.toarray())
['ああ', 'いい', 'うう', 'ええ']
[[1 1 1 0]
[1 1 0 1]]
['ああ', 'いい', 'うう', 'ええ', 'おお']
[[1 1 1 1 0]
[0 0 0 1 1]]
#例2
「,」区切りの場合は?
corpus = ["ああ,いい,うう", "ああ,いい,ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
['ああ', 'いい', 'うう', 'ええ']
「:」区切りの場合は?
corpus = ["ああ:いい:うう", "ああ:いい:ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
['ああ', 'いい', 'うう', 'ええ']
「、」の区切りは?
corpus = ["ああ、いい、うう", "ああ、いい、ええ"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
['ああ', 'いい', 'うう', 'ええ']
#tokenizer
tokenizer引数
def tokenize(t):
print(t)
return t
corpus = ["ああ、いい、うう", "ああ、いい、ええ"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
ああ、いい、うう
ああ、いい、ええ
['、', 'あ', 'い', 'う', 'え']
def tokenize(t):
print(t)
return t.split('猫')
corpus = ["ああ猫いい猫うう", "ああ猫いい猫ええ"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
ああ猫いい猫うう
ああ猫いい猫ええ
['ああ', 'いい', 'うう', 'ええ']
def tokenize(t):
print(t)
return t
corpus = ["あ、い、う", "あ、い、え"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
あ、い、う
あ、い、え
['、', 'あ', 'い', 'う', 'え']
def tokenize(t):
print(t)
return t
corpus = ["あい猫、う", "あ、い、え"]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
print(features)
あい猫、う
あ、い、え
['、', 'あ', 'い', 'う', 'え', '猫']
tokenizerを渡さない場合は?
corpus = ["あい猫、う", "あ、い、え"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
['あい猫']