言語処理100本ノック第6章: 機械学習の俺の解答。その他の章はこちら。
50. データの入手・整形
UNIXコマンドでやってしまう。
$ cat newsCorpora.csv | grep -E $'\t(Reuters|Huffington Post|Businessweek|Contactmusic.com|Daily Mail)\t' | shuf > newsCorpora-RHBCD.csv
$ wc -l newsCorpora-RHBCD.csv
13356 newsCorpora-RHBCD.csv
$ expr 13356 / 10
1335
$ expr 13356 - 1335 "*" 2
10686
train, valid, testそれぞれ10686, 1335, 1335にする。
$ head -n 10686 newsCorpora-RHBCD.csv > newsCorpora-train.csv
$ tail -n 2670 newsCorpora-RHBCD.csv | head -n 1335 > newsCorpora-valid.csv
$ tail -n 1335 newsCorpora-RHBCD.csv > newsCorpora-test.csv
$ for x in train valid test ; do
cut -f 2 newsCorpora-$x.csv > $x-title.txt
cut -f 5 newsCorpora-$x.csv > $x-category.txt
paste $x-category.txt $x-title.txt > $x.txt
done
51-59
素性抽出はTfidfVectorizerにお任せでサボっている。scikit-learnの使い方を覚えること重視で。
59も、まずは流行りのlightgbmの使い方を覚えるところまで。ハイパーパラメータ自動調整ツールを使ってゴリゴリチューニングするところまでは手を出していない。
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument('problem', type=int)
args = argparser.parse_args()
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
dataset = dict.fromkeys(['train', 'valid', 'test'])
for f_base in dataset:
dataset[f_base] = np.loadtxt(f_base + '.txt', encoding="utf-8", delimiter='\t', dtype='unicode')
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset['train'][:, 1])
Xs = {}
Ys = {}
for f_base, data in dataset.items():
Xs[f_base] = vectorizer.transform(data[:, 1])
Ys[f_base] = data[:, 0]
if args.problem == 51:
for f_base, X in Xs.items():
np.savetxt(f_base + '.feature.txt', X.toarray())
exit()
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression
if args.problem < 58:
lr = LogisticRegression()
lr.fit(Xs['train'], Ys['train'])
if args.problem == 53:
s = [input('Title: ')]
X_input = vectorizer.transform(s)
probs = lr.predict_proba(X_input)[0]
for l, p in zip(lr.classes_, probs):
print('%s %f' % (l, p))
exit()
elif args.problem == 57:
w_dict = dict(zip(lr.classes_, lr.coef_))
fnames = vectorizer.get_feature_names_out()
for c in lr.classes_:
print(c)
d = dict(zip(fnames, w_dict[c]))
d = sorted(d.items(), key=lambda x : x[1], reverse=True)
for f, w in d[:10] + d[-10:]:
print('%f %s' % (w, f))
exit()
preds = {}
for f_base in 'train', 'test':
preds[f_base] = lr.predict(Xs[f_base])
if args.problem == 54:
from sklearn.metrics import accuracy_score
for f_base in 'train', 'test':
print(f_base + ':')
print(accuracy_score(Ys[f_base], preds[f_base]))
elif args.problem == 55:
from sklearn.metrics import confusion_matrix
for f_base in 'train', 'test':
print(f_base + ':')
print(confusion_matrix(Ys[f_base], preds[f_base]))
elif args.problem == 56:
for f_base in 'train', 'test':
print(f_base + ':')
print(classification_report(Ys[f_base], preds[f_base]))
elif args.problem == 58:
cs = []
acc = {}
f_bases = ['train', 'valid', 'test']
for f_base in f_bases:
acc[f_base] = []
for i in range(1, 11):
c = 0.1 * i
cs.append(c)
lr = LogisticRegression(C=c)
lr.fit(Xs['train'], Ys['train'])
preds = {}
for f_base in f_bases:
preds[f_base] = lr.predict(Xs[f_base])
acc[f_base].append(accuracy_score(Ys[f_base], preds[f_base]))
for f_base in f_bases:
pyplot.plot(cs, acc[f_base])
pyplot.legend(f_bases)
pyplot.savefig('c-acc.png')
elif args.problem == 59:
# lightgbmが文字列ラベルを受け付けないのでラベルからラベル番号へのマップを用意
labels = list(set(Ys['train']))
num_class = len(labels)
labels_d = dict(zip(labels, range(num_class)))
import lightgbm as lgb
Y_int = {}
lgb_datasets = {}
for f_base in ('train', 'valid'):
Y_int[f_base] = [labels_d[x] for x in Ys[f_base]]
lgb_datasets[f_base] = lgb.Dataset(Xs[f_base].toarray(), Y_int[f_base])
import sys
n_leaves_range = range(3, 100)
acc = []
for n_leaves in n_leaves_range:
print('num_leaves: %d' % n_leaves, file=sys.stderr)
params = {
'objective': 'multiclass',
'num_class': num_class,
'num_boost_round': 500,
'num_leaves': n_leaves}
model_lgb = lgb.train(params, train_set=lgb_datasets['train'], valid_sets=lgb_datasets['valid'])
pred = model_lgb.predict(Xs['test'], num_iteration=model_lgb.best_iteration)
Y_pred = [labels[np.argmax(y)] for y in pred]
acc.append(accuracy_score(Ys['test'], Y_pred))
pyplot.plot(n_leaves_range, acc)
pyplot.savefig('59-num_leaves-acc.png')