More than 5 years have passed since last update.

素人の言語処理100本ノック:78

Last updated at 2017-05-03Posted at 2017-02-26

言語処理100本ノック 2015の挑戦記録です。環境はUbuntu 16.04 LTS ＋ Python 3.5.2 :: Anaconda 4.1.1 (64-bit)です。過去のノックの一覧はこちらからどうぞ。

第8章: 機械学習

本章では，Bo Pang氏とLillian Lee氏が公開しているMovie Review Dataのsentence polarity dataset v1.0を用い，文を肯定的（ポジティブ）もしくは否定的（ネガティブ）に分類するタスク（極性分析）に取り組む．

###78. 5分割交差検定

76-77の実験では，学習に用いた事例を評価にも用いたため，正当な評価とは言えない．すなわち，分類器が訓練事例を丸暗記する際の性能を評価しており，モデルの汎化性能を測定していない．そこで，5分割交差検定により，極性分類の正解率，適合率，再現率，F1スコアを求めよ．

####出来上がったコード：

main.py

# coding: utf-8
import codecs
import snowballstemmer
import numpy as np

fname_sentiment = 'sentiment.txt'
fname_features = 'features.txt'
fname_result = 'result.txt'
fencoding = 'cp1252'		# Windows-1252らしい

division = 5			# データの分割数
learn_alpha = 6.0		# 学習レート
learn_count = 1000		# 学習の繰り返し数

stemmer = snowballstemmer.stemmer('english')

# ストップワードのリスト	 http://xpo6.com/list-of-english-stop-words/ のCSV Formatより
stop_words = (
	'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,'
	'as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,'
	'either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,'
	'him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,'
	'likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,'
	'on,only,or,other,our,own,rather,said,say,says,she,should,since,so,'
	'some,than,that,the,their,them,then,there,these,they,this,tis,to,too,'
	'twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,'
	'will,with,would,yet,you,your').lower().split(',')


def is_stopword(str):
	'''文字がストップワードかどうかを返す
	大小文字は同一視する

	戻り値：
	ストップワードならTrue、違う場合はFalse
	'''
	return str.lower() in stop_words


def hypothesis(data_x, theta):
	'''仮説関数
	data_xに対して、thetaを使ってdata_yを予測

	戻り値：
	予測値の行列
	'''
	return 1.0 / (1.0 + np.exp(-data_x.dot(theta)))


def cost(data_x, theta, data_y):
	'''目的関数
	data_xに対して予測した結果と正解との差を算出

	戻り値：
	予測と正解との差
	'''
	m = data_y.size			# データ件数
	h = hypothesis(data_x, theta)		# data_yの予測値の行列
	j = 1 / m * np.sum(-data_y * np.log(h) -
			(np.ones(m) - data_y) * np.log(np.ones(m) - h))

	return j


def gradient(data_x, theta, data_y):
	'''最急降下における勾配の算出

	戻り値：
	thetaに対する勾配の行列
	'''
	m = data_y.size			# データ件数
	h = hypothesis(data_x, theta)		# data_yの予測値の行列
	grad = 1 / m * (h - data_y).dot(data_x)

	return grad


def extract_features(data, dict_features):
	'''文章から素性を抽出
	文章からdict_featuresに含まれる素性を抽出し、
	dict_features['(素性)']の位置を1にした行列を返す。
	なお、先頭要素は固定で1。素性に対応しない重み用。

	戻り値：
	先頭要素と、該当素性の位置+1を1にした行列
	'''
	data_one_x = np.zeros(len(dict_features) + 1, dtype=np.float64)
	data_one_x[0] = 1		# 先頭要素は固定で1、素性に対応しない重み用。

	for word in data.split(' '):

		# 前後の空白文字除去
		word = word.strip()

		# ストップワード除去
		if is_stopword(word):
			continue

		# ステミング
		word = stemmer.stemWord(word)

		# 素性のインデックス取得、行列の該当箇所を1に
		try:
			data_one_x[dict_features[word]] = 1
		except:
			pass		# dict_featuresにない素性は無視

	return data_one_x


def load_dict_features():
	'''features.txtを読み込み、素性をインデックスに変換するための辞書を作成
	インデックスの値は1ベースで、features.txtにおける行番号と一致する。

	戻り値：
	素性をインデックスに変換する辞書
	'''
	with codecs.open(fname_features, 'r', fencoding) as file_in:
		return {line.strip(): i for i, line in enumerate(file_in, start=1)}


def create_training_set(sentiments, dict_features):
	'''正解データsentimentsから学習対象の行列と、極性ラベルの行列を作成
	学習対象の行例の大きさは正解データのレビュー数×(素性数+1)。
	列の値は、各レビューに対して該当素性がある場合は1、なければ0になる。
	列の素性のインデックスはdict_features['(素性)']で決まる。
	先頭の列は常に1で、素性に対応しない重みの学習用。
	dict_featuresに存在しない素性は無視。

	極性ラベルの行列の大きさはレビュー数×1。
	肯定的な内容が1、否定的な内容が0。

	戻り値：
	学習対象の行列,極性ラベルの行列
	'''

	# 行列を0で初期化
	data_x = np.zeros([len(sentiments), len(dict_features) + 1], dtype=np.float64)
	data_y = np.zeros(len(sentiments), dtype=np.float64)

	for i, line in enumerate(sentiments):

		# 素性抽出
		data_x[i] = extract_features(line[3:], dict_features)

		# 極性ラベル行列のセット
		if line[0:2] == '+1':
			data_y[i] = 1

	return data_x, data_y


def learn(data_x, data_y, alpha, count):
	'''ロジスティック回帰の学習

	戻り値：
	学習済みのtheta
	'''
	theta = np.zeros(data_x.shape[1])
	c = cost(data_x, theta, data_y)
	print('\t学習開始\tcost：{}'.format(c))

	for i in range(1, count + 1):

		grad = gradient(data_x, theta, data_y)
		theta -= alpha * grad

		# コストとthetaの最大調整量を算出して経過表示（100回に1回）
		if i % 100 == 0:
			c = cost(data_x, theta, data_y)
			e = np.max(np.absolute(alpha * grad))
			print('\t学習中(#{})\tcost：{}\tE:{}'.format(i, c, e))

	c = cost(data_x, theta, data_y)
	e = np.max(np.absolute(alpha * grad))
	print('\t学習完了(#{}) \tcost：{}\tE:{}'.format(i, c, e))
	return theta


def score(fname):
	'''結果ファイルからスコア算出
	fnameで指定された結果ファイルを読み込んで、正解率、適合率、再現率、F1スコアを返す

	戻り値：
	正解率,適合率,再現率,F1スコア
	'''
	# 結果を読み込んで集計
	TP = 0		# True-Positive		予想が+1、正解も+1
	FP = 0		# False-Positive	予想が+1、正解は-1
	FN = 0		# False-Negative	予想が-1、正解は+1
	TN = 0		# True-Negative		予想が-1、正解も-1

	with open(fname) as data_file:
		for line in data_file:
			cols = line.split('\t')

			if len(cols) < 3:
				continue

			if cols[0] == '+1':			# 正解
				if cols[1] == '+1':		# 予想
					TP += 1
				else:
					FN += 1
			else:
				if cols[1] == '+1':
					FP += 1
				else:
					TN += 1

	# 算出
	accuracy = (TP + TN) / (TP + FP + FN + TN)		# 正解率
	precision = TP / (TP + FP)		# 適合率
	recall = TP / (TP + FN)		# 再現率
	f1 = (2 * recall * precision) / (recall + precision) 	# F1スコア

	return accuracy, precision, recall, f1


# 素性辞書の読み込み
dict_features = load_dict_features()

# 正解データ読み込み
with codecs.open(fname_sentiment, 'r', fencoding) as file_in:
	sentiments_all = list(file_in)

# 正解データを5分割
sentiments = []
unit = int(len(sentiments_all) / division)
for i in range(5):
	sentiments.append(sentiments_all[i * unit:(i + 1) * unit])

# 5分割交差検定
with open(fname_result, 'w') as file_out:
	for i in range(division):

		print('{}/{}'.format(i + 1, division))

		# 学習用と検証用に正解データを分割
		data_learn = []
		for j in range(division):
			if i == j:
				data_validation = sentiments[j]
			else:
				data_learn += sentiments[j]

		# 学習対象の配列と極性ラベルの配列作成
		data_x, data_y = create_training_set(data_learn, dict_features)

		# 学習
		theta = learn(data_x, data_y, alpha=learn_alpha, count=learn_count)

		# 検証
		for line in data_validation:

			# 素性抽出
			data_one_x = extract_features(line[3:], dict_features)

			# 予測、結果出力
			h = hypothesis(data_one_x, theta)
			if h > 0.5:
				file_out.write('{}\t{}\t{}\n'.format(line[0:2], '+1', h))
			else:
				file_out.write('{}\t{}\t{}\n'.format(line[0:2], '-1', 1 - h))

# 結果表示
print('\n学習レート：{}\t学習繰り返し数：{}'.format(learn_alpha, learn_count))
accuracy, precision, recall, f1 = score(fname_result)
print('正解率　\t{}\n適合率　\t{}\n再現率　\t{}\nF1スコア　\t{}'.format(
	accuracy, precision, recall, f1
))

####実行結果：

実行結果

1/5
	学習開始	cost：0.6931471805599453
	学習中(#100)	cost：0.46843942718055814	E:0.006388382573910524
	学習中(#200)	cost：0.4155300488897057	E:0.003950176267083882
	学習中(#300)	cost：0.3855283848183693	E:0.002867235531957132
	学習中(#400)	cost：0.3648933651792237	E:0.0022495471367582247
	学習中(#500)	cost：0.3493282931816998	E:0.0018583498524543404
	学習中(#600)	cost：0.3369232080431452	E:0.0016771358183603987
	学習中(#700)	cost：0.32666634898652896	E:0.001528412108716516
	学習中(#800)	cost：0.31795919554061053	E:0.0014042508127869423
	学習中(#900)	cost：0.31041943220497686	E:0.0012990594970099315
	学習中(#1000)	cost：0.30378857681325766	E:0.0012088047599478039
	学習完了(#1000) 	cost：0.30378857681325766	E:0.0012088047599478039
2/5
	学習開始	cost：0.6931471805599453
	学習中(#100)	cost：0.4741687433335998	E:0.006589814822192543
	学習中(#200)	cost：0.42144780985764596	E:0.003908261118677938
	学習中(#300)	cost：0.3912183151335336	E:0.002804459291483359
	学習中(#400)	cost：0.370303379815077	E:0.0023610369221010326
	学習中(#500)	cost：0.354477846021314	E:0.0020514997491309413
	学習中(#600)	cost：0.3418460542105294	E:0.0018224684562050484
	学習中(#700)	cost：0.33139550986560584	E:0.001645643112098399
	学習中(#800)	cost：0.3225230456812948	E:0.0015047097369745835
	学習中(#900)	cost：0.31484124228803834	E:0.0013896119787524179
	学習中(#1000)	cost：0.3080871067835467	E:0.0012937962132790058
	学習完了(#1000) 	cost：0.3080871067835467	E:0.0012937962132790058
3/5
	学習開始	cost：0.6931471805599453
	学習中(#100)	cost：0.46891949543978517	E:0.006357216339527686
	学習中(#200)	cost：0.41580499264287696	E:0.003532830533162978
	学習中(#300)	cost：0.3854553165948075	E:0.0027301913427912735
	学習中(#400)	cost：0.3644760512004263	E:0.0022545615099526647
	学習中(#500)	cost：0.3485986820681382	E:0.001919021249806922
	学習中(#600)	cost：0.3359163761795678	E:0.0016705021198879075
	学習中(#700)	cost：0.32541428766128333	E:0.0014797071516709523
	学習中(#800)	cost：0.31648958311645375	E:0.0013367387334497819
	学習中(#900)	cost：0.3087557956043563	E:0.0012494627215075146
	学習中(#1000)	cost：0.3019508027016161	E:0.0011779206121903469
	学習完了(#1000) 	cost：0.3019508027016161	E:0.0011779206121903469
4/5
	学習開始	cost：0.6931471805599453
	学習中(#100)	cost：0.4725342546493931	E:0.006182597071964639
	学習中(#200)	cost：0.4194276723005623	E:0.0034649497530972943
	学習中(#300)	cost：0.38918298242842136	E:0.0025501444797361994
	学習中(#400)	cost：0.36832204557828535	E:0.0021388621069763788
	学習中(#500)	cost：0.3525543611131982	E:0.001855410065711756
	学習中(#600)	cost：0.33996964450743344	E:0.0016480855756071824
	学習中(#700)	cost：0.32955351095109425	E:0.0014898405345723522
	学習中(#800)	cost：0.32070420313966275	E:0.001365069555771408
	学習中(#900)	cost：0.3130363527272276	E:0.0012842751555114352
	学習中(#1000)	cost：0.3062888953703655	E:0.0012201511930926112
	学習完了(#1000) 	cost：0.3062888953703655	E:0.0012201511930926112
5/5
	学習開始	cost：0.6931471805599453
	学習中(#100)	cost：0.47367883038307196	E:0.006165844710913304
	学習中(#200)	cost：0.42196370471708444	E:0.0038294500786362744
	学習中(#300)	cost：0.39242868456409186	E:0.002903639748114128
	学習中(#400)	cost：0.3720216436950633	E:0.002348459481805761
	学習中(#500)	cost：0.3565815749862366	E:0.0019763223680587666
	学習中(#600)	cost：0.34425094991837796	E:0.001708469854933442
	学習中(#700)	cost：0.33404185010109005	E:0.0015059837001246833
	学習中(#800)	cost：0.32536765342218166	E:0.001357007404701798
	学習中(#900)	cost：0.3178523514158344	E:0.0012612117027114012
	学習中(#1000)	cost：0.3112409530842421	E:0.0011798784899874886
	学習完了(#1000) 	cost：0.3112409530842421	E:0.0011798784899874886

学習レート：6.0	学習繰り返し数：1000
正解率　	0.7483114446529081
適合率　	0.749058734939759
再現率　	0.7466691686995685
F1スコア　	0.7478620430410676

###交差検定とは
学習結果の精度を検証する際、学習に使ったデータで検証しても、未知のデータに対する精度がわかりません。また、学習に使ったデータでの検証は、いわゆる教科書丸暗記で応用が効かないような学習（過学習とか呼びます）でも良い結果になってしまいます。そのため、学習に使っていないデータで検証する必要があります。これを交差検定と呼びます。

交差検定には一般的な方法がいくつかあるようです。今回の問題で指定されている「K-分割交差検定」は、データをK分割して、そのうちの1つを検証用、残りを学習用とする方法です。これを検証用のデータを切り替えながらK回繰り返して平均します。詳細については技術評論社の「gihyo.jp」で公開されている第21回（最終回）　機械学習はじめようが分かりやすいです。

また、例のCourseraのMachine Learningでは別の方法をオススメしていました。学習用データと調整用のデータと検証用データの3つに分けて、学習用データで学習し、調整用のデータで調整し、最終的に検証用データで結果の精度を確認する方法です。こちらについてはkatsu1110さんのCoursera Machine Learning (6): 機械学習のモデル評価（交差検定、Bias & Variance、適合率 & 再現率）のまとめがわかりやすいです。

###5分割交差検定の実装
単純にデータを5分割して実装しました。データは全部で10,662件あるので、割れずに余る2件は扱いが面倒なのでなかったことにしました^^;

なお、関数はすべてこれまでの問題で作ったものです。
スコアの算出についても問題77で作ったscore()をそのまま使うために、5回に分けて予測させた結果を問題76のフォーマットで1つのファイルに追記していき、出来あがった大きな1つの結果ファイルをscore()に食わせる形にしました。

###精度の改善について
残念ながら75%弱まで下がってしまいました。ちょっと低い感じがしますが、とりあえず次の問題に進もうと思います。

なお、精度を改善するためには、素性を増やしたり、データを増やしたり、正規化のパラメータを追加したり、仮説関数の多項式を増やしたり、素性の抽出方法を見直したり...といろいろな手段があります。ただ、思いつきでいろいろ試しても時間を浪費するだけで、精度はなかなか改善できないそうです。

CourseraのMachine Learningでは、改善のためにまず精度が低い原因がオーバーフィット（学習データに依存しすぎている状態、高バリアンスとか呼びます）なのかアンダーフィット（学習データを活用しきれていない、高バイアス）なのかを見極めなさいとのことでした。これは正規化パラーメータを導入すると判断できるようになります。その結果によって改善のアプローチが決まってきます。検証や改善についてはMachine LearningのWeek6で学ぶことができます。

　
79本目のノックは以上です。誤りなどありましたら、ご指摘いただけますと幸いです。

実行結果には、100本ノックで用いるコーパス・データで配布されているデータの一部が含まれます。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up