More than 5 years have passed since last update.

numpyで回帰メモ

Last updated at 2020-09-18Posted at 2020-05-17

サポートページ : https://www.sbcr.jp/product/4797393965/
第1版正誤表 : https://s3-ap-northeast-1.amazonaws.com/sbcr-dl-tec/2018-12-12-93965-%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%81%AE%E3%82%A8%E3%83%83%E3%82%BB%E3%83%B3%E3%82%B9/errata.pdf

from IPython.display import HTML

HTML('''
<script>  
code_show=true; 
function code_toggle() {
  if (code_show){
    $(\'div.input\').hide();
  } else {
    $(\'div.input\').show();
  }
  code_show = !code_show
}  
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
    <input type="submit" value="Toggle code">
</form>
''')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.size']=15

def plt_legend_out():
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

線形回帰

2次元

$y=ax+b$の$x$と$y$は既知である。二乗誤差$E=\sum_{i=1}^n(ax_i+b-y_i)^2$が最小となる$a$と$b$を求める。$\partial E/\partial a=0$、$\partial E/\partial b=0$を解くと、以下が導かれる。

\begin{eqnarray}
a &=& 
\left (\sum_{i=1}^nx_iy_i-\frac{1}{n}\sum_{i=1}^nx_i\sum_{i=1}^ny_i \right ) \Big/
\left (\sum_{i=1}^nx_i^2-\frac{1}{n}\left (\sum_{i=1}^nx_i\right )^2 \right )\\
b &=& \frac{1}{n}\sum_{i=1}^ny_i-ax_i)
\end{eqnarray}

数式（$\sum$を使った表現）	数式（ベクトル表現）	コード
$\sum_{i=1}^nx_iy_i$	$x^Ty$	`np.dot(x,y)`
$\frac{1}{n}\sum_{i=1}^nx_i\sum_{i=1}^ny_i$		`y.sum() * x.sum() / n`
$\sum_{i=1}^nx_i^2$		`(x**2).sum()`
$\frac{1}{n}(\sum_{i=1}^nx_i)^2$		`x.sum()**2 / n`

def reg1dim2(x, y):
    n = len(x)
    a = ((np.dot(x, y) - y.sum() * x.sum() / n) /
         ((x**2).sum() - x.sum()**2 / n))
    b = (y.sum() - a * x.sum()) / n
    return a, b

x = np.array([1, 2, 4, 6, 7])
y = np.array([1, 3, 3, 5, 4])
a, b = reg1dim2(x, y)

plt.scatter(x, y, color="k")
xmax = x.max()
plt.plot([0, xmax], [b, a * xmax + b], color="k")
plt.xlabel('x')
plt.ylabel('y')
plt.show()

多次元

多次元の場合

$y=w^T\tilde{x}$

書き換えると、

$\hat{y}(w)=\tilde{X}w$

二乗誤差を計算

\begin{eqnarray}
E(\color{red}{w}) &=& ||y-\tilde{X}\color{red}{w}||^2\\
&=& (y-\tilde{X}\color{red}{w})^T(y-\tilde{X}\color{red}{w})\\
&=& y^Ty-\color{red}{w^T}\tilde{X}^Ty-y^T\tilde{X}\color{red}{w}+\color{red}{w^T}\tilde{X}^T\tilde{X}\color{red}{w}
\end{eqnarray}

$\color{red}{w}$を求める

\begin{eqnarray}
\nabla E = -2\tilde{X}y+2\tilde{X}^T\tilde{X}\color{red}{w} &=& 0\\
\tilde{X}y &=& \tilde{X}^T\tilde{X}\color{red}{w}\\
\color{red}{w} &=& (\tilde{X}^T\tilde{X})^{-1}\tilde{X}^Ty
\end{eqnarray}

from scipy import linalg

class LinearRegression:
    def __init__(self):
        self.w_ = None

    def fit(self, X, t):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        A = np.dot(Xtil.T, Xtil)
        b = np.dot(Xtil.T, t)
        self.w_ = linalg.solve(A, b)

    def predict(self, X):
        if X.ndim == 1:
            X = X.reshape(1, -1)
        Xtil = np.c_[np.ones(X.shape[0]), X]
        return np.dot(Xtil, self.w_)

from mpl_toolkits.mplot3d import axes3d

n = 100
scale = 10

np.random.seed(0)
X = np.random.random((n, 2)) * scale
w0 = 1
w1 = 2
w2 = 3
y = w0 + w1 * X[:, 0] + w2 * X[:, 1] + np.random.randn(n)

# model = linearreg.LinearRegression()
model = LinearRegression()
model.fit(X, y)
# print("係数：", model.w_)
# print("(1, 1)に対する予測値：", model.predict(np.array([1, 1])))

xmesh, ymesh = np.meshgrid(np.linspace(0, scale, 20),
                           np.linspace(0, scale, 20))
zmesh = (model.w_[0] + model.w_[1] * xmesh.ravel() +
         model.w_[2] * ymesh.ravel()).reshape(xmesh.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], y, color="k")
ax.plot_wireframe(xmesh, ymesh, zmesh, color="r")
plt.xlabel('x')
plt.ylabel('y')
ax.set_zlabel('z')
plt.show()

実践的な例

import linearreg
import numpy as np
import csv

# データ読み込み
Xy = []
with open("winequality-red.csv") as fp:
    for row in csv.reader(fp, delimiter=";"):
        Xy.append(row)
Xy = np.array(Xy[1:], dtype=np.float64)

# 訓練用データとテスト用データに分割する
np.random.seed(0)
np.random.shuffle(Xy)
train_X = Xy[:-1000, :-1]
train_y = Xy[:-1000, -1]
test_X = Xy[-1000:, :-1]
test_y = Xy[-1000:, -1]

# 学習させる
model = linearreg.LinearRegression()
model.fit(train_X, train_y)

# テスト用データにモデルを適用
y = model.predict(test_X)

print("最初の5つの正解と予測値:")
for i in range(5):
    print("{:1.0f} {:5.3f}".format(test_y[i], y[i]))
print()
print("RMSE:", np.sqrt(((test_y - y)**2).mean()))

最初の5つの正解と予測値:
7 6.012
6 5.734
5 5.285
8 6.352
5 5.414

RMSE: 0.6724248548465226

plt.figure(figsize=(4,4))
plt.scatter(test_y, y)
plt.xticks(np.arange(2,10,1))
plt.yticks(np.arange(2,10,1))
plt.xlim(2,9)
plt.ylim(2,9)
plt.xlabel('実測値')
plt.ylabel('予測値')
plt.plot(np.arange(2,10,1),np.arange(2,10,1),color='gray',lw=0.5)
plt.grid()
plt.show()

リッジ回帰

$\lambda ||\color{red}{w}||^2$を導入する。出来るだけ$\lambda ||\color{red}{w}||^2$が小さい方が良い。$\color{red}{w}$はL2ノルム。

$E(\color{red}{w})=||y-\tilde{X}\color{red}{w}||^2+\lambda ||\color{red}{w}||^2$

$\color{red}{w}$を求める。

\begin{eqnarray}
\nabla E=2\tilde{X}^T\tilde{X}\color{red}{w}-2\tilde{X}^Ty+2\lambda \color{red}{w} &=&0\\
2[(\tilde{X}^T\tilde{X}+\lambda I)\color{red}{w}-\tilde{X}^Ty] &=& 0\\
\color{red}{w} &=& (\tilde{X}^T\tilde{X}+\lambda I)^{-1}\tilde{X}^T y
\end{eqnarray}

from scipy import linalg

class RidgeRegression:
    def __init__(self, lambda_=1.):
        self.lambda_ = lambda_
        self.w_ = None

    def fit(self, X, t):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        c = np.eye(Xtil.shape[1])
        A = np.dot(Xtil.T, Xtil) + self.lambda_ * c
        b = np.dot(Xtil.T, t)
        self.w_ = linalg.solve(A, b)

    def predict(self, X):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        return np.dot(Xtil, self.w_)

x = np.array([1, 2, 4, 6, 7])
y = np.array([1, 3, 3, 5, 4])
plt.scatter(x, y, color="k")

par = 0
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 1
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 10
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 50
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

plt.xlabel('x')
plt.ylabel('y')
plt_legend_out()

plt.show()

外れ値が存在する場合

x = np.arange(12)
y = 1 + 2 * x
y[2] = 20
y[4] = 0

plt.scatter(x,y,color='k')

par = 0
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 1
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 10
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = -10
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

plt.xlabel('x')
plt.ylabel('y')
plt_legend_out()

plt.show()

# import ridge
# import linearreg

x = np.arange(12)
y = 1 + 2 * x
y[2] = 20
y[4] = 0

xmin = 0
xmax = 12
ymin = -1
ymax = 25
fig, axes = plt.subplots(nrows=2, ncols=5)
for i in range(5):
    axes[0, i].set_xlim([xmin, xmax])
    axes[0, i].set_ylim([ymin, ymax])
    axes[1, i].set_xlim([xmin, xmax])
    axes[1, i].set_ylim([ymin, ymax])
    axes[0, i].tick_params(bottom=False,labelbottom=False)
    if i>0:
        axes[0, i].tick_params(left=False,labelleft=False)
        axes[1, i].tick_params(left=False,labelleft=False)
    xx = x[: 2 + i * 2]
    yy = y[: 2 + i * 2]
    axes[0, i].scatter(xx, yy, color="k")
    axes[1, i].scatter(xx, yy, color="k")
#    model = linearreg.LinearRegression()
    model = LinearRegression()
    model.fit(xx, yy)
    xs = [xmin, xmax]
    ys = [model.w_[0] + model.w_[1] * xmin, model.w_[0] + model.w_[1] * xmax]
    axes[0, i].plot(xs, ys, color="red")
#    plt.legend()
#    model = ridge.RidgeRegression(10.)
    model = RidgeRegression(10.)
    model.fit(xx, yy)
    xs = [xmin, xmax]
    ys = [model.w_[0] + model.w_[1] * xmin, model.w_[0] + model.w_[1] * xmax]
    axes[1, i].plot(xs, ys, color="red")
    axes[1,i].set_xlabel('x')
#    plt.xlabel('x')
axes[0,0].set_ylabel('ols')
axes[1,0].set_ylabel('ridge\n($\lambda$ = 10)')
# plt_legend_out()
plt.show()

多項式回帰と汎化性能

class PolynomialRegression:
    def __init__(self, degree):
        self.degree = degree

    def fit(self, x, y):
        x_pow = []
        xx = x.reshape(len(x), 1)
        for i in range(1, self.degree + 1):
            x_pow.append(xx**i)
        mat = np.concatenate(x_pow, axis=1)
#        linreg = linearreg.LinearRegression()
        linreg = LinearRegression()
        linreg.fit(mat, y)
        self.w_ = linreg.w_

    def predict(self, x):
        r = 0
        for i in range(self.degree + 1):
            r += x**i * self.w_[i]
        return r

# データ生成
np.random.seed(0)

def f(x):
    return 1 + 2 * x

x = np.random.random(10) * 10
y = f(x) + np.random.randn(10)

# 多項式回帰
# model = polyreg.PolynomialRegression(10)
model = PolynomialRegression(10)
model.fit(x, y)

plt.scatter(x, y, color="k")
plt.ylim([y.min() - 1, y.max() + 1])
xx = np.linspace(x.min(), x.max(), 300)
yy = np.array([model.predict(u) for u in xx])
plt.plot(xx, yy, color="k")

# 線形回帰
# model = linearreg.LinearRegression()
model = LinearRegression()
model.fit(x, y)
b, a = model.w_
x1 = x.min() - 1
x2 = x.max() + 1
plt.plot([x1, x2], [f(x1), f(x2)], color="k", linestyle="dashed")
plt.xlabel('x')
plt.ylabel('y')
plt.show()

/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:11: LinAlgWarning: Ill-conditioned matrix (rcond=2.60099e-30): result may not be accurate.
  # This is added back by InteractiveShellApp.init_path()

以下は二乗誤差の平均。$\color{red}{\hat{f}_D(x)}$はデータ$D$を使って予測した値。第1項はバイアスで、実測値と予測値の差の二乗。第2項はバリアンスで、$D$が変化したときの予測値の分散。

E_D[(f(x)-\color{red}{\hat{f}_D(x)})^2]=(f(x)-E_D[\color{red}{\hat{f}_D(x)}])^2+E_D[(\color{red}{\hat{f}_D(x)}-E_D[\hat{f}(x)])^2]

アルゴリズム

以下を$n回$繰り返す。
1. $0\leq x \leq 5$の範囲に5点をランダムに取得し、$f(x)=1/(1+x)$を計算する。
2. 線形回帰 or 多項式回帰
3. それぞれの$x$に対して予測値を計算
予測値の平均値を計算

↓ n=1では過学習気味だが、試行を重ねるにつれて汎化性能が上がっているように見える。そして線形モデルの方が汎化性能が高い。

import warnings

def f(x):
    return 1 / (1 + x)

def sample(n):
    x = np.random.random(n) * 5
    y = f(x)
    return x, y

xx = np.arange(0, 5, 0.1)
np.random.seed(0)
y_poly_sum = np.zeros(len(xx))
y_lin_sum = np.zeros(len(xx))
y_poly_sum_sq = np.zeros(len(xx))
y_lin_sum_sq = np.zeros(len(xx))
y_true = f(xx)

warnings.filterwarnings("ignore")

bvmax = 0.5
bvmin = -0.05
ymax = 1.5
ymin = 0

##########

plt.figure(figsize=(10,7))
plt.subplots_adjust(hspace=0.1,wspace=0.1)

##########

plt.subplot(321)
plt.scatter(xx, f(xx), label="truth",color="k", s=2)

ns = [1,10,20,50]
c = sns.color_palette('husl',len(ns))
lb = [] ; lv = []
for i in range(len(ns)):
    n = ns[i]
    for _ in range(n):
        x, y = sample(5)
        lin = LinearRegression()
        lin.fit(x, y)
        y_lin = lin.predict(xx.reshape(-1, 1))
        y_lin_sum += y_lin
        y_lin_sum_sq += (y_lin - y_true)**2
    plt.plot(xx, y_lin_sum / n, label="n = "+str(n),color=c[i],alpha=0.8)
    lb.append((y_lin_sum / n - y_true)**2)
    lv.append(y_lin_sum_sq / n)

plt.tick_params(bottom=False,labelbottom=False)
plt.title('linear reg.')
plt.ylabel('y')
plt.ylim(ymin,ymax)

##########

plt.subplot(323)
for i in range(len(ns)):
    plt.plot(xx, lb[i], color=c[i])
plt.ylabel('bias')
plt.ylim(bvmin,bvmax)
plt.tick_params(bottom=False,labelbottom=False)

##########

plt.subplot(325)
for i in range(len(ns)):
    plt.plot(xx, lv[i], color=c[i])
plt.ylabel('variance')
plt.xlabel('x')
plt.ylim(bvmin,bvmax)

##########

plt.subplot(322)
plt.scatter(xx, f(xx), label="truth",color="k", s=2)

ns = [1,10,20,50]
c = sns.color_palette('husl',len(ns))
lb = [] ; lv = []
for i in range(len(ns)):
    n = ns[i]
    for _ in range(n):
        x, y = sample(5)
        poly = PolynomialRegression(4)
        poly.fit(x, y)
        y_poly = poly.predict(xx)
        y_poly_sum += y_poly
        y_poly_sum_sq += (y_poly - y_true)**2
    plt.plot(xx, y_poly_sum / n,label="n = "+str(n),color=c[i],alpha=0.8)
    lb.append((y_poly_sum / n - y_true)**2)
    lv.append(y_poly_sum_sq / n)

plt_legend_out()
plt.title('polynomial reg.')
plt.tick_params(bottom=False,labelbottom=False,left=False,labelleft=False)
plt.ylim(ymin,ymax)

##########

plt.subplot(324)
for i in range(len(ns)):
    plt.plot(xx, lb[i], color=c[i])
plt.ylim(bvmin,bvmax)
plt.tick_params(bottom=False,labelbottom=False,left=False,labelleft=False)

##########

plt.subplot(326)
for i in range(len(ns)):
    plt.plot(xx, lv[i], color=c[i])
plt.xlabel('x')
plt.ylim(bvmin,bvmax)
plt.tick_params(left=False,labelleft=False)
plt.show()

ラッソ回帰

$\phi=\frac{1}{2}||y-\bar{X}w||^2+\lambda |w|_1$

ただし$|\cdot|$はL1ノルム$|w|_1$と呼ばれる。

|w|_1=\sum_{i=1}^d|w_i|

import numpy as np

def soft_thresholding(x, y):
    return np.sign(x) * max(abs(x) - y, 0)

class Lasso:
    def __init__(self, lambda_, tol=0.0001, max_iter=1000):
        self.lambda_ = lambda_
        self.tol = tol
        self.max_iter = max_iter
        self.w_ = None

    def fit(self, X, t):
        n, d = X.shape
        self.w_ = np.zeros(d + 1)
        avgl1 = 0.
        for _ in range(self.max_iter):
            avgl1_prev = avgl1
            self._update(n, d, X, t)
            avgl1 = np.abs(self.w_).sum() / self.w_.shape[0]
            if abs(avgl1 - avgl1_prev) <= self.tol:
                break

    def _update(self, n, d, X, t):
        self.w_[0] = (t - np.dot(X, self.w_[1:])).sum() / n
        w0vec = np.ones(n) * self.w_[0]
        for k in range(d):
            ww = self.w_[1:]
            ww[k] = 0
            q = np.dot(t - w0vec - np.dot(X, ww), X[:, k])
            r = np.dot(X[:, k], X[:, k])
            self.w_[k + 1] = soft_thresholding(q / r, self.lambda_)

    def predict(self, X):
        if X.ndim == 1:
            X = X.reshape(X.shape[0], 1)
        Xtil = np.c_[np.ones(X.shape[0]), X]
        return np.dot(Xtil, self.w_)

# データ読み込み
Xy = []
with open("winequality-red.csv") as fp:
    for row in csv.reader(fp, delimiter=";"):
        Xy.append(row)
Xy = np.array(Xy[1:], dtype=np.float64)

# 訓練用データとテスト用データに分割する
np.random.seed(0)
np.random.shuffle(Xy)
train_X = Xy[:-1000, :-1]
train_y = Xy[:-1000, -1]
test_X = Xy[-1000:, :-1]
test_y = Xy[-1000:, -1]

ll = [] ; lc = [] ; lm = []

# ハイパーパラメータを変えながら学習させて結果表示
for lambda_ in [1., 0.1, 0.01]:
#    model = lasso.Lasso(lambda_)
    model = Lasso(lambda_)
    model.fit(train_X, train_y)
    y = model.predict(test_X)
#    print("--- lambda = {} ---".format(lambda_))
#    print("coefficients:")
#    print(model.w_)
    mse = ((y - test_y)**2).mean()
#    print("MSE: {:.3f}".format(mse))
    ll.append(lambda_)
    lm.append(mse)
    lc.append(model.w_)

df_tmp1 = pd.DataFrame({'lambda':ll,'mse':lm})

df_tmp2 = pd.DataFrame(lc)
df_tmp2.columns = varname

df_tmp = pd.concat([df_tmp1,df_tmp2],axis=1)

df_plot = df_tmp.melt(id_vars=['lambda','mse'])

sns.barplot(data=df_tmp1,x='lambda',y='mse')
plt.show()

sns.barplot(data=df_plot,x='value',y='variable',hue='lambda')
plt.axvline(x=0,lw=0.5,color='k')
plt_legend_out()

ロジスティック回帰

import numpy as np
from scipy import linalg

THRESHMIN = 1e-10

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class LogisticRegression:
    def __init__(self, tol=0.001, max_iter=3, random_seed=0):
        self.tol = tol
        self.max_iter = max_iter
        self.random_state = np.random.RandomState(random_seed)
        self.w_ = None

    def fit(self, X, y):
        self.w_ = self.random_state.randn(X.shape[1] + 1)
        Xtil = np.c_[np.ones(X.shape[0]), X]
        diff = np.inf
        w_prev = self.w_
        iter = 0
        while diff > self.tol and iter < self.max_iter:
            yhat = sigmoid(np.dot(Xtil, self.w_))
            r = np.clip(yhat * (1 - yhat),
                        THRESHMIN, np.inf)
            XR = Xtil.T * r
            XRX = np.dot(Xtil.T * r, Xtil)
            w_prev = self.w_
            b = np.dot(XR, np.dot(Xtil, self.w_) -
                       1 / r * (yhat - y))
            self.w_ = linalg.solve(XRX, b)
            diff = abs(w_prev - self.w_).mean()
            iter += 1

    def predict(self, X):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        yhat = sigmoid(np.dot(Xtil, self.w_))
        return np.where(yhat > .5, 1, 0)

import csv
import numpy as np

n_test = 100
X = []
y = []
with open("wdbc.data") as fp:
    for row in csv.reader(fp):
        if row[1] == "B":
            y.append(0)
        else:
            y.append(1)
        X.append(row[2:])

y = np.array(y, dtype=np.float64)
X = np.array(X, dtype=np.float64)
y_train = y[:-n_test]
X_train = X[:-n_test]
y_test = y[-n_test:]
X_test = X[-n_test:]
# model = logisticreg.LogisticRegression(tol=0.01)
model = LogisticRegression(tol=0.01)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)
n_hits = (y_test == y_predict).sum()
print("Accuracy: {}/{} = {}".format(n_hits, n_test, n_hits / n_test))

Accuracy: 97/100 = 0.97

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

numpyで回帰 メモ