0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

numpyで回帰 メモ

Last updated at Posted at 2020-05-17
from IPython.display import HTML

HTML('''
<script>  
code_show=true; 
function code_toggle() {
  if (code_show){
    $(\'div.input\').hide();
  } else {
    $(\'div.input\').show();
  }
  code_show = !code_show
}  
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
    <input type="submit" value="Toggle code">
</form>
''')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.size']=15

def plt_legend_out():
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

線形回帰

2次元

$y=ax+b$の$x$と$y$は既知である。二乗誤差$E=\sum_{i=1}^n(ax_i+b-y_i)^2$が最小となる$a$と$b$を求める。$\partial E/\partial a=0$、$\partial E/\partial b=0$を解くと、以下が導かれる。

\begin{eqnarray}
a &=& 
\left (\sum_{i=1}^nx_iy_i-\frac{1}{n}\sum_{i=1}^nx_i\sum_{i=1}^ny_i \right ) \Big/
\left (\sum_{i=1}^nx_i^2-\frac{1}{n}\left (\sum_{i=1}^nx_i\right )^2 \right )\\
b &=& \frac{1}{n}\sum_{i=1}^ny_i-ax_i)
\end{eqnarray}
数式($\sum$を使った表現) 数式(ベクトル表現) コード
$\sum_{i=1}^nx_iy_i$ $x^Ty$ np.dot(x,y)
$\frac{1}{n}\sum_{i=1}^nx_i\sum_{i=1}^ny_i$ y.sum() * x.sum() / n
$\sum_{i=1}^nx_i^2$ (x**2).sum()
$\frac{1}{n}(\sum_{i=1}^nx_i)^2$ x.sum()**2 / n
def reg1dim2(x, y):
    n = len(x)
    a = ((np.dot(x, y) - y.sum() * x.sum() / n) /
         ((x**2).sum() - x.sum()**2 / n))
    b = (y.sum() - a * x.sum()) / n
    return a, b

x = np.array([1, 2, 4, 6, 7])
y = np.array([1, 3, 3, 5, 4])
a, b = reg1dim2(x, y)

plt.scatter(x, y, color="k")
xmax = x.max()
plt.plot([0, xmax], [b, a * xmax + b], color="k")
plt.xlabel('x')
plt.ylabel('y')
plt.show()

image.png

多次元

多次元の場合

$y=w^T\tilde{x}$

書き換えると、

$\hat{y}(w)=\tilde{X}w$

二乗誤差を計算

\begin{eqnarray}
E(\color{red}{w}) &=& ||y-\tilde{X}\color{red}{w}||^2\\
&=& (y-\tilde{X}\color{red}{w})^T(y-\tilde{X}\color{red}{w})\\
&=& y^Ty-\color{red}{w^T}\tilde{X}^Ty-y^T\tilde{X}\color{red}{w}+\color{red}{w^T}\tilde{X}^T\tilde{X}\color{red}{w}
\end{eqnarray}

$\color{red}{w}$を求める

\begin{eqnarray}
\nabla E = -2\tilde{X}y+2\tilde{X}^T\tilde{X}\color{red}{w} &=& 0\\
\tilde{X}y &=& \tilde{X}^T\tilde{X}\color{red}{w}\\
\color{red}{w} &=& (\tilde{X}^T\tilde{X})^{-1}\tilde{X}^Ty
\end{eqnarray}
from scipy import linalg

class LinearRegression:
    def __init__(self):
        self.w_ = None

    def fit(self, X, t):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        A = np.dot(Xtil.T, Xtil)
        b = np.dot(Xtil.T, t)
        self.w_ = linalg.solve(A, b)

    def predict(self, X):
        if X.ndim == 1:
            X = X.reshape(1, -1)
        Xtil = np.c_[np.ones(X.shape[0]), X]
        return np.dot(Xtil, self.w_)
from mpl_toolkits.mplot3d import axes3d

n = 100
scale = 10

np.random.seed(0)
X = np.random.random((n, 2)) * scale
w0 = 1
w1 = 2
w2 = 3
y = w0 + w1 * X[:, 0] + w2 * X[:, 1] + np.random.randn(n)

# model = linearreg.LinearRegression()
model = LinearRegression()
model.fit(X, y)
# print("係数:", model.w_)
# print("(1, 1)に対する予測値:", model.predict(np.array([1, 1])))

xmesh, ymesh = np.meshgrid(np.linspace(0, scale, 20),
                           np.linspace(0, scale, 20))
zmesh = (model.w_[0] + model.w_[1] * xmesh.ravel() +
         model.w_[2] * ymesh.ravel()).reshape(xmesh.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], y, color="k")
ax.plot_wireframe(xmesh, ymesh, zmesh, color="r")
plt.xlabel('x')
plt.ylabel('y')
ax.set_zlabel('z')
plt.show()

image.png

実践的な例

import linearreg
import numpy as np
import csv

# データ読み込み
Xy = []
with open("winequality-red.csv") as fp:
    for row in csv.reader(fp, delimiter=";"):
        Xy.append(row)
Xy = np.array(Xy[1:], dtype=np.float64)

# 訓練用データとテスト用データに分割する
np.random.seed(0)
np.random.shuffle(Xy)
train_X = Xy[:-1000, :-1]
train_y = Xy[:-1000, -1]
test_X = Xy[-1000:, :-1]
test_y = Xy[-1000:, -1]

# 学習させる
model = linearreg.LinearRegression()
model.fit(train_X, train_y)

# テスト用データにモデルを適用
y = model.predict(test_X)

print("最初の5つの正解と予測値:")
for i in range(5):
    print("{:1.0f} {:5.3f}".format(test_y[i], y[i]))
print()
print("RMSE:", np.sqrt(((test_y - y)**2).mean()))
最初の5つの正解と予測値:
7 6.012
6 5.734
5 5.285
8 6.352
5 5.414

RMSE: 0.6724248548465226
plt.figure(figsize=(4,4))
plt.scatter(test_y, y)
plt.xticks(np.arange(2,10,1))
plt.yticks(np.arange(2,10,1))
plt.xlim(2,9)
plt.ylim(2,9)
plt.xlabel('実測値')
plt.ylabel('予測値')
plt.plot(np.arange(2,10,1),np.arange(2,10,1),color='gray',lw=0.5)
plt.grid()
plt.show()

image.png

リッジ回帰

$\lambda ||\color{red}{w}||^2$を導入する。出来るだけ$\lambda ||\color{red}{w}||^2$が小さい方が良い。$\color{red}{w}$はL2ノルム。

$E(\color{red}{w})=||y-\tilde{X}\color{red}{w}||^2+\lambda ||\color{red}{w}||^2$

$\color{red}{w}$を求める。

\begin{eqnarray}
\nabla E=2\tilde{X}^T\tilde{X}\color{red}{w}-2\tilde{X}^Ty+2\lambda \color{red}{w} &=&0\\
2[(\tilde{X}^T\tilde{X}+\lambda I)\color{red}{w}-\tilde{X}^Ty] &=& 0\\
\color{red}{w} &=& (\tilde{X}^T\tilde{X}+\lambda I)^{-1}\tilde{X}^T y
\end{eqnarray}
from scipy import linalg

class RidgeRegression:
    def __init__(self, lambda_=1.):
        self.lambda_ = lambda_
        self.w_ = None

    def fit(self, X, t):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        c = np.eye(Xtil.shape[1])
        A = np.dot(Xtil.T, Xtil) + self.lambda_ * c
        b = np.dot(Xtil.T, t)
        self.w_ = linalg.solve(A, b)

    def predict(self, X):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        return np.dot(Xtil, self.w_)
x = np.array([1, 2, 4, 6, 7])
y = np.array([1, 3, 3, 5, 4])
plt.scatter(x, y, color="k")

par = 0
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 1
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 10
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 50
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

plt.xlabel('x')
plt.ylabel('y')
plt_legend_out()

plt.show()

image.png

外れ値が存在する場合

x = np.arange(12)
y = 1 + 2 * x
y[2] = 20
y[4] = 0

plt.scatter(x,y,color='k')

par = 0
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 1
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = 10
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

par = -10
model = RidgeRegression(par)
model.fit(x, y)
b, a = model.w_
xmax = x.max()
plt.plot([0, xmax], [b, b + a * xmax],label='$\lambda$ = '+str(par))

plt.xlabel('x')
plt.ylabel('y')
plt_legend_out()

plt.show()

image.png

# import ridge
# import linearreg

x = np.arange(12)
y = 1 + 2 * x
y[2] = 20
y[4] = 0

xmin = 0
xmax = 12
ymin = -1
ymax = 25
fig, axes = plt.subplots(nrows=2, ncols=5)
for i in range(5):
    axes[0, i].set_xlim([xmin, xmax])
    axes[0, i].set_ylim([ymin, ymax])
    axes[1, i].set_xlim([xmin, xmax])
    axes[1, i].set_ylim([ymin, ymax])
    axes[0, i].tick_params(bottom=False,labelbottom=False)
    if i>0:
        axes[0, i].tick_params(left=False,labelleft=False)
        axes[1, i].tick_params(left=False,labelleft=False)
    xx = x[: 2 + i * 2]
    yy = y[: 2 + i * 2]
    axes[0, i].scatter(xx, yy, color="k")
    axes[1, i].scatter(xx, yy, color="k")
#    model = linearreg.LinearRegression()
    model = LinearRegression()
    model.fit(xx, yy)
    xs = [xmin, xmax]
    ys = [model.w_[0] + model.w_[1] * xmin, model.w_[0] + model.w_[1] * xmax]
    axes[0, i].plot(xs, ys, color="red")
#    plt.legend()
#    model = ridge.RidgeRegression(10.)
    model = RidgeRegression(10.)
    model.fit(xx, yy)
    xs = [xmin, xmax]
    ys = [model.w_[0] + model.w_[1] * xmin, model.w_[0] + model.w_[1] * xmax]
    axes[1, i].plot(xs, ys, color="red")
    axes[1,i].set_xlabel('x')
#    plt.xlabel('x')
axes[0,0].set_ylabel('ols')
axes[1,0].set_ylabel('ridge\n($\lambda$ = 10)')
# plt_legend_out()
plt.show()

image.png

多項式回帰と汎化性能

class PolynomialRegression:
    def __init__(self, degree):
        self.degree = degree

    def fit(self, x, y):
        x_pow = []
        xx = x.reshape(len(x), 1)
        for i in range(1, self.degree + 1):
            x_pow.append(xx**i)
        mat = np.concatenate(x_pow, axis=1)
#        linreg = linearreg.LinearRegression()
        linreg = LinearRegression()
        linreg.fit(mat, y)
        self.w_ = linreg.w_

    def predict(self, x):
        r = 0
        for i in range(self.degree + 1):
            r += x**i * self.w_[i]
        return r
# データ生成
np.random.seed(0)

def f(x):
    return 1 + 2 * x

x = np.random.random(10) * 10
y = f(x) + np.random.randn(10)

# 多項式回帰
# model = polyreg.PolynomialRegression(10)
model = PolynomialRegression(10)
model.fit(x, y)

plt.scatter(x, y, color="k")
plt.ylim([y.min() - 1, y.max() + 1])
xx = np.linspace(x.min(), x.max(), 300)
yy = np.array([model.predict(u) for u in xx])
plt.plot(xx, yy, color="k")

# 線形回帰
# model = linearreg.LinearRegression()
model = LinearRegression()
model.fit(x, y)
b, a = model.w_
x1 = x.min() - 1
x2 = x.max() + 1
plt.plot([x1, x2], [f(x1), f(x2)], color="k", linestyle="dashed")
plt.xlabel('x')
plt.ylabel('y')
plt.show()
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:11: LinAlgWarning: Ill-conditioned matrix (rcond=2.60099e-30): result may not be accurate.
  # This is added back by InteractiveShellApp.init_path()

image.png

以下は二乗誤差の平均。$\color{red}{\hat{f}_D(x)}$はデータ$D$を使って予測した値。第1項はバイアスで、実測値と予測値の差の二乗。第2項はバリアンスで、$D$が変化したときの予測値の分散。

E_D[(f(x)-\color{red}{\hat{f}_D(x)})^2]=(f(x)-E_D[\color{red}{\hat{f}_D(x)}])^2+E_D[(\color{red}{\hat{f}_D(x)}-E_D[\hat{f}(x)])^2]

アルゴリズム

  1. 以下を$n回$繰り返す。
    1. $0\leq x \leq 5$の範囲に5点をランダムに取得し、$f(x)=1/(1+x)$を計算する。
    2. 線形回帰 or 多項式回帰
    3. それぞれの$x$に対して予測値を計算
  2. 予測値の平均値を計算

↓ n=1では過学習気味だが、試行を重ねるにつれて汎化性能が上がっているように見える。そして線形モデルの方が汎化性能が高い。

import warnings

def f(x):
    return 1 / (1 + x)

def sample(n):
    x = np.random.random(n) * 5
    y = f(x)
    return x, y

xx = np.arange(0, 5, 0.1)
np.random.seed(0)
y_poly_sum = np.zeros(len(xx))
y_lin_sum = np.zeros(len(xx))
y_poly_sum_sq = np.zeros(len(xx))
y_lin_sum_sq = np.zeros(len(xx))
y_true = f(xx)

warnings.filterwarnings("ignore")

bvmax = 0.5
bvmin = -0.05
ymax = 1.5
ymin = 0

##########

plt.figure(figsize=(10,7))
plt.subplots_adjust(hspace=0.1,wspace=0.1)

##########

plt.subplot(321)
plt.scatter(xx, f(xx), label="truth",color="k", s=2)

ns = [1,10,20,50]
c = sns.color_palette('husl',len(ns))
lb = [] ; lv = []
for i in range(len(ns)):
    n = ns[i]
    for _ in range(n):
        x, y = sample(5)
        lin = LinearRegression()
        lin.fit(x, y)
        y_lin = lin.predict(xx.reshape(-1, 1))
        y_lin_sum += y_lin
        y_lin_sum_sq += (y_lin - y_true)**2
    plt.plot(xx, y_lin_sum / n, label="n = "+str(n),color=c[i],alpha=0.8)
    lb.append((y_lin_sum / n - y_true)**2)
    lv.append(y_lin_sum_sq / n)

plt.tick_params(bottom=False,labelbottom=False)
plt.title('linear reg.')
plt.ylabel('y')
plt.ylim(ymin,ymax)

##########

plt.subplot(323)
for i in range(len(ns)):
    plt.plot(xx, lb[i], color=c[i])
plt.ylabel('bias')
plt.ylim(bvmin,bvmax)
plt.tick_params(bottom=False,labelbottom=False)

##########

plt.subplot(325)
for i in range(len(ns)):
    plt.plot(xx, lv[i], color=c[i])
plt.ylabel('variance')
plt.xlabel('x')
plt.ylim(bvmin,bvmax)

##########

plt.subplot(322)
plt.scatter(xx, f(xx), label="truth",color="k", s=2)

ns = [1,10,20,50]
c = sns.color_palette('husl',len(ns))
lb = [] ; lv = []
for i in range(len(ns)):
    n = ns[i]
    for _ in range(n):
        x, y = sample(5)
        poly = PolynomialRegression(4)
        poly.fit(x, y)
        y_poly = poly.predict(xx)
        y_poly_sum += y_poly
        y_poly_sum_sq += (y_poly - y_true)**2
    plt.plot(xx, y_poly_sum / n,label="n = "+str(n),color=c[i],alpha=0.8)
    lb.append((y_poly_sum / n - y_true)**2)
    lv.append(y_poly_sum_sq / n)

plt_legend_out()
plt.title('polynomial reg.')
plt.tick_params(bottom=False,labelbottom=False,left=False,labelleft=False)
plt.ylim(ymin,ymax)

##########

plt.subplot(324)
for i in range(len(ns)):
    plt.plot(xx, lb[i], color=c[i])
plt.ylim(bvmin,bvmax)
plt.tick_params(bottom=False,labelbottom=False,left=False,labelleft=False)

##########

plt.subplot(326)
for i in range(len(ns)):
    plt.plot(xx, lv[i], color=c[i])
plt.xlabel('x')
plt.ylim(bvmin,bvmax)
plt.tick_params(left=False,labelleft=False)
plt.show()

image.png

ラッソ回帰

$\phi=\frac{1}{2}||y-\bar{X}w||^2+\lambda |w|_1$

ただし$|\cdot|$はL1ノルム$|w|_1$と呼ばれる。

|w|_1=\sum_{i=1}^d|w_i|
import numpy as np

def soft_thresholding(x, y):
    return np.sign(x) * max(abs(x) - y, 0)

class Lasso:
    def __init__(self, lambda_, tol=0.0001, max_iter=1000):
        self.lambda_ = lambda_
        self.tol = tol
        self.max_iter = max_iter
        self.w_ = None

    def fit(self, X, t):
        n, d = X.shape
        self.w_ = np.zeros(d + 1)
        avgl1 = 0.
        for _ in range(self.max_iter):
            avgl1_prev = avgl1
            self._update(n, d, X, t)
            avgl1 = np.abs(self.w_).sum() / self.w_.shape[0]
            if abs(avgl1 - avgl1_prev) <= self.tol:
                break

    def _update(self, n, d, X, t):
        self.w_[0] = (t - np.dot(X, self.w_[1:])).sum() / n
        w0vec = np.ones(n) * self.w_[0]
        for k in range(d):
            ww = self.w_[1:]
            ww[k] = 0
            q = np.dot(t - w0vec - np.dot(X, ww), X[:, k])
            r = np.dot(X[:, k], X[:, k])
            self.w_[k + 1] = soft_thresholding(q / r, self.lambda_)

    def predict(self, X):
        if X.ndim == 1:
            X = X.reshape(X.shape[0], 1)
        Xtil = np.c_[np.ones(X.shape[0]), X]
        return np.dot(Xtil, self.w_)
# データ読み込み
Xy = []
with open("winequality-red.csv") as fp:
    for row in csv.reader(fp, delimiter=";"):
        Xy.append(row)
Xy = np.array(Xy[1:], dtype=np.float64)

# 訓練用データとテスト用データに分割する
np.random.seed(0)
np.random.shuffle(Xy)
train_X = Xy[:-1000, :-1]
train_y = Xy[:-1000, -1]
test_X = Xy[-1000:, :-1]
test_y = Xy[-1000:, -1]

ll = [] ; lc = [] ; lm = []

# ハイパーパラメータを変えながら学習させて結果表示
for lambda_ in [1., 0.1, 0.01]:
#    model = lasso.Lasso(lambda_)
    model = Lasso(lambda_)
    model.fit(train_X, train_y)
    y = model.predict(test_X)
#    print("--- lambda = {} ---".format(lambda_))
#    print("coefficients:")
#    print(model.w_)
    mse = ((y - test_y)**2).mean()
#    print("MSE: {:.3f}".format(mse))
    ll.append(lambda_)
    lm.append(mse)
    lc.append(model.w_)
df_tmp1 = pd.DataFrame({'lambda':ll,'mse':lm})

df_tmp2 = pd.DataFrame(lc)
df_tmp2.columns = varname

df_tmp = pd.concat([df_tmp1,df_tmp2],axis=1)

df_plot = df_tmp.melt(id_vars=['lambda','mse'])
sns.barplot(data=df_tmp1,x='lambda',y='mse')
plt.show()

image.png

sns.barplot(data=df_plot,x='value',y='variable',hue='lambda')
plt.axvline(x=0,lw=0.5,color='k')
plt_legend_out()

image.png

ロジスティック回帰

import numpy as np
from scipy import linalg

THRESHMIN = 1e-10

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class LogisticRegression:
    def __init__(self, tol=0.001, max_iter=3, random_seed=0):
        self.tol = tol
        self.max_iter = max_iter
        self.random_state = np.random.RandomState(random_seed)
        self.w_ = None

    def fit(self, X, y):
        self.w_ = self.random_state.randn(X.shape[1] + 1)
        Xtil = np.c_[np.ones(X.shape[0]), X]
        diff = np.inf
        w_prev = self.w_
        iter = 0
        while diff > self.tol and iter < self.max_iter:
            yhat = sigmoid(np.dot(Xtil, self.w_))
            r = np.clip(yhat * (1 - yhat),
                        THRESHMIN, np.inf)
            XR = Xtil.T * r
            XRX = np.dot(Xtil.T * r, Xtil)
            w_prev = self.w_
            b = np.dot(XR, np.dot(Xtil, self.w_) -
                       1 / r * (yhat - y))
            self.w_ = linalg.solve(XRX, b)
            diff = abs(w_prev - self.w_).mean()
            iter += 1

    def predict(self, X):
        Xtil = np.c_[np.ones(X.shape[0]), X]
        yhat = sigmoid(np.dot(Xtil, self.w_))
        return np.where(yhat > .5, 1, 0)
import csv
import numpy as np

n_test = 100
X = []
y = []
with open("wdbc.data") as fp:
    for row in csv.reader(fp):
        if row[1] == "B":
            y.append(0)
        else:
            y.append(1)
        X.append(row[2:])

y = np.array(y, dtype=np.float64)
X = np.array(X, dtype=np.float64)
y_train = y[:-n_test]
X_train = X[:-n_test]
y_test = y[-n_test:]
X_test = X[-n_test:]
# model = logisticreg.LogisticRegression(tol=0.01)
model = LogisticRegression(tol=0.01)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)
n_hits = (y_test == y_predict).sum()
print("Accuracy: {}/{} = {}".format(n_hits, n_test, n_hits / n_test))

Accuracy: 97/100 = 0.97
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?