Introduction
カルマンフィルタを使用した状態空間モデルをRで実装する方法についてのメモ.
今回は外生変数を組み込んだな時変係数モデルを実装する.
参考図書:時系列分析と状態空間モデルの基礎: RとStanで学ぶ理論と実装
ライブラリインポート
library(KFAS) # モデル作成
library(ggplot2) # グラフ描画
library(patchwork) # グラフ配置
シミュレーションデータの作成
2つの外生変数と目的変数からなるデータを作成.
set.seed(1)
N=100
# 各種標準偏差
sigma_v = 10
sigma_w = 5
sigma_t1 = 0.6
sigma_t2 = 0.2
# 説明変数
x1 = rnorm(n=N, mean=5, sd=10)
x2 = rnorm(n=N, mean=10, sd=5)
# 切片,回帰係数,観測値
mu = cumsum(rnorm(n=N, mean=0, sd=sigma_w))
b1 = cumsum(rnorm(n=N, mean=0, sd=sigma_t1)) + 10
b2 = cumsum(rnorm(n=N, mean=0, sd=sigma_t1)) + 5
y = mu + b1*x1 + b2*x2 + rnorm(n=N, mean=0, sd=sigma_v)
dataset = data.frame(y=y, x1=x1, x2=x2)
coefset = data.frame(b1=b1, b2=b2, mu=mu)
データの確認
# 外生変数と目的変数
par(mfrow=c(3,1))
plot(x1)
plot(x2)
plot(y)
# 回帰係数と切片
par(mfrow=c(3,1))
plot(b1)
plot(b2)
plot(mu)
テストデータと訓練データに分割
p = 80
train = dataset[1:p,]
test = dataset[81:N,]
train_coef = coefset[1:p,]
test_coef = coefset[81:N,]
モデリング
2つの外生変数をもつ時変係数モデルは以下のように定式化される.
$$
\begin{eqnarray}
\beta_{1,t} &=& \beta_{1,t-1} + \tau_{1,t}, \ \ \ \ &\tau_{1,t}&~Normal(0,\sigma^2_{\tau_1}) \\
\beta_{2,t} &=& \beta_{2,t-1} + \tau_{2,t}, \ \ &\tau_{2,t}&~Normal(0,\sigma^2_{\tau_2}) \\
\mu_t &=& \mu_{t-1} + w_t, \ \ &w_t&~Normal(0,\sigma^2_w) \\
y_t &=& \mu_t + \beta_{1,t}x_{1,t} + \beta_{2,t}x_{2,t} + v_t, \ \ \ \ &v_t&~Normal(0,\sigma^2_v)
\end{eqnarray}
$$
- $\beta_t$:回帰係数
- $\mu_t$:切片
- $x_t$:外生変数
- $y_t$:目的変数
- $\tau_t$:回帰係数誤差
- $w_t$:過程誤差
- $v_t$:観測誤差
# Step1:モデルの構造を決める
build_kfas = SSModel(H = NA, train$y ~ SSMtrend(degree = 1, Q = NA) +
SSMregression( ~ train$x1 + train$x2 , Q = diag(rep(NA, 2))))
# Step2:パラメータ推定
fit_kfas = fitSSM(build_kfas, inits = c(1, 1, 1, 1))
# Step3:フィルタリング
result_kfas = KFS(fit_kfas$model, filtering=c('state', 'mean'), smoothing=c('state', 'mean'))
モデル評価
訓練データの結果を図示.
# グラフ用データフレーム作成
df_kfas = data.frame(time=1:p, y=train$y, y_pred=result_kfas$m[1:length(train$y)])
df_kfas$y_pred[1] = NA
ggplot(data=df_kfas, aes(x=time, y=y))+
geom_point(alpha=0.5)+
geom_line(aes(y=y_pred),size=1.2, col = 'red')
切片および回帰係数の推定結果
df_kfas_coef = data.frame(time=1:p, b1=train_coef$b1, b2=train_coef$b2, mu=train_coef$mu,
b1_pred = coef(result_kfas)[,1],
b2_pred = coef(result_kfas)[,2],
mu_pred = coef(result_kfas)[,3])
g1 = ggplot(data=df_kfas_coef, aes(x=time, y=b1)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b1_pred), size=1, col='red')
g2 = ggplot(data=df_kfas_coef, aes(x=time, y=b2)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b2_pred), size=1, col='red')
g3 = ggplot(data=df_kfas_coef, aes(x=time, y=mu)) +
geom_point(alpha=0.5) +
geom_line(aes(y=mu_pred), size=1, col='red')
g1+g2+g3+plot_layout(ncol=1)
逐次更新
訓練データで推定したパラメータを使用し,テストデータにおいても状態の更新をしたい.
少し強引な方法ではあるがKFASパッケージを使って逐次更新する方法について記す.
データ準備
テストデータのサイズ分訓練データ内の目的変数にNAを代入する.
train2 = dataset
train2[81:N,1] = NA
KFASパッケージでパラメタの推定まではこれまでと同様に行う.
# Step1:モデルの構造を決める
build_kfas2 = SSModel(H = NA, train2$y ~ SSMtrend(degree = 1, Q = NA) +
SSMregression( ~ train2$x1 + train2$x2 , Q = diag(rep(NA, 2))))
# Step2:パラメータ推定
fit_kfas2 = fitSSM(build_kfas2, inits = c(1, 1, 1, 1))
パラメータ推定結果を格納したオブジェクトに全データを格納しフィルタリングを行う.
fit_kfas2[["model"]][["y"]][1:N] = dataset$y
# Step3:フィルタリング
result_kfas2 = KFS(fit_kfas2$model, filtering=c('state', 'mean'), smoothing=c('state', 'mean'))
目的変数と外生変数を図示
df_kfas2 = data.frame(time=1:N, y=dataset$y, y_pred=result_kfas2$m[1:N],
x1=dataset$x1, x2=dataset$x2)
df_kfas2$y_pred[1] = NA
g1 = ggplot(data=df_kfas2, aes(x=time, y=y))+
geom_point(alpha=0.5)+
geom_line(aes(y=y_pred),size=1.2, col = 'red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g2 = ggplot(data=df_kfas2, aes(x=time, y=x1))+
geom_point(alpha=0.5) +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g3 = ggplot(data=df_kfas2, aes(x=time, y=x2))+
geom_point(alpha=0.5) +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g1+g2+g3+plot_layout(ncol=1)
切片と回帰係数を図示
df_kfas_coef = data.frame(time=1:N, b1=coefset$b1, b2=coefset$b2, mu=coefset$mu,
b1_pred = coef(result_kfas2)[,1],
b2_pred = coef(result_kfas2)[,2],
mu_pred = coef(result_kfas2)[,3])
g1 = ggplot(data=df_kfas_coef, aes(x=time, y=b1)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b1_pred), size=1, col='red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g2 = ggplot(data=df_kfas_coef, aes(x=time, y=b2)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b2_pred), size=1, col='red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g3 = ggplot(data=df_kfas_coef, aes(x=time, y=mu)) +
geom_point(alpha=0.5) +
geom_line(aes(y=mu_pred), size=1, col='red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g1+g2+g3+plot_layout(ncol=1)
Conclusion
今回は時変係数モデルの構築,予測,逐次更新を実装した.
状態空間モデルは時系列データを柔軟に表現できるモデルとされているため,今後も学習を続けていこうと思う.
Code
# ライブラリインポート
library(KFAS) # モデル作成
library(ggplot2) # グラフ描画
library(patchwork) # グラフ配置
# シミュレーションデータの作成
set.seed(1)
N=100
sigma_v = 10
sigma_w = 5
sigma_t1 = 0.6
sigma_t2 = 0.2
x1 = rnorm(n=N, mean=5, sd=10)
x2 = rnorm(n=N, mean=10, sd=5)
mu = cumsum(rnorm(n=N, mean=0, sd=sigma_w))
b1 = cumsum(rnorm(n=N, mean=0, sd=sigma_t1)) + 10
b2 = cumsum(rnorm(n=N, mean=0, sd=sigma_t1)) + 5
y = mu + b1*x1 + b2*x2 + rnorm(n=N, mean=0, sd=sigma_v)
dataset = data.frame(y=y, x1=x1, x2=x2)
coefset = data.frame(b1=b1, b2=b2, mu=mu)
par(mfrow=c(3,1))
plot(x1)
plot(x2)
plot(y)
par(mfrow=c(3,1))
plot(b1)
plot(b2)
plot(mu)
p = 80
train = dataset[1:p,]
test = dataset[81:N,]
train_coef = coefset[1:p,]
test_coef = coefset[81:N,]
# モデリング
build_kfas = SSModel(H = NA, train$y ~ SSMtrend(degree = 1, Q = NA) +
SSMregression( ~ train$x1 + train$x2 , Q = diag(rep(NA, 2))))
fit_kfas = fitSSM(build_kfas, inits = c(1, 1, 1, 1))
result_kfas = KFS(fit_kfas$model, filtering=c('state', 'mean'), smoothing=c('state', 'mean'))
# モデル評価
df_kfas = data.frame(time=1:p, y=train$y, y_pred=result_kfas$m[1:length(train$y)])
df_kfas$y_pred[1] = NA
ggplot(data=df_kfas, aes(x=time, y=y))+
geom_point(alpha=0.5)+
geom_line(aes(y=y_pred),size=1.2, col = 'red')
df_kfas_coef = data.frame(time=1:p, b1=train_coef$b1, b2=train_coef$b2, mu=train_coef$mu,
b1_pred = coef(result_kfas)[,1],
b2_pred = coef(result_kfas)[,2],
mu_pred = coef(result_kfas)[,3])
g1 = ggplot(data=df_kfas_coef, aes(x=time, y=b1)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b1_pred), size=1, col='red')
g2 = ggplot(data=df_kfas_coef, aes(x=time, y=b2)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b2_pred), size=1, col='red')
g3 = ggplot(data=df_kfas_coef, aes(x=time, y=mu)) +
geom_point(alpha=0.5) +
geom_line(aes(y=mu_pred), size=1, col='red')
g1+g2+g3+plot_layout(ncol=1)
# 逐次更新
train2 = dataset
train2[81:N,1] = NA
build_kfas2 = SSModel(H = NA, train2$y ~ SSMtrend(degree = 1, Q = NA) +
SSMregression( ~ train2$x1 + train2$x2 , Q = diag(rep(NA, 2))))
fit_kfas2 = fitSSM(build_kfas2, inits = c(1, 1, 1, 1))
fit_kfas2[["model"]][["y"]][1:N] = dataset$y
result_kfas2 = KFS(fit_kfas2$model, filtering=c('state', 'mean'), smoothing=c('state', 'mean'))
df_kfas2 = data.frame(time=1:N, y=dataset$y, y_pred=result_kfas2$m[1:N],
x1=dataset$x1, x2=dataset$x2)
df_kfas2$y_pred[1] = NA
g1 = ggplot(data=df_kfas2, aes(x=time, y=y))+
geom_point(alpha=0.5)+
geom_line(aes(y=y_pred),size=1.2, col = 'red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g2 = ggplot(data=df_kfas2, aes(x=time, y=x1))+
geom_point(alpha=0.5) +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g3 = ggplot(data=df_kfas2, aes(x=time, y=x2))+
geom_point(alpha=0.5) +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g1+g2+g3+plot_layout(ncol=1)
df_kfas_coef = data.frame(time=1:N, b1=coefset$b1, b2=coefset$b2, mu=coefset$mu,
b1_pred = coef(result_kfas2)[,1],
b2_pred = coef(result_kfas2)[,2],
mu_pred = coef(result_kfas2)[,3])
g1 = ggplot(data=df_kfas_coef, aes(x=time, y=b1)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b1_pred), size=1, col='red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g2 = ggplot(data=df_kfas_coef, aes(x=time, y=b2)) +
geom_point(alpha=0.5) +
geom_line(aes(y=b2_pred), size=1, col='red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g3 = ggplot(data=df_kfas_coef, aes(x=time, y=mu)) +
geom_point(alpha=0.5) +
geom_line(aes(y=mu_pred), size=1, col='red') +
geom_vline(xintercept=p, col = 'purple', linetype='dashed')
g1+g2+g3+plot_layout(ncol=1)