はじめに
備忘として、データフレーム操作におけるRのtidyverse (主にdplyr)からpythonのpolarsへの書き換えを順次更新していきます。
Rからpythonに移行したものの、pandasが嫌いで使いたくない人向けです。
準備
パッケージインストール
R
install.packages("tidyverse")
library(tidyverse)
Python
!pip install polars
import polars as pl
データフレーム
以下のような横型と縦型の2種類のデータフレームを使ってコードを書いていきます。
R
> df
date USDJPY EURJPY SPX
1 2023-01-01 130.0 140.0 3800
2 2023-01-02 131.0 141.0 3900
3 2023-01-03 130.5 143.5 4000
> df_long
date index value
1 2023-01-01 USDJPY 130.0
2 2023-01-01 EURJPY 140.0
3 2023-01-01 SPX 3800.0
4 2023-01-02 USDJPY 131.0
5 2023-01-02 EURJPY 141.0
6 2023-01-02 SPX 3900.0
7 2023-01-03 USDJPY 130.5
8 2023-01-03 EURJPY 143.5
9 2023-01-03 SPX 4000.0
列の選択(select)
R
df %>%
select(date,USDJPY)
date USDJPY
1 2023-01-01 130.0
2 2023-01-02 131.0
3 2023-01-03 130.5
Python
(df
.select([pl.col("date"),pl.col("USDJPY")])
)
date USDJPY
date f64
2023-01-01 130.0
2023-01-02 131.0
2023-01-03 130.5
列の変更・追加(mutate)
変更
USDJPYの値を2倍にする
R
df %>%
mutate(USDJPY = USDJPY * 2)
date USDJPY EURJPY SPX
1 2023-01-01 260 140.0 3800
2 2023-01-02 262 141.0 3900
3 2023-01-03 261 143.5 4000
Python
(df
.with_column([pl.col("USDJPY") * 2])
)
date USDJPY EURJPY SPX
date f64 f64 i64
2023-01-01 260.0 140.0 3800
2023-01-02 262.0 141.0 3900
2023-01-03 261.0 143.5 4000
追加
EURUSDを追加する
R
df %>%
mutate(EURUSD = EURJPY/USDJPY)
date USDJPY EURJPY SPX EURUSD
1 2023-01-01 130.0 140.0 3800 1.076923
2 2023-01-02 131.0 141.0 3900 1.076336
3 2023-01-03 130.5 143.5 4000 1.099617
Python
(df
.with_column((pl.col("EURJPY")/pl.col("USDJPY")).alias("EURUSD"))
)
date USDJPY EURJPY SPX EURUSD
date f64 f64 i64 f64
2023-01-01 130.0 140.0 3800 1.076923
2023-01-02 131.0 141.0 3900 1.076336
2023-01-03 130.5 143.5 4000 1.099617
※複数追加する場合はwith_columns
を使う
行の選択(filter)
R
df %>%
filter(date == "2023/1/2")
date USDJPY EURJPY SPX
1 2023-01-02 131 141 3900
Python
(df
.filter(pl.col("date")=="2023-01-02")
)
date USDJPY EURJPY SPX
date f64 f64 i64
2023-01-02 131.0 141.0 3900
ソート(arrange)
R
# 昇順
df %>%
arrange(date)
date USDJPY EURJPY SPX
1 2023-01-01 130.0 140.0 3800
2 2023-01-02 131.0 141.0 3900
3 2023-01-03 130.5 143.5 4000
# 降順
df %>%
arrange(desc(date))
date USDJPY EURJPY SPX
1 2023-01-03 130.5 143.5 4000
2 2023-01-02 131.0 141.0 3900
3 2023-01-01 130.0 140.0 3800
Python
# 昇順
(df
.sort(by = "date")
)
date USDJPY EURJPY SPX
date f64 f64 i64
2023-01-01 130.0 140.0 3800
2023-01-02 131.0 141.0 3900
2023-01-03 130.5 143.5 4000
# 降順
(df
.sort(by = "date",descending=True)
)
date USDJPY EURJPY SPX
date f64 f64 i64
2023-01-03 130.5 143.5 4000
2023-01-02 131.0 141.0 3900
2023-01-01 130.0 140.0 3800
重複行の削除(distinct)
R
# 全列が重複する行を削除(この例では完全一致する行はないので何も消えない)
df_long %>%
distinct()
date index value
1 2023-01-01 USDJPY 130.0
2 2023-01-01 EURJPY 140.0
3 2023-01-01 SPX 3800.0
4 2023-01-02 USDJPY 131.0
5 2023-01-02 EURJPY 141.0
6 2023-01-02 SPX 3900.0
7 2023-01-03 USDJPY 130.5
8 2023-01-03 EURJPY 143.5
9 2023-01-03 SPX 4000.0
# date列が重複する行を削除
df_long %>%
distinct(date,.keep_all = TRUE)
date index value
1 2023-01-01 USDJPY 130.0
2 2023-01-02 USDJPY 131.0
3 2023-01-03 USDJPY 130.5
Python
# 全列が重複する行を削除(この例では完全一致する行はないので何も消えない)
(df_long
.unique()
)
date index value
date str f64
2023-01-01 "USDJPY" 130.0
2023-01-01 "EURJPY" 140.0
2023-01-01 "SPX" 3800.0
2023-01-02 "USDJPY" 131.0
2023-01-02 "EURJPY" 141.0
2023-01-02 "SPX" 3900.0
2023-01-03 "USDJPY" 130.5
2023-01-03 "EURJPY" 143.5
2023-01-03 "SPX" 4000.0
# date列が重複する行を削除
(df_long
.unique(subset = "date")
)
date index value
date str f64
2023-01-01 "USDJPY" 130.0
2023-01-02 "USDJPY" 131.0
2023-01-03 "USDJPY" 130.5
縦型データに変換(pivot_longer)
R
df %>%
pivot_longer(cols = -date,names_to = “currency”,values_to = “value”)
date index value
1 2023-01-01 USDJPY 130.0
2 2023-01-01 EURJPY 140.0
3 2023-01-01 SPX 3800.0
4 2023-01-02 USDJPY 131.0
5 2023-01-02 EURJPY 141.0
6 2023-01-02 SPX 3900.0
7 2023-01-03 USDJPY 130.5
8 2023-01-03 EURJPY 143.5
9 2023-01-03 SPX 4000.0
Python
(df
.melt(id_vars ="date",variable_name="index",value_name="value")
)
date index value
date str f64
2023-01-01 "USDJPY" 130.0
2023-01-02 "USDJPY" 131.0
2023-01-03 "USDJPY" 130.5
2023-01-01 "EURJPY" 140.0
2023-01-02 "EURJPY" 141.0
2023-01-03 "EURJPY" 143.5
2023-01-01 "SPX" 3800.0
2023-01-02 "SPX" 3900.0
2023-01-03 "SPX" 4000.0
※value_vars
で指定するパターンもある
横型データに変換(pivot_wider)
R
df_long %>%
pivot_wider(id_cols = "date", names_from = “index”,values_from = “value”)
date USDJPY EURJPY SPX
1 2023-01-01 130.0 140.0 3800
2 2023-01-02 131.0 141.0 3900
3 2023-01-03 130.5 143.5 4000
Python
(df_long
.pivot(index = "date", columns = "index", values = "value")
)
date USDJPY EURJPY SPX
date f64 f64 f64
2023-01-01 130.0 140.0 3800.0
2023-01-02 131.0 141.0 3900.0
2023-01-03 130.5 143.5 4000.0
グループ化(group_by)
R
df_long %>%
group_by(index) %>%
summarize(ave = mean(value),
max = max(value),
min = min(value))
index ave max min
1 EURJPY 141.5 143.5 140
2 SPX 3900.0 4000.0 3800
3 USDJPY 130.5 131.0 130
Python
(df_long
.groupby("index")
.agg([pl.avg("value").alias("ave"),
pl.max("value").alias("max"),
pl.min("value").alias("min")])
)
index ave max min
str f64 f64 f64
"EURJPY" 141.5 143.5 140.0
"SPX" 3900.0 4000.0 3800.0
"USDJPY" 130.5 131.0 130.0
データフレームのキー結合(XXX_join)
joinによる結合の詳しい仕組みはここでは解説せず、コードのみを記載します。
R
# 結合するデータフレームを準備
df2 <-
data.frame(index = c("USDJPY","TPX","SPX"),
asset = c("FX","EQUITY","EQUITY"))
df2
index asset
1 USDJPY FX
2 TPX EQUITY
3 SPX EQUITY
# 左側結合
df_long %>%
left_join(df2,by = "index")
date index value asset
1 2023-01-01 USDJPY 130.0 FX
2 2023-01-01 EURJPY 140.0 <NA>
3 2023-01-01 SPX 3800.0 EQUITY
4 2023-01-02 USDJPY 131.0 FX
5 2023-01-02 EURJPY 141.0 <NA>
6 2023-01-02 SPX 3900.0 EQUITY
7 2023-01-03 USDJPY 130.5 FX
8 2023-01-03 EURJPY 143.5 <NA>
9 2023-01-03 SPX 4000.0 EQUITY
# 内部結合
df_long %>%
inner_join(df2,by = "index")
date index value asset
1 2023-01-01 USDJPY 130.0 FX
2 2023-01-01 SPX 3800.0 EQUITY
3 2023-01-02 USDJPY 131.0 FX
4 2023-01-02 SPX 3900.0 EQUITY
5 2023-01-03 USDJPY 130.5 FX
6 2023-01-03 SPX 4000.0 EQUITY
# 外部結合
df_long %>%
full_join(df2,by = "index")
date index value asset
1 2023-01-01 USDJPY 130.0 FX
2 2023-01-01 EURJPY 140.0 <NA>
3 2023-01-01 SPX 3800.0 EQUITY
4 2023-01-02 USDJPY 131.0 FX
5 2023-01-02 EURJPY 141.0 <NA>
6 2023-01-02 SPX 3900.0 EQUITY
7 2023-01-03 USDJPY 130.5 FX
8 2023-01-03 EURJPY 143.5 <NA>
9 2023-01-03 SPX 4000.0 EQUITY
10 <NA> TPX NA EQUITY
Python
# 結合するデータフレームを準備
df2 = pl.DataFrame({"index":["USDJPY","TPX","SPX"],
"asset":["FX","EQUITY","EQUITY"]})
df2
index asset
str str
"USDJPY" "FX"
"TPX" "EQUITY"
"SPX" "EQUITY"
# 左側結合
(df_long
.join(df2, on = "index", how = "left")
)
date index value asset
date str f64 str
2023-01-01 "USDJPY" 130.0 "FX"
2023-01-01 "EURJPY" 140.0 null
2023-01-01 "SPX" 3800.0 "EQUITY"
2023-01-02 "USDJPY" 131.0 "FX"
2023-01-02 "EURJPY" 141.0 null
2023-01-02 "SPX" 3900.0 "EQUITY"
2023-01-03 "USDJPY" 130.5 "FX"
2023-01-03 "EURJPY" 143.5 null
2023-01-03 "SPX" 4000.0 "EQUITY"
# 内部結合
(df_long
.join(df2, on = "index", how = "inner")
)
date index value asset
date str f64 str
2023-01-01 "USDJPY" 130.0 "FX"
2023-01-01 "SPX" 3800.0 "EQUITY"
2023-01-02 "USDJPY" 131.0 "FX"
2023-01-02 "SPX" 3900.0 "EQUITY"
2023-01-03 "USDJPY" 130.5 "FX"
2023-01-03 "SPX" 4000.0 "EQUITY"
# 外部結合
(df_long
.join(df2, on = "index", how = "outer")
)
date index value asset
date str f64 str
2023-01-01 "USDJPY" 130.0 "FX"
2023-01-01 "EURJPY" 140.0 null
2023-01-01 "SPX" 3800.0 "EQUITY"
2023-01-02 "USDJPY" 131.0 "FX"
2023-01-02 "EURJPY" 141.0 null
2023-01-02 "SPX" 3900.0 "EQUITY"
2023-01-03 "USDJPY" 130.5 "FX"
2023-01-03 "EURJPY" 143.5 null
2023-01-03 "SPX" 4000.0 "EQUITY"
null "TPX" null "EQUITY"
条件分岐(if_else)
R
df_long %>%
mutate(value2 = if_else(index == "USDJPY",100,value))
date index value value2
1 2023-01-01 USDJPY 130.0 100.0
2 2023-01-01 EURJPY 140.0 140.0
3 2023-01-01 SPX 3800.0 3800.0
4 2023-01-02 USDJPY 131.0 100.0
5 2023-01-02 EURJPY 141.0 141.0
6 2023-01-02 SPX 3900.0 3900.0
7 2023-01-03 USDJPY 130.5 100.0
8 2023-01-03 EURJPY 143.5 143.5
9 2023-01-03 SPX 4000.0 4000.0
Python
# pl.when(条件式).then(Trueの値).otherwise(Falseの値)
(df_long
.with_columns((pl.when(pl.col("index") == "USDJPY").then(100).otherwise(pl.col("value"))).alias("value2"))
)
date index value value2
date str f64 f64
2023-01-01 "USDJPY" 130.0 100.0
2023-01-01 "EURJPY" 140.0 140.0
2023-01-01 "SPX" 3800.0 3800.0
2023-01-02 "USDJPY" 131.0 100.0
2023-01-02 "EURJPY" 141.0 141.0
2023-01-02 "SPX" 3900.0 3900.0
2023-01-03 "USDJPY" 130.5 100.0
2023-01-03 "EURJPY" 143.5 143.5
2023-01-03 "SPX" 4000.0 4000.0
条件分岐(case_when)
R
# case when
df_long %>%
mutate(value2 = case_when(index == "USDJPY" ~ 1,
index == "EURJPY" ~ 2,
TRUE ~ 3))
date index value value2
1 2023-01-01 USDJPY 130.0 1
2 2023-01-01 EURJPY 140.0 2
3 2023-01-01 SPX 3800.0 3
4 2023-01-02 USDJPY 131.0 1
5 2023-01-02 EURJPY 141.0 2
6 2023-01-02 SPX 3900.0 3
7 2023-01-03 USDJPY 130.5 1
8 2023-01-03 EURJPY 143.5 2
9 2023-01-03 SPX 4000.0 3
Python
(df_long
.with_columns((pl.when(pl.col("index") == "USDJPY").then(1).
when(pl.col("index")=="EURJPY").then(2).
otherwise(3)).alias("value2"))
)
date index value value2
date str f64 i32
2023-01-01 "USDJPY" 130.0 1
2023-01-01 "EURJPY" 140.0 2
2023-01-01 "SPX" 3800.0 3
2023-01-02 "USDJPY" 131.0 1
2023-01-02 "EURJPY" 141.0 2
2023-01-02 "SPX" 3900.0 3
2023-01-03 "USDJPY" 130.5 1
2023-01-03 "EURJPY" 143.5 2
2023-01-03 "SPX" 4000.0 3
列の値をずらす(lag ,lead)
R
df %>%
mutate(USDJPY_lag1 = lag(USDJPY,1),
USDJPY_lead1 = lead(USDJPY,1))
date USDJPY EURJPY SPX USDJPY_lag1 USDJPY_lead1
1 2023-01-01 130.0 140.0 3800 NA 131.0
2 2023-01-02 131.0 141.0 3900 130.0 130.5
3 2023-01-03 130.5 143.5 4000 131.0 NA
Python
(df
.with_columns([pl.col("USDJPY").shift(1).alias("USDJPY_lag1"),
pl.col("USDJPY").shift(-1).alias("USDJPY_lead1")])
)
date USDJPY EURJPY SPX USDJPY_lag1 USDJPY_lead1
date f64 f64 i64 f64 f64
2023-01-01 130.0 140.0 3800 null 131.0
2023-01-02 131.0 141.0 3900 130.0 130.5
2023-01-03 130.5 143.5 4000 131.0 null