LoginSignup
6
6

R/tidyverseとPython/polarsの比較

Last updated at Posted at 2023-01-26

はじめに

備忘として、データフレーム操作におけるRのtidyverse (主にdplyr)からpythonのpolarsへの書き換えを順次更新していきます。
Rからpythonに移行したものの、pandasが嫌いで使いたくない人向けです。

準備

パッケージインストール

R
install.packages("tidyverse")
library(tidyverse)
Python
!pip install polars
import polars as pl

データフレーム

以下のような横型と縦型の2種類のデータフレームを使ってコードを書いていきます。

R
> df
        date USDJPY EURJPY  SPX
1 2023-01-01  130.0  140.0 3800
2 2023-01-02  131.0  141.0 3900
3 2023-01-03  130.5  143.5 4000

> df_long
        date  index  value
1 2023-01-01 USDJPY  130.0
2 2023-01-01 EURJPY  140.0
3 2023-01-01    SPX 3800.0
4 2023-01-02 USDJPY  131.0
5 2023-01-02 EURJPY  141.0
6 2023-01-02    SPX 3900.0
7 2023-01-03 USDJPY  130.5
8 2023-01-03 EURJPY  143.5
9 2023-01-03    SPX 4000.0

列の選択(select)

R
df %>%
  select(date,USDJPY)

        date USDJPY
1 2023-01-01  130.0
2 2023-01-02  131.0
3 2023-01-03  130.5
Python
(df
.select([pl.col("date"),pl.col("USDJPY")])
)
date	    USDJPY
date	    f64
2023-01-01	130.0
2023-01-02	131.0
2023-01-03	130.5

列の変更・追加(mutate)

変更

USDJPYの値を2倍にする

R
df %>%
  mutate(USDJPY = USDJPY * 2)

        date USDJPY EURJPY  SPX
1 2023-01-01    260  140.0 3800
2 2023-01-02    262  141.0 3900
3 2023-01-03    261  143.5 4000
Python
(df
.with_column([pl.col("USDJPY") * 2])
)
date	    USDJPY	EURJPY	SPX
date	    f64	    f64	    i64
2023-01-01	260.0	140.0	3800
2023-01-02	262.0	141.0	3900
2023-01-03	261.0	143.5	4000

追加

EURUSDを追加する

R
df %>%
 mutate(EURUSD = EURJPY/USDJPY)

        date USDJPY EURJPY  SPX   EURUSD
1 2023-01-01  130.0  140.0 3800 1.076923
2 2023-01-02  131.0  141.0 3900 1.076336
3 2023-01-03  130.5  143.5 4000 1.099617
Python
(df
 .with_column((pl.col("EURJPY")/pl.col("USDJPY")).alias("EURUSD"))
)
date	    USDJPY	EURJPY	SPX	    EURUSD
date	    f64	    f64	    i64	    f64
2023-01-01	130.0	140.0	3800	1.076923
2023-01-02	131.0	141.0	3900	1.076336
2023-01-03	130.5	143.5	4000	1.099617

※複数追加する場合はwith_columnsを使う

行の選択(filter)

R
df %>%
  filter(date == "2023/1/2")

        date USDJPY EURJPY  SPX
1 2023-01-02    131    141 3900
Python
(df
 .filter(pl.col("date")=="2023-01-02")    
)
date	    USDJPY	EURJPY	SPX
date	    f64	    f64	    i64
2023-01-02	131.0	141.0	3900

ソート(arrange)

R
# 昇順
df %>%
 arrange(date)

        date USDJPY EURJPY  SPX
1 2023-01-01  130.0  140.0 3800
2 2023-01-02  131.0  141.0 3900
3 2023-01-03  130.5  143.5 4000

# 降順
df %>%
 arrange(desc(date))
        date USDJPY EURJPY  SPX
1 2023-01-03  130.5  143.5 4000
2 2023-01-02  131.0  141.0 3900
3 2023-01-01  130.0  140.0 3800
Python
# 昇順
(df
 .sort(by = "date")
 )
date	    USDJPY	EURJPY	SPX
date	    f64	    f64	    i64
2023-01-01	130.0	140.0	3800
2023-01-02	131.0	141.0	3900
2023-01-03	130.5	143.5	4000

# 降順
(df
 .sort(by = "date",descending=True)
 )
date	    USDJPY	EURJPY	SPX
date	    f64	    f64	    i64
2023-01-03	130.5	143.5	4000
2023-01-02	131.0	141.0	3900
2023-01-01	130.0	140.0	3800

重複行の削除(distinct)

R
# 全列が重複する行を削除(この例では完全一致する行はないので何も消えない)
df_long %>%
   distinct()

        date  index  value
1 2023-01-01 USDJPY  130.0
2 2023-01-01 EURJPY  140.0
3 2023-01-01    SPX 3800.0
4 2023-01-02 USDJPY  131.0
5 2023-01-02 EURJPY  141.0
6 2023-01-02    SPX 3900.0
7 2023-01-03 USDJPY  130.5
8 2023-01-03 EURJPY  143.5
9 2023-01-03    SPX 4000.0

# date列が重複する行を削除
df_long %>%
  distinct(date,.keep_all = TRUE)

        date  index value
1 2023-01-01 USDJPY 130.0
2 2023-01-02 USDJPY 131.0
3 2023-01-03 USDJPY 130.5
Python
# 全列が重複する行を削除(この例では完全一致する行はないので何も消えない)
(df_long
 .unique()
)
date	    index	    value
date	    str	        f64
2023-01-01	"USDJPY"	130.0
2023-01-01	"EURJPY"	140.0
2023-01-01	"SPX"	    3800.0
2023-01-02	"USDJPY"	131.0
2023-01-02	"EURJPY"	141.0
2023-01-02	"SPX"	    3900.0
2023-01-03	"USDJPY"	130.5
2023-01-03	"EURJPY"	143.5
2023-01-03	"SPX"	    4000.0

# date列が重複する行を削除
(df_long
 .unique(subset = "date")
)
date	    index	    value
date	    str	        f64
2023-01-01	"USDJPY"	130.0
2023-01-02	"USDJPY"	131.0
2023-01-03	"USDJPY"	130.5

縦型データに変換(pivot_longer)

R
df %>%
 pivot_longer(cols = -date,names_to = currency,values_to = value)

        date  index  value
1 2023-01-01 USDJPY  130.0
2 2023-01-01 EURJPY  140.0
3 2023-01-01    SPX 3800.0
4 2023-01-02 USDJPY  131.0
5 2023-01-02 EURJPY  141.0
6 2023-01-02    SPX 3900.0
7 2023-01-03 USDJPY  130.5
8 2023-01-03 EURJPY  143.5
9 2023-01-03    SPX 4000.0
Python
(df
 .melt(id_vars ="date",variable_name="index",value_name="value")
)
date	    index	    value
date	    str	        f64
2023-01-01	"USDJPY"	130.0
2023-01-02	"USDJPY"	131.0
2023-01-03	"USDJPY"	130.5
2023-01-01	"EURJPY"	140.0
2023-01-02	"EURJPY"	141.0
2023-01-03	"EURJPY"	143.5
2023-01-01	"SPX"	    3800.0
2023-01-02	"SPX"	    3900.0
2023-01-03	"SPX"	    4000.0

value_varsで指定するパターンもある

横型データに変換(pivot_wider)

R
df_long %>%
 pivot_wider(id_cols = "date", names_from = index,values_from = value)

        date USDJPY EURJPY  SPX
1 2023-01-01  130.0  140.0 3800
2 2023-01-02  131.0  141.0 3900
3 2023-01-03  130.5  143.5 4000
Python
(df_long
 .pivot(index = "date", columns = "index", values = "value")
)
date	    USDJPY	EURJPY	SPX
date	    f64	    f64	    f64
2023-01-01	130.0	140.0	3800.0
2023-01-02	131.0	141.0	3900.0
2023-01-03	130.5	143.5	4000.0

グループ化(group_by)

R
df_long %>%
  group_by(index) %>%
  summarize(ave = mean(value),
            max = max(value),
            min = min(value))

   index    ave    max  min
1 EURJPY  141.5  143.5  140
2    SPX 3900.0 4000.0 3800
3 USDJPY  130.5  131.0  130

Python
(df_long
 .groupby("index")
 .agg([pl.avg("value").alias("ave"),
       pl.max("value").alias("max"),
       pl.min("value").alias("min")])
) 
index	    ave	    max	    min
str	        f64	    f64	    f64
"EURJPY"	141.5	143.5	140.0
"SPX"	    3900.0	4000.0	3800.0
"USDJPY"	130.5	131.0	130.0

データフレームのキー結合(XXX_join)

joinによる結合の詳しい仕組みはここでは解説せず、コードのみを記載します。

R
# 結合するデータフレームを準備
df2 <- 
  data.frame(index = c("USDJPY","TPX","SPX"),
             asset = c("FX","EQUITY","EQUITY"))
df2
   index  asset
1 USDJPY     FX
2    TPX EQUITY
3    SPX EQUITY

# 左側結合
df_long %>%
  left_join(df2,by = "index")

        date  index  value  asset
1 2023-01-01 USDJPY  130.0     FX
2 2023-01-01 EURJPY  140.0   <NA>
3 2023-01-01    SPX 3800.0 EQUITY
4 2023-01-02 USDJPY  131.0     FX
5 2023-01-02 EURJPY  141.0   <NA>
6 2023-01-02    SPX 3900.0 EQUITY
7 2023-01-03 USDJPY  130.5     FX
8 2023-01-03 EURJPY  143.5   <NA>
9 2023-01-03    SPX 4000.0 EQUITY

# 内部結合
df_long %>%
  inner_join(df2,by = "index")

        date  index  value  asset
1 2023-01-01 USDJPY  130.0     FX
2 2023-01-01    SPX 3800.0 EQUITY
3 2023-01-02 USDJPY  131.0     FX
4 2023-01-02    SPX 3900.0 EQUITY
5 2023-01-03 USDJPY  130.5     FX
6 2023-01-03    SPX 4000.0 EQUITY

# 外部結合
df_long %>%
  full_join(df2,by = "index")

         date  index  value  asset
1  2023-01-01 USDJPY  130.0     FX
2  2023-01-01 EURJPY  140.0   <NA>
3  2023-01-01    SPX 3800.0 EQUITY
4  2023-01-02 USDJPY  131.0     FX
5  2023-01-02 EURJPY  141.0   <NA>
6  2023-01-02    SPX 3900.0 EQUITY
7  2023-01-03 USDJPY  130.5     FX
8  2023-01-03 EURJPY  143.5   <NA>
9  2023-01-03    SPX 4000.0 EQUITY
10       <NA>    TPX     NA EQUITY
Python
# 結合するデータフレームを準備
df2 = pl.DataFrame({"index":["USDJPY","TPX","SPX"],
                    "asset":["FX","EQUITY","EQUITY"]})
df2
index	    asset
str	        str
"USDJPY"	"FX"
"TPX"	    "EQUITY"
"SPX"	    "EQUITY"

# 左側結合
(df_long
 .join(df2, on = "index", how = "left")
)
date	    index	    value	asset
date	    str	        f64	    str
2023-01-01	"USDJPY"	130.0	"FX"
2023-01-01	"EURJPY"	140.0	null
2023-01-01	"SPX"	    3800.0	"EQUITY"
2023-01-02	"USDJPY"	131.0	"FX"
2023-01-02	"EURJPY"	141.0	null
2023-01-02	"SPX"	    3900.0	"EQUITY"
2023-01-03	"USDJPY"	130.5	"FX"
2023-01-03	"EURJPY"	143.5	null
2023-01-03	"SPX"	    4000.0	"EQUITY"

# 内部結合
(df_long
 .join(df2, on = "index", how = "inner")
)
date	    index	    value	asset
date	    str	        f64	    str
2023-01-01	"USDJPY"	130.0	"FX"
2023-01-01	"SPX"	    3800.0	"EQUITY"
2023-01-02	"USDJPY"	131.0	"FX"
2023-01-02	"SPX"	    3900.0	"EQUITY"
2023-01-03	"USDJPY"	130.5	"FX"
2023-01-03	"SPX"	    4000.0	"EQUITY"

# 外部結合
(df_long
 .join(df2, on = "index", how = "outer")
)
date	    index	    value	asset
date	    str	        f64	    str
2023-01-01	"USDJPY"	130.0	"FX"
2023-01-01	"EURJPY"	140.0	null
2023-01-01	"SPX"	    3800.0	"EQUITY"
2023-01-02	"USDJPY"	131.0	"FX"
2023-01-02	"EURJPY"	141.0	null
2023-01-02	"SPX"	    3900.0	"EQUITY"
2023-01-03	"USDJPY"	130.5	"FX"
2023-01-03	"EURJPY"	143.5	null
2023-01-03	"SPX"	    4000.0	"EQUITY"
null	    "TPX"	    null	"EQUITY"

条件分岐(if_else)

R
df_long %>%
  mutate(value2 = if_else(index == "USDJPY",100,value))

        date  index  value value2
1 2023-01-01 USDJPY  130.0  100.0
2 2023-01-01 EURJPY  140.0  140.0
3 2023-01-01    SPX 3800.0 3800.0
4 2023-01-02 USDJPY  131.0  100.0
5 2023-01-02 EURJPY  141.0  141.0
6 2023-01-02    SPX 3900.0 3900.0
7 2023-01-03 USDJPY  130.5  100.0
8 2023-01-03 EURJPY  143.5  143.5
9 2023-01-03    SPX 4000.0 4000.0
Python
# pl.when(条件式).then(Trueの値).otherwise(Falseの値)
(df_long
 .with_columns((pl.when(pl.col("index") == "USDJPY").then(100).otherwise(pl.col("value"))).alias("value2"))
 )
date	    index	    value	value2
date	    str	        f64  	f64
2023-01-01	"USDJPY"	130.0	100.0
2023-01-01	"EURJPY"	140.0	140.0
2023-01-01	"SPX"	    3800.0	3800.0
2023-01-02	"USDJPY"	131.0	100.0
2023-01-02	"EURJPY"	141.0	141.0
2023-01-02	"SPX"	    3900.0	3900.0
2023-01-03	"USDJPY"	130.5	100.0
2023-01-03	"EURJPY"	143.5	143.5
2023-01-03	"SPX"	    4000.0	4000.0

条件分岐(case_when)

R
# case when
df_long %>%
  mutate(value2 = case_when(index == "USDJPY" ~ 1,
                            index == "EURJPY" ~ 2,
                            TRUE ~ 3))

        date  index  value value2
1 2023-01-01 USDJPY  130.0      1
2 2023-01-01 EURJPY  140.0      2
3 2023-01-01    SPX 3800.0      3
4 2023-01-02 USDJPY  131.0      1
5 2023-01-02 EURJPY  141.0      2
6 2023-01-02    SPX 3900.0      3
7 2023-01-03 USDJPY  130.5      1
8 2023-01-03 EURJPY  143.5      2
9 2023-01-03    SPX 4000.0      3
Python
(df_long
 .with_columns((pl.when(pl.col("index") == "USDJPY").then(1).
                when(pl.col("index")=="EURJPY").then(2).
                otherwise(3)).alias("value2"))
 )
date	    index	    value	value2
date	    str	        f64	    i32
2023-01-01	"USDJPY"	130.0	1
2023-01-01	"EURJPY"	140.0	2
2023-01-01	"SPX"	    3800.0	3
2023-01-02	"USDJPY"	131.0	1
2023-01-02	"EURJPY"	141.0	2
2023-01-02	"SPX"	    3900.0	3
2023-01-03	"USDJPY"	130.5	1
2023-01-03	"EURJPY"	143.5	2
2023-01-03	"SPX"	    4000.0	3

列の値をずらす(lag ,lead)

R
df %>%
  mutate(USDJPY_lag1 = lag(USDJPY,1),
         USDJPY_lead1 = lead(USDJPY,1))

        date  USDJPY EURJPY  SPX    USDJPY_lag1 USDJPY_lead1
1 2023-01-01  130.0  140.0   3800   NA          131.0
2 2023-01-02  131.0  141.0   3900   130.0       130.5
3 2023-01-03  130.5  143.5   4000   131.0       NA
Python
(df
 .with_columns([pl.col("USDJPY").shift(1).alias("USDJPY_lag1"),
                pl.col("USDJPY").shift(-1).alias("USDJPY_lead1")])
)
date	    USDJPY	EURJPY	SPX	   USDJPY_lag1	USDJPY_lead1
date	    f64	    f64	    i64	   f64	        f64
2023-01-01	130.0	140.0	3800   null	        131.0
2023-01-02	131.0	141.0	3900   130.0	    130.5
2023-01-03	130.5	143.5	4000   131.0	    null

参考ページ

6
6
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
6
6