今回はPandasとPolarsでohlcvデータの時系列のダウンサンプリング(分足から5分足)とohlcvデータ(分足から5分足)の作成の比較を行います.結論から言うと,pandasとpolarsはほぼ同じようにダウンサンプリングとOHLCの作成ができました.
import pandas as pd
import polars as pl
import re
データの読み込み
pandasで読み込みます.6502
,4755
,9984
の11月4日から11月30日の分足データ(OHLCV)です.
pandas_stock_df = pd.read_parquet("naive_multi_stock_df.parquet")
print(pandas_stock_df.dropna()[:5])
Open_6502 High_6502 Low_6502 Close_6502 Volume_6502 \
timestamp
2020-11-04 00:00:00 2669.0 2670.0 2658.0 2664.0 93000.0
2020-11-04 00:01:00 2663.0 2664.0 2650.0 2652.0 17600.0
2020-11-04 00:02:00 2649.0 2655.0 2646.0 2649.0 19200.0
2020-11-04 00:03:00 2652.0 2670.0 2651.0 2670.0 31200.0
2020-11-04 00:04:00 2671.0 2674.0 2670.0 2674.0 12800.0
Open_4755 High_4755 Low_4755 Close_4755 Volume_4755 \
timestamp
2020-11-04 00:00:00 1053.0 1056.0 1045.0 1048.0 670700.0
2020-11-04 00:01:00 1049.0 1050.0 1046.0 1048.0 107400.0
2020-11-04 00:02:00 1049.0 1053.0 1046.0 1051.0 80600.0
2020-11-04 00:03:00 1050.0 1051.0 1048.0 1049.0 112700.0
2020-11-04 00:04:00 1049.0 1050.0 1046.0 1047.0 97900.0
Open_9984 High_9984 Low_9984 Close_9984 Volume_9984
timestamp
2020-11-04 00:00:00 6600.0 6622.0 6500.0 6539.0 1185000.0
2020-11-04 00:01:00 6540.0 6541.0 6471.0 6481.0 477700.0
2020-11-04 00:02:00 6484.0 6510.0 6467.0 6510.0 355400.0
2020-11-04 00:03:00 6511.0 6520.0 6489.0 6517.0 266800.0
2020-11-04 00:04:00 6517.0 6535.0 6498.0 6533.0 280300.0
polarsで読み込みます.
polars_stock_df = pl.read_parquet("naive_multi_stock_df.parquet")
print(polars_stock_df.drop_nulls()[:5])
shape: (5, 16)
╭───────────┬───────────┬───────────┬───────────┬─────┬───────────┬──────────┬──────────┬──────────╮
│ Open_6502 ┆ High_6502 ┆ Low_6502 ┆ Close_650 ┆ ... ┆ Low_9984 ┆ Close_99 ┆ Volume_9 ┆ timestam │
│ --- ┆ --- ┆ --- ┆ 2 ┆ ┆ --- ┆ 84 ┆ 984 ┆ p │
│ f64 ┆ f64 ┆ f64 ┆ --- ┆ ┆ f64 ┆ --- ┆ --- ┆ --- │
│ ┆ ┆ ┆ f64 ┆ ┆ ┆ f64 ┆ f64 ┆ date64(m │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ s) │
╞═══════════╪═══════════╪═══════════╪═══════════╪═════╪═══════════╪══════════╪══════════╪══════════╡
│ 2669 ┆ 2670 ┆ 2658 ┆ 2664 ┆ ... ┆ 6500 ┆ 6539 ┆ 1.185e6 ┆ 2020-11- │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:00:00 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2663 ┆ 2664 ┆ 2650 ┆ 2652 ┆ ... ┆ 6471 ┆ 6481 ┆ 4.777e5 ┆ 2020-11- │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:01:00 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2649 ┆ 2655 ┆ 2646 ┆ 2649 ┆ ... ┆ 6467 ┆ 6510 ┆ 3.554e5 ┆ 2020-11- │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:02:00 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2652 ┆ 2670 ┆ 2651 ┆ 2670 ┆ ... ┆ 6489 ┆ 6517 ┆ 2.668e5 ┆ 2020-11- │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:03:00 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2671 ┆ 2674 ┆ 2670 ┆ 2674 ┆ ... ┆ 6498 ┆ 6533 ┆ 2.803e5 ┆ 2020-11- │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:04:00 │
╰───────────┴───────────┴───────────┴───────────┴─────┴───────────┴──────────┴──────────┴──────────╯
ダウンサンプリング
カラム名と処理の対応
カラム名とそのカラムに適用する処理の文字列を対応づけた辞書(agg_dict)を用意します.
ohlcv_patterns = {
"Open":re.compile("Open_.+"),
"High":re.compile("High_.+"),
"Low":re.compile("Low_.+"),
"Close":re.compile("Close_.+"),
"Volume":re.compile("Volume_.+")
}
pandas.Seriesを利用することで簡潔に作成できます.
column_series = pd.Series(len(pandas_stock_df.columns)*[None], index=pandas_stock_df.columns)
column_series.loc[column_series.index.str.match(ohlcv_patterns["Open"])] = "first"
column_series.loc[column_series.index.str.match(ohlcv_patterns["High"])] = "max"
column_series.loc[column_series.index.str.match(ohlcv_patterns["Low"])] = "min"
column_series.loc[column_series.index.str.match(ohlcv_patterns["Close"])] = "last"
column_series.loc[column_series.index.str.match(ohlcv_patterns["Volume"])] = "sum"
agg_dict = dict(column_series)
print(agg_dict)
{'Open_6502': 'first', 'High_6502': 'max', 'Low_6502': 'min', 'Close_6502': 'last', 'Volume_6502': 'sum', 'Open_4755': 'first', 'High_4755': 'max', 'Low_4755': 'min', 'Close_4755': 'last', 'Volume_4755': 'sum', 'Open_9984': 'first', 'High_9984': 'max', 'Low_9984': 'min', 'Close_9984': 'last', 'Volume_9984': 'sum'}
pandasを使いたくない場合は以下のようにできます.後で必要になるので,rename_dictも作っておきます.
agg_dict = {}
rename_dict = {}
for column_name in polars_stock_df.columns:
if ohlcv_patterns["Open"].match(column_name):
agg_dict[column_name] = "first"
rename_dict[column_name + "_first"] = column_name
elif ohlcv_patterns["High"].match(column_name):
agg_dict[column_name] = "max"
rename_dict[column_name + "_max"] = column_name
elif ohlcv_patterns["Low"].match(column_name):
agg_dict[column_name] = "min"
rename_dict[column_name + "_min"] = column_name
elif ohlcv_patterns["Close"].match(column_name):
agg_dict[column_name] = "last"
rename_dict[column_name + "_last"] = column_name
elif ohlcv_patterns["Volume"].match(column_name):
agg_dict[column_name] = "sum"
rename_dict[column_name + "_sum"] = column_name
print(agg_dict)
print(rename_dict)
{'Open_6502': 'first', 'High_6502': 'max', 'Low_6502': 'min', 'Close_6502': 'last', 'Volume_6502': 'sum', 'Open_4755': 'first', 'High_4755': 'max', 'Low_4755': 'min', 'Close_4755': 'last', 'Volume_4755': 'sum', 'Open_9984': 'first', 'High_9984': 'max', 'Low_9984': 'min', 'Close_9984': 'last', 'Volume_9984': 'sum'}
{'Open_6502_first': 'Open_6502', 'High_6502_max': 'High_6502', 'Low_6502_min': 'Low_6502', 'Close_6502_last': 'Close_6502', 'Volume_6502_sum': 'Volume_6502', 'Open_4755_first': 'Open_4755', 'High_4755_max': 'High_4755', 'Low_4755_min': 'Low_4755', 'Close_4755_last': 'Close_4755', 'Volume_4755_sum': 'Volume_4755', 'Open_9984_first': 'Open_9984', 'High_9984_max': 'High_9984', 'Low_9984_min': 'Low_9984', 'Close_9984_last': 'Close_9984', 'Volume_9984_sum': 'Volume_9984'}
pandas
downsampled_pandas_stock_df = pandas_stock_df.resample("5T", label="left", closed="left").agg(agg_dict)
print(downsampled_pandas_stock_df.dropna()[:5])
Open_6502 High_6502 Low_6502 Close_6502 Volume_6502 \
timestamp
2020-11-04 00:00:00 2669.0 2674.0 2646.0 2674.0 173800.0
2020-11-04 00:05:00 2672.0 2675.0 2664.0 2669.0 36600.0
2020-11-04 00:10:00 2669.0 2671.0 2659.0 2660.0 33100.0
2020-11-04 00:15:00 2660.0 2663.0 2653.0 2657.0 46500.0
2020-11-04 00:20:00 2656.0 2659.0 2650.0 2654.0 37800.0
Open_4755 High_4755 Low_4755 Close_4755 Volume_4755 \
timestamp
2020-11-04 00:00:00 1053.0 1056.0 1045.0 1047.0 1069300.0
2020-11-04 00:05:00 1047.0 1066.0 1047.0 1065.0 527500.0
2020-11-04 00:10:00 1066.0 1066.0 1047.0 1051.0 423700.0
2020-11-04 00:15:00 1051.0 1056.0 1046.0 1047.0 223100.0
2020-11-04 00:20:00 1048.0 1056.0 1047.0 1054.0 309500.0
Open_9984 High_9984 Low_9984 Close_9984 Volume_9984
timestamp
2020-11-04 00:00:00 6600.0 6622.0 6467.0 6533.0 2565200.0
2020-11-04 00:05:00 6534.0 6536.0 6480.0 6513.0 871200.0
2020-11-04 00:10:00 6515.0 6558.0 6513.0 6548.0 569500.0
2020-11-04 00:15:00 6548.0 6549.0 6502.0 6521.0 472700.0
2020-11-04 00:20:00 6520.0 6564.0 6510.0 6559.0 399400.0
polars
downsampled_polars_stock_df = (
polars_stock_df.downsample("timestamp", "minute", 5)
.agg(agg_dict)
)
print(downsampled_polars_stock_df.drop_nulls()[:5])
shape: (5, 16)
╭───────────┬───────────┬───────────┬───────────┬─────┬───────────┬──────────┬──────────┬──────────╮
│ timestamp ┆ Open_6502 ┆ High_6502 ┆ Low_6502_ ┆ ... ┆ High_9984 ┆ Low_9984 ┆ Close_99 ┆ Volume_9 │
│ --- ┆ _first ┆ _max ┆ min ┆ ┆ _max ┆ _min ┆ 84_last ┆ 984_sum │
│ date64(ms ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ ) ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═══════════╪═══════════╪═══════════╪═══════════╪═════╪═══════════╪══════════╪══════════╪══════════╡
│ 2020-11-0 ┆ 2669 ┆ 2674 ┆ 2646 ┆ ... ┆ 6622 ┆ 6467 ┆ 6533 ┆ 2.5652e6 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2672 ┆ 2675 ┆ 2664 ┆ ... ┆ 6536 ┆ 6480 ┆ 6513 ┆ 8.712e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:05:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2669 ┆ 2671 ┆ 2659 ┆ ... ┆ 6558 ┆ 6513 ┆ 6548 ┆ 5.695e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:10:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2660 ┆ 2663 ┆ 2653 ┆ ... ┆ 6549 ┆ 6502 ┆ 6521 ┆ 4.727e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:15:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2656 ┆ 2659 ┆ 2650 ┆ ... ┆ 6564 ┆ 6510 ┆ 6559 ┆ 3.994e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:20:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
╰───────────┴───────────┴───────────┴───────────┴─────┴───────────┴──────────┴──────────┴──────────╯
カラム名を変更します.
downsampled_polars_stock_df = downsampled_polars_stock_df.rename(rename_dict)
print(downsampled_polars_stock_df.drop_nulls()[:5])
shape: (5, 16)
╭───────────┬───────────┬───────────┬───────────┬─────┬───────────┬──────────┬──────────┬──────────╮
│ timestamp ┆ Open_6502 ┆ High_6502 ┆ Low_6502 ┆ ... ┆ High_9984 ┆ Low_9984 ┆ Close_99 ┆ Volume_9 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ 84 ┆ 984 │
│ date64(ms ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ --- ┆ --- │
│ ) ┆ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 │
╞═══════════╪═══════════╪═══════════╪═══════════╪═════╪═══════════╪══════════╪══════════╪══════════╡
│ 2020-11-0 ┆ 2669 ┆ 2674 ┆ 2646 ┆ ... ┆ 6622 ┆ 6467 ┆ 6533 ┆ 2.5652e6 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2672 ┆ 2675 ┆ 2664 ┆ ... ┆ 6536 ┆ 6480 ┆ 6513 ┆ 8.712e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:05:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2669 ┆ 2671 ┆ 2659 ┆ ... ┆ 6558 ┆ 6513 ┆ 6548 ┆ 5.695e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:10:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2660 ┆ 2663 ┆ 2653 ┆ ... ┆ 6549 ┆ 6502 ┆ 6521 ┆ 4.727e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:15:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2656 ┆ 2659 ┆ 2650 ┆ ... ┆ 6564 ┆ 6510 ┆ 6559 ┆ 3.994e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:20:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
╰───────────┴───────────┴───────────┴───────────┴─────┴───────────┴──────────┴──────────┴──────────╯
aggに文字列の辞書ではなくpolars.col
などのpolars.lazy.expr
を用いる方法が分かりませんでした.
一応手元のノートバソコン環境で速度を比較してみます.
%%timeit
downsampled_pandas_stock_df = pandas_stock_df.resample("5T", label="left", closed="left").agg(agg_dict)
25 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
downsampled_polars_stock_df = (
polars_stock_df.downsample("timestamp", "minute", 5)
.agg(agg_dict)
)
downsampled_polars_stock_df = downsampled_polars_stock_df.rename(rename_dict)
11.2 ms ± 2.62 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
OHLCVの作成
時系列の価格データとしてOpenを利用し,それとVolumeを利用してOHLCVをダウンサンプリングします.
sub_pandas_stock_df = pandas_stock_df.loc[:,["Open_6502", "Volume_6502", "Open_4755", "Volume_4755", "Open_9984", "Volume_9984"]]
print(sub_pandas_stock_df.dropna()[:5])
Open_6502 Volume_6502 Open_4755 Volume_4755 \
timestamp
2020-11-04 00:00:00 2669.0 93000.0 1053.0 670700.0
2020-11-04 00:01:00 2663.0 17600.0 1049.0 107400.0
2020-11-04 00:02:00 2649.0 19200.0 1049.0 80600.0
2020-11-04 00:03:00 2652.0 31200.0 1050.0 112700.0
2020-11-04 00:04:00 2671.0 12800.0 1049.0 97900.0
Open_9984 Volume_9984
timestamp
2020-11-04 00:00:00 6600.0 1185000.0
2020-11-04 00:01:00 6540.0 477700.0
2020-11-04 00:02:00 6484.0 355400.0
2020-11-04 00:03:00 6511.0 266800.0
2020-11-04 00:04:00 6517.0 280300.0
sub_polars_stock_df = polars_stock_df[["timestamp","Open_6502", "Volume_6502", "Open_4755", "Volume_4755", "Open_9984", "Volume_9984"]]
print(sub_polars_stock_df.drop_nulls()[:5])
shape: (5, 7)
╭────────────────────┬───────────┬─────────────┬───────────┬─────────────┬───────────┬─────────────╮
│ timestamp ┆ Open_6502 ┆ Volume_6502 ┆ Open_4755 ┆ Volume_4755 ┆ Open_9984 ┆ Volume_9984 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ date64(ms) ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞════════════════════╪═══════════╪═════════════╪═══════════╪═════════════╪═══════════╪═════════════╡
│ 2020-11-04 ┆ 2669 ┆ 9.3e4 ┆ 1053 ┆ 6.707e5 ┆ 6600 ┆ 1.185e6 │
│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-04 ┆ 2663 ┆ 1.76e4 ┆ 1049 ┆ 1.074e5 ┆ 6540 ┆ 4.777e5 │
│ 00:01:00 ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-04 ┆ 2649 ┆ 1.92e4 ┆ 1049 ┆ 8.06e4 ┆ 6484 ┆ 3.554e5 │
│ 00:02:00 ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-04 ┆ 2652 ┆ 3.12e4 ┆ 1050 ┆ 1.127e5 ┆ 6511 ┆ 2.668e5 │
│ 00:03:00 ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-04 ┆ 2671 ┆ 1.28e4 ┆ 1049 ┆ 9.79e4 ┆ 6517 ┆ 2.803e5 │
│ 00:04:00 ┆ ┆ ┆ ┆ ┆ ┆ │
╰────────────────────┴───────────┴─────────────┴───────────┴─────────────┴───────────┴─────────────╯
カラム名と処理の対応
agg_dict2 = {}
for column_name in sub_pandas_stock_df.columns:
if ohlcv_patterns["Open"].match(column_name):
agg_dict2[column_name] = ["first", "max", "min", "last"]
elif ohlcv_patterns["Volume"].match(column_name):
agg_dict2[column_name] = "sum"
print(agg_dict2)
{'Open_6502': ['first', 'max', 'min', 'last'], 'Volume_6502': 'sum', 'Open_4755': ['first', 'max', 'min', 'last'], 'Volume_4755': 'sum', 'Open_9984': ['first', 'max', 'min', 'last'], 'Volume_9984': 'sum'}
pandas
複数の銘柄があるので,ohlcメソッドは利用せずにresampleで行います.
ohlcv_pandas_stock_df = sub_pandas_stock_df.resample("5T", label="left", closed="left").agg(agg_dict2)
print(ohlcv_pandas_stock_df.dropna()[:5])
Open_6502 Volume_6502 Open_4755 \
first max min last sum first
timestamp
2020-11-04 00:00:00 2669.0 2671.0 2649.0 2671.0 173800.0 1053.0
2020-11-04 00:05:00 2672.0 2673.0 2666.0 2668.0 36600.0 1047.0
2020-11-04 00:10:00 2669.0 2669.0 2662.0 2662.0 33100.0 1066.0
2020-11-04 00:15:00 2660.0 2663.0 2655.0 2657.0 46500.0 1051.0
2020-11-04 00:20:00 2656.0 2656.0 2652.0 2652.0 37800.0 1048.0
Volume_4755 Open_9984 \
max min last sum first max
timestamp
2020-11-04 00:00:00 1053.0 1049.0 1049.0 1069300.0 6600.0 6600.0
2020-11-04 00:05:00 1060.0 1047.0 1060.0 527500.0 6534.0 6534.0
2020-11-04 00:10:00 1066.0 1048.0 1048.0 423700.0 6515.0 6543.0
2020-11-04 00:15:00 1052.0 1047.0 1049.0 223100.0 6548.0 6548.0
2020-11-04 00:20:00 1055.0 1047.0 1052.0 309500.0 6520.0 6541.0
Volume_9984
min last sum
timestamp
2020-11-04 00:00:00 6484.0 6517.0 2565200.0
2020-11-04 00:05:00 6493.0 6495.0 871200.0
2020-11-04 00:10:00 6515.0 6531.0 569500.0
2020-11-04 00:15:00 6508.0 6512.0 472700.0
2020-11-04 00:20:00 6518.0 6541.0 399400.0
マルチインデックスになっているので,分かりやすさのためflattenします.
ohlcv_pandas_stock_df.columns = ohlcv_pandas_stock_df.columns.to_flat_index()
print(ohlcv_pandas_stock_df.dropna()[:5])
(Open_6502, first) (Open_6502, max) (Open_6502, min) \
timestamp
2020-11-04 00:00:00 2669.0 2671.0 2649.0
2020-11-04 00:05:00 2672.0 2673.0 2666.0
2020-11-04 00:10:00 2669.0 2669.0 2662.0
2020-11-04 00:15:00 2660.0 2663.0 2655.0
2020-11-04 00:20:00 2656.0 2656.0 2652.0
(Open_6502, last) (Volume_6502, sum) \
timestamp
2020-11-04 00:00:00 2671.0 173800.0
2020-11-04 00:05:00 2668.0 36600.0
2020-11-04 00:10:00 2662.0 33100.0
2020-11-04 00:15:00 2657.0 46500.0
2020-11-04 00:20:00 2652.0 37800.0
(Open_4755, first) (Open_4755, max) (Open_4755, min) \
timestamp
2020-11-04 00:00:00 1053.0 1053.0 1049.0
2020-11-04 00:05:00 1047.0 1060.0 1047.0
2020-11-04 00:10:00 1066.0 1066.0 1048.0
2020-11-04 00:15:00 1051.0 1052.0 1047.0
2020-11-04 00:20:00 1048.0 1055.0 1047.0
(Open_4755, last) (Volume_4755, sum) \
timestamp
2020-11-04 00:00:00 1049.0 1069300.0
2020-11-04 00:05:00 1060.0 527500.0
2020-11-04 00:10:00 1048.0 423700.0
2020-11-04 00:15:00 1049.0 223100.0
2020-11-04 00:20:00 1052.0 309500.0
(Open_9984, first) (Open_9984, max) (Open_9984, min) \
timestamp
2020-11-04 00:00:00 6600.0 6600.0 6484.0
2020-11-04 00:05:00 6534.0 6534.0 6493.0
2020-11-04 00:10:00 6515.0 6543.0 6515.0
2020-11-04 00:15:00 6548.0 6548.0 6508.0
2020-11-04 00:20:00 6520.0 6541.0 6518.0
(Open_9984, last) (Volume_9984, sum)
timestamp
2020-11-04 00:00:00 6517.0 2565200.0
2020-11-04 00:05:00 6495.0 871200.0
2020-11-04 00:10:00 6531.0 569500.0
2020-11-04 00:15:00 6512.0 472700.0
2020-11-04 00:20:00 6541.0 399400.0
カラム名を変更します.
rename_dict_2 = {}
for column_name, agg_str in ohlcv_pandas_stock_df.columns:
ticker_name = column_name.split("_")[-1]
if agg_str == "first":
rename_dict_2[(column_name, agg_str)] = "Open_" + ticker_name
elif agg_str == "max":
rename_dict_2[(column_name, agg_str)] = "Hign_" + ticker_name
elif agg_str == "min":
rename_dict_2[(column_name, agg_str)] = "Low_" + ticker_name
elif agg_str == "last":
rename_dict_2[(column_name, agg_str)] = "Close_" + ticker_name
elif agg_str == "sum":
rename_dict_2[(column_name, agg_str)] = "Volume_" + ticker_name
print(rename_dict_2)
{('Open_6502', 'first'): 'Open_6502', ('Open_6502', 'max'): 'Hign_6502', ('Open_6502', 'min'): 'Low_6502', ('Open_6502', 'last'): 'Close_6502', ('Volume_6502', 'sum'): 'Volume_6502', ('Open_4755', 'first'): 'Open_4755', ('Open_4755', 'max'): 'Hign_4755', ('Open_4755', 'min'): 'Low_4755', ('Open_4755', 'last'): 'Close_4755', ('Volume_4755', 'sum'): 'Volume_4755', ('Open_9984', 'first'): 'Open_9984', ('Open_9984', 'max'): 'Hign_9984', ('Open_9984', 'min'): 'Low_9984', ('Open_9984', 'last'): 'Close_9984', ('Volume_9984', 'sum'): 'Volume_9984'}
ohlcv_pandas_stock_df.rename(columns=rename_dict_2, inplace=True)
print(ohlcv_pandas_stock_df.dropna()[:5])
Open_6502 Hign_6502 Low_6502 Close_6502 Volume_6502 \
timestamp
2020-11-04 00:00:00 2669.0 2671.0 2649.0 2671.0 173800.0
2020-11-04 00:05:00 2672.0 2673.0 2666.0 2668.0 36600.0
2020-11-04 00:10:00 2669.0 2669.0 2662.0 2662.0 33100.0
2020-11-04 00:15:00 2660.0 2663.0 2655.0 2657.0 46500.0
2020-11-04 00:20:00 2656.0 2656.0 2652.0 2652.0 37800.0
Open_4755 Hign_4755 Low_4755 Close_4755 Volume_4755 \
timestamp
2020-11-04 00:00:00 1053.0 1053.0 1049.0 1049.0 1069300.0
2020-11-04 00:05:00 1047.0 1060.0 1047.0 1060.0 527500.0
2020-11-04 00:10:00 1066.0 1066.0 1048.0 1048.0 423700.0
2020-11-04 00:15:00 1051.0 1052.0 1047.0 1049.0 223100.0
2020-11-04 00:20:00 1048.0 1055.0 1047.0 1052.0 309500.0
Open_9984 Hign_9984 Low_9984 Close_9984 Volume_9984
timestamp
2020-11-04 00:00:00 6600.0 6600.0 6484.0 6517.0 2565200.0
2020-11-04 00:05:00 6534.0 6534.0 6493.0 6495.0 871200.0
2020-11-04 00:10:00 6515.0 6543.0 6515.0 6531.0 569500.0
2020-11-04 00:15:00 6548.0 6548.0 6508.0 6512.0 472700.0
2020-11-04 00:20:00 6520.0 6541.0 6518.0 6541.0 399400.0
polars
ohlcv_polars_stock_df = (
sub_polars_stock_df.downsample("timestamp", "minute", 5)
.agg(agg_dict2)
)
print(ohlcv_polars_stock_df.drop_nulls()[:5])
shape: (5, 16)
╭───────────┬───────────┬───────────┬───────────┬─────┬───────────┬──────────┬──────────┬──────────╮
│ timestamp ┆ Open_6502 ┆ Open_6502 ┆ Open_6502 ┆ ... ┆ Open_9984 ┆ Open_998 ┆ Open_998 ┆ Volume_9 │
│ --- ┆ _first ┆ _max ┆ _min ┆ ┆ _max ┆ 4_min ┆ 4_last ┆ 984_sum │
│ date64(ms ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ ) ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═══════════╪═══════════╪═══════════╪═══════════╪═════╪═══════════╪══════════╪══════════╪══════════╡
│ 2020-11-0 ┆ 2669 ┆ 2671 ┆ 2649 ┆ ... ┆ 6600 ┆ 6484 ┆ 6517 ┆ 2.5652e6 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2672 ┆ 2673 ┆ 2666 ┆ ... ┆ 6534 ┆ 6493 ┆ 6495 ┆ 8.712e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:05:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2669 ┆ 2669 ┆ 2662 ┆ ... ┆ 6543 ┆ 6515 ┆ 6531 ┆ 5.695e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:10:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2660 ┆ 2663 ┆ 2655 ┆ ... ┆ 6548 ┆ 6508 ┆ 6512 ┆ 4.727e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:15:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2656 ┆ 2656 ┆ 2652 ┆ ... ┆ 6541 ┆ 6518 ┆ 6541 ┆ 3.994e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:20:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
╰───────────┴───────────┴───────────┴───────────┴─────┴───────────┴──────────┴──────────┴──────────╯
カラム名を変更します.
rename_dict_3 = {}
for column_name in ohlcv_polars_stock_df.columns:
if len(column_name.split("_")) > 1:
ticker_name = column_name.split("_")[1]
agg_str = column_name.split("_")[2]
if agg_str == "first":
rename_dict_3[column_name] = "Open_" + ticker_name
elif agg_str == "max":
rename_dict_3[column_name] = "Hign_" + ticker_name
elif agg_str == "min":
rename_dict_3[column_name] = "Low_" + ticker_name
elif agg_str == "last":
rename_dict_3[column_name] = "Close_" + ticker_name
elif agg_str == "sum":
rename_dict_3[column_name] = "Volume_" + ticker_name
print(rename_dict_3)
{'Open_6502_first': 'Open_6502', 'Open_6502_max': 'Hign_6502', 'Open_6502_min': 'Low_6502', 'Open_6502_last': 'Close_6502', 'Volume_6502_sum': 'Volume_6502', 'Open_4755_first': 'Open_4755', 'Open_4755_max': 'Hign_4755', 'Open_4755_min': 'Low_4755', 'Open_4755_last': 'Close_4755', 'Volume_4755_sum': 'Volume_4755', 'Open_9984_first': 'Open_9984', 'Open_9984_max': 'Hign_9984', 'Open_9984_min': 'Low_9984', 'Open_9984_last': 'Close_9984', 'Volume_9984_sum': 'Volume_9984'}
ohlcv_polars_stock_df = ohlcv_polars_stock_df.rename(rename_dict_3)
print(ohlcv_polars_stock_df.drop_nulls()[:5])
shape: (5, 16)
╭───────────┬───────────┬───────────┬───────────┬─────┬───────────┬──────────┬──────────┬──────────╮
│ timestamp ┆ Open_6502 ┆ Hign_6502 ┆ Low_6502 ┆ ... ┆ Hign_9984 ┆ Low_9984 ┆ Close_99 ┆ Volume_9 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ 84 ┆ 984 │
│ date64(ms ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ --- ┆ --- │
│ ) ┆ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 │
╞═══════════╪═══════════╪═══════════╪═══════════╪═════╪═══════════╪══════════╪══════════╪══════════╡
│ 2020-11-0 ┆ 2669 ┆ 2671 ┆ 2649 ┆ ... ┆ 6600 ┆ 6484 ┆ 6517 ┆ 2.5652e6 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2672 ┆ 2673 ┆ 2666 ┆ ... ┆ 6534 ┆ 6493 ┆ 6495 ┆ 8.712e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:05:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2669 ┆ 2669 ┆ 2662 ┆ ... ┆ 6543 ┆ 6515 ┆ 6531 ┆ 5.695e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:10:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2660 ┆ 2663 ┆ 2655 ┆ ... ┆ 6548 ┆ 6508 ┆ 6512 ┆ 4.727e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:15:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020-11-0 ┆ 2656 ┆ 2656 ┆ 2652 ┆ ... ┆ 6541 ┆ 6518 ┆ 6541 ┆ 3.994e5 │
│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
│ 00:20:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
╰───────────┴───────────┴───────────┴───────────┴─────┴───────────┴──────────┴──────────┴──────────╯
これも,ノートパソコン環境で速度を比較してみます.
%%timeit
ohlcv_pandas_stock_df = sub_pandas_stock_df.resample("5T", label="left", closed="left").agg(agg_dict2)
73 ms ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
ohlcv_polars_stock_df = (
sub_polars_stock_df.downsample("timestamp", "minute", 5)
.agg(agg_dict2)
)
8.59 ms ± 88.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
まとめ
- pandasとpolarsはほぼ同じようにダウンサンプリングとOHLCの作成ができます.
- ダウンサンプリングのaggで文字列の辞書を使わず
polars.lazy.expr
を使う方法が分かりませんでした.