1. 週&時間のHeatmap
def get_hour_day_of_week_heatmap_data(
df,
dt_col="datetime",
hour_names=[f"{hour}時" for hour in range(24)],
day_names=["月曜", "火曜", "水曜", "木曜", "金曜", "土曜", "日曜"],
):
"""dfと指定されたdt_colから曜日(7)x時間(24)のrecord数を計算しDataFrameを返す"""
return (
df.groupby(
[ # count activity for each hour and day of week & fill 0 for no activity range
df[dt_col].dt.day_of_week,
df[dt_col].dt.hour,
]
)
.size()
.unstack(fill_value=0)
.rename( # update index and column values from number to name
index={d: day_names[d] for d in range(7)}, columns={h: hour_names[h] for h in range(24)}
)
.reindex( # reindex by the full range 7 days x 24 hours
index=day_names,
columns=hour_names,
fill_value=0,
)
)
使用例
import pandas as pd
import plotly.express as px
df = pd.DataFrame({"datetime": ["2023-07-17 10:30:00", "2023-07-17 15:45:00", "2023-07-18 08:15:00"]})
df["datetime"] = pd.to_datetime(df["datetime"])
data_df = get_hour_day_of_week_heatmap_data(df, "datetime")
fig = px.imshow(
data_df,
labels=dict(x="時間", y="曜日", color="回数"),
color_continuous_scale=[[0, "rgb(229, 255, 229)"], [1, "rgb(0, 68, 27)"]],
text_auto=True,
aspect="auto",
)
fig.show()
テスト
def test_get_hour_day_of_week_heatmap_data():
df = pd.DataFrame({"datetime": ["2023-07-17 10:30:00", "2023-07-17 15:45:00", "2023-07-18 08:15:00"]})
df["datetime"] = pd.to_datetime(df["datetime"])
data = get_hour_day_of_week_heatmap_data(df, "datetime")
expected_df = pd.DataFrame(
[[0] * 24] * 7,
index=pd.Index(["月曜", "火曜", "水曜", "木曜", "金曜", "土曜", "日曜"], name="datetime"),
columns=pd.Index([f"{h}時" for h in range(24)], name="datetime"),
)
expected_df.loc["火曜", "8時"] = 1
expected_df.loc["月曜", "10時"] = 1
expected_df.loc["月曜", "15時"] = 1
print(f"{data=}")
print(f"{expected_df}")
assert_frame_equal(data, expected_df)
2. Timeseries
2.1. timeseries for streamlit line chart
def timeseries(x):
"""timeseriesのLineChartColumn用のデータを準備"""
min_date = x.min().date()
max_date = x.max().date()
date2cnt = pd.Series([dt.date() for dt in x]).value_counts().to_dict()
delta = max_date - min_date
fulldate2cnt = {
(min_date + timedelta(days=i)): date2cnt.get(min_date + timedelta(days=i), 0) for i in range(delta.days + 1)
}
return dict(sorted(fulldate2cnt.items())).values()
使用例
df = pd.DataFrame({'activity_datetime': [datetime(2024, 1, i%3+1) for i in range(10)], 'user_id': [f"uid_{i%5}" for i in range(10)]})
df
activity_datetime user_id
0 2024-01-01 uid_0
1 2024-01-02 uid_1
2 2024-01-03 uid_2
3 2024-01-01 uid_3
4 2024-01-02 uid_4
5 2024-01-03 uid_0
6 2024-01-01 uid_1
7 2024-01-02 uid_2
8 2024-01-03 uid_3
9 2024-01-01 uid_4
stat_df = df.groupby("user_id").agg(activity_history=("activity_datetime", timeseries))
activity_history
user_id
uid_0 (1, 0, 1)
uid_1 (1, 1)
uid_2 (1, 1)
uid_3 (1, 0, 1)
uid_4 (1, 1)
実際に使ったのは streamlit内で userごとの activityをlinechartにするもの
st.dataframe(
stat_df,
column_config={
"activity_history": st.column_config.LineChartColumn(y_min=0, y_max=30),
},
)
2.2. timeseries line chart
上と同様のdfを使った場合
df.pivot_table(index='activity_datetime', columns='user_id', aggfunc=len, fill_value=0)
user_id uid_0 uid_1 uid_2 uid_3 uid_4
activity_datetime
2024-01-01 1 1 0 1 1
2024-01-02 0 1 1 0 1
2024-01-03 1 0 1 1 0
使い方
fig = px.line(ts_df)
fig.show()
この例だとちょっと分かりづらいですが、timeseriesのline chartを書くことが出来る
3. Sessionの計算
時系列データからSessionを計算したいことがある
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype
def calc_session(df, dt_col, interval=pd.Timedelta("30 minutes")):
"""session
Returns
session_id(numpy.ndarray)
Usage:
df['session_id'] = calc_session(df)
"""
if not is_datetime64_any_dtype(df[dt_col].dtype):
df[dt_col] = pd.to_datetime(df[dt_col])
return df.groupby(["user_id"]).apply(lambda row: (row[[dt_col]].diff() > interval).cumsum())[[dt_col]].values
使用例
df = pd.DataFrame(
{
"datetime": ["2023-07-17 10:30:00", "2023-07-17 10:31:00", "2023-07-18 08:15:00", "2023-07-18 08:15:00"],
"user_id": [
"uid1",
"uid1",
"uid1",
"uid2",
],
}
)
df["datetime"] = pd.to_datetime(df["datetime"])
df.sort_values(by=["user_id", "datetime"], inplace=True)
calc_session(df, "datetime")
array([[0], # session 0 of uid1
[0], # session 0 of uid1
[1], # session 1 of uid1
[0]]) # session 0 of uid2
テスト
def test_calc_session():
df = pd.DataFrame(
{
"datetime": ["2023-07-17 10:30:00", "2023-07-17 10:31:00", "2023-07-18 08:15:00", "2023-07-18 08:15:00"],
"user_id": [
"uid1",
"uid1",
"uid1",
"uid2",
],
}
)
df["datetime"] = pd.to_datetime(df["datetime"])
df.sort_values(by=["user_id", "datetime"], inplace=True)
expected_df = df.copy()
expected_df["session_id"] = [0, 0, 1, 0]
df["session_id"] = calc_session(df, "datetime")
assert_frame_equal(df, expected_df)
4. groupby
4.1. 簡単なaggregation
agg
を使って対象となる columnにほしい統計関数を指定
df.groupby('key').agg({'cnt': ['sum', 'mean']})
cnt
sum mean
key
data_source_type 12568 12568.000000
file_type 11656 3885.333333
repo 12029 4009.666667
source 769 769.000000
4.2. 自分でlambdaで定義する場合
groupbyしたものに複数のColumnを関連させたSeriesを返す
df.groupby('key').apply(lambda x: dict(zip(x['value'], x['cnt'])))
使用例
key value pairとそれに対するcntがあった場合に、keyでgroupしてからvalueを {value:cnt}
のdictionaryにして、 cntには、keyの合計カウントにして、keyの合計カウント順にソートしたい場合
data = {'key': ['data_source_type', 'file_type', 'file_type', 'repo', 'repo', 'repo', 'file_type', 'source'], 'value': ['github', '.md', '.go', 'nakamasato/golang-training', 'nakamasato/kubernetes-training', 'ariga/atlas', '.json', 'doc/website/package-lock.json'], 'cnt': [12568, 5444, 5441, 4937, 3830, 3262, 771, 769]}
df = pd.DataFrame(data)
>>> df
key value cnt
0 data_source_type github 12568
1 file_type .md 5444
2 file_type .go 5441
3 repo nakamasato/golang-training 4937
4 repo nakamasato/kubernetes-training 3830
5 repo ariga/atlas 3262
6 file_type .json 771
7 source doc/website/package-lock.json 769
df.groupby('key').apply(lambda x: pd.Series({'value': dict(zip(x['value'], x['cnt'])), 'cnt': x['cnt'].sum()})).sort_values('cnt', ascending=False)
value cnt
key
data_source_type {'github': 12568} 12568
repo {'nakamasato/golang-training': 4937, 'nakamasa... 12029
file_type {'.md': 5444, '.go': 5441, '.json': 771} 11656
source {'doc/website/package-lock.json': 769} 769
4.3. 自分で関数を定義する場合
lambdaには書ききれなくなってくる場合には、自分で関数を作ってapply(func)
とすることも可。上のLambdaと全く同じ例をcustom_funcを定義して書いてみると
def custom_func(x):
v = dict(zip(x['value'], x['cnt']))
c = sum(x['cnt'])
return pd.Series({'value': v, 'cnt': c})
df.groupby('key').apply(custom_func).sort_values('cnt', ascending=False)
value cnt
key
data_source_type {'github': 12568} 12568
repo {'nakamasato/golang-training': 4937, 'nakamasa... 12029
file_type {'.md': 5444, '.go': 5441, '.json': 771} 11656
source {'doc/website/package-lock.json': 769} 769