pandas.DataFrame便利処理メモ

Posted at 2024-01-20

1. 週&時間のHeatmap

def get_hour_day_of_week_heatmap_data(
    df,
    dt_col="datetime",
    hour_names=[f"{hour}時" for hour in range(24)],
    day_names=["月曜", "火曜", "水曜", "木曜", "金曜", "土曜", "日曜"],
):
    """dfと指定されたdt_colから曜日(7)x時間(24)のrecord数を計算しDataFrameを返す"""
    return (
        df.groupby(
            [  # count activity for each hour and day of week & fill 0 for no activity range
                df[dt_col].dt.day_of_week,
                df[dt_col].dt.hour,
            ]
        )
        .size()
        .unstack(fill_value=0)
        .rename(  # update index and column values from number to name
            index={d: day_names[d] for d in range(7)}, columns={h: hour_names[h] for h in range(24)}
        )
        .reindex(  # reindex by the full range 7 days x 24 hours
            index=day_names,
            columns=hour_names,
            fill_value=0,
        )
    )

使用例

import pandas as pd
import plotly.express as px

df = pd.DataFrame({"datetime": ["2023-07-17 10:30:00", "2023-07-17 15:45:00", "2023-07-18 08:15:00"]})
df["datetime"] = pd.to_datetime(df["datetime"])
data_df = get_hour_day_of_week_heatmap_data(df, "datetime")
fig = px.imshow(
    data_df,
    labels=dict(x="時間", y="曜日", color="回数"),
    color_continuous_scale=[[0, "rgb(229, 255, 229)"], [1, "rgb(0, 68, 27)"]],
    text_auto=True,
    aspect="auto",
)
fig.show()

テスト

def test_get_hour_day_of_week_heatmap_data():

    df = pd.DataFrame({"datetime": ["2023-07-17 10:30:00", "2023-07-17 15:45:00", "2023-07-18 08:15:00"]})
    df["datetime"] = pd.to_datetime(df["datetime"])

    data = get_hour_day_of_week_heatmap_data(df, "datetime")
    expected_df = pd.DataFrame(
        [[0] * 24] * 7,
        index=pd.Index(["月曜", "火曜", "水曜", "木曜", "金曜", "土曜", "日曜"], name="datetime"),
        columns=pd.Index([f"{h}時" for h in range(24)], name="datetime"),
    )
    expected_df.loc["火曜", "8時"] = 1
    expected_df.loc["月曜", "10時"] = 1
    expected_df.loc["月曜", "15時"] = 1
    print(f"{data=}")
    print(f"{expected_df}")
    assert_frame_equal(data, expected_df)

2. Timeseries

2.1. timeseries for streamlit line chart

def timeseries(x):
    """timeseriesのLineChartColumn用のデータを準備"""
    min_date = x.min().date()
    max_date = x.max().date()
    date2cnt = pd.Series([dt.date() for dt in x]).value_counts().to_dict()
    delta = max_date - min_date
    fulldate2cnt = {
        (min_date + timedelta(days=i)): date2cnt.get(min_date + timedelta(days=i), 0) for i in range(delta.days + 1)
    }
    return dict(sorted(fulldate2cnt.items())).values()

使用例

df = pd.DataFrame({'activity_datetime': [datetime(2024, 1, i%3+1) for i in range(10)], 'user_id': [f"uid_{i%5}" for i in range(10)]})
df
  activity_datetime user_id
0        2024-01-01   uid_0
1        2024-01-02   uid_1
2        2024-01-03   uid_2
3        2024-01-01   uid_3
4        2024-01-02   uid_4
5        2024-01-03   uid_0
6        2024-01-01   uid_1
7        2024-01-02   uid_2
8        2024-01-03   uid_3
9        2024-01-01   uid_4

stat_df = df.groupby("user_id").agg(activity_history=("activity_datetime", timeseries))

        activity_history
user_id                 
uid_0          (1, 0, 1)
uid_1             (1, 1)
uid_2             (1, 1)
uid_3          (1, 0, 1)
uid_4             (1, 1)

実際に使ったのは streamlit内で userごとの activityをlinechartにするもの

st.dataframe(
    stat_df,
    column_config={
        "activity_history": st.column_config.LineChartColumn(y_min=0, y_max=30),
    },
)

2.2. timeseries line chart

上と同様のdfを使った場合

df.pivot_table(index='activity_datetime', columns='user_id', aggfunc=len, fill_value=0)

user_id            uid_0  uid_1  uid_2  uid_3  uid_4
activity_datetime                                   
2024-01-01             1      1      0      1      1
2024-01-02             0      1      1      0      1
2024-01-03             1      0      1      1      0

使い方

plotly line chartを使うと

fig = px.line(ts_df)
fig.show()

この例だとちょっと分かりづらいですが、timeseriesのline chartを書くことが出来る

3. Sessionの計算

時系列データからSessionを計算したいことがある

import pandas as pd
from pandas.api.types import is_datetime64_any_dtype

def calc_session(df, dt_col, interval=pd.Timedelta("30 minutes")):
    """session

    Returns
        session_id(numpy.ndarray)

    Usage:
        df['session_id'] = calc_session(df)
    """
    if not is_datetime64_any_dtype(df[dt_col].dtype):
        df[dt_col] = pd.to_datetime(df[dt_col])
    return df.groupby(["user_id"]).apply(lambda row: (row[[dt_col]].diff() > interval).cumsum())[[dt_col]].values

使用例

df = pd.DataFrame(
    {
        "datetime": ["2023-07-17 10:30:00", "2023-07-17 10:31:00", "2023-07-18 08:15:00", "2023-07-18 08:15:00"],
        "user_id": [
            "uid1",
            "uid1",
            "uid1",
            "uid2",
        ],
    }
)
df["datetime"] = pd.to_datetime(df["datetime"])
df.sort_values(by=["user_id", "datetime"], inplace=True)
calc_session(df, "datetime")
array([[0], # session 0 of uid1
       [0], # session 0 of uid1
       [1], # session 1 of uid1
       [0]]) # session 0 of uid2

テスト

def test_calc_session():

    df = pd.DataFrame(
        {
            "datetime": ["2023-07-17 10:30:00", "2023-07-17 10:31:00", "2023-07-18 08:15:00", "2023-07-18 08:15:00"],
            "user_id": [
                "uid1",
                "uid1",
                "uid1",
                "uid2",
            ],
        }
    )
    df["datetime"] = pd.to_datetime(df["datetime"])
    df.sort_values(by=["user_id", "datetime"], inplace=True)
    expected_df = df.copy()
    expected_df["session_id"] = [0, 0, 1, 0]

    df["session_id"] = calc_session(df, "datetime")

    assert_frame_equal(df, expected_df)

4. groupby

4.1. 簡単なaggregation

aggを使って対象となる columnにほしい統計関数を指定

df.groupby('key').agg({'cnt': ['sum', 'mean']})
                    cnt              
                    sum          mean
key                                  
data_source_type  12568  12568.000000
file_type         11656   3885.333333
repo              12029   4009.666667
source              769    769.000000

4.2. 自分でlambdaで定義する場合

groupbyしたものに複数のColumnを関連させたSeriesを返す

df.groupby('key').apply(lambda x: dict(zip(x['value'], x['cnt'])))

使用例

key value pairとそれに対するcntがあった場合に、keyでgroupしてからvalueを {value:cnt}のdictionaryにして、 cntには、keyの合計カウントにして、keyの合計カウント順にソートしたい場合

data = {'key': ['data_source_type', 'file_type', 'file_type', 'repo', 'repo', 'repo', 'file_type', 'source'], 'value': ['github', '.md', '.go', 'nakamasato/golang-training', 'nakamasato/kubernetes-training', 'ariga/atlas', '.json', 'doc/website/package-lock.json'], 'cnt': [12568, 5444, 5441, 4937, 3830, 3262, 771, 769]}
df = pd.DataFrame(data)
>>> df
                key                           value    cnt
0  data_source_type                          github  12568
1         file_type                             .md   5444
2         file_type                             .go   5441
3              repo      nakamasato/golang-training   4937
4              repo  nakamasato/kubernetes-training   3830
5              repo                     ariga/atlas   3262
6         file_type                           .json    771
7            source   doc/website/package-lock.json    769

df.groupby('key').apply(lambda x: pd.Series({'value': dict(zip(x['value'], x['cnt'])), 'cnt': x['cnt'].sum()})).sort_values('cnt', ascending=False)
                                                              value    cnt
key                                                                       
data_source_type                                  {'github': 12568}  12568
repo              {'nakamasato/golang-training': 4937, 'nakamasa...  12029
file_type                  {'.md': 5444, '.go': 5441, '.json': 771}  11656
source                       {'doc/website/package-lock.json': 769}    769

4.3. 自分で関数を定義する場合

lambdaには書ききれなくなってくる場合には、自分で関数を作ってapply(func)とすることも可。上のLambdaと全く同じ例をcustom_funcを定義して書いてみると

def custom_func(x):
    v = dict(zip(x['value'], x['cnt']))
    c = sum(x['cnt'])
    return pd.Series({'value': v, 'cnt': c})

df.groupby('key').apply(custom_func).sort_values('cnt', ascending=False)
                                                              value    cnt
key                                                                       
data_source_type                                  {'github': 12568}  12568
repo              {'nakamasato/golang-training': 4937, 'nakamasa...  12029
file_type                  {'.md': 5444, '.go': 5441, '.json': 771}  11656
source                       {'doc/website/package-lock.json': 769}    769

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up