1
3

pandas.DataFrame便利処理メモ

Posted at

1. 週&時間のHeatmap

def get_hour_day_of_week_heatmap_data(
    df,
    dt_col="datetime",
    hour_names=[f"{hour}" for hour in range(24)],
    day_names=["月曜", "火曜", "水曜", "木曜", "金曜", "土曜", "日曜"],
):
    """dfと指定されたdt_colから曜日(7)x時間(24)のrecord数を計算しDataFrameを返す"""
    return (
        df.groupby(
            [  # count activity for each hour and day of week & fill 0 for no activity range
                df[dt_col].dt.day_of_week,
                df[dt_col].dt.hour,
            ]
        )
        .size()
        .unstack(fill_value=0)
        .rename(  # update index and column values from number to name
            index={d: day_names[d] for d in range(7)}, columns={h: hour_names[h] for h in range(24)}
        )
        .reindex(  # reindex by the full range 7 days x 24 hours
            index=day_names,
            columns=hour_names,
            fill_value=0,
        )
    )

使用例

import pandas as pd
import plotly.express as px

df = pd.DataFrame({"datetime": ["2023-07-17 10:30:00", "2023-07-17 15:45:00", "2023-07-18 08:15:00"]})
df["datetime"] = pd.to_datetime(df["datetime"])
data_df = get_hour_day_of_week_heatmap_data(df, "datetime")
fig = px.imshow(
    data_df,
    labels=dict(x="時間", y="曜日", color="回数"),
    color_continuous_scale=[[0, "rgb(229, 255, 229)"], [1, "rgb(0, 68, 27)"]],
    text_auto=True,
    aspect="auto",
)
fig.show()

Screen Shot 2023-08-06 at 22.03.38.png

テスト

def test_get_hour_day_of_week_heatmap_data():

    df = pd.DataFrame({"datetime": ["2023-07-17 10:30:00", "2023-07-17 15:45:00", "2023-07-18 08:15:00"]})
    df["datetime"] = pd.to_datetime(df["datetime"])

    data = get_hour_day_of_week_heatmap_data(df, "datetime")
    expected_df = pd.DataFrame(
        [[0] * 24] * 7,
        index=pd.Index(["月曜", "火曜", "水曜", "木曜", "金曜", "土曜", "日曜"], name="datetime"),
        columns=pd.Index([f"{h}" for h in range(24)], name="datetime"),
    )
    expected_df.loc["火曜", "8時"] = 1
    expected_df.loc["月曜", "10時"] = 1
    expected_df.loc["月曜", "15時"] = 1
    print(f"{data=}")
    print(f"{expected_df}")
    assert_frame_equal(data, expected_df)

2. Timeseries

2.1. timeseries for streamlit line chart

def timeseries(x):
    """timeseriesのLineChartColumn用のデータを準備"""
    min_date = x.min().date()
    max_date = x.max().date()
    date2cnt = pd.Series([dt.date() for dt in x]).value_counts().to_dict()
    delta = max_date - min_date
    fulldate2cnt = {
        (min_date + timedelta(days=i)): date2cnt.get(min_date + timedelta(days=i), 0) for i in range(delta.days + 1)
    }
    return dict(sorted(fulldate2cnt.items())).values()

使用例

df = pd.DataFrame({'activity_datetime': [datetime(2024, 1, i%3+1) for i in range(10)], 'user_id': [f"uid_{i%5}" for i in range(10)]})
df
  activity_datetime user_id
0        2024-01-01   uid_0
1        2024-01-02   uid_1
2        2024-01-03   uid_2
3        2024-01-01   uid_3
4        2024-01-02   uid_4
5        2024-01-03   uid_0
6        2024-01-01   uid_1
7        2024-01-02   uid_2
8        2024-01-03   uid_3
9        2024-01-01   uid_4
stat_df = df.groupby("user_id").agg(activity_history=("activity_datetime", timeseries))
        activity_history
user_id                 
uid_0          (1, 0, 1)
uid_1             (1, 1)
uid_2             (1, 1)
uid_3          (1, 0, 1)
uid_4             (1, 1)

実際に使ったのは streamlit内で userごとの activityをlinechartにするもの

st.dataframe(
    stat_df,
    column_config={
        "activity_history": st.column_config.LineChartColumn(y_min=0, y_max=30),
    },
)

Screen Shot 2024-01-18 at 9.57.19.png

2.2. timeseries line chart

上と同様のdfを使った場合

df.pivot_table(index='activity_datetime', columns='user_id', aggfunc=len, fill_value=0)
user_id            uid_0  uid_1  uid_2  uid_3  uid_4
activity_datetime                                   
2024-01-01             1      1      0      1      1
2024-01-02             0      1      1      0      1
2024-01-03             1      0      1      1      0

使い方

plotly line chartを使うと

fig = px.line(ts_df)
fig.show()

newplot.png

この例だとちょっと分かりづらいですが、timeseriesのline chartを書くことが出来る

3. Sessionの計算

時系列データからSessionを計算したいことがある

import pandas as pd
from pandas.api.types import is_datetime64_any_dtype

def calc_session(df, dt_col, interval=pd.Timedelta("30 minutes")):
    """session

    Returns
        session_id(numpy.ndarray)

    Usage:
        df['session_id'] = calc_session(df)
    """
    if not is_datetime64_any_dtype(df[dt_col].dtype):
        df[dt_col] = pd.to_datetime(df[dt_col])
    return df.groupby(["user_id"]).apply(lambda row: (row[[dt_col]].diff() > interval).cumsum())[[dt_col]].values

使用例

df = pd.DataFrame(
    {
        "datetime": ["2023-07-17 10:30:00", "2023-07-17 10:31:00", "2023-07-18 08:15:00", "2023-07-18 08:15:00"],
        "user_id": [
            "uid1",
            "uid1",
            "uid1",
            "uid2",
        ],
    }
)
df["datetime"] = pd.to_datetime(df["datetime"])
df.sort_values(by=["user_id", "datetime"], inplace=True)
calc_session(df, "datetime")
array([[0], # session 0 of uid1
       [0], # session 0 of uid1
       [1], # session 1 of uid1
       [0]]) # session 0 of uid2

テスト

def test_calc_session():

    df = pd.DataFrame(
        {
            "datetime": ["2023-07-17 10:30:00", "2023-07-17 10:31:00", "2023-07-18 08:15:00", "2023-07-18 08:15:00"],
            "user_id": [
                "uid1",
                "uid1",
                "uid1",
                "uid2",
            ],
        }
    )
    df["datetime"] = pd.to_datetime(df["datetime"])
    df.sort_values(by=["user_id", "datetime"], inplace=True)
    expected_df = df.copy()
    expected_df["session_id"] = [0, 0, 1, 0]

    df["session_id"] = calc_session(df, "datetime")

    assert_frame_equal(df, expected_df)

4. groupby

4.1. 簡単なaggregation

aggを使って対象となる columnにほしい統計関数を指定

df.groupby('key').agg({'cnt': ['sum', 'mean']})
                    cnt              
                    sum          mean
key                                  
data_source_type  12568  12568.000000
file_type         11656   3885.333333
repo              12029   4009.666667
source              769    769.000000

4.2. 自分でlambdaで定義する場合

groupbyしたものに複数のColumnを関連させたSeriesを返す

df.groupby('key').apply(lambda x: dict(zip(x['value'], x['cnt'])))

使用例

key value pairとそれに対するcntがあった場合に、keyでgroupしてからvalueを {value:cnt}のdictionaryにして、 cntには、keyの合計カウントにして、keyの合計カウント順にソートしたい場合

data = {'key': ['data_source_type', 'file_type', 'file_type', 'repo', 'repo', 'repo', 'file_type', 'source'], 'value': ['github', '.md', '.go', 'nakamasato/golang-training', 'nakamasato/kubernetes-training', 'ariga/atlas', '.json', 'doc/website/package-lock.json'], 'cnt': [12568, 5444, 5441, 4937, 3830, 3262, 771, 769]}
df = pd.DataFrame(data)
>>> df
                key                           value    cnt
0  data_source_type                          github  12568
1         file_type                             .md   5444
2         file_type                             .go   5441
3              repo      nakamasato/golang-training   4937
4              repo  nakamasato/kubernetes-training   3830
5              repo                     ariga/atlas   3262
6         file_type                           .json    771
7            source   doc/website/package-lock.json    769
df.groupby('key').apply(lambda x: pd.Series({'value': dict(zip(x['value'], x['cnt'])), 'cnt': x['cnt'].sum()})).sort_values('cnt', ascending=False)
                                                              value    cnt
key                                                                       
data_source_type                                  {'github': 12568}  12568
repo              {'nakamasato/golang-training': 4937, 'nakamasa...  12029
file_type                  {'.md': 5444, '.go': 5441, '.json': 771}  11656
source                       {'doc/website/package-lock.json': 769}    769

4.3. 自分で関数を定義する場合

lambdaには書ききれなくなってくる場合には、自分で関数を作ってapply(func)とすることも可。上のLambdaと全く同じ例をcustom_funcを定義して書いてみると

def custom_func(x):
    v = dict(zip(x['value'], x['cnt']))
    c = sum(x['cnt'])
    return pd.Series({'value': v, 'cnt': c})
df.groupby('key').apply(custom_func).sort_values('cnt', ascending=False)
                                                              value    cnt
key                                                                       
data_source_type                                  {'github': 12568}  12568
repo              {'nakamasato/golang-training': 4937, 'nakamasa...  12029
file_type                  {'.md': 5444, '.go': 5441, '.json': 771}  11656
source                       {'doc/website/package-lock.json': 769}    769
1
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
3