More than 1 year has passed since last update.

Pandas カスタム条件の関数でDataframeを比較する

Last updated at 2022-08-26Posted at 2022-08-26

Dataframe同士を比較する際は、

df1.eq(df2) pandas.DataFrame.eq
df1.compare(df2) pandas.DataFrame.compare

などが用いられるが、比較の条件を緩くしたり、厳しくしたい場面がある。

今回、NaNとNaNは等しくないと判断され差分がでる仕様が好ましくなかったため、
以下の、自作の比較用関数で対応したので、共有する。

テストデータを用意する

from IPython.display import display
import pandas as pd
import numpy as np

df1 = pd.DataFrame(
    {
        "col1": [1.0, 2.0, 3.0, np.nan, 5.0],
        "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
        "col3": [1.0, 2.0, 3.01, 4.2, 6.0]
    },
)

df2 = pd.DataFrame(
    {
        "col1": [1.0, 2.0, 3.0, 4.0, 5.0],
        "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
        "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
    },
)

display(df1)
display(df2)

任意の条件を関数で指定して比較

def custom_compare(df_before, df_after, eq_cond_func):
    if df_before.shape != df_after.shape:
        raise Exception("サイズの異なるDataframeは比較できません！")
        
    # 高速に同サイズの空の配列を作成
    # https://stackoverflow.com/questions/23195250/create-empty-dataframe-with-same-dimensions-as-another
    df_result = pd.DataFrame(np.full(df_before.shape, False, dtype=bool))
    df_result.index = df1.index
    df_result.columns = df1.columns
    
    for index in df_result.index:
        for column in df_result.columns:
            # pandas.DataFrame.at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.at.html
            # df.at[index, column] でデータフレームの指定した列・行の値を取得でき、さらに代入すれば書き換えができる。
            # df_before、df_afterの値をcond_funcで評価した結果をdf_resultに
            df_result.at[index, column] = eq_cond_func(df_before.at[index, column], df_after.at[index, column])
    
    return df_result

# 比較用の関数。a, bが等しいと判断するとTrue、そうでなければFalseを返す。
def eq_cond(a, b):
    if a == b:
        return True
    if np.isnan(a) and np.isnan(b):
        return True
    return False

custom_compare(df1, df2, eq_cond)

Typingを用いて型付けする場合

from collections.abc import Callable
from typing import Any

def custom_compare(df_before: pd.DataFrame, df_after: pd.DataFrame, cond_func: Callable[[Any, Any], bool]) -> pd.DataFrame:
    if df_before.shape != df_after.shape:
        raise Exception("サイズの異なるDataframeは比較できません！")
        
    # 高速に同サイズの空の配列を作成
    # https://stackoverflow.com/questions/23195250/create-empty-dataframe-with-same-dimensions-as-another
    df_result = pd.DataFrame(np.full(df_before.shape, False, dtype=bool))
    df_result.index = df1.index
    df_result.columns = df1.columns
    
    print(df_result.index)
    print(df_result.columns)

    for index in df_result.index:
        for column in df_result.columns:
            # pandas.DataFrame.at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.at.html
            # df.at[index, column] でデータフレームの指定した列・行の値を取得でき、さらに代入すれば書き換えができる。
            # df_before、df_afterの値をcond_funcで評価した結果をdf_result
            df_result.at[index, column] = cond_func(df_before.at[index, column], df_after.at[index, column])
    
    return df_result

# 比較用の関数。a, bが等しいと判断するとTrue、そうでなければFalseを返す。
def eq_cond(a: Any, b: Any) -> bool:
    if a == b:
        return True
    if np.isnan(a) and np.isnan(b):
        return True
    return False

custom_compare(df1, df2, eq_cond)

比較条件をカスタムする

Nan同士は等しいことにする

# 比較用の関数。a, bが等しいと判断するとTrue、そうでなければFalseを返す。
def eq_cond(a, b):
    if a == b:
        return True
    if np.isnan(a) and np.isnan(b):
        return True
    return False

数字の場合は差が１以下をなら等しいことにする

# 比較用の関数。a, bが等しいと判断するとTrue、そうでなければFalseを返す。
def eq_cond(a, b):
    if a == b:
        return True
    if (isinstance(a, int) or isinstance(a, float)) and (isinstance(b, int) or isinstance(b, float)) :
        return abs(float(a) - float(b)) < 1 # 差が１以下
    return False

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up