More than 5 years have passed since last update.

Pythonで基礎集計（２）

Last updated at 2019-06-02Posted at 2019-06-02

今回はplotです。

例でも取り上げているirisみたいな小さいデータセットであればplotはとても有用。ただ、数百万とかそれ以上のデータ数になると潰れちゃうし、時間がかかり過ぎるのであまりオススメしない。

（１）のclassをちょっと変えて、さらにメソッドを追加。

FeatureAgg.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")

class FeatureAgg(object):
    def __init__(self, dataframe, agg_type="classification"):
        self.df = dataframe
        self.table_list = None
        self.p_value_list = None
        self.agg_type = agg_type

    #０件数
    def count0_num(self, data):
        count0_num = data[data == 0].shape[0]
        return count0_num

    #0件数割合
    def count0_rate(self, data):
        count0_num = data[data == 0].shape[0] 
        return count0_num/data.shape[0]

    #欠損件数
    def na_num(self, data):
        na_num = data.isnull().sum()
        return na_num

    #欠損件数
    def na_rate(self, data):
        na_num = data.isnull().sum()
        return na_num/data.shape[0]

    #5%点
    def lower_5per(self, data):
        low_5per = data.quantile(0.05)
        return low_5per

    #95%点
    def upper_5per(self, data):
        upper_5per = data.quantile(0.95)
        return upper_5per

    #基本統計量を算出
    def _aggregation(self, data, arg_list, col):
        try:
            data_agg = data.groupby(arg_list, as_index=False)\
            .agg({col:[self.count0_num, self.count0_rate, self.na_num, self.na_rate, "count", "std", "min", self.lower_5per, "mean", "median", self.upper_5per, "max"]})
        except:
            pass

        return data_agg

    #describeの拡張
    def agg_df_describe(self, filename, *args):
        df = self.df
        arg_list = [arg for arg in args]
        for i, col in enumerate(df.columns.values):
            if not (col in arg_list):
                try:
                    df_agg = self._aggregation(df[arg_list + [col]], arg_list, col)

                    df_agg = pd.concat([pd.DataFrame(col, index=np.arange(df_agg.shape[0]), columns=["cols"]), df_agg], axis=1)
                    df_agg.columns = ["項目"] + arg_list + ["0件数", "0割合", "欠損件数", "欠損割合", "件数", "標準偏差", "最小値", "5%点", "平均", "中央値", "95%点", "最大値"]
                except:
                    print("{}は集計できません。".format(col))

            else:
                pass

            if i == 0:
                df_agg.to_csv(filename + ".csv", header=True, index=False, encoding="shift-jis")
            else:
                try:
                    df_agg.to_csv(filename + ".csv", header=False, index=False, encoding="shift-jis", mode="a")
                except:
                    print("書き込みエラー発生 {} 番目のカラム ： {}".format(i, col))
                    
    #対数オッズ計算用関数              
    def _log_odds(self, data):
        eps = 1e-8 #log0を防ぐため
        return np.log(data.mean() + eps)/(1 - np.log(data.mean() + eps))
    
    #プロット
    #データが多い場合は重くなるのでオススメしない
    def agg_plot(self, target, height=8, width=8):
        """
        pairplotは変数が多くなると重すぎる and 見にくいので個別に
        1.classificationの場合はバイオリンプロット
        2.classification_2classの場合はバイオリンプロット + 対数オッズに対するscatterplot
        3.regressionの場合は目的変数と説明変数のdistplot + 散布図
        
        （層別のplotも追加する！！！！）
        """
        agg_type = self.agg_type
        df = self.df
        for col in df.columns.values:
            if col != target:
                if (agg_type == "classification") | (agg_type == "classification_2class"):
                    plt.figure(figsize=(height, width))
                    sns.violinplot(x = target, y = col, data = df)
                    plt.title(target + " vs " + col)
                    plt.savefig("plot/violin/" + target + "_vs_" + col + "_violin.png")
                    plt.close()
                    
                if agg_type == "classification_2class":
                    dat = df.copy()
                    dat.loc[:, col], bins = pd.qcut(dat.loc[:, col], 10, labels=False, retbins=True, duplicates='drop')
                    da = dat.groupby(col, as_index=False).agg({target:self._log_odds})
                    plt.figure(figsize=(height, width))
                    plt.scatter(da.loc[:, col], da.loc[:, target], s=dat.groupby(col).count()*10)
                    plt.title(target + " vs " + col)
                    plt.xlabel(col + "\n bins = " + str(bins))
                    plt.ylabel("logg odds " + target)
                    plt.savefig("plot/scatter/" + target + "_vs_" + col + "_logodds_scatter.png")
                    plt.close()
                
                if agg_type == "regression":
                    plt.figure(figsize=(height, width))
                    plt.scatter(df.loc[:, col], df.loc[:, target])
                    plt.title(target + " vs " + col)
                    plt.xlabel(col)
                    plt.ylabel(target)
                    plt.savefig("plot/scatter/" + target + "_vs_" + col + "_scatter.png")
                    plt.close()

execute.py

# irisでテスト
import os
from sklearn.datasets import load_iris
os.chdir("python")
iris = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)
df["target"] = iris.target
df.target = np.where(df.target == 2, 1, df.target) #2class分類問題にするため
featureagg = FeatureAgg(df, agg_type="classification_2class")
featureagg.agg_plot("target", height=10, width=10) #どれが目的変数化を指定する

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up