1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

Pythonで基礎集計(2)

Last updated at Posted at 2019-06-02

今回はplotです。

例でも取り上げているirisみたいな小さいデータセットであればplotはとても有用。ただ、数百万とかそれ以上のデータ数になると潰れちゃうし、時間がかかり過ぎるのであまりオススメしない。

(1)のclassをちょっと変えて、さらにメソッドを追加。

FeatureAgg.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")

class FeatureAgg(object):
    def __init__(self, dataframe, agg_type="classification"):
        self.df = dataframe
        self.table_list = None
        self.p_value_list = None
        self.agg_type = agg_type

    #0件数
    def count0_num(self, data):
        count0_num = data[data == 0].shape[0]
        return count0_num

    #0件数割合
    def count0_rate(self, data):
        count0_num = data[data == 0].shape[0] 
        return count0_num/data.shape[0]

    #欠損件数
    def na_num(self, data):
        na_num = data.isnull().sum()
        return na_num

    #欠損件数
    def na_rate(self, data):
        na_num = data.isnull().sum()
        return na_num/data.shape[0]

    #5%点
    def lower_5per(self, data):
        low_5per = data.quantile(0.05)
        return low_5per

    #95%点
    def upper_5per(self, data):
        upper_5per = data.quantile(0.95)
        return upper_5per

    #基本統計量を算出
    def _aggregation(self, data, arg_list, col):
        try:
            data_agg = data.groupby(arg_list, as_index=False)\
            .agg({col:[self.count0_num, self.count0_rate, self.na_num, self.na_rate, "count", "std", "min", self.lower_5per, "mean", "median", self.upper_5per, "max"]})
        except:
            pass

        return data_agg

    #describeの拡張
    def agg_df_describe(self, filename, *args):
        df = self.df
        arg_list = [arg for arg in args]
        for i, col in enumerate(df.columns.values):
            if not (col in arg_list):
                try:
                    df_agg = self._aggregation(df[arg_list + [col]], arg_list, col)

                    df_agg = pd.concat([pd.DataFrame(col, index=np.arange(df_agg.shape[0]), columns=["cols"]), df_agg], axis=1)
                    df_agg.columns = ["項目"] + arg_list + ["0件数", "0割合", "欠損件数", "欠損割合", "件数", "標準偏差", "最小値", "5%点", "平均", "中央値", "95%点", "最大値"]
                except:
                    print("{}は集計できません。".format(col))

            else:
                pass

            if i == 0:
                df_agg.to_csv(filename + ".csv", header=True, index=False, encoding="shift-jis")
            else:
                try:
                    df_agg.to_csv(filename + ".csv", header=False, index=False, encoding="shift-jis", mode="a")
                except:
                    print("書き込みエラー発生 {} 番目のカラム : {}".format(i, col))
                    
    #対数オッズ計算用関数              
    def _log_odds(self, data):
        eps = 1e-8 #log0を防ぐため
        return np.log(data.mean() + eps)/(1 - np.log(data.mean() + eps))
    
    #プロット
    #データが多い場合は重くなるのでオススメしない
    def agg_plot(self, target, height=8, width=8):
        """
        pairplotは変数が多くなると重すぎる and 見にくいので個別に
        1.classificationの場合はバイオリンプロット
        2.classification_2classの場合はバイオリンプロット + 対数オッズに対するscatterplot
        3.regressionの場合は目的変数と説明変数のdistplot + 散布図
        
        (層別のplotも追加する!!!!)
        """
        agg_type = self.agg_type
        df = self.df
        for col in df.columns.values:
            if col != target:
                if (agg_type == "classification") | (agg_type == "classification_2class"):
                    plt.figure(figsize=(height, width))
                    sns.violinplot(x = target, y = col, data = df)
                    plt.title(target + " vs " + col)
                    plt.savefig("plot/violin/" + target + "_vs_" + col + "_violin.png")
                    plt.close()
                    
                if agg_type == "classification_2class":
                    dat = df.copy()
                    dat.loc[:, col], bins = pd.qcut(dat.loc[:, col], 10, labels=False, retbins=True, duplicates='drop')
                    da = dat.groupby(col, as_index=False).agg({target:self._log_odds})
                    plt.figure(figsize=(height, width))
                    plt.scatter(da.loc[:, col], da.loc[:, target], s=dat.groupby(col).count()*10)
                    plt.title(target + " vs " + col)
                    plt.xlabel(col + "\n bins = " + str(bins))
                    plt.ylabel("logg odds " + target)
                    plt.savefig("plot/scatter/" + target + "_vs_" + col + "_logodds_scatter.png")
                    plt.close()
                
                if agg_type == "regression":
                    plt.figure(figsize=(height, width))
                    plt.scatter(df.loc[:, col], df.loc[:, target])
                    plt.title(target + " vs " + col)
                    plt.xlabel(col)
                    plt.ylabel(target)
                    plt.savefig("plot/scatter/" + target + "_vs_" + col + "_scatter.png")
                    plt.close()

execute.py
#irisでテスト
import os
from sklearn.datasets import load_iris
os.chdir("python")
iris = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)
df["target"] = iris.target
df.target = np.where(df.target == 2, 1, df.target) #2class分類問題にするため
featureagg = FeatureAgg(df, agg_type="classification_2class")
featureagg.agg_plot("target", height=10, width=10) #どれが目的変数化を指定する

target_vs_petal length (cm)_logodds_scatter.png
target_vs_petal width (cm)_logodds_scatter.png
target_vs_sepal length (cm)_logodds_scatter.png
target_vs_sepal width (cm)_logodds_scatter.png

target_vs_petal length (cm)_violin.png
target_vs_petal width (cm)_violin.png
target_vs_sepal length (cm)_violin.png
target_vs_sepal width (cm)_violin.png

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?