EDA

Python

Last updated at 2025-04-07Posted at 2025-04-04

01_基本情報の表示と欠損値処理.py

import streamlit as st
import pandas as pd

def main():
    # dataのread
    df = pd.read_csv("./data/train.csv")

    # ページ設定
    st.set_page_config(layout="wide")
    st.title("基本情報の表示と欠損値処理")
    
    # データの基本情報
    with st.expander("データの基本情報", expanded=True):
        # 基本サイズ情報
        st.metric("行数", f"{df.shape[0]:,}")
        st.metric("列数", df.shape[1])

        # メモリ使用量
        memory_usage = df.memory_usage(deep=True).sum()
        if memory_usage < 1024:
            memory_str = f"{memory_usage} bytes"
        elif memory_usage < 1024**2:
            memory_str = f"{memory_usage/1024:.2f} KB"
        else:
            memory_str = f"{memory_usage/1024**2:.2f} MB"
            st.metric("メモリ使用量", memory_str)


    # カテゴリーデータと数値データのカラムを表示
    with st.expander("カテゴリーデータと数値データのカラムを表示", expanded=True):
        # カラムの型による分類
        category_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        numeric_cols = df.select_dtypes(include=['number']).columns.tolist()

        def create_column_info_df(cols):
            return pd.DataFrame({
                'idx': range(1, len(cols)+1),
                '列名': cols,
                'データ型': [df[col].dtype for col in cols],
                'データ例': [df[col].dropna().iloc[0] if not df[col].dropna().empty else 'NaN' for col in cols]
            })

        col1, col2 = st.columns(2)

        with col1:
            st.subheader("カテゴリーデータのカラム")
            st.dataframe(create_column_info_df(category_cols))

        with col2:
            st.subheader("数値データのカラム")
            st.dataframe(create_column_info_df(numeric_cols))


    # データの一部を見てみる
    with st.expander("データの一部を見てみる", expanded=True):
        st.dataframe(df.head())
    
    # 数値データの統計量
    with st.expander("数値データの統計量", expanded=True):
        st.write(df.describe().transpose())
    
    # 欠損値情報をまとめる
    missing_info = pd.DataFrame({
        "カラム名": df.columns,
        "データ型": [df[col].dtype for col in df.columns],
        "欠損値の数": df.isnull().sum().values,
        "欠損値の割合": (df.isnull().sum() / len(df) * 100).values,
        "ユニーク値の数": df.nunique().values,
        "最頻値": df.mode().iloc[0].values,
        "最頻値の数": [df[col].value_counts().max() if df[col].count() > 0 else 0 for col in df.columns]
    })
    
    # 欠損値情報テーブル
    with st.expander("欠損値情報テーブル", expanded=True):
        # 1行あたりの基本高さ (ピクセル) - ヘッダー含む
        row_height_px = 35
        # ヘッダー分として1行追加
        num_rows_display = len(missing_info) + 1
        # 基本の高さを計算
        calculated_height = num_rows_display * row_height_px
        # 見やすさのための追加マージン (任意)
        margin = 10
        total_height = calculated_height + margin
        # あまりにも高さが大きくなりすぎるのを防ぐための最大高さを設定 (例: 600px)
        # 画面の高さいっぱいに広げたい場合は、大きな値を設定するか、上限を設けない
        max_height = 5000
        display_height = min(total_height, max_height)

        # パーセンテージ表示をフォーマット
        st.dataframe(
            missing_info.style.format({"欠損値の割合(%)": "{:.2f}%"}), # .2fで小数点以下2桁表示
            height=display_height,
            on_select="rerun",
            selection_mode=["multi-row", "multi-column"],
        )

if __name__ == "__main__":
    main()

02_変数ごとの詳細な分析.py

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

def main():
   st.set_page_config(layout="wide")
   
   # データフレームの読み込み
   df = pd.read_csv("./data/train.csv")
   
   st.subheader("変数ごとの詳細な分析")

   # 数値変数とカテゴリ変数に分ける
   numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
   categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

   # 数値変数のpills表示
   selected_numeric_cols = st.pills("数値変数を選んでください", numeric_cols, selection_mode="multi")
   
   # カテゴリ変数のpills表示
   selected_categorical_cols = st.pills("カテゴリ変数を選んでください", categorical_cols, selection_mode="multi")

   # 数値変数の詳細分析
   if selected_numeric_cols:
       for col in selected_numeric_cols:
           with st.expander(f"{col} の詳細分析", expanded=True):
               # 基本統計量
               st.write(f"**{col}** の基本統計量")
               st.write(df[col].describe())

               # 3列に並べて表示
               cols = st.columns(3)  # 3列のレイアウト

               with cols[0]:
                   # ヒストグラムとKDEプロット
                   st.write(f"**{col}** のヒストグラムとKDEプロット")
                   fig, ax = plt.subplots(figsize=(5, 3))  # グラフサイズを調整
                   sns.histplot(df[col].dropna(), kde=True, ax=ax)
                   ax.set_title(f"{col} のヒストグラムとKDE")
                   st.pyplot(fig)

               with cols[1]:
                   # 箱ひげ図
                   st.write(f"**{col}** の箱ひげ図")
                   fig, ax = plt.subplots(figsize=(5, 3))  # グラフサイズを調整
                   sns.boxplot(x=df[col], ax=ax)
                   ax.set_title(f"{col} の箱ひげ図")
                   st.pyplot(fig)

               with cols[2]:
                   # 散布図 (他の数値変数との関係性)
                   st.write(f"**{col}** と他の数値変数の関係性")
                   selected_num_col = st.selectbox(f"{col} と比較する数値変数を選んでください", numeric_cols)
                   if selected_num_col:
                       fig, ax = plt.subplots(figsize=(5, 3))  # グラフサイズを調整
                       sns.scatterplot(x=df[col], y=df[selected_num_col], ax=ax)
                       ax.set_title(f"{col} と {selected_num_col} の散布図")
                       st.pyplot(fig)

   # カテゴリ変数の詳細分析
   if selected_categorical_cols:
       for col in selected_categorical_cols:
           with st.expander(f"{col} の詳細分析", expanded=True):
               # 基本統計量を表示
               st.write(f"**{col}** のユニーク値: {df[col].unique()}")
               st.write(f"**{col}** の最頻値: {df[col].mode()[0]}")
               st.write(f"**{col}** の各カテゴリの数")
               st.write(df[col].value_counts())

               # 3列に並べて表示
               cols = st.columns(3)  # 3列のレイアウト

               with cols[0]:
                   # カテゴリ変数の分布を棒グラフで表示
                   st.write(f"**{col}** の分布 (棒グラフ)")
                   value_counts = df[col].value_counts().reset_index()
                   value_counts.columns = [col, 'count']  # 列名を変更
                   fig = px.bar(value_counts, x=col, y='count',
                                labels={col: col, 'count': 'Count'},
                                title=f'{col} の分布')
                   st.plotly_chart(fig)

               with cols[1]:
                   # ヒストグラムとカーネル密度推定 (KDE)
                   st.write(f"**{col}** のヒストグラムとKDE")
                   fig, ax = plt.subplots(figsize=(5, 3))  # グラフサイズを調整
                   sns.histplot(df[col].dropna(), kde=True, ax=ax)
                   ax.set_title(f"{col} のヒストグラムとKDE")
                   st.pyplot(fig)

               with cols[2]:
                   # カテゴリ変数と数値変数の関係性 (箱ひげ図)
                   st.write(f"**{col}** と数値変数の関係性")
                   selected_num_col = st.selectbox(f"{col} と比較する数値変数を選んでください", numeric_cols)
                   if selected_num_col:
                       fig, ax = plt.subplots(figsize=(5, 3))  # グラフサイズを調整
                       sns.boxplot(x=df[col], y=df[selected_num_col], ax=ax)
                       ax.set_title(f"{col} と {selected_num_col} の関係")
                       st.pyplot(fig)

if __name__ == "__main__":
   main()

03_可視化_数値データ.py


import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

def main():
    st.set_page_config(layout="wide")
    
    df = pd.read_csv("./data/train.csv")
    
    st.subheader("可視化_数値データ")



    # ヒストグラムの表示
    with st.expander("数値カラムの詳細分析", expanded=True):
        # 数値カラムのリスト
        numeric_cols = df.select_dtypes(include=['number']).columns.tolist()

        # st.pillsを使って選択肢を表示（multi選択可）
        selected_cols = st.pills("数値カラムを選んでください", numeric_cols, selection_mode="multi")
        
        if selected_cols:
            st.write(f"選択したカラム: {', '.join(selected_cols)}")

            # 基本統計量をDataFrameで表示
            data_example = []
            for col in selected_cols:
                stats = df[col].describe()  # 基本統計量を取得
                data_example.append([
                    col,
                    int(stats['min']),
                    int(stats['max']),
                    int(stats['mean']),
                    int(df[col].median()),
                    int(stats['std'])
                ])

            # DataFrameを作成
            example_df = pd.DataFrame(data_example, columns=["列名", "最小", "最大", "平均", "中央値", "標準偏差"])
            st.dataframe(example_df)

            # ヒストグラムを3列に並べて表示
            cols = st.columns(3)  # 3列に分けて表示
            for i, col in enumerate(selected_cols):
                with cols[i % 3]:
                    fig, ax = plt.subplots(figsize=(5, 3))  # グラフサイズを調整
                    sns.histplot(df[col].dropna(), bins=30, kde=True, ax=ax)
                    ax.set_title(f"{col} ヒストグラム")
                    st.pyplot(fig)


                    

    # バイオリンプロットの表示
    with st.expander("バイオリンプロット", expanded=True):
        selected_cols = st.multiselect("バイオリンプロットを表示する列を選択", df.select_dtypes(include=["number"]).columns)
        if selected_cols:
            fig = go.Figure()
            for col in selected_cols:
                fig.add_trace(go.Violin(
                    y=df[col].dropna(),
                    name=col,
                    box_visible=True,
                    meanline_visible=True
                ))
            # プロットのレイアウト設定
            fig.update_layout(
                title="バイオリンプロット",
                xaxis_title="Columns",
                yaxis_title="Values",
                violinmode="group"
            )
            st.plotly_chart(fig)
        else:
            pass


    # 散布図の作成
    with st.expander("散布図", expanded=True):
        x_col = st.selectbox("X軸に表示する列を選択", df.select_dtypes(include=["number"]).columns)
        y_col = st.selectbox("Y軸に表示する列を選択", df.select_dtypes(include=["number"]).columns)
        color_col = st.selectbox("色分けする列を選択（オプション）", ['None'] + list(df.select_dtypes(include=["object"]).columns))

        if x_col and y_col:
            if color_col != 'None':
                fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
                                marginal_x="box", marginal_y="violin", 
                                title=f"{x_col} vs {y_col} (colored by {color_col})")
            else:
                fig = px.scatter(df, x=x_col, y=y_col,
                                marginal_x="box", marginal_y="violin", 
                                title=f"{x_col} vs {y_col}")
            
            # 正方形の図に設定
            fig.update_layout(
                width=700, # 図のサイズ設定
                height=700, # 図のサイズ設定
                autosize=False
            )
            
            # 図の表示（コンテナの幅を使用しない）
            st.plotly_chart(fig, use_container_width=False)
        else:
            pass
         

    # 散布図行列の作成
    with st.expander("散布図行列", expanded=True):
        selected_cols = st.multiselect("表示する変数を選択(2つ以上の変数を選択してください)", df.select_dtypes(include=["number"]).columns)
        
        if len(selected_cols) >= 2:
            # 散布図行列を作成
            fig = px.scatter_matrix(df, dimensions=selected_cols, title="散布図行列")
            
            # 散布図行列を正方形に設定
            fig.update_layout(
                width=800,   # 幅
                height=800,  # 高さ
            )
            
            st.plotly_chart(fig)
        else:
            pass


   

if __name__ == "__main__":
    main()

04_可視化_カテゴリデータ.py

import streamlit as st
import pandas as pd
import plotly.express as px

def main():
    st.set_page_config(layout="wide")
    
    # データフレームの読み込み
    df = pd.read_csv("./data/train.csv")
    
    st.subheader("可視化_カテゴリデータ")

    # カテゴリーデータカラムの情報表示
    with st.expander("カテゴリーデータカラムの情報表示", expanded=True):

        # カテゴリカラムのみ
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()


        # st.pillsを使って選択肢を表示（multi選択可）
        selected_categorical_cols = st.pills("カテゴリカラムを選んでください", categorical_cols, selection_mode="multi")

        if selected_categorical_cols:

            # 選択されたカテゴリカラムの基本統計量をDataFrameで表示
            cat_data_example = []

            for col in selected_categorical_cols:
                # カテゴリ変数の基本統計量（ユニーク値数、最頻値、最頻値の出現回数）
                mode_value = df[col].mode()[0] if not df[col].mode().empty else 'NaN'
                mode_count = df[col].value_counts().get(mode_value, 0)
                cat_data_example.append([col, len(df[col].unique()), mode_value, mode_count])

            # DataFrameを作成
            cat_example_df = pd.DataFrame(cat_data_example, columns=["列名", "ユニークな値数", "最頻値", "最頻値の出現回数"])

            st.dataframe(cat_example_df)

        
        
        
        # カテゴリカル変数の分布を表示
        if selected_categorical_cols:
            # 3カラムのレイアウトを作成
            cols = st.columns(3)

            for i, selected_cat_col in enumerate(selected_categorical_cols):
                # 選択されたカテゴリカル変数に基づいて棒グラフを作成
                value_counts = df[selected_cat_col].value_counts().reset_index()
                value_counts.columns = [selected_cat_col, 'count']  # 列名を変更

                # プロットを作成
                fig = px.bar(value_counts, x=selected_cat_col, y='count',
                             labels={selected_cat_col: selected_cat_col, 'count': 'Count'})

                # それぞれのカラムにプロットを表示
                with cols[i % 3]:
                    st.plotly_chart(fig)


if __name__ == "__main__":
    main()

05_filter法による特徴量選択.py

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from scipy.stats import f_oneway



def calculate_vif(df):
    # NaNやinfの値を処理（削除する）
    df_clean = df.replace([float('inf'), -float('inf')], pd.NA).dropna()

    # 定数項を追加（VIFの計算には定数項が必要）
    df_with_const = add_constant(df_clean)
    
    # 各特徴量のVIFを計算
    vif_data = pd.DataFrame()
    vif_data["feature"] = df_with_const.columns
    vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(df_with_const.shape[1])]
    
    return vif_data


def main():
    st.set_page_config(layout="wide")
    
    df = pd.read_csv("./data/train.csv")
    
    st.subheader("相関とVIFによる相互共線性分析")




    with st.expander("相関ランキング", expanded=True):
        # 数値変数のみを選択
        numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
        numeric_df = df[numeric_cols]

        # 相関係数の計算
        corr_matrix = numeric_df.corr()

        # 相関係数が高い順にソート
        sorted_corr = corr_matrix.unstack().sort_values(ascending=False, key=abs)
        
        # 自己相関は除外
        sorted_corr = sorted_corr[sorted_corr < 1]

        # 組み合わせの最大数
        max_combinations = len(sorted_corr)

        # スライダーを使ってトップNの相関係数を表示
        top_n = st.slider("トップNの相関係数を表示", min_value=1, max_value=max_combinations, value=10)
        
        # トップNの相関を表示
        st.subheader(f"相関係数のトップ {top_n} 件")
        st.write(sorted_corr.head(top_n))





    # ヒートマップの表示
    with st.expander("相関関係 (ヒートマップ)", expanded=True):
        # 数値変数とカテゴリ変数に分けてpillsで表示
        numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
        categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

        # 数値変数のpills
        selected_numeric_cols = st.pills("数値変数を選んでください", numeric_cols, selection_mode="multi")
        
        # カテゴリ変数のpills
        selected_categorical_cols = st.pills("カテゴリ変数を選んでください", categorical_cols, selection_mode="multi")

        if selected_numeric_cols:
            numeric_df = df[selected_numeric_cols]
            show_numbers = st.checkbox("ヒートマップに数字を表示する", value=True)  # 数字表示のオン/オフ
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.heatmap(numeric_df.corr(), annot=show_numbers, cmap="coolwarm", fmt=".2f" if show_numbers else "", ax=ax)
            st.pyplot(fig)

        if selected_categorical_cols:
            # カテゴリ変数の処理（例: カテゴリカルデータのカウントプロットなど）
            for col in selected_categorical_cols:
                value_counts = df[col].value_counts().reset_index()
                value_counts.columns = [col, 'count']  # 列名を変更

                # プロットを作成
                fig = px.bar(value_counts, x=col, y='count',
                             labels={col: col, 'count': 'Count'},
                             title=f'{col}の分布')

                st.plotly_chart(fig)

        if not selected_numeric_cols and not selected_categorical_cols:
            pass



    # VIF分析
    st.subheader("VIF分析")
    with st.expander("VIF分析", expanded=True):
        # VIF計算
        vif_data = calculate_vif(df[numeric_cols])
        
        # VIFが高い順にソート
        st.subheader("VIFが高い順に表示します")
        vif_data_sorted = vif_data.sort_values("VIF", ascending=False)

        # 表としてVIFを表示
        st.write(vif_data_sorted)

        # VIFが高い特徴量を表示（一般的にVIFが10を超えると多重共線性あり）
        high_vif = vif_data_sorted[vif_data_sorted["VIF"] > 10]
        if not high_vif.empty:
            st.subheader("以下の特徴量は多重共線性が高い可能性があります（VIF > 10）:")
            st.write(high_vif)
        else:
            st.subheader("多重共線性が高い特徴量は見つかりませんでした。")

        # PlotlyでVIFが高い順に棒グラフをプロット
        fig = go.Figure()

        fig.add_trace(go.Bar(
            x=vif_data_sorted["feature"],
            y=vif_data_sorted["VIF"],
            marker=dict(color='rgba(255, 99, 132, 0.6)', line=dict(color='rgba(255, 99, 132, 1)', width=1)),
            text=vif_data_sorted["VIF"],
            hoverinfo="x+text",
        ))

        fig.update_layout(
            title="VIFの大きい順に並べた棒グラフ",
            xaxis_title="特徴量",
            yaxis_title="VIF",
            showlegend=False,
            xaxis=dict(tickangle=45),
        )

        st.plotly_chart(fig)





    # 分散の確認
    st.subheader("分散の確認")

    with st.expander("分散分析", expanded=True):
        # 各数値変数の分散を計算
        variance = df[numeric_cols].var()

        # 分散が大きい順にソート
        variance_sorted = variance.sort_values(ascending=False)

        # 表として分散を表示
        st.write(variance_sorted)

        # Plotlyで分散が大きい順に棒グラフをプロット
        fig = go.Figure()

        fig.add_trace(go.Bar(
            x=variance_sorted.index,
            y=variance_sorted.values,
            marker=dict(color='rgba(0, 123, 255, 0.6)', line=dict(color='rgba(0, 123, 255, 1)', width=1)),
            text=variance_sorted.values,
            hoverinfo="x+text",
        ))

        fig.update_layout(
            title="各変数の分散",
            xaxis_title="変数",
            yaxis_title="分散",
            showlegend=False,
            xaxis=dict(tickangle=45),
        )

        st.plotly_chart(fig)

if __name__ == "__main__":
    main()

06_チェックリスト.py

import streamlit as st
import pandas as pd

def main():
    st.set_page_config(layout="wide")

    # アルゴリズムの選択
    model_type = st.selectbox("予測アルゴリズムを選択", ["回帰", "分類", "どちらでもない"])

    # チェックリストセクション
    st.subheader("データ前処理後の最終チェックリスト")

    # チェックリスト
    check_list = {
        "外れ値の確認": "外れ値が適切に処理されたか（例：IQR法、Zスコア法）",
        "欠損値の確認": "欠損値が適切に処理されたか（例：削除、補完）",
        "重複データの確認": "重複行が削除されたか",
        "データ型の確認": "各列のデータ型が正しいか（数値、文字列など）",
        "スケーリングの確認": "数値データのスケーリング（標準化、正規化）が必要か確認",
        "カテゴリ変数の確認": "カテゴリ変数が適切にエンコードされたか（例：One-Hot Encoding）",
        "分布の確認": "特徴量の分布が偏っていないか（例：歪度、尖度の確認）",
        "相関の確認": "強い相関がある特徴量を確認し、多重共線性がないか確認（VIFなど）",
        "データの一貫性": "異常な値や一貫性のない値（例えば、年齢が負の値など）がないか確認",
        "ターゲット変数の確認": "ターゲット変数に偏りがないか（クラス不均衡など）"
    }

    # 回帰用と分類用のチェック項目を分ける
    if model_type == "回帰":
        check_list.update({
            "ターゲット変数のスケーリング": "回帰モデルの場合、ターゲット変数がスケーリングされているか確認",
        })
    elif model_type == "分類":
        check_list.update({
            "ターゲット変数のクラス不均衡": "分類問題ではターゲット変数のクラス不均衡を確認",
        })

    checkboxes = {}
    for key, value in check_list.items():
        checkboxes[key] = st.checkbox(f"{key}: {value}")

    # チェックがすべて完了しているかの確認
    if all(checkboxes.values()):
        st.success("データ前処理が完了しました。")
    else:
        st.warning("まだ確認が完了していない項目があります。再度確認してください。")

if __name__ == "__main__":
    main()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up