探索的データ分析
Explanatory Data Analysisの重要性は
EDA ツール
CSVデータの例でEDAに関するデータの可視化のツールをまとめます。
リクワイアメント
- Download https://www.kaggle.com/c/titanic/data?select=train.csv
- Install
pip install pandas
Dataprep
Install
pip install dataprep
Usage
import pandas as pd
from dataprep.eda import create_report
train = pd.read_csv('train.csv')
create_report(train).show_browser()
Pandas Profiling
Install
pip install pandas-profiling
Usage
import pandas as pd
from pandas_profiling import ProfileReport
train = pd.read_csv('train.csv')
profile = ProfileReport(train, title="Report")
profile
# Save as a HTML file
# profile.to_file("pandas_profiling_train.html")
Sweetviz
Install
pip install sweetviz
Usage
import sweetviz as sv
train = pd.read_csv('train.csv')
analyze_report = sv.analyze(train)
analyze_report.show_html('report.html', open_browser=True)
AutoViz
Install
pip install autoviz
Usage
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df_av = AV.AutoViz('train.csv', chart_format='bokeh')
# Local web server
#df_av = AV.AutoViz('train.csv', chart_format='server')
# Save charts as HTML files at AutoViz_Plots/
#df_av = AV.AutoViz('train.csv', chart_format='html')
PipeRider
CLIでEDAとAssertionするツールです
Install
pip install 'piperider[csv]'
CLI Usage
piperider init
Initialize piperider to path /Users/gabriel/Workspace/playground/titanic/.piperider
[?] What is your data source name? (alphanumeric and underscore are allowed): titanic
[?] Which data source would you like to connect to?: csv
> csv
Please enter the following fields for csv
[?] Path of csv file: train.csv
piperider run
[?] Do you want to auto generate recommended assertions for this datasource [Yes/no]? Yes
Generating reports from: ~/titanic/.piperider/outputs/latest/run.json
Report generated in ~/titanic/.piperider/outputs/latest/index.html
Whylogs (追記)
Install
# With Profile Visualizer
pip install 'whylogs[viz]'
Usage
import whylogs as why
import pandas as pd
#dataframe
train = pd.read_csv("train.csv")
result = why.log(pandas=train)
train_view = result.view()
from whylogs.viz import NotebookProfileVisualizer
visualization = NotebookProfileVisualizer()
visualization.set_profiles(train_view)
visualization.profile_summary()
Data Profiler(追記)
AIでEDAすることができるここのProfilerですが、HTMLのレポートがありません。
Install
# Report only
pip install DataProfiler[report]
# With Tensorflow
pip install DataProfiler[ml]
# Full package
pip install DataProfiler[full]
Usage
import json
from dataprofiler import Data, Profiler
data = Data("train.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text, URL
print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame
profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc
readable_report = profile.report(report_options={"output_format": "compact"})
print(json.dumps(readable_report, indent=4))
レポートのJSON
{
"global_stats": {
"samples_used": 891,
"column_count": 12,
"row_count": 891,
"row_has_null_ratio": 0.7946,
"row_is_null_ratio": 0.0,
"unique_row_ratio": 1.0,
"duplicate_row_count": 0,
"file_type": "csv",
"encoding": "utf-8",
"correlation_matrix": null,
"chi2_matrix": "[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], ... , [nan, 0., 0., nan, 0., 0., 0., 0., nan, nan, nan, 1.]]",
"profile_schema": {
"PassengerId": [
0
],
"Survived": [
1
],
"Pclass": [
2
],
"Name": [
3
],
"Sex": [
4
],
"Age": [
5
],
"SibSp": [
6
],
"Parch": [
7
],
"Ticket": [
8
],
"Fare": [
9
],
"Cabin": [
10
],
"Embarked": [
11
]
},
"times": {
"row_stats": 0.003
}
},
"data_stats": [
{
"column_name": "PassengerId",
"data_type": "int",
"categorical": false,
"order": "ascending",
"samples": "['600', '74', '831', '76', '811']",
"statistics": {
"min": 1.0,
"max": 891.0,
"mode": "[1.445, 2.335, 3.225, 4.115, 5.005]",
"median": 446.445,
"sum": 397386.0,
"mean": 446.0,
"variance": 66231.0,
"stddev": 257.3538,
"skewness": 0.0,
"kurtosis": -1.2,
"quantiles": {
"0": 223.2775,
"1": 446.445,
"2": 668.7225
},
"median_abs_deviation": 222.7225,
"num_zeros": 0,
"num_negatives": 0,
"unique_count": 891,
"unique_ratio": 1.0,
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 1.0,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "Survived",
"data_type": "int",
"categorical": true,
"order": "random",
"samples": "['1', '0', '0', '1', '1']",
"statistics": {
"min": 0.0,
"max": 1.0,
"mode": "[0.0005]",
"median": 0.0008,
"sum": 342.0,
"mean": 0.3838,
"variance": 0.2368,
"stddev": 0.4866,
"skewness": 0.4785,
"kurtosis": -1.775,
"quantiles": {
"0": 0.0004,
"1": 0.0008,
"2": 0.9993
},
"median_abs_deviation": 0,
"num_zeros": 549,
"num_negatives": 0,
"unique_count": 2,
"unique_ratio": 0.0022,
"categories": "['0', '1']",
"gini_impurity": 0.473,
"unalikeability": 0.4735,
"categorical_count": {
"0": 549,
"1": 342
},
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 1.0,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "Pclass",
"data_type": "int",
"categorical": true,
"order": "random",
"samples": "['2', '2', '1', '1', '3']",
"statistics": {
"min": 1.0,
"max": 3.0,
"mode": "[2.999]",
"median": 2.9982,
"sum": 2057.0,
"mean": 2.3086,
"variance": 0.699,
"stddev": 0.8361,
"skewness": -0.6305,
"kurtosis": -1.28,
"quantiles": {
"0": 2.0001,
"1": 2.9982,
"2": 2.9991
},
"median_abs_deviation": 0.0016,
"num_zeros": 0,
"num_negatives": 0,
"unique_count": 3,
"unique_ratio": 0.0034,
"categories": "['3', '1', '2']",
"gini_impurity": 0.5949,
"unalikeability": 0.5956,
"categorical_count": {
"3": 491,
"1": 216,
"2": 184
},
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 1.0,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "Name",
"data_type": "string",
"categorical": false,
"order": "random",
"samples": "['Smiljanic, Mr. Mile', 'Isham, Miss. Ann Elizabeth',\n 'Petranec, Miss. Matilda', 'Ling, Mr. Lee',\n 'Kirkland, Rev. Charles Leonard']",
"statistics": {
"min": 12.0,
"max": 82.0,
"mode": "[19.035]",
"median": 25.0041,
"sum": 24026.0,
"mean": 26.9652,
"variance": 86.1482,
"stddev": 9.2816,
"skewness": 1.3926,
"kurtosis": 2.5594,
"quantiles": {
"0": 20.0137,
"1": 25.0041,
"2": 30.0586
},
"median_abs_deviation": 5.0216,
"vocab": "['O', ' ', 'k', 'a', 'P', ... , 'K', 'j', 'd', 'Q', 'x']",
"unique_count": 891,
"unique_ratio": 1.0,
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 0.0,
"float": 0.0,
"string": 1.0
}
}
},
{
"column_name": "Sex",
"data_type": "string",
"categorical": true,
"order": "random",
"samples": "['male', 'male', 'female', 'male', 'male']",
"statistics": {
"min": 4.0,
"max": 6.0,
"mode": "[4.001]",
"median": 4.0015,
"sum": 4192.0,
"mean": 4.7048,
"variance": 0.9139,
"stddev": 0.956,
"skewness": 0.6189,
"kurtosis": -1.6206,
"quantiles": {
"0": 4.0008,
"1": 4.0015,
"2": 5.9986
},
"median_abs_deviation": 0,
"vocab": "['e', 'm', 'a', 'f', 'l']",
"unique_count": 2,
"unique_ratio": 0.0022,
"categories": "['male', 'female']",
"gini_impurity": 0.4564,
"unalikeability": 0.4569,
"categorical_count": {
"male": 577,
"female": 314
},
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 0.0,
"float": 0.0,
"string": 1.0
}
}
},
{
"column_name": "Age",
"data_type": "float",
"categorical": true,
"order": "random",
"samples": "['28', '34', '27', '26', '63']",
"statistics": {
"min": 0.42,
"max": 80.0,
"mode": "[24.01547]",
"median": 28.0183,
"sum": 21205.17,
"mean": 29.6991,
"variance": 211.0191,
"stddev": 14.5265,
"skewness": 0.3891,
"kurtosis": 0.1783,
"quantiles": {
"0": 20.0736,
"1": 28.0183,
"2": 38.0505
},
"median_abs_deviation": 8.9421,
"num_zeros": 0,
"num_negatives": 0,
"precision": {
"min": 1,
"max": 3,
"mean": 1.85,
"var": 0.18,
"std": 0.424,
"sample_size": 714,
"margin_of_error": 0.0523,
"confidence_level": 0.999
},
"unique_count": 88,
"unique_ratio": 0.1232,
"categories": "['24', '22', '18', '19', ... , '55.5', '0.92', '23.5', '74']",
"gini_impurity": 0.978,
"unalikeability": 0.9794,
"categorical_count": {
"24": 30,
"22": 27,
"18": 26,
"19": 25,
"28": 25,
"30": 25,
"21": 24,
"25": 23,
"36": 22,
"29": 20,
"32": 18,
"35": 18,
"27": 18,
"26": 18,
"16": 17,
"31": 17,
"20": 15,
"34": 15,
"33": 15,
"23": 15,
"39": 14,
"40": 13,
"17": 13,
"42": 13,
"45": 12,
"38": 11,
"50": 10,
"2": 10,
"4": 10,
"44": 9,
"48": 9,
"47": 9,
"54": 8,
"9": 8,
"1": 7,
"51": 7,
"14": 6,
"52": 6,
"37": 6,
"49": 6,
"41": 6,
"3": 6,
"58": 5,
"15": 5,
"43": 5,
"62": 4,
"56": 4,
"5": 4,
"11": 4,
"60": 4,
"8": 4,
"6": 3,
"46": 3,
"61": 3,
"65": 3,
"7": 3,
"10": 2,
"64": 2,
"13": 2,
"63": 2,
"30.5": 2,
"57": 2,
"70": 2,
"0.75": 2,
"71": 2,
"59": 2,
"0.83": 2,
"40.5": 2,
"55": 2,
"32.5": 2,
"28.5": 2,
"45.5": 2,
"34.5": 1,
"0.42": 1,
"0.67": 1,
"66": 1,
"24.5": 1,
"80": 1,
"20.5": 1,
"53": 1,
"14.5": 1,
"70.5": 1,
"12": 1,
"36.5": 1,
"55.5": 1,
"0.92": 1,
"23.5": 1,
"74": 1
},
"sample_size": 891,
"null_count": 177,
"null_types": "['']",
"data_type_representation": {
"datetime": 0.0,
"int": 0.965,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "SibSp",
"data_type": "int",
"categorical": true,
"order": "random",
"samples": "['1', '0', '0', '5', '0']",
"statistics": {
"min": 0.0,
"max": 8.0,
"mode": "[0.004]",
"median": 0.0059,
"sum": 466.0,
"mean": 0.523,
"variance": 1.216,
"stddev": 1.1027,
"skewness": 3.6954,
"kurtosis": 17.8804,
"quantiles": {
"0": 0.0029,
"1": 0.0059,
"2": 1.0023
},
"median_abs_deviation": 0,
"num_zeros": 608,
"num_negatives": 0,
"unique_count": 7,
"unique_ratio": 0.0079,
"categories": "['0', '1', '2', '4', '3', '8', '5']",
"gini_impurity": 0.4775,
"unalikeability": 0.4781,
"categorical_count": {
"0": 608,
"1": 209,
"2": 28,
"4": 18,
"3": 16,
"8": 7,
"5": 5
},
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 1.0,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "Parch",
"data_type": "int",
"categorical": true,
"order": "random",
"samples": "['1', '0', '0', '0', '0']",
"statistics": {
"min": 0.0,
"max": 6.0,
"mode": "[0.003]",
"median": 0.0039,
"sum": 340.0,
"mean": 0.3816,
"variance": 0.6497,
"stddev": 0.8061,
"skewness": 2.7491,
"kurtosis": 9.7781,
"quantiles": {
"0": 0.002,
"1": 0.0039,
"2": 0.0059
},
"median_abs_deviation": 0,
"num_zeros": 678,
"num_negatives": 0,
"unique_count": 7,
"unique_ratio": 0.0079,
"categories": "['0', '1', '2', '5', '3', '4', '6']",
"gini_impurity": 0.3953,
"unalikeability": 0.3957,
"categorical_count": {
"0": 678,
"1": 118,
"2": 80,
"5": 5,
"3": 5,
"4": 4,
"6": 1
},
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 1.0,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "Ticket",
"data_type": "string",
"categorical": false,
"order": "random",
"samples": "['113781', '315097', '371110', 'C.A. 17248', '36947']",
"statistics": {
"min": 3.0,
"max": 18.0,
"mode": "[6.0075]",
"median": 6.0076,
"sum": 6015.0,
"mean": 6.7508,
"variance": 7.5379,
"stddev": 2.7455,
"skewness": 2.211,
"kurtosis": 5.1754,
"quantiles": {
"0": 5.0087,
"1": 6.0076,
"2": 6.9985
},
"median_abs_deviation": 0.9972,
"vocab": "['H', '5', '7', 'Q', '3', ... , '0', 'P', 'e', 'L', 'N']",
"unique_count": 681,
"unique_ratio": 0.7643,
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.4837,
"int": 0.7419,
"float": 0.7419,
"string": 1.0
}
}
},
{
"column_name": "Fare",
"data_type": "float",
"categorical": false,
"order": "random",
"samples": "['7.8958', '9.5', '0', '34.375', '227.525']",
"statistics": {
"min": 0.0,
"max": 512.3292,
"mode": "[7.9411026]",
"median": 14.5475,
"sum": 28693.9493,
"mean": 32.2042,
"variance": 2469.4368,
"stddev": 49.6934,
"skewness": 4.7873,
"kurtosis": 33.3981,
"quantiles": {
"0": 8.0222,
"1": 14.5475,
"2": 31.124
},
"median_abs_deviation": 6.945,
"num_zeros": 15,
"num_negatives": 0,
"precision": {
"min": 0,
"max": 7,
"mean": 3.899,
"var": 2.3898,
"std": 1.5459,
"sample_size": 891,
"margin_of_error": 0.1704,
"confidence_level": 0.999
},
"unique_count": 248,
"unique_ratio": 0.2783,
"sample_size": 891,
"null_count": 0,
"null_types": "[]",
"data_type_representation": {
"datetime": 0.0,
"int": 0.1807,
"float": 1.0,
"string": 1.0
}
}
},
{
"column_name": "Cabin",
"data_type": "string",
"categorical": false,
"order": "random",
"samples": "['B42', 'E25', 'D', 'E33', 'A23']",
"statistics": {
"min": 1.0,
"max": 15.0,
"mode": "[2.995]",
"median": 2.9961,
"sum": 732.0,
"mean": 3.5882,
"variance": 4.3025,
"stddev": 2.0743,
"skewness": 3.1847,
"kurtosis": 11.7603,
"quantiles": {
"0": 2.9905,
"1": 2.9961,
"2": 3.0017
},
"median_abs_deviation": 0.0056,
"vocab": "['5', '7', '3', 'T', 'E', ... , '1', 'D', '4', ' ', '0']",
"unique_count": 147,
"unique_ratio": 0.7206,
"sample_size": 891,
"null_count": 687,
"null_types": "['']",
"data_type_representation": {
"datetime": 0.0,
"int": 0.0,
"float": 0.0,
"string": 1.0
}
}
},
{
"column_name": "Embarked",
"data_type": "string",
"categorical": true,
"order": "random",
"samples": "['S', 'S', 'S', 'S', 'S']",
"statistics": {
"min": 1.0,
"max": 1.0,
"mode": "[1.]",
"median": 1.0,
"sum": 889.0,
"mean": 1.0,
"variance": 0.0,
"stddev": 0.0,
"skewness": 0.0,
"kurtosis": -3.0102,
"quantiles": {
"0": 1.0,
"1": 1.0,
"2": 1.0
},
"median_abs_deviation": 0,
"vocab": "['S', 'Q', 'C']",
"unique_count": 3,
"unique_ratio": 0.0034,
"categories": "['S', 'C', 'Q']",
"gini_impurity": 0.432,
"unalikeability": 0.4325,
"categorical_count": {
"S": 644,
"C": 168,
"Q": 77
},
"sample_size": 891,
"null_count": 2,
"null_types": "['']",
"data_type_representation": {
"datetime": 0.0,
"int": 0.0,
"float": 0.0,
"string": 1.0
}
}
}
]
}
Lux(追記)
Install
pip install lux-api
Usage
import lux
import pandas as pd
df = pd.read_csv("train.csv")
df
# df.save_as_html("lux.html")
Polymersearch(追記)
Webでspreadsheetやcsvを読み込んでEDAします
本文記載時点は無料で14-days-trialがあります