はじめに
CSVから列をドロップするだけのアプリつくりまーす
開発環境
- Windows 10 PC
- Python 3.10
- PyQt5
導入
1.ライブラリのインストール
pip install pandas
pip install seaborn
pip install PyQt5
使用しているライブラリはこんな感じ
contourpy==1.1.0
cycler==0.11.0
fonttools==4.42.0
kiwisolver==1.4.4
matplotlib==3.7.2
numpy==1.25.2
packaging==23.1
pandas==2.0.3
Pillow==10.0.0
pyparsing==3.0.9
PyQt5==5.15.9
PyQt5-Qt5==5.15.2
PyQt5-sip==12.12.2
python-dateutil==2.8.2
pytz==2023.3
seaborn==0.12.2
six==1.16.0
tzdata==2023.3
pip freeze > requirements.txt
これで書き出せます。
2.コードを作成。ChatGPTに聞きまくれ!
import sys
import pandas as pd
import seaborn as sns
from PyQt5 import QtWidgets
from PyQt5.QtWidgets import QFileDialog, QTabWidget
from PyQt5.QtCore import Qt
from PyQt5.QtGui import QIcon
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
class MplCanvas(FigureCanvas):
def __init__(self, parent=None, width=10, height=8, dpi=100):
fig = Figure(figsize=(width, height), dpi=dpi)
self.axes = fig.add_subplot(111)
super(MplCanvas, self).__init__(fig)
class FeatureDropperApp(QtWidgets.QWidget):
def __init__(self):
super().__init__()
# Load Button
self.load_button = QtWidgets.QPushButton("Load CSV File")
self.load_button.clicked.connect(self.load_csv)
# Features List
self.features_list = QtWidgets.QListWidget()
self.features_list.setSelectionMode(QtWidgets.QAbstractItemView.MultiSelection)
# Drop Button
self.drop_button = QtWidgets.QPushButton("Drop Selected Features")
self.drop_button.clicked.connect(self.drop_features)
# Save Button
self.save_button = QtWidgets.QPushButton("Save As CSV")
self.save_button.clicked.connect(self.save_csv)
# Layout for left side
left_layout = QtWidgets.QVBoxLayout()
left_layout.addWidget(self.load_button)
left_layout.addWidget(self.features_list)
left_layout.addWidget(self.drop_button)
left_layout.addWidget(self.save_button)
# Tab Widget (right side)
self.tab_widget = QTabWidget()
self.distribution_scroll_area = QtWidgets.QScrollArea()
self.distribution_scroll_area.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
self.correlation_canvas = MplCanvas(self, width=6, height=6, dpi=100)
self.missing_values_canvas = MplCanvas(self, width=6, height=6, dpi=100)
# Add a table for the first 100 rows
self.first_100_rows_table = QtWidgets.QTableWidget()
self.first_100_rows_table.setEditTriggers(
QtWidgets.QAbstractItemView.NoEditTriggers
)
# Statistics Tab with Numerical and Categorical sub-tabs
self.statistics_tab_widget = QTabWidget()
self.numerical_statistics_layout = QtWidgets.QVBoxLayout()
self.numerical_statistics_widget = QtWidgets.QWidget()
self.numerical_statistics_widget.setLayout(self.numerical_statistics_layout)
self.categorical_statistics_layout = QtWidgets.QVBoxLayout()
self.categorical_statistics_widget = QtWidgets.QWidget()
self.categorical_statistics_widget.setLayout(self.categorical_statistics_layout)
self.statistics_tab_widget.addTab(self.numerical_statistics_widget, "Numerical")
self.statistics_tab_widget.addTab(
self.categorical_statistics_widget, "Categorical"
)
self.tab_widget.addTab(self.first_100_rows_table, "First 100 Rows")
self.tab_widget.addTab(self.statistics_tab_widget, "Statistics")
self.tab_widget.addTab(self.distribution_scroll_area, "Distribution")
self.tab_widget.addTab(self.correlation_canvas, "Correlation")
self.tab_widget.addTab(self.missing_values_canvas, "Missing Values")
# Main layout
main_layout = QtWidgets.QHBoxLayout()
main_layout.addLayout(left_layout, 1)
main_layout.addWidget(self.tab_widget, 3)
self.setLayout(main_layout)
self.data = None
self.resize(800, 600) # adjust the numbers as you see fit
self.setWindowTitle("Feature Dropper")
self.setWindowIcon(QIcon("favicon.ico"))
def show_first_100_rows(self):
if self.data is not None:
# Display the first 100 rows in the table
self.first_100_rows_table.setRowCount(min(100, len(self.data)))
self.first_100_rows_table.setColumnCount(len(self.data.columns))
self.first_100_rows_table.setHorizontalHeaderLabels(self.data.columns)
for i in range(min(100, len(self.data))):
for j, value in enumerate(self.data.iloc[i]):
self.first_100_rows_table.setItem(
i, j, QtWidgets.QTableWidgetItem(str(value))
)
self.first_100_rows_table.resizeColumnsToContents()
def show_statistics(self):
if self.data is not None:
# Get numerical and categorical columns
numerical_data = self.data.select_dtypes(include=["number"])
categorical_data = self.data.select_dtypes(include=["object", "category"])
# Clear the previous contents
for i in reversed(range(self.numerical_statistics_layout.count())):
self.numerical_statistics_layout.itemAt(i).widget().setParent(None)
for i in reversed(range(self.categorical_statistics_layout.count())):
self.categorical_statistics_layout.itemAt(i).widget().setParent(None)
# Numerical data
numerical_table = QtWidgets.QTableWidget()
numerical_table.setColumnCount(len(numerical_data.columns))
numerical_table.setRowCount(4)
numerical_table.setHorizontalHeaderLabels(numerical_data.columns)
numerical_table.setVerticalHeaderLabels(["Mean", "Median", "Min", "Max"])
numerical_table.setEditTriggers(QtWidgets.QAbstractItemView.NoEditTriggers)
for i, col in enumerate(numerical_data.columns):
numerical_table.setItem(
0, i, QtWidgets.QTableWidgetItem(str(numerical_data[col].mean()))
)
numerical_table.setItem(
1, i, QtWidgets.QTableWidgetItem(str(numerical_data[col].median()))
)
numerical_table.setItem(
2, i, QtWidgets.QTableWidgetItem(str(numerical_data[col].min()))
)
numerical_table.setItem(
3, i, QtWidgets.QTableWidgetItem(str(numerical_data[col].max()))
)
self.numerical_statistics_layout.addWidget(numerical_table)
# Categorical data
categorical_table = QtWidgets.QTableWidget()
categorical_table.setColumnCount(len(categorical_data.columns))
categorical_table.setRowCount(2)
categorical_table.setHorizontalHeaderLabels(categorical_data.columns)
categorical_table.setVerticalHeaderLabels(
["Unique values", "Most common value"]
)
categorical_table.setEditTriggers(
QtWidgets.QAbstractItemView.NoEditTriggers
)
for i, col in enumerate(categorical_data.columns):
categorical_table.setItem(
0,
i,
QtWidgets.QTableWidgetItem(str(categorical_data[col].nunique())),
)
categorical_table.setItem(
1,
i,
QtWidgets.QTableWidgetItem(
str(categorical_data[col].mode().iat[0])
),
)
self.categorical_statistics_layout.addWidget(categorical_table)
def show_distribution(self):
if self.data is not None:
# Get selected features
selected_features = self.data.columns.tolist()
numerical_data = self.data[selected_features].select_dtypes(
include=["number"]
)
# Create a widget and a vertical layout to hold the histograms
distribution_widget = QtWidgets.QWidget()
distribution_layout = QtWidgets.QHBoxLayout()
for col in numerical_data.columns:
# Create a new MplCanvas for each histogram
canvas = MplCanvas(self, width=5, height=4, dpi=100)
numerical_data[col].plot(kind="hist", ax=canvas.axes)
canvas.axes.set_title(col)
# Rotate x-axis labels
for label in canvas.axes.get_xticklabels():
label.set_rotation(45)
# Add the canvas to the layout
distribution_layout.addWidget(canvas)
# Adjust layout to make sure everything fits
canvas.figure.tight_layout()
# Set the layout on the widget and set the widget on the scroll area
distribution_widget.setLayout(distribution_layout)
self.distribution_scroll_area.setWidget(distribution_widget)
def show_correlation(self):
if self.data is not None:
self.correlation_canvas.axes.clear()
numerical_data = self.data.select_dtypes(include=["number"])
corr = numerical_data.corr()
sns.heatmap(corr, ax=self.correlation_canvas.axes)
self.correlation_canvas.figure.tight_layout()
self.correlation_canvas.draw()
def show_missing_values(self):
if self.data is not None:
self.missing_values_canvas.axes.clear()
missing_values = self.data.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values.plot(kind="bar", ax=self.missing_values_canvas.axes)
self.missing_values_canvas.figure.tight_layout()
self.missing_values_canvas.draw()
def load_csv(self):
options = QFileDialog.Options()
file_name, _ = QFileDialog.getOpenFileName(
self,
"Load CSV File",
"",
"CSV Files (*.csv);;All Files (*)",
options=options,
)
if file_name:
self.data = pd.read_csv(file_name)
self.features_list.addItems(self.data.columns.tolist())
self.show_first_100_rows()
self.show_statistics()
self.show_distribution()
self.show_correlation()
self.show_missing_values()
def drop_features(self):
if self.data is not None:
selected_features = [
item.text() for item in self.features_list.selectedItems()
]
self.data.drop(columns=selected_features, inplace=True)
self.features_list.clear()
self.features_list.addItems(self.data.columns.tolist())
self.show_first_100_rows()
self.show_statistics()
self.show_distribution()
self.show_correlation()
self.show_missing_values()
def save_csv(self):
if self.data is not None:
options = QFileDialog.Options()
file_name, _ = QFileDialog.getSaveFileName(
self,
"Save CSV File",
"",
"CSV Files (*.csv);;All Files (*)",
options=options,
)
if file_name:
self.data.to_csv(file_name, index=False)
app = QtWidgets.QApplication(sys.argv)
window = FeatureDropperApp()
window.show()
sys.exit(app.exec_())
実行
1.実行すると以下のように表示されます
python app.py
2.Load CSV FileからCSVファイルを読み込みましょう
CSVファイルはKaggleのAsteroid Datasetを用います。
435MBもあります。Visual Studio CodeのEdit csvだと50MBを超えるファイルは編集できないです。
3.First 100 Rows:最初の100行を表示します
4.Statistics:数値データとカテゴリデータの統計データを表示します
Numerical | Categorical |
---|---|
![]() |
![]() |
6.Correlation:データの相関のグラフを表示します
7.Missing Values:欠損値の個数を表示します
8.欠損値の多いnameとprefixを選択してドロップします
Drop Selected Featuresをクリックするとグラフも更新されます
9.Save As CSVで保存します
EXE化して配布
アプリのアイコン(favicon.ico)を用意します。
アプリ名はFeatureDropperにしました。
pip install pyinstaller
pyinstaller --onefile --noconsole --icon=favicon.ico --name=FeatureDropper app.py
実行するとdistフォルダの中にFeatureDropper.exeが生成されます。
配布して実行してもらうと不明な発行元の警告が出ます。
解決するには証明書が必要みたいです。
お疲れ様でした。
Powered by ChatGPT