pythonの基礎を備忘録として残しておく
summary
pd.read_csv('./data/test.csv', sep=',') # CSVデータ読み込み
df[['job','age']] # 'job','age'列
df['job'] # 'job'列のシリーズデータ
df[df['age']>=40] # 'age'が40以上
df[(df['age'] <=30) & (df['marital'] == 'married')] # 'age'が30以下かつ'marital'が'married'
df['over_35'] = 'yes' # 'over_35'列を作成し値は全て'yes'
df.loc[df['age'] <=35, 'over_35'] ='no' # 'age'が35以下の'over_35'は全て'no'
df.drop(columns = 'age') # 'age'列を削除
df.loc[len(df), :] = df.loc[len(df) -1 , :] # 最終行を最後に追加
df[df == "yes"] = np.nan # 'yes'のセルを欠損値に
df.isnull().any() # 各列の欠損値の有無
df[df.isnull().any(axis = 1)] # 欠損値のある行表示
df.dropna() # 欠損値のある行の削除
df.dropna(axis = 1) # 欠損値のある列を削除
df.fillna("unknown") # 欠損値を'unknown'に置き換え
df.drop_duplicates("job") # 'job'列の重複行を削除
(df == "unknown").sum() # 列毎の'unknown'の個数
(df == "unknown").sum().sum() # 'unknown'の総個数
((df["age"] >= 30) & (df["age"] < 35)).sum() # 'age'が30以上35未満の個数
df.rename(columns={"job":"work"}) # 'job'を'work'にリネーム
df.index = ["No." + str(i) for i in range(len(df))] # indexを'No.'に
df = df.reset_index(drop=True) # indexをリセット
df[["job", "age"]].groupby("job").max() # 'job'毎の'age'の最大
df[["job", "age", "marital"]].groupby(["job","marital"]).mean() # 'job','marital'毎の'age'の平均
* groupbyの項目が複数の場合はリストで
df.pivot_table(values='age', index=['marital', 'job'], aggfunc=np.mean) # 'marital','job'をインデックスに'age'をピボットテーブル
df['age'] = df['age'].astype(int) # 'age'列をint型に
df.sort_values("age", ascending=False) # 'age'列を降順に
new_df = pd.DataFrame(data=[[1,2],[3,4],[5,6]], columns=['A','B']) # データフレーム作成
df.merge(new_df, on='marital') # dfにnew_dfを'marital'でマージ
df.append(new_df, ignore_index=True) # new_dfを追加し、indexリセット
pd.get_dummies(df['marital']) # 'marital'をワンホットエンコーディング
pandsのインポート
import pandas as pd
csvファイルの内容
age,job,marital,housing,loan
30,blue-collar,married,yes,no
39,services,single,no,no
25,services,married,yes,no
38,services,married,unknown,unknown
47,admin.,married,yes,no
32,services,single,no,no
32,admin.,single,yes,no
41,entrepreneur,married,yes,no
31,services,divorced,no,no
35,blue-collar,married,no,no
csvファイルの読み込み (セパレータは sep=''で指定。デフォルトは',')
df = pd.read_csv('./data/test.csv', sep=',') # sep=','は省略化
print(df)
# 出力
age job marital housing loan
0 30 blue-collar married yes no
1 39 services single no no
2 25 services married yes no
3 38 services married unknown unknown
4 47 admin. married yes no
5 32 services single no no
6 32 admin. single yes no
7 41 entrepreneur married yes no
8 31 services divorced no no
9 35 blue-collar married no no
'job','age'の列をデータフレーム形式で表示
tmp = df[['job','age']]
print(tmp.head())
# 出力
job age
0 blue-collar 30
1 services 39
2 services 25
3 services 38
4 admin. 47
'job'の列をシリーズ形式で表示
tmp = df['job']
print(tmp.head())
# 出力
0 blue-collar
1 services
2 services
3 services
4 admin.
Name: job, dtype: object
'age'の列が40以上の行を表示
tmp = df[df['age']>=40]
print(tmp)
# 出力
age job marital housing loan
4 47 admin. married yes no
7 41 entrepreneur married yes no
'age'列が30以下かつ、'marital'列が'married'の行を表示する
tmp = df[(df['age'] <=30) & (df['marital'] == 'married')]
print(tmp)
# 出力
age job marital housing loan
0 30 blue-collar married yes no
2 25 services married yes no
'over_35'列を追加し、値は全て'yes'にする
df['over_35'] = 'yes'
print(df.head())
# 出力
age job marital housing loan over_35
0 30 blue-collar married yes no yes
1 39 services single no no yes
2 25 services married yes no yes
3 38 services married unknown unknown yes
4 47 admin. married yes no yes
over_35が、'age'が35以下の場合は値を'no'、それ以外は'yes'にする
df['over_35'] = 'yes'
df.loc[df['age'] <=35, 'over_35'] ='no'
print(df.head())
# 出力
age job marital housing loan over_35
0 30 blue-collar married yes no no
1 39 services single no no yes
2 25 services married yes no no
3 38 services married unknown unknown yes
4 47 admin. married yes no yes
'age'列を削除して表示する
tmp = df.drop(columns = 'age')
print(tmp.head())
# 出力
job marital housing loan over_35
0 blue-collar married yes no no
1 services single no no yes
2 services married yes no no
3 services married unknown unknown yes
4 admin. married yes no yes
最終行と同じ行を最終行の後に追加する
df.loc[len(df), :] = df.loc[len(df) -1 , :]
print(df.tail())
# 出力
age job marital housing loan over_35
6 32.0 admin. single yes no no
7 41.0 entrepreneur married yes no yes
8 31.0 services divorced no no no
9 35.0 blue-collar married no no no
10 35.0 blue-collar married no no no
セルの値が'yes'の場合は全て'NaN'(欠損値)に置き換える
import numpy as np
df[df == "yes"] = np.nan
print(df.head())
# 出力
age job marital housing loan over_35
0 30.0 blue-collar married NaN no no
1 39.0 services single no no NaN
2 25.0 services married NaN no no
3 38.0 services married unknown unknown NaN
4 47.0 admin. married NaN no NaN
各列に欠損値があるかどうかを表示する
tmp = df.isnull().any()
print(tmp)
# 出力
age False
job False
marital False
housing True
loan False
over_35 True
dtype: bool
欠損値がある行を全て表示する
tmp = df[df.isnull().any(axis = 1)]
print(tmp)
# 出力
age job marital housing loan over_35
0 30.0 blue-collar married NaN no no
1 39.0 services single no no NaN
2 25.0 services married NaN no no
3 38.0 services married unknown unknown NaN
4 47.0 admin. married NaN no NaN
6 32.0 admin. single NaN no no
7 41.0 entrepreneur married NaN no NaN
欠損値がある行を削除して表示
tmp = df.dropna()
print(tmp)
# 出力
age job marital housing loan over_35
5 32.0 services single no no no
8 31.0 services divorced no no no
9 35.0 blue-collar married no no no
10 35.0 blue-collar married no no no
欠損値がある列を削除して表示
tmp = df.dropna(axis = 1)
print(tmp.head())
# 出力
age job marital loan
0 30.0 blue-collar married no
1 39.0 services single no
2 25.0 services married no
3 38.0 services married unknown
4 47.0 admin. married no
欠損値('NaN')を'unknown'に置き換える
df = df.fillna("unknown")
print(df.head())
# 出力
age job marital housing loan over_35
0 30.0 blue-collar married unknown no no
1 39.0 services single no no unknown
2 25.0 services married unknown no no
3 38.0 services married unknown unknown unknown
4 47.0 admin. married unknown no unknown
'job'列の重複行を削除して表示する
tmp = df.drop_duplicates("job")
print(tmp)
# 出力
age job marital housing loan over_35
0 30.0 blue-collar married unknown no no
1 39.0 services single no no unknown
4 47.0 admin. married unknown no unknown
7 41.0 entrepreneur married unknown no unknown
'age'列の最大、最小、平均値
print(df['age'].max())
print(df['age'].min())
print(df['age'].mean())
# 出力
47.0
25.0
35.0
列毎に'unknown'の個数を表示する
tmp = (df == "unknown").sum()
print(tmp)
# 出力
age 0
job 0
marital 0
housing 6
loan 1
over_35 4
dtype: int64
'unknown'の総個数を表示する
tmp = (df == "unknown").sum().sum()
print(tmp)
# 出力
11
'age' が30以上かつ、35未満を表示する
tmp = ((df["age"] >= 30) & (df["age"] < 35)).sum()
print(tmp)
# 出力
4
列名'job'を'work'に変更する
tmp = df.rename(columns={"job":"work"})
print(tmp)
# 出力
age work marital housing loan over_35
0 30.0 blue-collar married unknown no no
1 39.0 services single no no unknown
2 25.0 services married unknown no no
3 38.0 services married unknown unknown unknown
4 47.0 admin. married unknown no unknown
indexを「No.{数字}」に変更
df.index = ["No." + str(i) for i in range(len(df))]
print(df.head())
# 出力
age job marital housing loan over_35
No.0 30.0 blue-collar married unknown no no
No.1 39.0 services single no no unknown
No.2 25.0 services married unknown no no
No.3 38.0 services married unknown unknown unknown
No.4 47.0 admin. married unknown no unknown
index をリセット
df = df.reset_index(drop=True)
print(df.head())
# 出力
age job marital housing loan over_35
0 30.0 blue-collar married unknown no no
1 39.0 services single no no unknown
2 25.0 services married unknown no no
3 38.0 services married unknown unknown unknown
4 47.0 admin. married unknown no unknown
'job'毎の最大値を表示
tmp = df[["job", "age"]].groupby("job").max()
print(tmp)
# 出力
job
admin. 47.0
blue-collar 35.0
entrepreneur 41.0
services 39.0
'job'毎、'marital'毎の'ageの平均
tmp = df[["job", "age", "marital"]].groupby(["job","marital"]).mean()
print(tmp)
# 出力
age
job marital
admin. married 47.000000
single 32.000000
blue-collar married 33.333333
entrepreneur married 41.000000
services divorced 31.000000
married 31.500000
single 35.500000
'marital'、'job'をインデックス、'age'を値としてピボットテーブルを表示する
tmp = df.pivot_table(values='age', index=['marital', 'job'], aggfunc=np.mean)
print(tmp)
# 出力
divorced services 31.000000
married admin. 47.000000
blue-collar 33.333333
entrepreneur 41.000000
services 31.500000
single admin. 32.000000
services 35.500000
'age'列をint型に変更する
df['age'] = df['age'].astype(int)
print(df.head())
# 出力
age job marital housing loan over_35
0 30 blue-collar married unknown no no
1 39 services single no no unknown
2 25 services married unknown no no
3 38 services married unknown unknown unknown
4 47 admin. married unknown no unknown
'age'列を降順に
tmp = df.sort_values("age", ascending=False)
print(tmp.head())
# 出力
age job marital housing loan over_35
4 47 admin. married unknown no unknown
7 41 entrepreneur married unknown no unknown
1 39 services single no no unknown
3 38 services married unknown unknown unknown
9 35 blue-collar married no no no
以下のデータフレームを作る
A, B
1, 2
3, 4
5, 6
new_df = pd.DataFrame(data=[[1,2],[3,4],[5,6]], columns=['A','B'])
print(new_df)
# 出力
A B
0 1 2
1 3 4
2 5 6
以下のデータフレームを作成し、'marital'列でmergeする
marital, m_index
single, 1
married, 2
divorce, 3
new_df = pd.DataFrame(data=[['single',1],
['married',2],
['divorced',3]],
columns=['marital','m_index'])
tmp =df.merge(new_df, on='marital')
print(tmp)
# 出力
age job marital housing loan over_35 m_index
0 30 blue-collar married unknown no no 2
1 25 services married unknown no no 2
2 38 services married unknown unknown unknown 2
3 47 admin. married unknown no unknown 2
4 41 entrepreneur married unknown no unknown 2
5 35 blue-collar married no no no 2
6 35 blue-collar married no no no 2
7 39 services single no no unknown 1
8 32 services single no no no 1
9 32 admin. single unknown no no 1
10 31 services divorced no no no 3
1-3 行をコピー、最終行に追加し、インデックスは初期化する
new_df = df.iloc[:3, :]
tmp = df.append(new_df, ignore_index=True)
print(tmp)
# 出力
age job marital housing loan over_35
0 30 blue-collar married unknown no no
1 39 services single no no unknown
2 25 services married unknown no no
3 38 services married unknown unknown unknown
4 47 admin. married unknown no unknown
5 32 services single no no no
6 32 admin. single unknown no no
7 41 entrepreneur married unknown no unknown
8 31 services divorced no no no
9 35 blue-collar married no no no
10 35 blue-collar married no no no
11 30 blue-collar married unknown no no
12 39 services single no no unknown
13 25 services married unknown no no
'marital'をone-hotエンコーディングしたデータフレームを表示する
tmp = pd.get_dummies(df['marital'])
print(tmp.head())
# 出力
divorced married single
0 0 1 0
1 0 0 1
2 0 1 0
3 0 1 0
4 0 1 0