More than 5 years have passed since last update.

【随時更新】Pythonライブラリの小技集

Python

Last updated at 2019-10-09Posted at 2019-05-18

前略

一つの記事にするまでもない小技を五月雨に記載します。

データフレームのアンパック

データフレームの各列を配列としてアンパックする。

import pandas as pd
import io

data = """date,val_1,val_2
2018-11-01 09:00:00,65,d g
2018-11-01 09:01:00,26,e h
2018-11-01 09:02:00,47,h w
2018-11-01 09:03:00,20,k d
2018-11-01 09:04:00,65,8 d
2018-11-01 09:05:00,4,l d
2018-11-01 09:06:00,31,w d
2018-11-01 09:07:00,21,s s
2018-11-01 09:08:00,98,a b
2018-11-01 09:09:00,48,f f
2018-11-01 09:10:00,18,g 4
2018-11-01 09:11:00,86,s f"""

df = pd.read_csv(io.StringIO(data), parse_dates=[0])
column1, column2, column3 = df.values.T.tolist()

軸ラベルの回転

# state-based interfaceの場合
plt.xticks(rotation=90)

# object-oriented interfaceの場合
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

メモリ圧縮

以下のコードはFabienDanielさんのApache 2.0 open source licenseです。一部改変しています。
https://www.kaggle.com/kernels/scriptcontent/8421466/download

floatの扱いが雑なので、場合によっては値が変わってしまうことに注意してください。
また、もともとは参照渡しになっていましたが、deepcopyを導入して値渡しにしました。

import time
import numpy as np
from tqdm import tqdm
from copy import deepcopy

def reduce_mem_usage(df, use_feather=False): # feather を利用する場合は np.float16 を避ける
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    df = deepcopy(df)
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    time.sleep(1)

    for col in tqdm(df.columns): # tqdmで進捗確認
        col_type = df[col].dtype

        if col_type != object and col_type !='<M8[ns]':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_feather:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        else:
            #df[col] = df[col].astype('category')
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

外れ値を判断し削除する

四分位で削除する場合。

def delete_outlier(series_in, bias=1.5):
    series = series_in.copy()
    #四分位数
    q1 = series.quantile(.25)
    q3 = series.quantile(.75)
    iqr = q3 - q1

    #外れ値の基準点
    outlier_min = q1 - (iqr) * bias
    outlier_max = q3 + (iqr) * bias

    print("outlier_min :" + str(outlier_min) + ", outlier_max :" + str(outlier_max))

    #外れ値を除去する
    series[series < outlier_min] = None
    series[series > outlier_max] = None
    
    return series

削除する値を指定する場合。

# 任意で外れ値を判断し削除する
def custom_delete_outlier(series_in, outlier_min, outlier_max, left_eq=False, right_eq=False):
    series = series_in.copy()
    print("outlier_min :" + str(outlier_min) + ", outlier_max :" + str(outlier_max))

    #外れ値を除去する
    if left_eq:
        series[series <= outlier_min] = None
    else:
        series[series < outlier_min] = None
        
    if right_eq:
        series[series >= outlier_max] = None
    else:
        series[series > outlier_max] = None
    
    return series

表固定

jupyterに表を埋め込む。
https://stackoverflow.com/questions/28778668/freeze-header-in-pandas-dataframe

from ipywidgets import interact, IntSlider
from IPython.display import display

def freeze_header(df, num_rows=30, num_columns=10, step_rows=1,
                  step_columns=1):
    """
    Freeze the headers (column and index names) of a Pandas DataFrame. A widget
    enables to slide through the rows and columns.

    Parameters
    ----------
    df : Pandas DataFrame
        DataFrame to display
    num_rows : int, optional
        Number of rows to display
    num_columns : int, optional
        Number of columns to display
    step_rows : int, optional
        Step in the rows
    step_columns : int, optional
        Step in the columns

    Returns
    -------
    Displays the DataFrame with the widget
    """
    @interact(last_row=IntSlider(min=min(num_rows, df.shape[0]),
                                 max=df.shape[0],
                                 step=step_rows,
                                 description='rows',
                                 readout=False,
                                 disabled=False,
                                 continuous_update=True,
                                 orientation='horizontal',
                                 slider_color='purple'),
              last_column=IntSlider(min=min(num_columns, df.shape[1]),
                                    max=df.shape[1],
                                    step=step_columns,
                                    description='columns',
                                    readout=False,
                                    disabled=False,
                                    continuous_update=True,
                                    orientation='horizontal',
                                    slider_color='purple'))
    def _freeze_header(last_row, last_column):
        display(df.iloc[max(0, last_row-num_rows):last_row,
                        max(0, last_column-num_columns):last_column])
        
# 使いかた
# freeze_header(df=df, num_rows=10)

datetime64の変換

from datetime import datetime

pred_time = np.datetime64('2012-06-18T02:00:05.453000000-0400')
datetime.utcfromtimestamp(pred_time.astype(datetime)/1e9).strftime("%Y年%m月%d日 %H:%M")

シフト

groupbyしてそのグループの中で特定列をシフトする。

import pandas as pd
import io

data = """date,val_1,val_2
2018-11-01 09:00:00,1,a
2018-11-01 09:01:00,2,a
2018-11-01 09:02:00,3,a
2018-11-01 09:03:00,4,a
2018-11-01 09:04:00,5,a
2018-11-01 09:05:00,6,a
2018-11-01 09:06:00,7,b
2018-11-01 09:07:00,8,b
2018-11-01 09:08:00,9,b
2018-11-01 09:09:00,10,b
2018-11-01 09:10:00,11,b
2018-11-01 09:11:00,12,b"""

df = pd.read_csv(io.StringIO(data), parse_dates=[0])
df.groupby(['val_2'])['val_1'].transform(lambda x:x.shift())

# output
# 0      NaN
# 1      1.0
# 2      2.0
# 3      3.0
# 4      4.0
# 5      5.0
# 6      NaN
# 7      7.0
# 8      8.0
# 9      9.0
# 10    10.0
# 11    11.0
# Name: val_1, dtype: float64

軸の刻み幅変更

# 既存のy軸の描画範囲を取得
start, end = plt.gca().get_ylim()
# y軸の刻みを変える
step = 10
plt.gca().yaxis.set_ticks(np.arange(start, end, step))

列の値に含まれるスペースで列を分割する

import pandas as pd
import io

data = """date,val_1,val_2
2018-11-01 09:00:00,65,d g
2018-11-01 09:01:00,26,e h
2018-11-01 09:02:00,47,h w
2018-11-01 09:03:00,20,k d
2018-11-01 09:04:00,65,8 d
2018-11-01 09:05:00,4,l d
2018-11-01 09:06:00,31,w d
2018-11-01 09:07:00,21,s s
2018-11-01 09:08:00,98,a b
2018-11-01 09:09:00,48,f f
2018-11-01 09:10:00,18,g 4
2018-11-01 09:11:00,86,s f"""

df = pd.read_csv(io.StringIO(data), parse_dates=[0])
df['val_2'].str.split(' ', expand=True)

int型の列のバイトを変換する

import pandas as pd
import io

data = """date,val_1,val_2
2018-11-01 09:00:00,65,d g
2018-11-01 09:01:00,26,e h
2018-11-01 09:02:00,47,h w
2018-11-01 09:03:00,20,k d
2018-11-01 09:04:00,65,8 d
2018-11-01 09:05:00,4,l d
2018-11-01 09:06:00,31,w d
2018-11-01 09:07:00,21,s s
2018-11-01 09:08:00,98,a b
2018-11-01 09:09:00,48,f f
2018-11-01 09:10:00,18,g 4
2018-11-01 09:11:00,86,s f"""

df = pd.read_csv(io.StringIO(data), parse_dates=[0])

## int型の大きさを判定
def type_judge(col):
    key = col[0]
    max_value = col[1]
    min_value = col[2]
    if -128<=min_value<=max_value <=127:
        return key,'int8'
    elif -32768<=min_value<= max_value <=32767:
        return key,'int16'
    elif -2147483647<=min_value<= max_value <=2147483647:
        return key,'int32'
    else:
        return key,'int64'

## int型の列の最大値を取得
df_type = df.select_dtypes(include='number').max().reset_index()

df_type.columns = ['key', 'max_value']
# int型の列の最小値を取得
df_type['min_value'] = df.select_dtypes(include='number').min().values
df_type

## 列名と型を辞書型で取得
d = dict(df_type.apply(type_judge, axis=1).values)

# dtypeを変換
df = df.astype(d)

df.info()

デバッグプリント

変数名と変数の中身を同時にprintする関数です。
こちらに手法が解説されています。

from inspect import currentframe
def chkprint(*args):
    names = {id(v):k for k,v in currentframe().f_back.f_locals.items()}
    print(', '.join(names.get(id(arg),'???')+' = '+repr(arg) for arg in args))

python3.8では標準機能として実装されています。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up