LoginSignup
3
7

More than 3 years have passed since last update.

日付の特徴量生成のメモ

Posted at

featuretoolsを使わずに、日付型のデータから特徴量や目的変数を生成するためのコードのメモ

datefeaturetool.py
import datetime
import numpy as np
import pandas as pd

class DeltaDate():
    """
    日付の特徴量生成
    """
    def __init__(self, cutoff_date):
        """
        cutoff_date: datetime.date(2020, 2, 2) or pandas.Timestamp('2020-02-02')
                     int, numpy.int64, float, numpy.float64
        """
        if type(cutoff_date) == datetime.date:
            self.cutoff_date = pd.to_datetime(cutoff_date)
            print('cutoff_date converted from datetime.date type to pandas.Timestamp type.')
        else:
            self.cutoff_date = cutoff_date

    def delta_date_1d(self, dates, freq='d', past_or_future='past'):
        """
        dates: pandas.Series 
            dtype: datetime64[ns]
                   int64, float64
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'

        return pandas.Series (np.int64)
        """
        day_lt = ['d', 'D', 'day', 'Day']
        month_lt = ['m', 'M', 'month', 'Month']
        year_lt = ['y', 'Y', 'year', 'Year']

        tcd = type(self.cutoff_date)
        if tcd == pd._libs.tslibs.timestamps.Timestamp:
            if freq in day_lt:
                delta = self.cutoff_date - dates  # timedelta64[ns]
                delta = delta.dt.days  # np.int64
            elif freq in (month_lt + year_lt):
                start_year = dates.dt.year  # timedelta64[ns]
                start_month = dates.dt.month  # timedelta64[ns]
                start_day = dates.dt.day  # timedelta64[ns]
                end_year = self.cutoff_date.year  # np.int64
                end_month = self.cutoff_date.month  # np.int64
                end_day = self.cutoff_date.day  # np.int64
                cond = ((end_month<start_month)|((end_month==start_month)&(end_day<start_day)))
                if freq in month_lt:
                    delta = (end_year - start_year) * 12 + (end_month - start_month)
                    delta = delta.mask(cond, delta - 1)  # np.int64
                else:
                    delta = end_year - start_year
                    delta = delta.mask(cond, delta - 1)  # np.int64
            else:
                print("freq must be 'day', 'month' or 'year'")
        elif (tcd==int)|(tcd==np.int64)|(tcd==float)|(tcd==np.float64):
            if freq in day_lt:
                y = self.cutoff_date // 10000
                m = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100
                d = self.cutoff_date - self.cutoff_date//100 * 100
                cod = pd.Timestamp(year=y, month=m, day=d)
                dates = pd.to_datetime(dates.astype(str), format='%Y%m%d')
                delta = cod - dates  # timedelta64[ns]
                delta = delta.dt.days  # np.int64
            elif freq in (month_lt + year_lt):
                y_diff = self.cutoff_date//10000 - dates//10000
                m_diff = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100 - (dates - dates//10000 * 10000)//100
                d_diff = (self.cutoff_date - self.cutoff_date//100 * 100) - (dates - dates//100 * 100)
                cond = (m_diff < 0) | ((m_diff == 0) & (d_diff < 0))
                if freq in month_lt:
                    delta = y_diff * 12 + m_diff
                    delta = delta.mask(cond, delta - 1)
                else:
                    delta = y_diff
                    delta = delta.mask(cond, delta - 1)
        else:
            print("freq must be 'day', 'month' or 'year'")

        if past_or_future in ['f', 'future']:
            delta = -delta
            print('delta for the future.')

        delta.name = 'elapsed_' + delta.name

        return delta

    def delta_date(self, dates, freq='d', past_or_future='past'):
        """
        dates: pandas.Series or pandas.DataFrame
            dtype: datetime64[ns]
                   int64, float64
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'

        return pandas.Series (np.int64)
        """
        if type(dates) == pd.core.series.Series:
            delta = self.delta_date_1d(dates, freq, past_or_future)
        elif type(dates) == pd.core.frame.DataFrame:
            s_lt = []
            for col in dates:
                dd = self.delta_date_1d(dates[col], freq, past_or_future)
                s_lt += [dd]
            delta = pd.concat(s_lt, axis=1)
        else:
            print('dates must be andas.Series or pd.DataFrame.')
        return delta

    def within_date(self, dates, within, freq='d', past_or_future='past'):
        """
        dates: pandas.Series or pandas.DataFrame
            dtype: datetime64[ns]
                   int64, float64
        within: int (n日以内、nヶ月以内、n年以内)
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'

        return pandas.Series (0: over, 1: within, np.nan: minus)
        """
        if type(within) == list:
            delta_sign_lt = []
            for n in within:
                delta = self.delta_date(dates, freq, past_or_future)
                delta_sign = delta.mask(delta>n, 0)
                delta_sign = delta_sign.mask(delta<=n, 1)
                delta_sign = delta_sign.mask(delta<0)
                if type(delta_sign) == pd.core.frame.DataFrame:
                    delta_sign.columns = ['within' + str(n) + c for c in dates.columns]
                else:
                    delta_sign.name = 'within' + str(n) + dates.name
                delta_sign_lt+= [delta_sign]
            within_sign = pd.concat(delta_sign_lt, axis=1)
        else:
            delta = self.delta_date(dates, freq, past_or_future)
            delta_sign = delta.mask(delta>within, 0)
            delta_sign = delta_sign.mask(delta<=within, 1)
            within_sign = delta_sign.mask(delta<0)
            within_sign.name = 'within' + str(within) + dates.name

        return within_sign

if __name__ == '__main__':
    df = pd.DataFrame([['2017-8-1', '2018-12-15'],
                       ['2020-2-2', '2019-3-31']],
                      columns=['date1', 'date2'])
    for c in df:
        df[c] = pd.to_datetime(df[c], format='%Y-%m-%d')

    deltadate = DeltaDate(datetime.date(2020, 2, 28))
    result = deltadate.delta_date(df, freq='d')
    within = deltadate.within_date(df, [12, 24], freq='m')
3
7
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
3
7