More than 5 years have passed since last update.

自然言語処理以外でのトピックモデルによる特徴抽出

Last updated at 2019-02-28Posted at 2019-02-28

はじめに

2019年2月27日に終了したKaggleのコンペ「Elo Merchant Category Recommendation」で非文章のカテゴリ変数からトピックモデルを用いて特徴量を生成したところ、スコアが大きく伸びたので紹介したいと思います。

利用したモデル

LDA(gensim)
FM_FTRL(WordBatch)

コンペの上位陣が公開してくれている解法でもトピックモデルによる特徴量生成は行われており、Word2Vecを利用している人が多かったようです。
とは言え、LDAとFM_FTRLで作成した特徴量でもPrvate LBでスコアが0.012改善していました(シングルモデル)

利用方法

「Elo Merchant Category Recommendation」コンペでは、カード会員の購買履歴からターゲット変数としてロイヤリティを算出するのですがカード会員を特定するcard_idに対して複数レコードの情報が紐づいています。
このような場合、card_idをグループとして何らかのカラム(単数、複数の組み合わせ)でカウントや最小値、最大値、平均を取ったり、One Hot Encodingで特徴量を生成するのがまずは考えられます。
当然、私もそのような特徴量生成から始めたのですがOne Hot Encodingの問題点であるデータのカーディナリティが高いとカラム数が爆発して手に負えなくなるという状況に陥りました。
そこでカーディナリティの高いカテゴリ変数を中心にcard_idと対象のカテゴリ変数でグループ化したカウントをスペース区切りで次々と繋げて一行の文章のようにします。
後はそれをトピックモデルに与えて分類して各トピックのスコアを特徴量として利用します。
この際にどのような順番で繋げた文章を作成するかや分類するトピックの数をいくつかにするかで精度が変わってくるので、CVをたよりに調整していきます。
実際のコードはこのような感じです。
コンペでは、merchant_id以外にもcity_id、state_id、subsector_id、merchant_category_idで同様にトピックを生成しました。

def create_merchant_topic(df):
    global merchant_texts
    merchant_texts = {}
    merchants = df.groupby(["card_id", "merchant_id"]).size().reset_index()
    merchants.rename(columns={0:"size"}, inplace=True)
    merchants["text"] = merchants["card_id"] + "," + merchants["merchant_id"] + "," + merchants["size"].astype(str)
    merchants["text"].progress_apply(lambda x: create_merchant_texts(x))
    df_merchant_texts = pd.DataFrame.from_dict(merchant_texts, orient='index').reset_index()
    df_merchant_texts.rename(columns={"index": "card_id", 0: "merchant_id_topic"}, inplace=True)

    return calc_topic_score(df_merchant_texts, "merchant_id").drop("merchant_id_topic", axis=1)

def create_merchant_texts(x):
    t = x.split(",")
    if t[0] not in merchant_texts.keys():
        merchant_texts[t[0]] = ""
    
    for i in range(int(t[2])):
        merchant_texts[t[0]] += " " + t[1]

def calc_topic_score(df, prefix):
    texts = [[word for word in document.lower().split()] for document in df[prefix+"_topic"].values]
    dictionary = corpora.Dictionary(texts)
    bow_corpus = [dictionary.doc2bow(t) for t in texts]
    lda = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=11)

    topics = {0: [-1]*len(df), 1: [-1]*len(df), 2: [-1]*len(df), 3: [-1]*len(df), 4: [-1]*len(df), 5: [-1]*len(df)}
    for i, row in enumerate(lda[bow_corpus]):
        for j, (topic_num, prop_topic) in enumerate(row):
            topics[topic_num][i] = prop_topic

    df[prefix+"_topic_0"] = topics[0]
    df[prefix+"_topic_1"] = topics[1]
    df[prefix+"_topic_2"] = topics[2]
    df[prefix+"_topic_3"] = topics[3]
    df[prefix+"_topic_4"] = topics[4]
    df[prefix+"_topic_5"] = topics[5]
    
    return df

一方で、FM_FTRLでは複数のカテゴリ変数の組み合わせでカウントを繋ぎ合わせた文章として学習させて、target変数に対する予測値を特徴量として生成しました。

変数を繋げたもの
A3 B3 C3 D2 E2 F2 G3 H2 I-1 A3 B3 C3 D2 E2 F2

実際のコードはこのような感じです。

batchsize = 201000
D = 2 ** 20

global WB
global FM_CLS
WB = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}), minibatch_size=batchsize//100, procs=8, freeze=True, timeout=1800, verbose=0)
FM_CLS = FM_FTRL(alpha=0.0001, beta=0.001, iters=20, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=8, e_noise=0.0, inv_link="identity", e_clip=1.0, use_avx=1, verbose=0)

def create_wordbatch_topic(df):
    global wordbatch_topic_texts
    wordbatch_topic_texts = {}
    df["category_2"] = df["category_2"].fillna(-1)
    df["category_3"] = df["category_3"].fillna("A")
    df["merchant_id"] = df["merchant_id"].fillna("M_ID_00a6ca8a8a")
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month_diff'] = ((RECENT_DATETIME - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['purchase_amount'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    df['duration'] = df['purchase_amount']*df['month_diff']
    
    df["count_1"] = df.groupby(['card_id', 'month_diff', 'state_id', 'city_id'])["card_id"].transform("count")
    df["count_2"] = df.groupby(['card_id', 'month_diff', 'category_1', 'category_2', 'category_3'])["card_id"].transform("count")
    df["count_3"] = df.groupby(['card_id', 'month_diff'])["card_id"].transform("count")
    df["count_4"] = df.groupby(['card_id', 'month_diff', 'merchant_id'])["card_id"].transform("count")
    df["count_5"] = df.groupby(['card_id', 'month_diff', 'merchant_category_id'])["card_id"].transform("count")
    df["count_6"] = df.groupby(['card_id', 'month_diff', 'subsector_id'])["card_id"].transform("count")
    df["count_7"] = df.groupby(['card_id', 'month_diff', 'installments'])["card_id"].transform("count")
    df["count_8"] = df.groupby(['card_id', 'month_diff'])["authorized_flag"].transform("sum")
    df["count_9"] = df.groupby(['card_id'])["duration"].transform("mean")
    
    df["count_1"] = df["count_1"].astype(int)
    df["count_2"] = df["count_2"].astype(int)
    df["count_3"] = df["count_3"].astype(int)
    df["count_4"] = df["count_4"].astype(int)
    df["count_5"] = df["count_5"].astype(int)
    df["count_6"] = df["count_6"].astype(int)
    df["count_7"] = df["count_7"].astype(int)
    df["count_8"] = df["count_8"].astype(int)
    df["count_9"] = df["count_9"].astype(int)
    
    df["wordbatch_topic"] = "A"+df["count_1"].astype(str) \
                            +" B"+df["count_2"].astype(str) \
                            +" C"+df["count_3"].astype(str) \
                            +" D"+df["count_4"].astype(str) \
                            +" E"+df["count_5"].astype(str) \
                            +" F"+df["count_6"].astype(str) \
                            +" G"+df["count_7"].astype(str) \
                            +" H"+df["count_8"].astype(str) \
                            +" I"+df["count_9"].astype(str)
    
    t = df.sort_values("month_diff")
    t["text"] = t["card_id"] + "," + t["wordbatch_topic"]
    t["text"].progress_apply(lambda x: create_wordbatch_topic_texts(x))
    df_wordbatch_topic_texts = pd.DataFrame.from_dict(wordbatch_topic_texts, orient='index').reset_index()
    df_wordbatch_topic_texts.rename(columns={"index": "card_id", 0: "wordbatch_topic"}, inplace=True)
    
    return df_wordbatch_topic_texts

def create_wordbatch_topic_texts(x):
    t = x.split(",")
    if t[0] not in wordbatch_topic_texts.keys():
        wordbatch_topic_texts[t[0]] = t[1]
    else:
        wordbatch_topic_texts[t[0]] += " " + t[1]

hist_df = pd.read_csv('../input/historical_transactions.csv')
df = pd.merge(df, create_wordbatch_topic(hist_df), on='card_id', how='outer')
train = df[df['target'].notnull()]
test = df[df['target'].isnull()]
X = WB.transform(train["wordbatch_topic"].values)
FM_CLS.fit(X, train["target"].values)
y = WB.transform(test["wordbatch_topic"].values)
train["topic_predict"] = np.round(FM_CLS.predict(X), 5)
test["topic_predict"] = np.round(FM_CLS.predict(y), 5)

最後に

以上が、トピックモデルによる非文章のカテゴリ変数から特徴抽出の紹介です。
カテゴリ変数の繋げ方やハイパーパラメータのチューニングなど大いに改善の余地があるとは思いますが、
そのような状態でもかなりのスコア改善が実現できたため、今後も大いに活用していきたいと思います。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up