次の課題に submit するまでの流れです。
Real or Not? NLP with Disaster Tweets
次のページを参考にしました。
NLP Getting Started Tutorial
realornot01.py
#! /usr/bin/python
#
# realornot01.py
#
# Feb/24/2020
# --------------------------------------------------------------------------
import sys
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
# --------------------------------------------------------------------------
def dump_frame_proc(df):
try:
print(len(df.index),len(df.columns))
except Exception as ee:
sys.stderr.write("*** error *** dump_frame_proc ***\n")
sys.stderr.write(str(ee) + "\n")
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
dump_frame_proc(train_df)
dump_frame_proc(test_df)
#
count_vectorizer = feature_extraction.text.CountVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())
#
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])
#
clf = linear_model.RidgeClassifier()
#
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
print(scores)
#
clf.fit(train_vectors, train_df["target"])
#
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("submission.csv", index=False)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------
スコアは、 0.77096 でした。