NLP with Disaster Tweetsに挑戦(0.82224)

Last updated at 2024-06-20Posted at 2024-06-20

使用環境

GoogleColaboratory
Python
BERT

データの確認

あとで出てきますが、トークナイズする際max_lengthをいうのを宣言しないといけません。
max_lengthとはトークンかする際の最大長です。これ以上長いトークンは切り捨てられ、短いものはパディングで埋められます。そのため、この値を最適にすることは重要です。(最適というのに解答はないです。)

from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv('train.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 文章の長さを最適化するために単語数分布を出力
tokens = data['text'].apply(tokenizer.tokenize)
token_counts = tokens.apply(len)

plt.hist(token_counts, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Token Counts')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

# 上位95%が含まれる最大トークン数
percentile = 95
percentile_token_length = np.percentile(token_counts, percentile)
print(f"{percentile} Percentile Token Length:", percentile_token_length)

# 平均に標準偏差の2倍を足した最大トークン数
mean_length = np.mean(token_counts)
std_deviation = np.std(token_counts)
max_token_length = int(mean_length + (2 * std_deviation))
print("Maximum Token Length:", max_token_length)

これによって、トークンの長さの分布を可視化します。そして二つの方法でトークンの値を見ていきます。

数字が見やすういように揃えています。↓

95 Percentile Token Length: 51.0
Maximum Token Length:       55

正規分布...とは言えない...?55を使用します。

学習

!pip install torch transformers pandas scikit-learn matplotlib

DT_train.py

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

df = pd.read_csv("train.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)

# モデルの定義
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# トークナイズの定義
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# トークン化する
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        target = self.data.iloc[index]["target"]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(target, dtype=torch.long)
        }

        return inputs

batch_size = 16
max_length = 55
learning_rate = 2e-5
epochs = 8

train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# 学習
losses = []
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    losses.append(epoch_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")

# ロス値の動きをプロット
plt.plot(range(1, epochs + 1), losses)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.show()

# 予測(trainの中のtestに対して)
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"].numpy()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels)

# モデルの評価
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(true_labels, predictions))

# 学習にものすごい時間がかかるので、結果を書き出す。
torch.save(model.state_dict(), "bert_model_epochs_8.pth")
files.download("bert_model_epochs_8.pth")

※結局時間がかかりすぎて見てられず、何回も結果を確認できないことがあったので、GoogleColabPro+を購入しました！めちゃめちゃ快適です。

予測

基本行っていることは一緒です。学習した結果(上記で書き出した結果)を使用して行う点だけが違うだけです。なので、全てを学習に回した方が、より良い結果が得られるかと...

DT_test.py

import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from google.colab import files

test_df = pd.read_csv("test.csv")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# 学習済みの結果を使用する。
model.load_state_dict(torch.load("bert_model_epochs_8.pth"))
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten()
        }

        return inputs

max_length = 55
batch_size = 16

test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)

test_df["target"] = predictions
test_df[["id", "target"]].to_csv("prediction1.csv", index=False)

files.download("prediction1.csv")

結果

9epochsで行った結果がよりよく0.82224の結果が得られました。
全部を学習見回して行ってみます。

全データに対して学習・予測

!pip install torch transformers pandas scikit-learn matplotlib

DT_expect.py

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# train.csvの読み込み
train_df = pd.read_csv("train.csv")

# モデルの定義
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# トークナイザーの定義
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# トークナイズしたデータを扱うカスタムデータセットの定義
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        target = self.data.iloc[index]["target"] if "target" in self.data.columns else None

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }

        if target is not None:
            inputs["labels"] = torch.tensor(target, dtype=torch.long)

        return inputs

batch_size = 16
max_length = 55
learning_rate = 2e-5
epochs = 10

train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# 学習
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")

# 学習済みモデルを保存
torch.save(model.state_dict(), "bert_model_epochs_8.pth")

# test.csvの予測
test_df = pd.read_csv("test.csv")

# カスタムデータセットの定義
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 学習済みモデルを読み込む
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("bert_model_epochs_8.pth"))
model.eval()

# 予測
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)

# 予測結果の保存
test_df["target"] = predictions
test_df[["id", "target"]].to_csv("prediction2.csv", index=False)

from google.colab import files
files.download("prediction2.csv")

8epochs:  0.80968
10epochs: 0.81887

８割のデータで学習した方がより良い結果でした。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up