0
0

NLP with Disaster Tweetsに挑戦(0.82224)

Last updated at Posted at 2024-06-20

使用環境

  • GoogleColaboratory
  • Python
  • BERT

データの確認

あとで出てきますが、トークナイズする際max_lengthをいうのを宣言しないといけません。
max_lengthとはトークンかする際の最大長です。これ以上長いトークンは切り捨てられ、短いものはパディングで埋められます。そのため、この値を最適にすることは重要です。(最適というのに解答はないです。)

from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv('train.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 文章の長さを最適化するために単語数分布を出力
tokens = data['text'].apply(tokenizer.tokenize)
token_counts = tokens.apply(len)

plt.hist(token_counts, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Token Counts')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

# 上位95%が含まれる最大トークン数
percentile = 95
percentile_token_length = np.percentile(token_counts, percentile)
print(f"{percentile} Percentile Token Length:", percentile_token_length)

# 平均に標準偏差の2倍を足した最大トークン数
mean_length = np.mean(token_counts)
std_deviation = np.std(token_counts)
max_token_length = int(mean_length + (2 * std_deviation))
print("Maximum Token Length:", max_token_length)

これによって、トークンの長さの分布を可視化します。そして二つの方法でトークンの値を見ていきます。
スクリーンショット 2024-04-04 14.28.59.png
数字が見やすういように揃えています。↓

95 Percentile Token Length: 51.0
Maximum Token Length:       55

正規分布...とは言えない...?55を使用します。

学習

!pip install torch transformers pandas scikit-learn matplotlib
DT_train.py
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

df = pd.read_csv("train.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)

# モデルの定義
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# トークナイズの定義
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# トークン化する
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        target = self.data.iloc[index]["target"]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(target, dtype=torch.long)
        }

        return inputs

batch_size = 16
max_length = 55
learning_rate = 2e-5
epochs = 8

train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# 学習
losses = []
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    losses.append(epoch_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")

# ロス値の動きをプロット
plt.plot(range(1, epochs + 1), losses)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.show()

# 予測(trainの中のtestに対して)
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"].numpy()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels)

# モデルの評価
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(true_labels, predictions))

# 学習にものすごい時間がかかるので、結果を書き出す。
torch.save(model.state_dict(), "bert_model_epochs_8.pth")
files.download("bert_model_epochs_8.pth")

※結局時間がかかりすぎて見てられず、何回も結果を確認できないことがあったので、GoogleColabPro+を購入しました!めちゃめちゃ快適です。

予測

基本行っていることは一緒です。学習した結果(上記で書き出した結果)を使用して行う点だけが違うだけです。なので、全てを学習に回した方が、より良い結果が得られるかと...

DT_test.py
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from google.colab import files

test_df = pd.read_csv("test.csv")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# 学習済みの結果を使用する。
model.load_state_dict(torch.load("bert_model_epochs_8.pth"))
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten()
        }

        return inputs

max_length = 55
batch_size = 16

test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)

test_df["target"] = predictions
test_df[["id", "target"]].to_csv("prediction1.csv", index=False)

files.download("prediction1.csv")

結果

9epochsで行った結果がよりよく0.82224の結果が得られました。
全部を学習見回して行ってみます。

全データに対して学習・予測

!pip install torch transformers pandas scikit-learn matplotlib
DT_expect.py
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# train.csvの読み込み
train_df = pd.read_csv("train.csv")

# モデルの定義
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# トークナイザーの定義
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# トークナイズしたデータを扱うカスタムデータセットの定義
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        target = self.data.iloc[index]["target"] if "target" in self.data.columns else None

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }

        if target is not None:
            inputs["labels"] = torch.tensor(target, dtype=torch.long)

        return inputs

batch_size = 16
max_length = 55
learning_rate = 2e-5
epochs = 10

train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# 学習
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")

# 学習済みモデルを保存
torch.save(model.state_dict(), "bert_model_epochs_8.pth")

# test.csvの予測
test_df = pd.read_csv("test.csv")

# カスタムデータセットの定義
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 学習済みモデルを読み込む
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("bert_model_epochs_8.pth"))
model.eval()

# 予測
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)

# 予測結果の保存
test_df["target"] = predictions
test_df[["id", "target"]].to_csv("prediction2.csv", index=False)

from google.colab import files
files.download("prediction2.csv")
8epochs:  0.80968
10epochs: 0.81887

8割のデータで学習した方がより良い結果でした。

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0