使用環境
- GoogleColaboratory
- Python
- BERT
データの確認
あとで出てきますが、トークナイズする際max_length
をいうのを宣言しないといけません。
max_length
とはトークンかする際の最大長です。これ以上長いトークンは切り捨てられ、短いものはパディングで埋められます。そのため、この値を最適にすることは重要です。(最適というのに解答はないです。)
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data = pd.read_csv('train.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 文章の長さを最適化するために単語数分布を出力
tokens = data['text'].apply(tokenizer.tokenize)
token_counts = tokens.apply(len)
plt.hist(token_counts, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Token Counts')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()
# 上位95%が含まれる最大トークン数
percentile = 95
percentile_token_length = np.percentile(token_counts, percentile)
print(f"{percentile} Percentile Token Length:", percentile_token_length)
# 平均に標準偏差の2倍を足した最大トークン数
mean_length = np.mean(token_counts)
std_deviation = np.std(token_counts)
max_token_length = int(mean_length + (2 * std_deviation))
print("Maximum Token Length:", max_token_length)
これによって、トークンの長さの分布を可視化します。そして二つの方法でトークンの値を見ていきます。
数字が見やすういように揃えています。↓
95 Percentile Token Length: 51.0
Maximum Token Length: 55
正規分布...とは言えない...?55を使用します。
学習
!pip install torch transformers pandas scikit-learn matplotlib
DT_train.py
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
df = pd.read_csv("train.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
# モデルの定義
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# トークナイズの定義
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# トークン化する
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length):
self.data = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, index):
text = self.data.iloc[index]["text"]
target = self.data.iloc[index]["target"]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
inputs = {
"input_ids": encoding["input_ids"].flatten(),
"attention_mask": encoding["attention_mask"].flatten(),
"labels": torch.tensor(target, dtype=torch.long)
}
return inputs
batch_size = 16
max_length = 55
learning_rate = 2e-5
epochs = 8
train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
# 学習
losses = []
for epoch in range(epochs):
model.train()
epoch_loss = 0
for batch in train_loader:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_loss /= len(train_loader)
losses.append(epoch_loss)
print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")
# ロス値の動きをプロット
plt.plot(range(1, epochs + 1), losses)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.show()
# 予測(trainの中のtestに対して)
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"].numpy()
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()
predictions.extend(predicted_labels)
true_labels.extend(labels)
# モデルの評価
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(true_labels, predictions))
# 学習にものすごい時間がかかるので、結果を書き出す。
torch.save(model.state_dict(), "bert_model_epochs_8.pth")
files.download("bert_model_epochs_8.pth")
※結局時間がかかりすぎて見てられず、何回も結果を確認できないことがあったので、GoogleColabPro+を購入しました!めちゃめちゃ快適です。
予測
基本行っていることは一緒です。学習した結果(上記で書き出した結果)を使用して行う点だけが違うだけです。なので、全てを学習に回した方が、より良い結果が得られるかと...
DT_test.py
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from google.colab import files
test_df = pd.read_csv("test.csv")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# 学習済みの結果を使用する。
model.load_state_dict(torch.load("bert_model_epochs_8.pth"))
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length):
self.data = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, index):
text = self.data.iloc[index]["text"]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
inputs = {
"input_ids": encoding["input_ids"].flatten(),
"attention_mask": encoding["attention_mask"].flatten()
}
return inputs
max_length = 55
batch_size = 16
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
predictions = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()
predictions.extend(predicted_labels)
test_df["target"] = predictions
test_df[["id", "target"]].to_csv("prediction1.csv", index=False)
files.download("prediction1.csv")
結果
9epochsで行った結果がよりよく0.82224の結果が得られました。
全部を学習見回して行ってみます。
全データに対して学習・予測
!pip install torch transformers pandas scikit-learn matplotlib
DT_expect.py
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# train.csvの読み込み
train_df = pd.read_csv("train.csv")
# モデルの定義
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# トークナイザーの定義
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# トークナイズしたデータを扱うカスタムデータセットの定義
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length):
self.data = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, index):
text = self.data.iloc[index]["text"]
target = self.data.iloc[index]["target"] if "target" in self.data.columns else None
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
inputs = {
"input_ids": encoding["input_ids"].flatten(),
"attention_mask": encoding["attention_mask"].flatten(),
}
if target is not None:
inputs["labels"] = torch.tensor(target, dtype=torch.long)
return inputs
batch_size = 16
max_length = 55
learning_rate = 2e-5
epochs = 10
train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
# 学習
for epoch in range(epochs):
model.train()
epoch_loss = 0
for batch in train_loader:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_loss /= len(train_loader)
print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")
# 学習済みモデルを保存
torch.save(model.state_dict(), "bert_model_epochs_8.pth")
# test.csvの予測
test_df = pd.read_csv("test.csv")
# カスタムデータセットの定義
test_dataset = CustomDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 学習済みモデルを読み込む
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("bert_model_epochs_8.pth"))
model.eval()
# 予測
predictions = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()
predictions.extend(predicted_labels)
# 予測結果の保存
test_df["target"] = predictions
test_df[["id", "target"]].to_csv("prediction2.csv", index=False)
from google.colab import files
files.download("prediction2.csv")
8epochs: 0.80968
10epochs: 0.81887
8割のデータで学習した方がより良い結果でした。