NeoBERTForTokenClassificationにdense層は必要なのか

Posted at 2025-03-03

今朝未明の記事にどうも納得がいかなかったので、NeoBERTForTokenClassificationをざっと書き直してみた。端的には、付替部をdropout層とclassifier層だけにして、dense層は捨ててみた。Google Colaboratory (GPU版)だと、こんな感じ。

!pip install -U transformers datasets evaluate seqeval accelerate xformers flash_attn torchvision
!test -d NeoBERT || git clone --depth=1 https://huggingface.co/chandar-lab/NeoBERT
with open("NeoBERT/token_classification.py","w",encoding="utf-8") as w:
  w.write('''
import torch
from typing import Optional
from .model import NeoBERTConfig, NeoBERTPreTrainedModel, NeoBERT
from transformers.modeling_outputs import TokenClassifierOutput

class NeoBERTForTokenClassification(NeoBERTPreTrainedModel):
    config_class = NeoBERTConfig

    def __init__(self, config: NeoBERTConfig):
        super().__init__(config)

        self.config = config

        self.num_labels = getattr(config, "num_labels", 2)
        self.classifier_dropout = getattr(config, "classifier_dropout", 0.1)
        self.classifier_init_range = getattr(config, "classifier_init_range", 0.02)

        self.model = NeoBERT(config)

        self.dropout = torch.nn.Dropout(self.classifier_dropout)
        self.classifier = torch.nn.Linear(self.config.hidden_size, self.num_labels)

        self.post_init()

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.classifier_init_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor = None,
        max_seqlen: int = None,
        cu_seqlens: torch.Tensor = None,
        attention_mask: torch.Tensor = None,
        output_hidden_states: bool = False,
        output_attentions: bool = False,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
    ):
        output = self.model.forward(
            input_ids,
            position_ids,
            max_seqlen,
            cu_seqlens,
            attention_mask,
            output_hidden_states,
            output_attentions,
        )
        x = output.last_hidden_state
        x = self.dropout(x)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            result = (logits,) + output[1:]
            return ((loss,) + result) if loss is not None else result

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=output.hidden_states if output_hidden_states else None,
            attentions=output.attentions if output_attentions else None,
        )
''')
import json
with open("NeoBERT/config.json","r",encoding="utf-8") as r:
  d=json.load(r)
d["auto_map"]["AutoModelForTokenClassification"]="token_classification.NeoBERTForTokenClassification"
with open("NeoBERT/config.json","w",encoding="utf-8") as w:
  json.dump(d,w,indent=2)
s='$1=="transformers"{printf("-b v%s",$2)}'
!test -d transformers || git clone `pip3 list | awk '{s}'` https://github.com/huggingface/transformers
!test -d UD_English-EWT || git clone --depth=1 https://github.com/UniversalDependencies/UD_English-EWT
def makejson(conllu_file,json_file):
  with open(conllu_file,"r",encoding="utf-8") as r, open(json_file,"w",encoding="utf-8") as w:
    d={"tokens":["[CLS]"],"tags":["SYM"]}
    for s in r:
      if s.strip()=="":
        if len(d["tokens"])>1:
          d["tokens"].append("[SEP]")
          d["tags"].append("SYM")
          print(json.dumps(d),file=w)
        d={"tokens":["[CLS]"],"tags":["SYM"]}
      else:
        t=s.split("\t")
        if len(t)==10 and t[0].isdecimal():
          d["tokens"].append(t[1])
          d["tags"].append(t[3])
makejson("UD_English-EWT/en_ewt-ud-train.conllu","train.json")
makejson("UD_English-EWT/en_ewt-ud-dev.conllu","dev.json")
makejson("UD_English-EWT/en_ewt-ud-test.conllu","test.json")
!env WANDB_DISABLED=true python3 transformers/examples/pytorch/token-classification/run_ner.py --task_name pos --model_name_or_path NeoBERT --trust_remote_code --train_file train.json --validation_file dev.json --test_file test.json --output_dir ./neobert-english-upos --overwrite_output_dir --do_train --do_eval --do_predict
!cp NeoBERT/*.py neobert-english-upos

私(安岡孝一)の手元では、25分ほどで以下のmetricsが出力されて、neobert-english-uposが出来上がった。

***** train metrics *****
  epoch                    =        3.0
  total_flos               =  1902228GF
  train_loss               =     0.0571
  train_runtime            = 0:21:30.63
  train_samples            =      12544
  train_samples_per_second =     29.158
  train_steps_per_second   =      3.645

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =      0.981
  eval_f1                 =     0.9783
  eval_loss               =     0.0945
  eval_precision          =      0.978
  eval_recall             =     0.9786
  eval_runtime            = 0:00:12.63
  eval_samples            =       2001
  eval_samples_per_second =     158.31
  eval_steps_per_second   =     19.858

***** predict metrics *****
  predict_accuracy           =     0.9816
  predict_f1                 =     0.9783
  predict_loss               =     0.0859
  predict_precision          =      0.977
  predict_recall             =     0.9796
  predict_runtime            = 0:00:13.69
  predict_samples_per_second =    151.618
  predict_steps_per_second   =      18.98

dense層を削っても、F1値はほとんど変わらない。ちょっと動かしてみよう。

from transformers import AutoTokenizer,AutoModelForTokenClassification,TokenClassificationPipeline
tkz=AutoTokenizer.from_pretrained("neobert-english-upos")
mdl=AutoModelForTokenClassification.from_pretrained("neobert-english-upos",trust_remote_code=True)
nlp=TokenClassificationPipeline(model=mdl,tokenizer=tkz,device=0)
print(nlp("It don't mean a thing if it ain't got that swing"))

出来立てのneobert-english-uposで「It don't mean a thing if it ain't got that swing」に品詞付与してみたところ、私の手元では以下の結果が得られた。

[{'entity': 'PRON', 'score': 0.99999654, 'index': 1, 'word': 'it', 'start': 0, 'end': 2}, {'entity': 'AUX', 'score': 0.99999964, 'index': 2, 'word': 'don', 'start': 3, 'end': 6}, {'entity': 'AUX', 'score': 0.9726723, 'index': 3, 'word': "'", 'start': 6, 'end': 7}, {'entity': 'PART', 'score': 0.99999917, 'index': 4, 'word': 't', 'start': 7, 'end': 8}, {'entity': 'VERB', 'score': 0.99999845, 'index': 5, 'word': 'mean', 'start': 9, 'end': 13}, {'entity': 'DET', 'score': 0.9999993, 'index': 6, 'word': 'a', 'start': 14, 'end': 15}, {'entity': 'NOUN', 'score': 0.99999774, 'index': 7, 'word': 'thing', 'start': 16, 'end': 21}, {'entity': 'SCONJ', 'score': 0.99999785, 'index': 8, 'word': 'if', 'start': 22, 'end': 24}, {'entity': 'PRON', 'score': 0.999987, 'index': 9, 'word': 'it', 'start': 25, 'end': 27}, {'entity': 'AUX', 'score': 0.999863, 'index': 10, 'word': 'ain', 'start': 28, 'end': 31}, {'entity': 'AUX', 'score': 0.9569846, 'index': 11, 'word': "'", 'start': 31, 'end': 32}, {'entity': 'PART', 'score': 0.99999964, 'index': 12, 'word': 't', 'start': 32, 'end': 33}, {'entity': 'VERB', 'score': 0.99953616, 'index': 13, 'word': 'got', 'start': 34, 'end': 37}, {'entity': 'DET', 'score': 0.9995009, 'index': 14, 'word': 'that', 'start': 38, 'end': 42}, {'entity': 'NOUN', 'score': 0.9997925, 'index': 15, 'word': 'swing', 'start': 43, 'end': 48}]

「don't」と「ain't」がうまくいってないことは同様で、まあ、dense層があってもなくてもダメらしい。しかしながら、dense層は4096出力に渡っており、不要なら削るに越したことはない。さて、このあたり、どう合意を取ればいいかな。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up