This is my model code, I want to fine-tuning a pertained language model XLM from Facebook to do NER tasks, so i linked a BiLSTM and CRF.
class XLM_BiLSTM_CRF(nn.Module):
def __init__(self, config, num_labels, params, dico, reloaded):
super().__init__()
self.config = config
self.num_labels = num_labels
self.batch_size = config.batch_size
self.hidden_dim = config.hidden_dim
self.xlm = TransformerModel(params, dico, True, True)
self.xlm.eval()
self.xlm.load_state_dict(reloaded['model'])
self.lstm = nn.LSTM(config.embedding_dim, config.hidden_dim // 2,
num_layers=1, bidirectional=True)
self.dropout = nn.Dropout(config.dropout)
self.classifier = nn.Linear(config.hidden_dim, config.num_class)
self.apply(self.init_bert_weights)
self.crf = CRF(config.num_class)
def forward(self, word_ids, lengths, langs=None, causal=False):
sequence_output = self.xlm('fwd', x=word_ids, lengths=lengths, causal=False).contiguous()
sequence_output, _ = self.lstm(sequence_output)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
return self.crf.decode(logits)
def log_likelihood(self, word_ids, lengths, tags):
sequence_output = self.xlm('fwd', x=word_ids, lengths=lengths, causal=False).contiguous()
sequence_output, _ = self.lstm(sequence_output)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
return - self.crf(logits, tags.transpose(0,1))
def init_bert_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
And this is my training code.
def train(model, train_iter, dev_iter, params):
for param in model.xlm.parameters(): ## freeze layers
param.requires_grad = False
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
lr=0.003, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4)
iteration, best_f1 = 0, 0
for epoch in trange(params.n_epochs):
for sentence, tags in train_iter:
model.train()
iteration += 1
optimizer.zero_grad()
# sentence = torch.tensor(sentence, dtype=torch.long)
sentence = sentence.long().transpose(0, 1).to(device) # slen * bs
# tags = torch.tensor([tag2id[t] for t in tags], dtype=torch.long)
tags = tags.long().to(device)
lengths = torch.LongTensor([params.max_len] * sentence.size(1)).to(device)
# langs = ''
# logits = model(sentence, lengths)
loss = model.log_likelihood(sentence, lengths, tags)
loss.backward()
torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=2)
optimizer.step()
if iteration % 20 == 0:
logging.info(
'\rEpoch[{}] - Iteration[{}] - loss: {}'.format(epoch, iteration, loss.item()))
if iteration % 20 == 0:
_, _, eval_f1 = eval(model, dev_iter, params)
if eval_f1 > best_f1:
best_f1 = eval_f1
save(model, "./dumped", iteration)
def eval(model, dev_iter, params):
model.eval()
aver_loss = 0
preds, labels = [], []
for sentence, tags in dev_iter:
sentence = sentence.long().transpose(0, 1).to(device)
tags = tags.long().to(device)
lengths = torch.LongTensor([params.max_len] * sentence.size(1)).to(device)
pred = model(sentence, lengths)
loss = model.log_likelihood(sentence, lengths, tags)
aver_loss += loss.item()
for i in pred:
preds += i
for i in tags.tolist():
labels += i
aver_loss /= (len(dev_iter) * params.batch_size)
precision = precision_score(labels, preds, average='macro')
recall = recall_score(labels, preds, average='macro')
f1 = f1_score(labels, preds, average='macro')
report = classification_report(labels, preds)
print(report)
logging.info('\nEvaluation - loss: {:.6f} precision: {:.4f} recall: {:.4f} f1: {:.4f} \n'.format(aver_loss,
precision,
recall, f1))
return precision, recall, f1
During training, the loss of model decreases, but the F1 score remains unchanged in 0.073, It looks like the loss of model didn't help to predict the correct label of entity.
I just confused and don't know why did this happen, could anyone can help? Appreciate a lot.