提要
本文记录一些本人使用BERT模型(论文链接)的经验和方法。针对论文中的两种使用方法,第一种是fine-tuning,由于我使用的框架是Pytorch而非TensorFlow,所以我借鉴了github上的Pytorch-Bert。第二种是提取特征方法,类似预训练词向量,我之前的做法是把这些特征保存到磁盘上,但是提取的速度太慢,文件太大,而且真正测试时缺少文件。为了解决这些问题我也直接把BERT模型加载后直接提取并训练。
模块化
为了方便使用,我把BERT模型封装在了module类中, 其中ScalarMix类用于加权求和各个向量。请预先安装好pytorch-pretrained-bert包以及下载好模型及词典(文后给出链接)。下面简要说明这两个类。
Bert_Embedding类:它的初始化参数分别为预训练好的bert模型、所要提取特征的层数,从后往前算,比如4就代表提取最后四层,然后像elmo一样加权求和并乘一个系数、bert向量的维度比如768。最后一个参数表示是否微调,提取特征的方法默认不微调。该类用于as-feature方法。
Bert_Encoder类:它的初始化参数类似,不再赘述,区别是它只输出最后一层。该类用于fine-tuning方法。
两者的forward的输入和输出类似。subword_idxs是各个子词在bert词典中对应的索引,subword_masks是子词序列的掩码,token_starts_masks是原始词对应哪一个子词的掩码,目前只实现了每个词对应其第一个子词。forward的输出就是对应每个原始词的向量,维度为[batch, maxlen, bert_dim]
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
class ScalarMix(torch.nn.Module):
def __init__(self, mixture_size=3):
super(ScalarMix, self).__init__()
self.mixture_size = mixture_size
self.scalar_parameters = Parameter(torch.zeros(mixture_size))
self.gamma = Parameter(torch.tensor(1.0))
def forward(self, layers):
normed_weights = F.softmax(self.scalar_parameters, dim=0)
return self.gamma * sum(
weight * tensor for weight, tensor in zip(normed_weights, layers)
)
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from pytorch_pretrained_bert.modeling import BertModel
from scalarmix import ScalarMix
class Bert_Embedding(nn.Module):
def __init__(self, bert_path, bert_layer, bert_dim, freeze=True):
super(Bert_Embedding, self).__init__()
self.bert_layer = bert_layer
self.bert = BertModel.from_pretrained(bert_path)
self.scalar_mix = ScalarMix(bert_layer)
if freeze:
self.freeze()
def forward(self, subword_idxs, subword_masks, token_starts_masks):
self.eval()
sen_lens = token_starts_masks.sum(dim=1)
bert_outs, _ = self.bert(
subword_idxs,
token_type_ids=None,
attention_mask=subword_masks,
output_all_encoded_layers=True,
)
bert_outs = bert_outs[: -self.bert_layer]
bert_outs = self.scalar_mix(bert_outs)
bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist())
bert_outs = pad_sequence(bert_outs, batch_first=True)
return bert_outs
def freeze(self):
for para in self.bert.parameters():
para.requires_grad = False
class Bert_Encoder(nn.Module):
def __init__(self, bert_path, bert_dim, freeze=False):
super(Bert_Encoder, self).__init__()
self.bert_dim = bert_dim
self.bert = BertModel.from_pretrained(bert_path)
if freeze:
self.freeze()
def forward(self, subword_idxs, subword_masks, token_starts_masks):
sen_lens = token_starts_masks.sum(dim=1)
bert_outs, _ = self.bert(
subword_idxs,
token_type_ids=None,
attention_mask=subword_masks,
output_all_encoded_layers=False,
)
bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist())
bert_outs = pad_sequence(bert_outs, batch_first=True)
return bert_outs
def freeze(self):
for para in self.bert.parameters():
para.requires_grad = False
准备tokenizer
由于BERT使用的是subword,所以它会把一些词拆成多个字词,导致输入输出不能对齐。(中文应该没有此类问题,没有试过)。根据论文中的描述,如果一个词被拆开,我们取第一个子词的输出表示(应该也能取最后一个,或者取平均)。我们要用到内置的BertTokenizer类,如下面语句所示,主要用来拆解词以及把词转换为索引。
from pytorch_pretrained_bert.tokenization import BertTokenizer
我还遇到一个出错的地方是BERT会忽略一些空白字符、特殊字符,我在德语法语语料中遇到很多。而往往为了对齐我们需要在该位置添加“[PAD]”字符。下面是一个vocab示例用于产生bert的输入
class Vocab(object):
def __init__(self, bert_vocab_path):
self.tokenizer = BertTokenizer.from_pretrained(
bert_vocab_path, do_lower_case=False
)
def numericalize(self, seqs, training=True):
subwords, masks, starts = [], [], []
for seq in seqs:
seq = [self.tokenizer.tokenize(token) for token in seq]
seq = [piece if piece else ["[PAD]"] for piece in seq]
seq = [["[CLS]"]] + seq + [["[SEP]"]]
lengths = [0] + [len(piece) for piece in seq]
# flatten the word pieces
tokens = sum(seq, [])
# subwords indexes
token_idx = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens))
subwords.append(token_idx)
# subword masks
mask = torch.ones(len(tokens))
masks.append(mask)
# record the start position of all words
start_idxs = torch.tensor(lengths).cumsum(0)[1:-2] # bos:0 eos:-2
# subword start masks
start_mask = torch.zeros(len(tokens), dtype=torch.uint8)
start_mask[start_idxs] = 1
starts.append(start_mask)
return subwords, masks, starts
fine-tuning method
下面的代码简单实现了一个基于bert fine-tuning的序列标注模型。该过程为整个模型的一次forward。
class Fine_Tuning_Tagger(nn.Module):
def __init__(self, bert_path, bert_dim, n_tag):
super(Fine_Tuning_Tagger, self).__init__()
self.bert_encoder = Bert_Encoder(bert_path, bert_dim)
self.classifier = nn.Linear(bert_dim, n_tag)
def forward(self, subword_idxs, subword_masks, token_starts):
bert_outs = self.bert_encoder(subword_idxs, subword_masks, token_starts)
out = self.classifier(bert_outs)
sen_lens = token_starts.sum(dim=1)
max_len = max(sen_lens)
mask = torch.arange(max_len).to(sen_lens.device) < sen_lens.unsqueeze(
-1
) # 可能要用到此mask来算loss或者预测
return out
if __name__ == "__main__":
bert_path = "/data/wjiang/data/bert/bert-base-cased.tar.gz"
bert_vocab_path = "/data/wjiang/data/bert/bert-base-cased-vocab.txt"
bert_dim = 768
n_tag = 10 # 假设10个标签
vocab = Vocab(bert_vocab_path)
examples = [["I", "am", "jiangwei"], ["Hello"]]
subword_idxs, subword_masks, token_starts_masks = vocab.numericalize(examples)
subword_idxs = pad_sequence(subword_idxs, batch_first=True)
subword_masks = pad_sequence(subword_masks, batch_first=True)
token_starts_masks = pad_sequence(token_starts_masks, batch_first=True)
model = Fine_Tuning_Tagger(bert_path, bert_dim, n_tag)
out = model(subword_idxs, subword_masks, token_starts_masks)
print(out.shape)
# 根据out算loss或者预测...
as-feature method
如果要提取特征到磁盘,详情见Bert文档,有空再添加。
下面的程序简单实现了一个基于as-feature方法的序列标注模型。首先通过bert获得bert的特征向量,然后把它拼在词向量之后,最后输入到LSTM中。
class As_Feature_Tagger(nn.Module):
def __init__(
self,
bert_path,
bert_dim,
bert_layer,
word_dim,
n_word,
lstm_dim,
lstm_layer,
n_tag,
):
super(As_Feature_Tagger, self).__init__()
self.bert_embedding = Bert_Embedding(bert_path, bert_layer, bert_dim)
self.word_embedding = nn.Embedding(n_word, word_dim)
self.lstm = nn.LSTM(
input_size=word_dim + bert_dim,
hidden_size=lstm_dim // 2,
bidirectional=True,
num_layers=lstm_layer,
)
self.classifier = nn.Linear(lstm_dim, n_tag)
def forward(self, subword_idxs, subword_masks, token_starts, word_idxs):
bert_outs = self.bert_embedding(subword_idxs, subword_masks, token_starts)
word_emb = self.word_embedding(word_idxs)
emb = torch.cat([word_emb, bert_outs], dim=-1)
# pytorch 1.1不需要排序
sen_lens = token_starts.sum(dim=1)
max_len = max(sen_lens)
mask = torch.arange(max_len).to(sen_lens.device) < sen_lens.unsqueeze(
-1
) # 可能要用到此mask来算loss或者预测
sorted_lens, sorted_idx = torch.sort(sen_lens, dim=0, descending=True)
reverse_idx = torch.sort(sorted_idx, dim=0)[1]
emb = emb[sorted_idx]
lstm_input = pack_padded_sequence(emb, sorted_lens, batch_first=True)
lstm_out, _ = self.lstm(lstm_input, None)
lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
lstm_out = lstm_out[reverse_idx]
out = self.classifier(lstm_out)
return out
if __name__ == "__main__":
bert_path = "/data/wjiang/data/bert/bert-base-cased.tar.gz"
bert_vocab_path = "/data/wjiang/data/bert/bert-base-cased-vocab.txt"
bert_dim = 768
bert_layer = 4
word_dim = 50
lstm_dim = 200
lstm_layer = 2
n_tag = 10 # 假设10个标签
vocab = Vocab(bert_vocab_path)
examples = [["I", "am", "jiangwei"], ["Hello"]]
words = sorted(set(sum(examples, [])))
words = ["<PAD>"] + words
word2id = {w: i for i, w in enumerate(words)}
subword_idxs, subword_masks, token_starts_masks = vocab.numericalize(examples)
word_idxs = [torch.tensor([word2id[w] for w in seq]) for seq in examples]
subword_idxs = pad_sequence(subword_idxs, batch_first=True)
subword_masks = pad_sequence(subword_masks, batch_first=True)
token_starts_masks = pad_sequence(token_starts_masks, batch_first=True)
word_idxs = pad_sequence(word_idxs, batch_first=True)
model = As_Feature_Tagger(
bert_path,
bert_dim,
bert_layer,
word_dim,
len(words),
lstm_dim,
lstm_layer,
n_tag,
)
out = model(subword_idxs, subword_masks, token_starts_masks, word_idxs)
print(out.shape)
# 根据out算loss或者预测...
fine-tuning on unlabeled data
详情参考官方文档。如果要用pytorch版,请参考https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning, 还没有实际测试过。
实际使用
我在UCCA Parser(训练集 4100句,batch为10)上尝试了两种方法,发现速度方面还可以。不用bert训练时间6分钟不到一点,用了base bert as-feature方法后7分钟左右,显存也没有明显增长。 使用fine-tuning后,显存占用比较大,训练时间8分半左右,暂时还没有好办法解决。
最终结果方面,as-feature方法和之前保存到磁盘上的方法基本一致,一个test上高了0.4%,另一个低了0.2%。fine-tuning方法比as-feature低了0.2%左右,学习速率尽量小(如2e-5),batch由于显存原因还没调.
参考
PRETRAINED_MODEL_ARCHIVE_MAP = { ‘bert-base-uncased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz”, ‘bert-large-uncased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz”, ‘bert-base-cased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz”, ‘bert-large-cased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz”, ‘bert-base-multilingual-uncased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz”, ‘bert-base-multilingual-cased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz”, ‘bert-base-chinese’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz”, }
PRETRAINED_VOCAB_ARCHIVE_MAP = { ‘bert-base-uncased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt”, ‘bert-large-uncased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt”, ‘bert-base-cased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt”, ‘bert-large-cased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt”, ‘bert-base-multilingual-uncased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt”, ‘bert-base-multilingual-cased’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt”, ‘bert-base-chinese’: “https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt”, }