二、用于预训练BERT的数据集( 四 )

< 0.5:is_next = Trueelse:# 先随机选取一段,再从这段中随机选取一个句子next_sentence = random.choice(random.choice(paragraphs))is_next = Falsereturn sentence, next_sentence, is_nextdef _get_nsp_data_from_paragraph(paragraph, paragraphs, vocab, max_len):"""生成用于下一句预测的训练样本"""nsp_data_from_paragraph = []for i in range(len(paragraph) - 1):tokens_a, tokens_b, is_next = _get_next_sentence(paragraph[i], paragraph[i + 1], paragraphs)# 考虑1个''词元和2个''词元if len(tokens_a) + len(tokens_b) + 3 > max_len:continuetokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b)nsp_data_from_paragraph.append((tokens, segments, is_next))return nsp_data_from_paragraph
3.生成遮蔽语言模型任务的数据
输入:是表示BERT输入序列的词元的列表,ions是不包括特殊词元的BERT输入序列的词元索引的列表(特殊词元在遮蔽语言模型任务中不被预测),指预测的数量(选择15%要预测的随机词元) 。
函数返回替换后的输入词元、发生预测的词元索引和这些预测的标签(被替换掉的真实词元的索引和词元) 。
def _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds, vocab):# 为遮蔽语言模型的输入创建新的词元副本,其中输入可能包含替换的“”或随机词元mlm_input_tokens = [token for token in tokens]pred_positions_and_labels = []# 打乱后用于在遮蔽语言模型任务中获取15%的随机词元进行预测random.shuffle(candidate_pred_positions)for mlm_pred_position in candidate_pred_positions:if len(pred_positions_and_labels) >= num_mlm_preds:breakmasked_token = None# 80%的概率:将词替换为“”词元if random.random() < 0.8:masked_token = ''else:# 10%的概率:保持词不变if random.random() < 0.5:masked_token = tokens[mlm_pred_position]# 10%的概率:用随机词替换该词else:masked_token = random.choice(vocab.idx_to_token)mlm_input_tokens[mlm_pred_position] = masked_tokenpred_positions_and_labels.append((mlm_pred_position, tokens[mlm_pred_position]))return mlm_input_tokens, pred_positions_and_labels

二、用于预训练BERT的数据集

文章插图
def _get_mlm_data_from_tokens(tokens, vocab):candidate_pred_positions = []# tokens是一个字符串列表for i, token in enumerate(tokens):# 在遮蔽语言模型任务中不会预测特殊词元if token in ['', '']:continuecandidate_pred_positions.append(i)# 遮蔽语言模型任务中预测15%的随机词元num_mlm_preds = max(1, round(len(tokens) * 0.15))mlm_input_tokens, pred_positions_and_labels = _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds, vocab)pred_positions_and_labels = sorted(pred_positions_and_labels, key=lambda x: x[0])pred_positions = [v[0] for v in pred_positions_and_labels]mlm_pred_labels = [v[1] for v in pred_positions_and_labels]# 返回输入词元的索引、发生预测的词元索引以及这些预测的标签索引return vocab[mlm_input_tokens], pred_positions, vocab[mlm_pred_labels]
4.将文本转换为预训练数据集
定义辅助函数将特殊的“”词元附加到输入,参数包含来自两个预训练任务的辅助函数和okens的输出 。
def _pad_bert_inputs(examples, max_len, vocab):max_num_mlm_preds = round(max_len * 0.15)all_token_ids, all_segments, valid_lens = [], [], []all_pred_positions, all_mlm_weights, all_mlm_labels = [], [], []nsp_labels = []for (token_ids, pred_positions, mlm_pred_label_ids, segments, is_next) in examples:all_token_ids.append(torch.tensor(token_ids + [vocab['']] * (max_len - len(token_ids)), dtype=torch.long))all_segments.append(torch.tensor(segments + [0] * (max_len - len(segments)), dtype=torch.long))valid_lens.append(torch.tensor(len(token_ids), dtype=torch.float32))all_pred_positions.append(torch.tensor(pred_positions + [0] * (max_num_mlm_preds - len(pred_positions)), dtype=torch.long))# 填充词元的预测将通过乘以0权重在损失中过滤掉all_mlm_weights.append(torch.tensor([1.0] * len(mlm_pred_label_ids) + [0.0] * (max_num_mlm_preds - len(pred_positions)),dtype=torch.float32))all_mlm_labels.append(torch.tensor(mlm_pred_label_ids + [0] * (max_num_mlm_preds - len(mlm_pred_label_ids)), dtype=torch.long))nsp_labels.append(torch.tensor(is_next, dtype=torch.long))return (all_token_ids, all_segments, valid_lens, all_pred_positions,all_mlm_weights, all_mlm_labels, nsp_labels)