抱抱脸系列 | 句对分类

本站内容均来自兴趣收集,如不慎侵害的您的相关权益,请留言告知,我们将尽快删除.谢谢.

任务:使用BERT系列模型做sentence-pair-classification的任务

数据集:使用GLUE中的MRPC

!pip install datasets==1.0.1
!pip install transformers==3.1.0
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

下载数据集

dataset = load_dataset('glue', 'mrpc')

分割数据集

split = dataset['train'].train_test_split(test_size=0.1, seed=1)
train = split['train']  # 90 % of the original training data
val = split['test']   # 10 % of the original training data
test = dataset['validation']
# Transform data into pandas dataframes
df_train = pd.DataFrame(train)
df_val = pd.DataFrame(val)
df_test = pd.DataFrame(test)
df_train.head()

可以看到数据包含:sentence1;sentence2;label

print(df_train.shape)  # (3301, 4)
print(df_val.shape)  # (367, 4)
print(df_test.shape)  # (408, 4)

定义超参数

bert_model = "albert-base-v2"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', ...
freeze_bert = False  # 如果为True,则冻住bert的参数,只调整上层的分类参数
maxlen = 128
bs = 16  # batch size
iters_to_accumulate = 2  # 梯度累加的次数
lr = 2e-5
epochs = 4

根据Dataset类定义数据集

class CustomDataset(Dataset):
def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):
self.data = data  # pandas dataframe
self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  # 初始化tokenizer
self.maxlen = maxlen
self.with_labels = with_labels
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sent1 = str(self.data.loc[index, 'sentence1'])
sent2 = str(self.data.loc[index, 'sentence2'])
# 使用tokenizer得到token id、attention mask和token type id
encoded_pair = self.tokenizer(sent1,
sent2,
padding='max_length',
truncation=True,
max_length=self.maxlen,
return_tensors='pt')
token_ids = encoded_pair['input_ids'].squeeze(0)
attn_masks = encoded_pair['attention_mask'].squeeze(0)
token_type_ids = encoded_pair['token_type_ids'].squeeze(0)
if self.with_labels:
label = self.data.loc[index, 'label']
return token_ids, attn_masks, token_type_ids, label
else:
return token_ids, attn_masks, token_type_ids
train_set = CustomDataset(df_train, maxlen, bert_model)
val_set = CustomDataset(df_val, maxlen, bert_model)

定义模型

class SentencePairClassifier(nn.Module):
def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
super(SentencePairClassifier, self).__init__()
self.bert_layer = AutoModel.from_pretrained(bert_model)
if bert_model == "albert-base-v2":  # 12M parameters
hidden_size = 768
elif bert_model == "albert-large-v2":  # 18M parameters
hidden_size = 1024
elif bert_model == "albert-xlarge-v2":  # 60M parameters
hidden_size = 2048
elif bert_model == "albert-xxlarge-v2":  # 235M parameters
hidden_size = 4096
elif bert_model == "bert-base-uncased": # 110M parameters
hidden_size = 768
# 如果冻住bert,那幺给bert层的参数梯度都设置为false
if freeze_bert:
for p in self.bert_layer.parameters():
p.requires_grad = False
self.cls_layer = nn.Linear(hidden_size, 1)
self.dropout = nn.Dropout(p=0.1)
@autocast()  # 自动混合精度
def forward(self, input_ids, attn_masks, token_type_ids):
cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)
logits = self.cls_layer(self.dropout(pooler_output))
return logits

设置随机数

def set_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

evaluate过程代码

def evaluate_loss(net, device, criterion, dataloader):
net.eval()
mean_loss = 0
count = 0
with torch.no_grad():
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
logits = net(seq, attn_masks, token_type_ids)
mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
count += 1
return mean_loss / count
!mkdir models

train过程代码

def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
best_loss = np.Inf
best_ep = 1
nb_iterations = len(train_loader)
print_every = nb_iterations // 5  # 一个epoch打印5次loss
iters = []
train_losses = []
val_losses = []
scaler = GradScaler()
for ep in range(epochs):
net.train()
running_loss = 0.0
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
# 转换成 cuda tensors
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
with autocast():
logits = net(seq, attn_masks, token_type_ids)
# 计算 loss
loss = criterion(logits.squeeze(-1), labels.float())
loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged
scaler.scale(loss).backward()
if (it + 1) % iters_to_accumulate == 0:
# Optimization step
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, opti.step() is then called,
# otherwise, opti.step() is skipped.
scaler.step(opti)
# Updates the scale for next iteration.
scaler.update()
# Adjust the learning rate based on the number of iterations.
lr_scheduler.step()
# Clear gradients
opti.zero_grad()
running_loss += loss.item()
if (it + 1) % print_every == 0:
print()
print("Iteration {}/{} of epoch {} complete. Loss : {} "
.format(it+1, nb_iterations, ep+1, running_loss / print_every))
running_loss = 0.0
val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
print()
print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
if val_loss < best_loss:
print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
print()
net_copy = copy.deepcopy(net)  # save a copy of the model
best_loss = val_loss
best_ep = ep + 1
# Saving the model
path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
torch.save(net_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))
del loss
torch.cuda.empty_cache()
set_seed(1)
train_set = CustomDataset(df_train, maxlen, bert_model)
val_set = CustomDataset(df_val, maxlen, bert_model)
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1:  # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
net = nn.DataParallel(net)
net.to(device)
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

Epoch 4 complete! Validation Loss : 0.36528593172197754

The model has been saved in models/albert-base-v2_lr_2e-05_val_loss_0.35007_ep_3.pt

def get_probs_from_logits(logits):
"""
Converts a tensor of logits into an array of probabilities by applying the sigmoid function
"""
probs = torch.sigmoid(logits.unsqueeze(-1))
return probs.detach().cpu().numpy()
def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
"""
Predict the probabilities on a dataset with or without labels and print the result in a file
"""
net.eval()
w = open(result_file, 'w')
probs_all = []
with torch.no_grad():
if with_labels:
for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
logits = net(seq, attn_masks, token_type_ids)
probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
probs_all += probs.tolist()
else:
for seq, attn_masks, token_type_ids in tqdm(dataloader):
seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
logits = net(seq, attn_masks, token_type_ids)
probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
probs_all += probs.tolist()
w.writelines(str(prob)+'\n' for prob in probs_all)
w.close()
!mkdir results
path_to_model = 'models/albert-base-v2_lr_2e-05_val_loss_0.35007_ep_3.pt'
path_to_output_file = 'results/output.txt'
print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)
model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1:  # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)
print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)
print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))
path_to_output_file = 'results/output.txt'  # path to the file with prediction probabilities
labels_test = df_test['label']  # true labels
probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
threshold = 0.5   # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold
metric = load_metric("glue", "mrpc")
metric._compute(predictions=preds_test, references=labels_test)

{‘accuracy’: 0.875, ‘f1’: 0.911917098445596}


Download as PDF

闪念基因
我还没有学会写个人说明!
上一篇

service mesh - 微服务通信进化之路

下一篇

3599元起 荣耀V40携手天猫随心花开启12期免息活动每天9.9元

你也可能喜欢

评论已经被关闭。

插入图片