Commit d80c5a59 authored by Se Park's avatar Se Park

Need to bugfix

parent 7d305f80
......@@ -52,24 +52,26 @@ class LoadData(Dataset):
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens = ["[CLS]"] + src_tokens + ["[SEP]"] + mt_tokens + ["[SEP]"]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)
attn_mask = [1]*len(tokens_ids)
if len(tokens) < self.maxlen:
# Padding sentences
tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
segment_ids = segment_ids + [0] * (self.maxlen - len(segment_ids))
padding = [0] * (self.maxlen-len(tokens_ids))
tokens_ids += padding
segment_ids += padding
attn_mask += padding
else:
# Prunning the list to be of specified max length
tokens = tokens[:self.maxlen-1] + ['[SEP]']
tokens_ids = tokens_ids[:self.maxlen]
segment_ids = segment_ids[:self.maxlen]
attn_mask = attn_mask[:self.maxlen]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
# Converting the list to a pytorch tensor
tokens_ids_tensor = torch.tensor(tokens_ids)
segment_ids_tensor = torch.tensor(segment_ids, dtype=torch.long)
# Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
attn_mask = (tokens_ids_tensor!=0).long()
return tokens_ids_tensor, segment_ids_tensor, attn_mask, score
tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)
segment_ids = torch.tensor(segment_ids, dtype=torch.long)
attn_mask = torch.tensor(attn_mask, dtype=torch.long)
return tokens_ids, attn_mask, segment_ids, score
......@@ -21,7 +21,7 @@ def evaluate(model, loss_fn, dataloader, device):
for token_ids, segment_ids, attn_masks, labels in dataloader:
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
qe_scores = model(token_ids, segment_ids, attn_masks)
loss = loss_fn(qe_scores, labels)
loss = loss_fn(qe_scores.view(-1), labels.float())
qe_scores = qe_scores.detach().cpu().numpy()
qe_scores = qe_scores.reshape((qe_scores.shape[0],))
......@@ -29,7 +29,7 @@ def evaluate(model, loss_fn, dataloader, device):
pred = np.concatenate((pred, qe_scores))
ref = np.concatenate((ref, labels))
eval_loss += loss.item()
count += 1
......@@ -44,6 +44,7 @@ def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device
for ep in range(num_epoch):
print('======= Epoch {:} ======='.format(ep))
for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):
model.train()
# Clear gradients
optimizer.zero_grad()
# Converting these to cuda tensors
......@@ -51,7 +52,7 @@ def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
# Computing loss
loss = loss_fn(qe_scores, labels)
loss = loss_fn(qe_scores.view(-1), labels.float())
# Backpropagating the gradients
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
......@@ -73,6 +74,7 @@ if __name__ == "__main__":
PATH = Path("/vol/bitbucket/shp2918/nlp")
use_cuda = torch.cuda.is_available()
# use_cuda = False
device = torch.device('cuda' if use_cuda else 'cpu')
print("Using GPU: {}".format(use_cuda))
......@@ -81,13 +83,13 @@ if __name__ == "__main__":
model.cuda()
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr = 2e-5)
optimizer = optim.Adam(model.parameters(), lr=5e-5)
MAX_LEN = 128
MAX_LEN = 64
train_set = LoadData(src_file=PATH/'data/train.ende.src', mt_file=PATH/'data/train.ende.mt', score_file=PATH/'data/train.ende.scores', maxlen=MAX_LEN)
val_set = LoadData(src_file=PATH/'data/dev.ende.src', mt_file=PATH/'data/dev.ende.mt', score_file=PATH/'data/dev.ende.scores', maxlen=MAX_LEN)
train_loader = DataLoader(train_set, batch_size=32, num_workers=5)
val_loader = DataLoader(val_set, batch_size=32, num_workers=5)
train_loader = DataLoader(train_set, batch_size=32)
val_loader = DataLoader(val_set, batch_size=32)
num_epoch = 4
train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device)
......@@ -10,16 +10,15 @@ class QualityEstimation(nn.Module):
# Instantiating BERT model object
config = BertConfig()
self.bert = BertModel(config)
self.bert = BertModel(config).from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(0.25)
# LSTM and classification layers
self.lstm = nn.LSTM(input_size=config.hidden_size,hidden_size=self.hidden_dim,
num_layers=1,batch_first=True,
dropout=0,bidirectional=False)
self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim,
num_layers=1, batch_first=True,
dropout=0, bidirectional=False)
self.fc1 = nn.Linear(self.hidden_dim, self.hidden_dim)
self.fc2 = nn.Linear(self.hidden_dim, 1)
self.loss = nn.MSELoss()
def forward(self, token_ids, segment_ids=None, attention_mask=None):
......@@ -29,6 +28,7 @@ class QualityEstimation(nn.Module):
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
encoded_layers, _ = self.bert(flat_token_ids, flat_segment_ids, flat_attention_mask)
# encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)
encoded_layers = self.dropout(encoded_layers)
output, _ = self.lstm(encoded_layers)
output = torch.tanh(self.fc1(output[:,-1,:]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment