Commit 2b770646 authored by Park, Se's avatar Park, Se


parent 5638e85b
import numpy as np
import pandas as pd
import torch
from import Dataset
from transformers import BertTokenizer, AlbertTokenizer
class Data(object):
"""A single training/test example for the dataset."""
def __init__(self, src, mt, score):
self.src = src = mt
self.score = score
def __str__(self):
return self.__repr__()
def __repr__(self):
l = ["src: {}".format(self.src), "mt: {}".format(, "label: {}".format(self.score)]
return ", ".join(l)
class LoadData(Dataset):
def __init__(self, src_file, mt_file, score_file, maxlen):
with open(src_file, 'r', encoding='utf-8') as f:
src_sentences = f.readlines()
with open(mt_file, 'r', encoding='utf-8') as f:
mt_sentences = f.readlines()
with open(score_file, 'r', encoding='utf-8') as f:
scores = f.readlines() = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))
for s, m, h in zip(src_sentences, mt_sentences, scores)]
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
self.maxlen = maxlen
def __len__(self):
return len(
def __getitem__(self, index):
# Selecting the sentence and label at the specified index in the data frame
src =[index].src
mt =[index].mt
score =[index].score
# Preprocessing the text to be suitable for BERT
# Tokenize the sentence
src_tokens = self.tokenizer.tokenize(src)
mt_tokens = self.tokenizer.tokenize(mt)
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens = ["[CLS]"] + src_tokens + ["[SEP]"] + mt_tokens + ["[SEP]"]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)
attn_mask = [1]*len(tokens_ids)
if len(tokens) < self.maxlen:
# Padding sentences
padding = [0] * (self.maxlen-len(tokens_ids))
tokens_ids += padding
segment_ids += padding
attn_mask += padding
# Prunning the list to be of specified max length
tokens_ids = tokens_ids[:self.maxlen]
segment_ids = segment_ids[:self.maxlen]
attn_mask = attn_mask[:self.maxlen]
# Converting the list to a pytorch tensor
tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)
segment_ids = torch.tensor(segment_ids, dtype=torch.long)
attn_mask = torch.tensor(attn_mask, dtype=torch.long)
score = torch.tensor(score, dtype=torch.float)
# score = torch.tanh(score)
return tokens_ids, attn_mask, segment_ids, score
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment