Commit b84c4eb6 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Experimentation with further improving feature and relation extraction

parent 71a5c117
...@@ -5,6 +5,7 @@ server/agent/amazon_data/ ...@@ -5,6 +5,7 @@ server/agent/amazon_data/
server/agent/SA/data/ server/agent/SA/data/
server/agent/target_extraction/data/ server/agent/target_extraction/data/
server/agent/target_extraction/BERT/data/ server/agent/target_extraction/BERT/data/
server/agent/target_extraction/eval/qa/
.DS_Store .DS_Store
*.pickle *.pickle
*.wv *.wv
\ No newline at end of file
...@@ -10,7 +10,7 @@ from sklearn import metrics ...@@ -10,7 +10,7 @@ from sklearn import metrics
import statistics import statistics
from transformers import get_linear_schedule_with_warmup from transformers import get_linear_schedule_with_warmup
from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityDataset, generate_batch, generate_production_batch from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityDataset, generate_batch, generate_production_batch
from agent.target_extraction.BERT.entity_extractor.entitybertnet import NUM_CLASSES, EntityBertNet from agent.target_extraction.BERT.entity_extractor.entitybertnet import NUM_CLASSES, EntityBertNet, BATCH_SIZE
device = torch.device('cuda') device = torch.device('cuda')
...@@ -21,7 +21,6 @@ MAX_GRAD_NORM = 1.0 ...@@ -21,7 +21,6 @@ MAX_GRAD_NORM = 1.0
# training # training
N_EPOCHS = 3 N_EPOCHS = 3
BATCH_SIZE = 32
WARM_UP_FRAC = 0.05 WARM_UP_FRAC = 0.05
# loss # loss
...@@ -61,8 +60,7 @@ class BertEntityExtractor: ...@@ -61,8 +60,7 @@ class BertEntityExtractor:
else: else:
train_size = int(size * (1 - valid_frac)) if size is not None else None train_size = int(size * (1 - valid_frac)) if size is not None else None
train_data, _ = EntityDataset.from_file(file_path, size=train_size) train_data, _ = EntityDataset.from_file(file_path, size=train_size)
valid_size = int(size * valid_frac) if size is not None else int(len(train_data) * valid_frac) valid_data, _ = EntityDataset.from_file(valid_file_path)
valid_data, _ = EntityDataset.from_file(valid_file_path, size=valid_size)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch) collate_fn=generate_batch)
...@@ -119,11 +117,11 @@ class BertEntityExtractor: ...@@ -119,11 +117,11 @@ class BertEntityExtractor:
print('epoch done') print('epoch done')
torch.save(self.net.state_dict(), '{}_epoch_{}.pt'.format(save_file, epoch_idx + 1))
if valid_data is not None: if valid_data is not None:
self.evaluate(data=valid_data) self.evaluate(data=valid_data)
torch.save(self.net.state_dict(), '{}.pt'.format(save_file))
end = time.time() end = time.time()
print('Training took', end - start, 'seconds') print('Training took', end - start, 'seconds')
...@@ -207,3 +205,7 @@ class BertEntityExtractor: ...@@ -207,3 +205,7 @@ class BertEntityExtractor:
probs[ins.entity].append(score) probs[ins.entity].append(score)
return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()} return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()}
BertEntityExtractor.train_and_validate('all_reviews_features.tsv', 'feature_extractor',
valid_file_path='annotated_watch_review_features.tsv')
...@@ -8,58 +8,22 @@ import os.path ...@@ -8,58 +8,22 @@ import os.path
from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
MAX_SEQ_LEN = 128 MAX_SEQ_LEN = 128
LABELS = ['ASPECT', 'NAN']
LABEL_MAP = {'ASPECT': 1, 'NAN': 0, None: None}
MASK_TOKEN = '[MASK]' MASK_TOKEN = '[MASK]'
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS) tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
def generate_batch(batch):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
labels = torch.tensor([instance.label for instance in batch])
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, entity_indices, labels
def generate_production_batch(batch):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, entity_indices, batch
def indices_for_entity_ranges(ranges):
max_e_len = max(end - start for start, end in ranges)
indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES
for t in range(start, start + max_e_len + 1)]
for start, end in ranges])
return indices
class EntityDataset(Dataset): class EntityDataset(Dataset):
def __init__(self, df, size=None): def __init__(self, df, training=True, size=None):
# filter inapplicable rows self.df = df
self.df = df[df.apply(lambda x: EntityDataset.instance_from_row(x) is not None, axis=1)] self.training = training
# sample data if a size is specified # sample data if a size is specified
if size is not None and size < len(self): if size is not None and size < len(self):
self.df = self.df.sample(size, replace=False) self.df = self.df.sample(size, replace=False)
@staticmethod @staticmethod
def from_df(df, size=None): def for_extraction(df):
dataset = EntityDataset(df, size=size) dataset = EntityDataset(df, training=False)
print('Obtained dataset of size', len(dataset)) print('Obtained dataset of size', len(dataset))
return dataset return dataset
...@@ -83,80 +47,60 @@ class EntityDataset(Dataset): ...@@ -83,80 +47,60 @@ class EntityDataset(Dataset):
print('Obtained train set of size', len(dataset), 'and validation set of size', len(validset)) print('Obtained train set of size', len(dataset), 'and validation set of size', len(validset))
return dataset, validset return dataset, validset
@staticmethod def instance_from_row(self, row):
def instance_from_row(row): if self.training:
unpacked_arr = literal_eval(row['entityMentions']) if type(row['entityMentions']) is str else row['entityMentions'] return EntityInstance(literal_eval(row['tokens']),
rms = [rm for rm in unpacked_arr if 'label' not in rm or rm['label'] in LABELS] row['entity_idx'],
if len(rms) == 1: label=row['label'])
entity, label = rms[0]['text'], (rms[0]['label'] if 'label' in rms[0] else None)
else:
return None # raise AttributeError('Instances must have exactly one relation')
text = row['sentText']
return EntityDataset.get_instance(text, entity, label=label)
@staticmethod
def get_instance(text, entity, label=None):
tokens = tokenizer.tokenize(text)
i = 0
found_entity = False
entity_range = None
while i < len(tokens):
match_length = EntityDataset.token_entity_match(i, entity.lower(), tokens)
if match_length is not None:
if found_entity:
return None # raise AttributeError('Entity {} appears twice in text {}'.format(entity, text))
found_entity = True
tokens[i:i + match_length] = [MASK_TOKEN] * match_length
entity_range = (i + 1, i + match_length) # + 1 taking into account the [CLS] token
i += match_length
else:
i += 1
if found_entity:
return PairRelInstance(tokens, entity, entity_range, LABEL_MAP[label], text)
else: else:
return None return EntityInstance(row['tokens'],
row['entity_idx'],
@staticmethod entity=row['entity'])
def token_entity_match(first_token_idx, entity, tokens):
token_idx = first_token_idx
remaining_entity = entity
while remaining_entity:
if remaining_entity == entity or remaining_entity.lstrip() != remaining_entity:
# start of new word
remaining_entity = remaining_entity.lstrip()
if token_idx < len(tokens) and tokens[token_idx] == remaining_entity[:len(tokens[token_idx])]:
remaining_entity = remaining_entity[len(tokens[token_idx]):]
token_idx += 1
else:
break
else:
# continuing same word
if (token_idx < len(tokens) and tokens[token_idx].startswith('##')
and tokens[token_idx][2:] == remaining_entity[:len(tokens[token_idx][2:])]):
remaining_entity = remaining_entity[len(tokens[token_idx][2:]):]
token_idx += 1
else:
break
if remaining_entity:
return None
else:
return token_idx - first_token_idx
def __len__(self): def __len__(self):
return len(self.df.index) return len(self.df.index)
def __getitem__(self, idx): def __getitem__(self, idx):
return EntityDataset.instance_from_row(self.df.iloc[idx]) return self.instance_from_row(self.df.iloc[idx])
class PairRelInstance: class EntityInstance:
def __init__(self, tokens, entity, entity_range, label, text): def __init__(self, tokens, entity_idx, label=None, entity=None):
self.tokens = tokens self.tokens = tokens
self.entity = entity self.entity_idx = entity_idx
self.entity_range = entity_range
self.label = label self.label = label
self.text = text self.entity = entity
def generate_batch(instances: [EntityInstance]):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in instances], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = torch.tensor([instance.entity_idx for instance in instances])
labels = torch.tensor([instance.label for instance in instances])
return input_ids, attn_mask, entity_indices, labels
def generate_production_batch(instances: [EntityInstance]):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in instances], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = torch.tensor([instance.entity_idx for instance in instances])
return input_ids, attn_mask, entity_indices, instances
# def indices_for_entity_ranges(ranges):
# max_e_len = max(end - start for start, end in ranges)
# indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES
# for t in range(start, start + max_e_len + 1)]
# for start, end in ranges])
# return indices
...@@ -5,6 +5,7 @@ from transformers import * ...@@ -5,6 +5,7 @@ from transformers import *
HIDDEN_OUTPUT_FEATURES = 768 HIDDEN_OUTPUT_FEATURES = 768
TRAINED_WEIGHTS = 'bert-base-uncased' TRAINED_WEIGHTS = 'bert-base-uncased'
NUM_CLASSES = 2 # entity, not entity NUM_CLASSES = 2 # entity, not entity
BATCH_SIZE = 32
class EntityBertNet(nn.Module): class EntityBertNet(nn.Module):
...@@ -20,14 +21,9 @@ class EntityBertNet(nn.Module): ...@@ -20,14 +21,9 @@ class EntityBertNet(nn.Module):
bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask) bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask)
# max pooling at entity locations # max pooling at entity locations
entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices) entity_pooled_output = bert_output[torch.arange(0, bert_output.shape[0]), entity_indices]
# fc layer (softmax activation done in loss function) # fc layer (softmax activation done in loss function)
x = self.fc(entity_pooled_output) x = self.fc(entity_pooled_output)
return x return x
@staticmethod
def pooled_output(bert_output, indices):
outputs = torch.gather(bert_output, dim=1, index=indices)
pooled_output, _ = torch.max(outputs, dim=1)
return pooled_output
...@@ -8,8 +8,10 @@ import time ...@@ -8,8 +8,10 @@ import time
import numpy as np import numpy as np
from sklearn import metrics from sklearn import metrics
from transformers import get_linear_schedule_with_warmup from transformers import get_linear_schedule_with_warmup
from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset, generate_batch, generate_production_batch # from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset, generate_batch, generate_production_batch
from agent.target_extraction.BERT.relation_extractor.pairbertnet import NUM_CLASSES, PairBertNet from agent.target_extraction.BERT.relation_extractor.rel_dataset import PairRelDataset, generate_batch, generate_production_batch, RelInstance
# from agent.target_extraction.BERT.relation_extractor.pairbertnet import NUM_CLASSES, PairBertNet
from agent.target_extraction.BERT.relation_extractor.relbertnet import NUM_CLASSES, RelBertNet
device = torch.device('cuda') device = torch.device('cuda')
...@@ -30,12 +32,12 @@ loss_criterion = CrossEntropyLoss() ...@@ -30,12 +32,12 @@ loss_criterion = CrossEntropyLoss()
class BertRelExtractor: class BertRelExtractor:
def __init__(self): def __init__(self):
self.net = PairBertNet() self.net = RelBertNet()
@staticmethod @staticmethod
def load_saved(path): def load_saved(path):
extr = BertRelExtractor() extr = BertRelExtractor()
extr.net = PairBertNet() extr.net = RelBertNet()
extr.net.load_state_dict(torch.load(path)) extr.net.load_state_dict(torch.load(path))
extr.net.eval() extr.net.eval()
return extr return extr
...@@ -60,8 +62,7 @@ class BertRelExtractor: ...@@ -60,8 +62,7 @@ class BertRelExtractor:
else: else:
train_size = int(size * (1 - valid_frac)) if size is not None else None train_size = int(size * (1 - valid_frac)) if size is not None else None
train_data, _ = PairRelDataset.from_file(file_path, size=train_size) train_data, _ = PairRelDataset.from_file(file_path, size=train_size)
valid_size = int(size * valid_frac) if size is not None else int(len(train_data) * valid_frac) valid_data, _ = PairRelDataset.from_file(valid_file_path)
valid_data, _ = PairRelDataset.from_file(valid_file_path, size=valid_size)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch) collate_fn=generate_batch)
...@@ -87,16 +88,16 @@ class BertRelExtractor: ...@@ -87,16 +88,16 @@ class BertRelExtractor:
for batch_idx, batch in enumerate(train_loader): for batch_idx, batch in enumerate(train_loader):
# send batch to gpu # send batch to gpu
input_ids, attn_mask, masked_indices, fst_indices, snd_indices, target_labels = tuple(i.to(device) for i in batch) input_ids, attn_mask, entity_indices, entity_mask, labels = tuple(i.to(device) for i in batch)
# zero param gradients # zero param gradients
optimiser.zero_grad() optimiser.zero_grad()
# forward pass # forward pass
output_scores = self.net(input_ids, attn_mask, masked_indices, fst_indices, snd_indices) output_scores = self.net(input_ids, attn_mask, entity_indices, entity_mask)
# backward pass # backward pass
loss = loss_criterion(output_scores, target_labels) loss = loss_criterion(output_scores, labels)
loss.backward() loss.backward()
# clip gradient norm # clip gradient norm
...@@ -117,12 +118,11 @@ class BertRelExtractor: ...@@ -117,12 +118,11 @@ class BertRelExtractor:
batch_loss = 0.0 batch_loss = 0.0
print('epoch done') print('epoch done')
torch.save(self.net.state_dict(), '{}_epoch_{}.pt'.format(save_file, epoch_idx + 1))
if valid_data is not None: if valid_data is not None:
self.evaluate(data=valid_data) self.evaluate(data=valid_data)
torch.save(self.net.state_dict(), '{}.pt'.format(save_file))
end = time.time() end = time.time()
print('Training took', end - start, 'seconds') print('Training took', end - start, 'seconds')
...@@ -147,15 +147,14 @@ class BertRelExtractor: ...@@ -147,15 +147,14 @@ class BertRelExtractor:
with torch.no_grad(): with torch.no_grad():
for batch in test_loader: for batch in test_loader:
# send batch to gpu # send batch to gpu
input_ids, attn_mask, masked_indices, fst_indices, snd_indices, target_labels = tuple(i.to(device) input_ids, attn_mask, entity_indices, entity_mask, labels = tuple(i.to(device) for i in batch)
for i in batch)
# forward pass # forward pass
output_scores = self.net(input_ids, attn_mask, masked_indices, fst_indices, snd_indices) output_scores = self.net(input_ids, attn_mask, entity_indices, entity_mask)
_, output_labels = torch.max(output_scores.data, 1) _, output_labels = torch.max(output_scores.data, 1)
outputs += output_labels.tolist() outputs += output_labels.tolist()
targets += target_labels.tolist() targets += labels.tolist()
assert len(outputs) == len(targets) assert len(outputs) == len(targets)
...@@ -176,25 +175,24 @@ class BertRelExtractor: ...@@ -176,25 +175,24 @@ class BertRelExtractor:
recall = metrics.recall_score(targets, outputs, average=None) recall = metrics.recall_score(targets, outputs, average=None)
print('recall:', recall) print('recall:', recall)
def extract_single_relation(self, text, e1, e2): def extract_single_relation(self, text, entities):
ins = PairRelDataset.get_instance(text, e1, e2) ins = RelInstance.from_sentence(text, entities)
input_ids, attn_mask, masked_indices, prod_indices, feat_indices, instances = generate_production_batch([ins]) input_ids, attn_mask, entity_indices, entity_mask, _ = generate_production_batch([ins])
self.net.cuda() self.net.cuda()
self.net.eval() self.net.eval()
with torch.no_grad(): with torch.no_grad():
# send batch to gpu # send batch to gpu
input_ids, attn_mask, masked_indices, prod_indices, feat_indices = tuple(i.to(device) for i in input_ids, attn_mask, entity_indices, entity_mask = tuple(i.to(device) for i in [input_ids, attn_mask,
[input_ids, attn_mask, entity_indices,
masked_indices, prod_indices, entity_mask])
feat_indices])
# forward pass # forward pass
output_scores = softmax(self.net(input_ids, attn_mask, masked_indices, prod_indices, feat_indices), dim=1) output_scores = softmax(self.net(input_ids, attn_mask, entity_indices, entity_mask), dim=1)
_, output_labels = torch.max(output_scores.data, 1) _, output_labels = torch.max(output_scores.data, 1)
print(instances[0].get_relation_for_label(output_labels[0])) ins.print_results_for_labels(output_labels)
def extract_relations(self, n_aspects, aspect_index_map, aspect_counts, file_path=None, dataset=None, size=None): def extract_relations(self, n_aspects, aspect_index_map, aspect_counts, file_path=None, dataset=None, size=None):
# load data # load data
...@@ -215,15 +213,14 @@ class BertRelExtractor: ...@@ -215,15 +213,14 @@ class BertRelExtractor:
count_matrix = np.zeros((n_aspects, n_aspects)) count_matrix = np.zeros((n_aspects, n_aspects))
with torch.no_grad(): with torch.no_grad():
for input_ids, attn_mask, masked_indices, prod_indices, feat_indices, instances in loader: for input_ids, attn_mask, prod_indices, feat_indices, instances in loader:
# send batch to gpu # send batch to gpu
input_ids, attn_mask, masked_indices, prod_indices, feat_indices = tuple(i.to(device) for i in input_ids, attn_mask, prod_indices, feat_indices = tuple(i.to(device) for i in [input_ids, attn_mask,
[input_ids, attn_mask, prod_indices,
masked_indices, prod_indices, feat_indices])
feat_indices])
# forward pass # forward pass
output_scores = softmax(self.net(input_ids, attn_mask, masked_indices, prod_indices, feat_indices), dim=1) output_scores = softmax(self.net(input_ids, attn_mask, prod_indices, feat_indices), dim=1)
rel_scores = output_scores.narrow(1, 1, 2) rel_scores = output_scores.narrow(1, 1, 2)
for ins, scores in zip(instances, rel_scores.tolist()): for ins, scores in zip(instances, rel_scores.tolist()):
...@@ -236,4 +233,38 @@ class BertRelExtractor: ...@@ -236,4 +233,38 @@ class BertRelExtractor:
return prob_matrix, count_matrix return prob_matrix, count_matrix
def extract_relations2(self, n_aspects, dataset):
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_production_batch)
self.net.cuda()
self.net.eval()
prob_matrix = np.zeros((n_aspects, n_aspects))
count_matrix = np.zeros((n_aspects, n_aspects))
with torch.no_grad():
for input_ids, attn_mask, entity_indices, combination_indices, instances in loader:
# send batch to gpu
input_ids, attn_mask, entity_indices, combination_indices = tuple(i.to(device) for i in
[input_ids, attn_mask,
entity_indices, combination_indices])