Commit 34e90902 authored by Joel Oksanen's avatar Joel Oksanen

Integrated entity and relation extractions with BERT to target_extractor with...

Integrated entity and relation extractions with BERT to target_extractor with good results on unseen guitar reviews.
parent d9b522d6
......@@ -7,6 +7,7 @@ from torch.optim import Adam
import time
import numpy as np
from sklearn import metrics
import statistics
from transformers import get_linear_schedule_with_warmup
from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityDataset, generate_batch, generate_production_batch
from agent.target_extraction.BERT.entity_extractor.entitybertnet import NUM_CLASSES, EntityBertNet
......@@ -175,8 +176,34 @@ class BertEntityExtractor:
recall = metrics.recall_score(targets, outputs, average=None)
print('recall:', recall)
def extract_entity_probabilities(self, terms, file_path=None, dataset=None, size=None):
# load data
if file_path is not None:
data, _ = EntityDataset.from_file(file_path, size=size)
else:
if dataset is None:
raise AttributeError('file_path and data cannot both be None')
data = dataset
loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_production_batch)
self.net.cuda()
self.net.eval()
probs = {term: [] for term in terms}
with torch.no_grad():
for input_ids, attn_mask, entity_indices, instances in loader:
# send batch to gpu
input_ids, attn_mask, entity_indices = tuple(i.to(device) for i in [input_ids, attn_mask,
entity_indices])
# forward pass
output_scores = softmax(self.net(input_ids, attn_mask, entity_indices), dim=1)
entity_scores = output_scores.narrow(1, 1, 1).flatten()
for ins, score in zip(instances, entity_scores.tolist()):
probs[ins.entity].append(score)
extr: BertEntityExtractor = BertEntityExtractor.train_and_validate('camera_backpack_laptop_review_entities.tsv',
'trained_bert_entity_extractor_camera_backpack_laptop.pt',
valid_frac=0.05,
valid_file_path='annotated_acoustic_guitar_review_entities.tsv')
\ No newline at end of file
return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()}
......@@ -9,7 +9,7 @@ from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_
MAX_SEQ_LEN = 128
LABELS = ['ASPECT', 'NAN']
LABEL_MAP = {'ASPECT': 1, 'NAN': 0}
LABEL_MAP = {'ASPECT': 1, 'NAN': 0, None: None}
MASK_TOKEN = '[MASK]'
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
......@@ -34,7 +34,7 @@ def generate_production_batch(batch):
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = indices_for_entity_ranges([instance.range for instance in batch])
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, entity_indices, batch
......
......@@ -196,7 +196,7 @@ class BertRelExtractor:
print(instances[0].get_relation_for_label(output_labels[0]))
def extract_relations(self, file_path=None, dataset=None, size=None):
def extract_relations(self, n_aspects, aspect_index_map, aspect_counts, file_path=None, dataset=None, size=None):
# load data
if file_path is not None:
data, _ = PairRelDataset.from_file(file_path, size=size)
......@@ -211,7 +211,8 @@ class BertRelExtractor:
self.net.cuda()
self.net.eval()
outputs = []
prob_matrix = np.zeros((n_aspects, n_aspects))
count_matrix = np.zeros((n_aspects, n_aspects))
with torch.no_grad():
for input_ids, attn_mask, masked_indices, prod_indices, feat_indices, instances in loader:
......@@ -223,24 +224,18 @@ class BertRelExtractor:
# forward pass
output_scores = softmax(self.net(input_ids, attn_mask, masked_indices, prod_indices, feat_indices), dim=1)
_, output_labels = torch.max(output_scores.data, 1)
outputs += map(lambda x: x[0].get_relation_for_label(x[1]), zip(instances, output_labels.tolist()))
for ins, scores, out in zip(instances, output_scores.tolist(), output_labels.tolist()):
print(ins.text)
print(ins.tokens)
print(scores)
print(ins.get_relation_for_label(out))
print('---')
rel_scores = output_scores.narrow(1, 1, 2)
return outputs
for ins, scores in zip(instances, rel_scores.tolist()):
forward_score, backward_score = scores
fst_idx, snd_idx = aspect_index_map[ins.fst], aspect_index_map[ins.snd]
prob_matrix[snd_idx][fst_idx] += forward_score
prob_matrix[fst_idx][snd_idx] += backward_score
count_matrix[snd_idx][fst_idx] += 1
count_matrix[fst_idx][snd_idx] += 1
prob_matrix = (prob_matrix.T / aspect_counts).T # scale rows by aspect counts
extr: BertRelExtractor = BertRelExtractor.train_and_validate('../data/camera_backpack_laptop_review_pairs_no_nan.tsv',
'trained_bert_rel_extractor_camera_backpack_laptop_no_nan.pt',
size=10000,
valid_frac=0.05,
valid_file_path='data/annotated_acoustic_guitar_review_pairs.tsv')
return prob_matrix
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment