Commit 34e90902 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Integrated entity and relation extractions with BERT to target_extractor with...

Integrated entity and relation extractions with BERT to target_extractor with good results on unseen guitar reviews.
parent d9b522d6
......@@ -7,6 +7,7 @@ from torch.optim import Adam
import time
import numpy as np
from sklearn import metrics
import statistics
from transformers import get_linear_schedule_with_warmup
from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityDataset, generate_batch, generate_production_batch
from agent.target_extraction.BERT.entity_extractor.entitybertnet import NUM_CLASSES, EntityBertNet
......@@ -175,8 +176,34 @@ class BertEntityExtractor:
recall = metrics.recall_score(targets, outputs, average=None)
print('recall:', recall)
def extract_entity_probabilities(self, terms, file_path=None, dataset=None, size=None):
# load data
if file_path is not None:
data, _ = EntityDataset.from_file(file_path, size=size)
else:
if dataset is None:
raise AttributeError('file_path and data cannot both be None')
data = dataset
loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_production_batch)
self.net.cuda()
self.net.eval()
probs = {term: [] for term in terms}
with torch.no_grad():
for input_ids, attn_mask, entity_indices, instances in loader:
# send batch to gpu
input_ids, attn_mask, entity_indices = tuple(i.to(device) for i in [input_ids, attn_mask,
entity_indices])
# forward pass
output_scores = softmax(self.net(input_ids, attn_mask, entity_indices), dim=1)
entity_scores = output_scores.narrow(1, 1, 1).flatten()
for ins, score in zip(instances, entity_scores.tolist()):
probs[ins.entity].append(score)
extr: BertEntityExtractor = BertEntityExtractor.train_and_validate('camera_backpack_laptop_review_entities.tsv',
'trained_bert_entity_extractor_camera_backpack_laptop.pt',
valid_frac=0.05,
valid_file_path='annotated_acoustic_guitar_review_entities.tsv')
\ No newline at end of file
return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()}
......@@ -9,7 +9,7 @@ from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_
MAX_SEQ_LEN = 128
LABELS = ['ASPECT', 'NAN']
LABEL_MAP = {'ASPECT': 1, 'NAN': 0}
LABEL_MAP = {'ASPECT': 1, 'NAN': 0, None: None}
MASK_TOKEN = '[MASK]'
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
......@@ -34,7 +34,7 @@ def generate_production_batch(batch):
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = indices_for_entity_ranges([instance.range for instance in batch])
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, entity_indices, batch
......
......@@ -196,7 +196,7 @@ class BertRelExtractor:
print(instances[0].get_relation_for_label(output_labels[0]))
def extract_relations(self, file_path=None, dataset=None, size=None):
def extract_relations(self, n_aspects, aspect_index_map, aspect_counts, file_path=None, dataset=None, size=None):
# load data
if file_path is not None:
data, _ = PairRelDataset.from_file(file_path, size=size)
......@@ -211,7 +211,8 @@ class BertRelExtractor:
self.net.cuda()
self.net.eval()
outputs = []
prob_matrix = np.zeros((n_aspects, n_aspects))
count_matrix = np.zeros((n_aspects, n_aspects))
with torch.no_grad():
for input_ids, attn_mask, masked_indices, prod_indices, feat_indices, instances in loader:
......@@ -223,24 +224,18 @@ class BertRelExtractor:
# forward pass
output_scores = softmax(self.net(input_ids, attn_mask, masked_indices, prod_indices, feat_indices), dim=1)
_, output_labels = torch.max(output_scores.data, 1)
outputs += map(lambda x: x[0].get_relation_for_label(x[1]), zip(instances, output_labels.tolist()))
for ins, scores, out in zip(instances, output_scores.tolist(), output_labels.tolist()):
print(ins.text)
print(ins.tokens)
print(scores)
print(ins.get_relation_for_label(out))
print('---')
rel_scores = output_scores.narrow(1, 1, 2)
return outputs
for ins, scores in zip(instances, rel_scores.tolist()):
forward_score, backward_score = scores
fst_idx, snd_idx = aspect_index_map[ins.fst], aspect_index_map[ins.snd]
prob_matrix[snd_idx][fst_idx] += forward_score
prob_matrix[fst_idx][snd_idx] += backward_score
count_matrix[snd_idx][fst_idx] += 1
count_matrix[fst_idx][snd_idx] += 1
prob_matrix = (prob_matrix.T / aspect_counts).T # scale rows by aspect counts
extr: BertRelExtractor = BertRelExtractor.train_and_validate('../data/camera_backpack_laptop_review_pairs_no_nan.tsv',
'trained_bert_rel_extractor_camera_backpack_laptop_no_nan.pt',
size=10000,
valid_frac=0.05,
valid_file_path='data/annotated_acoustic_guitar_review_pairs.tsv')
return prob_matrix
......@@ -2,49 +2,50 @@ import pandas as pd
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
from anytree import Node
from anytree import Node, RenderTree
import numpy as np
import re
from gensim.models import Word2Vec
import pickle
from agent.target_extraction.product import Product
from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityDataset
from agent.target_extraction.BERT.entity_extractor.bert_entity_extractor import BertEntityExtractor
from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset
from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor
from pathos.multiprocessing import ProcessingPool as Pool
import itertools
np.set_printoptions(precision=4, threshold=np.inf, suppress=False)
np.set_printoptions(precision=3, threshold=np.inf, suppress=True)
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
sentiment_lexicon = pd.read_csv('data/NRC-Sentiment-Lexicon-Wordlevel-v0.92.tsv', sep='\t', index_col=0)
bert_extractor_path = 'BERT/trained_bert_rel_extractor_camera_and_backpack_with_nan.pt'
pool = Pool(4)
entity_extractor_path = 'BERT/entity_extractor/trained_bert_entity_extractor_camera_backpack_laptop.pt'
rel_extractor_path = 'BERT/relation_extractor/trained_bert_rel_extractor_camera_backpack_laptop_no_nan.pt'
pool = Pool(2)
class TargetExtractor:
PHRASE_THRESHOLD = 4
MIN_RELATEDNESS = 0.3
N_ASPECTS = 100
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3
FREQ_OVER_PARENT = 10 # target must appear x times more frequently than in parent
OUTLIER_COEFFICIENT = 5
N_DIRECT_FEATURES = 3 # top N_DIRECT_FEATURES features will be direct children of the product (not subfeatures)
PARENT_COUNT_FRAC = 0.5 # feature f1 will only be considered as a subfeature of f2 if c(f1) / c(f2) > this value
WV_SIZE = 100
WV_WINDOW = 9
# phraser
PHRASE_THRESHOLD = 4
# tree
SUBFEATURE_MULT = 2 # for z to be a subfeature of x, matrix(z, x) > matrix(z, f) * SUBFEATURE_MULT for all other f
# word2vec
MIN_TERM_COUNT = 0
MIN_SIMILARITY = 0
SYNONYM_SIMILARITY = 0.1
MIN_TERM_COUNT = 100
SYNONYM_SIMILARITY = 0.10
WV_SIZE = 100
WV_WINDOW = 7
# bert
MAX_BERT_DATASET_SIZE = 300000
ENTITY_PROB_THRESHOLD = 0.5
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, file_path, text_column):
......@@ -53,7 +54,7 @@ class TargetExtractor:
print('tokenizing phrases...')
# tokenize and normalize phrases
texts = TargetExtractor.obtain_texts(file_path, text_column, n=50000)
texts = TargetExtractor.obtain_texts(file_path, text_column)
self.sentences = list(itertools.chain.from_iterable(pool.map(sent_tokenize, texts)))
self.sentences = pool.map(lambda s: s.replace('_', ' ').lower(), self.sentences)
self.phrases = pool.map(word_tokenize, self.sentences)
......@@ -81,7 +82,7 @@ class TargetExtractor:
print('mining aspects...')
# mine aspects
aspects, counts = self.get_related_nouns(self.counter)
aspects, counts = self.get_aspects(self.counter)
print('extracting synonyms...')
# obtain synonyms
......@@ -97,21 +98,22 @@ class TargetExtractor:
self.save()
print('extracting relatedness matrix...')
# extract relationships between aspects
self.relatedness_matrix = self.get_bert_relations()
print(self.aspects)
print(self.syn_dict)
print(self.relatedness_matrix)
self.save()
# print('extracting aspect tree...')
# # extract aspect tree
# self.tree = self.get_product_tree()
print('extracting aspect tree...')
self.tree = self.get_product_tree()
# print('saving...')
print('saving...')
self.save()
#
# print('done:')
print('done:')
print(self.aspects)
print(self.syn_dict)
print(self.relatedness_matrix)
print(RenderTree(self.tree))
def extract_relatedness_matrix(self):
print('extracting relatedness matrix...')
......@@ -155,7 +157,7 @@ class TargetExtractor:
if len(term) > 1 and (any(not re.compile('NN|JJ').match(tag) for _, tag in term)
or any(tag.startswith('JJ') and polar_adjective(t) for t, tag in term)):
return [subterm for subterm, _ in term]
return ['_'.join([subterm for subterm, _ in term])]
return [' '.join([subterm for subterm, _ in term])]
def polar_adjective(adj):
return adj in sentiment_lexicon.index and (sentiment_lexicon.loc[adj]['positive'] == 1 or
......@@ -164,48 +166,21 @@ class TargetExtractor:
return [subterm for term in tagged_unfiltered for subterm in filter_ngram(term)]
def get_bert_relations(self):
print(' select phrases for BERT...')
print(' select phrases for relation extraction...')
pair_texts = [rel for rel in pool.map(self.pair_relations_for_text, range(len(self.phrases))) if rel is not None]
bert_df = pd.DataFrame(pair_texts, columns=['sentText', 'relationMentions'])
df = pd.DataFrame(pair_texts, columns=['sentText', 'relationMentions'])
print(' extracting relations with BERT...')
dataset = PairRelDataset.from_df(bert_df, size=TargetExtractor.MAX_BERT_DATASET_SIZE)
bert_extractor = BertRelExtractor.load_saved(bert_extractor_path)
rels = bert_extractor.extract_relations(dataset=dataset)
print(' calculating relatedness matrix...')
positive_matrix = np.zeros((len(self.aspects), len(self.aspects)))
negative_matrix = np.zeros((len(self.aspects), len(self.aspects)))
for e1, rel, e2 in rels:
idx_super = self.index_for_aspect(e1)
idx_sub = self.index_for_aspect(e2)
if idx_super is None or idx_sub is None:
raise AttributeError('Could not find an aspect for term', e1 if idx_super is None else e2)
if rel not in ['/has_feature', '/no_relation']:
raise AttributeError('Could not recognize relation', rel)
if rel == '/has_feature':
positive_matrix[idx_sub][idx_super] += 1
else:
negative_matrix[idx_sub][idx_super] += 1
relatedness_matrix = (positive_matrix - negative_matrix) / (positive_matrix + negative_matrix)
# scale values by counts of aspects
# counts_arr = np.zeros(len(self.aspects))
# for idx, aspect in enumerate(self.aspects):
# counts_arr[idx] = self.counts[aspect]
# relatedness_matrix = (relatedness_matrix.T / counts_arr).T
# scale linearly to be more readable
# relatedness_matrix = relatedness_matrix / relatedness_matrix.max()
dataset = PairRelDataset.from_df(df, size=TargetExtractor.MAX_BERT_DATASET_SIZE)
bert_extractor = BertRelExtractor.load_saved(rel_extractor_path)
aspect_counts = np.array([self.counts[aspect] for aspect in self.aspects])
relatedness_matrix = bert_extractor.extract_relations(len(self.aspects), self.aspect_index_map(), aspect_counts,
dataset=dataset)
return relatedness_matrix
def index_for_aspect(self, aspect):
return next(idx for idx, a in enumerate(self.aspects) if aspect.lower().replace(' ', '_') in self.syn_dict[a])
def aspect_index_map(self):
return {syn: idx for idx, aspect in enumerate(self.aspects) for syn in self.syn_dict[aspect]}
def pair_relations_for_text(self, idx):
text = self.sentences[idx]
......@@ -218,15 +193,14 @@ class TargetExtractor:
found_aspects = []
for aspect in self.aspects:
found_aspect = False
found_form = False
for form in self.syn_dict[aspect]:
if form in ngrams:
form_with_space = form.replace('_', ' ')
if len(found_aspects) > 1 or found_aspect or overlapping_terms(found_aspects, form_with_space):
if len(found_aspects) > 1 or found_form or overlapping_terms(found_aspects, form):
# cannot have more than two aspects, or two forms of the same aspect, or overlapping terms
return None
found_aspects.append(form_with_space)
found_aspect = True
found_aspects.append(form)
found_form = True
return (text, [{'em1Text': found_aspects[0], 'em2Text': found_aspects[1]}]) if len(found_aspects) == 2 else None
......@@ -251,8 +225,8 @@ class TargetExtractor:
nouns = []
word_idx = 0
for token in ngrams:
if '_' in token:
words = token.split('_')
if ' ' in token:
words = token.split(' ')
word_range = range(word_idx, word_idx + len(words))
has_noun = any(is_noun(pos_tags[i]) for i in word_range)
all_terms_valid = all(is_valid_term(pos_tags[i]) for i in word_range)
......@@ -267,30 +241,61 @@ class TargetExtractor:
word_idx += 1
return nouns
def get_related_nouns(self, counter):
def get_aspects(self, counter):
# take N_ASPECTS most common terms
term_counts = counter.most_common()[:TargetExtractor.N_ASPECTS]
terms = [term for term, count in term_counts]
# bring product to front of list
if self.product in terms:
terms.remove(self.product)
else:
terms.pop()
terms.insert(0, self.product)
return terms, {term: count for term, count in term_counts}
print(' preparing entity texts for BERT...')
entity_texts = [t for t in pool.map(lambda i: self.entity_mentions_in_text(i, terms),
range(len(self.sentences)))
if t is not None]
df = pd.DataFrame(entity_texts, columns=['sentText', 'entityMentions'])
print(' extracting entities with BERT...')
dataset = EntityDataset.from_df(df, size=TargetExtractor.MAX_BERT_DATASET_SIZE)
entity_extractor = BertEntityExtractor.load_saved(entity_extractor_path)
probs = entity_extractor.extract_entity_probabilities(terms, dataset=dataset)
aspects = [term for term in terms if probs[term] is not None and probs[term] >= TargetExtractor.ENTITY_PROB_THRESHOLD]
def is_related_to_product(self, term, wv):
return wv.similarity(self.product, term) > TargetExtractor.MIN_SIMILARITY
# bring product to front of list
if self.product in aspects:
aspects.remove(self.product)
aspects.insert(0, self.product)
return aspects, {term: count for term, count in term_counts if term in aspects}
def entity_mentions_in_text(self, text_idx, entities):
text = self.sentences[text_idx]
all_tokens = set().union(*[self.phrases[text_idx], self.ngram_phrases[text_idx]])
entity_mention = None
for entity in entities:
n_mentions = sum(1 for token in all_tokens if entity == token.lower())
if n_mentions > 1:
# many mentions of same entity
return None
if n_mentions == 1:
if entity_mention is None:
entity_mention = entity
elif entity_mention in entity:
entity_mention = entity
elif entity not in entity_mention:
# text cannot have more than one entity mention, unless one is a subset of the other,
# in which case the longer one is taken
return None
if entity_mention is not None:
return text, [{'text': entity_mention}]
return None
@staticmethod
def get_syn_pairs(terms, model):
return {frozenset((t1, t2)) for t1 in terms for t2 in terms
if t1 != t2 and model.relative_cosine_similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
def frequency_for_term(self, term):
return self.counter[term] / self.total_count
def get_word2vec_model(self, size, window, min_count):
model = Word2Vec(self.ngrams(self.phrases), size=size, window=window, min_count=min_count).wv
return model
......@@ -307,83 +312,24 @@ class TargetExtractor:
f.close()
return extractor
@staticmethod
def wordnet_relatedness(t1, t2):
fst = wordnet.synset(t1 + '.n.01')
snd = wordnet.synset(t2 + '.n.01')
return fst.wup_similarity(snd)
@staticmethod
def spanning_tree_from_root(vertices, weights, root_idx=0):
root = Node(vertices[root_idx])
for idx in np.flip(np.argsort(weights[root_idx])):
if idx == root_idx:
continue
gain = max(TargetExtractor.MIN_DIRECT_GAIN, weights[root_idx][idx])
parent = root
for branch_node in root.descendants:
min_scaled_weight = min(weights[n.idx][idx] * pow(TargetExtractor.DEPTH_COST, branch_node.depth)
for n in (branch_node,) + branch_node.anchestors if n != root)
if min_scaled_weight > gain:
gain = min_scaled_weight
parent = branch_node
node = Node(vertices[idx], parent=parent)
node.idx = idx
return root
def get_product_tree(self):
root = Node(self.aspects[0])
root.idx = 0
for idx in range(1, TargetExtractor.N_DIRECT_FEATURES + 1):
node = Node(self.aspects[idx], parent=root)
node.idx = idx
unassigned = {idx for idx in range(TargetExtractor.N_DIRECT_FEATURES + 1, len(self.aspects))}
for idx in range(1, len(self.aspects)): # for each feature in order from highest to lowest count
print(self.aspects[idx])
# create node for aspect with parent root if it is unassigned
node = next((n for n in root.descendants if n.idx == idx), None)
if not node:
node = Node(self.aspects[idx], parent=root)
node.idx = idx
unassigned.remove(idx)
# get highest dependant from unassigned aspects if there exists a significant one
dep_idx = self.get_dependant(idx, [], unassigned)
while dep_idx is not None:
print(' ', self.aspects[dep_idx])
# assign dep as subfeature of t
dep_node = Node(self.aspects[dep_idx], parent=node)
dep_node.idx = dep_idx
unassigned.remove(dep_idx)
# get highest dependant from remaining targets if there exists a significant one
dep_idx = self.get_dependant(idx, [child.idx for child in node.children], unassigned)
row = self.relatedness_matrix[idx]
max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1
if max_idx1 < idx and row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT:
parent = next(n for n in root.descendants if n.idx == max_idx1)
else:
parent = root
node = Node(self.aspects[idx], parent=parent)
node.idx = idx
self.node_map = {n.idx: n for n in (root,) + root.descendants}
return root
@staticmethod
def kruskal(vertices, edges):
result = set()
groups = {vertex: i for i, vertex in enumerate(vertices)}
for u, v in sorted(edges, key=edges.get, reverse=True):
if groups[u] != groups[v]:
result.add((u, v))
TargetExtractor.join_groups(groups, groups[u], groups[v])
return result
@staticmethod
def join_groups(groups, i, j):
for v in groups:
if groups[v] == j:
groups[v] = i
@staticmethod
def singular(word):
return wnl.lemmatize(word)
@staticmethod
def print_relations(target_indices, dep_matrix, targets):
idx_pairs = {frozenset((idx1, idx2)) for idx1 in target_indices for idx2 in target_indices if idx1 != idx2}
......@@ -400,57 +346,6 @@ class TargetExtractor:
for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True):
print(' {:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx])
def get_dependant(self, idx, child_indices, unassigned_indices):
ignore_idx = [0] + child_indices
max_dependant = (None, 0)
for u_idx in unassigned_indices:
# print(' ', self.aspects[u_idx])
dependence = self.get_significant_dependence(u_idx, ignore_idx=ignore_idx)
if dependence is not None:
dep_idx, score = dependence
# print(' ', self.aspects[dep_idx], score)
if dep_idx == idx and score > max_dependant[1]:
max_dependant = (u_idx, score)
return max_dependant[0]
def get_significant_dependence(self, idx, ignore_idx=None):
if not ignore_idx:
ignore_idx = [0]
deps = self.relatedness_matrix[idx].copy()
for i in ignore_idx:
deps[i] = np.ma.masked
fst_high_outlier = TargetExtractor.high_outlier_idx(deps)
# print(' ', fst_high_outlier)
if fst_high_outlier is not None:
deps[fst_high_outlier[0]] = np.ma.masked
snd_high_outlier = TargetExtractor.high_outlier_idx(deps) # np.delete(deps, fst_high_outlier)
# print(' ', snd_high_outlier)
if snd_high_outlier is None:
return fst_high_outlier
return None
@staticmethod
def high_outlier_idx(arr):
q1 = np.nanquantile(np.ma.filled(arr, np.NaN), 0.25)
q3 = np.nanquantile(np.ma.filled(arr, np.NaN), 0.75)
max_idx = np.nanargmax(np.ma.filled(arr, np.NaN))
lim = q3 + TargetExtractor.OUTLIER_COEFFICIENT * (q3 - q1)
# print(' ', arr, arr[max_idx], q1, q3, lim)
return (max_idx, arr[max_idx] / lim) if arr[max_idx] > lim else None
@staticmethod
def are_correlated(idx1, idx2, dep_matrix, ignore_idx=None):
if not ignore_idx:
ignore_idx = []
mask = [i in ignore_idx for i in range(len(dep_matrix[idx1]))]
deps1 = np.ma.masked_array(dep_matrix[idx1], mask=mask)
deps2 = np.ma.masked_array(dep_matrix[idx2], mask=mask)
return TargetExtractor.in_q3(deps1, idx2) and TargetExtractor.in_q3(deps2, idx1)
@staticmethod
def in_q3(arr, idx):
return arr[idx] >= np.quantile(arr, 0.75)
class Synset:
......@@ -509,7 +404,4 @@ class Synset:
return None
extr: TargetExtractor = TargetExtractor('backpack', 'data/verified_backpack_reviews.tsv', 'reviewText')
# extr.print_relations_from('keys')