Commit 258c1a0b authored by Joel Oksanen's avatar Joel Oksanen

Cleaned up code

parent ecb23284
......@@ -2,8 +2,8 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/reviews.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# data_location = 'agent/amazon_data/reviews.tsv'
# reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
@staticmethod
def get_reviews(product_id):
......
......@@ -8,42 +8,34 @@ from agent.argument import Argument
import pickle
import re
from time import time
import os
class Framework:
HIGH_SENTIMENT_THRESHOLD = 0.99
bert_analyzer = BertAnalyzer.load_saved('agent/SA/semeval_2014_both_unmasked_6.pt')
bert_analyzer = BertAnalyzer.load_saved(os.path.dirname(__file__) + '/SA/semeval_2014_both_unmasked_6.pt')
def __init__(self, product_type, product_id):
def __init__(self, product_category, product_id, review_df=None, wordnet=False):
self.product_id = product_id
self.product = Product.get_product(product_type)
self.product = Product.get_product(product_category, wordnet=wordnet)
self.product_node = self.product.root
self.arguments = self.product.argument_nodes
self.features = self.product.feature_nodes
ts = time()
# get reviews
review_csv = DataLoader.get_reviews(product_id)
reviews = [Review(row, self.product) for _, row in review_csv.head(1000).iterrows()]
t_feature = time()
print('Feature detection took {} seconds'.format(t_feature - ts))
review_df = review_df if review_df is not None else DataLoader.get_reviews(product_id)
reviews = [Review(row, self.product) for _, row in review_df.iterrows()]
# extract augmented votes
self.extract_votes(reviews)
voting_reviews = list(filter(lambda r: r.is_voting(), reviews))
if len(voting_reviews) / len(reviews) < 0.33:
print('warning: only a small fraction of reviews generated votes')
t_sa = time()
print('Sentiment analysis took {} seconds'.format(t_sa - t_feature))
# get aggregates
ra, self.vote_sum, self.vote_phrases = self.get_aggregates(reviews)
t_ra = time()
print('Review aggregation took {} seconds'.format(t_ra - t_sa))
# voting_reviews = list(filter(lambda r: r.is_voting(), reviews))
# voting_frac = len(voting_reviews) / len(reviews)
# if voting_frac < 0.33:
# print('warning: only a small fraction of reviews ({:.3f}) generated votes'.format(voting_frac))
# get qbaf from ra
self.qbaf, self.argument_polarities = self.get_qbaf(ra, len(reviews))
......@@ -51,16 +43,6 @@ class Framework:
# apply gradual semantics
self.strengths = self.get_strengths(self.qbaf)
te = time()
print('QBAF construction took {} seconds'.format(te - t_ra))
print('Process took {} seconds'.format(te - ts))
# save
self.save()
# print results
self.print()
def print(self):
print('qbaf:')
print(self.qbaf)
......@@ -148,7 +130,6 @@ class Framework:
return base_score
# apply DF-QUAD gradual semantics to qbaf
# CHANGES TO INTERIM REPORT METHOD
def get_strengths(self, qbaf):
strengths = {}
arguments = [node for node in PostOrderIter(self.product_node)]
......@@ -237,3 +218,7 @@ class Framework:
attackers = [self.create_arg(att_node, size - 20) for att_node in self.qbaf['attackers'][arg_node]]
phrase = self.best_supporting_phrase(arg_node) if self.argument_polarities[arg_node] else self.best_attacking_phrase(arg_node)
return Argument(arg_node.name, self.argument_polarities[arg_node], supporters, attackers, phrase, size)
# evaluation
def get_product_strength_percentage(self):
return self.strengths[self.product_node] * 100
......@@ -54,7 +54,7 @@ class Review:
self.votes[argument] = 1 if polar_sum > 0 else -1
def is_voting(self):
return len(self.votes) > 0
return any(len(p.votes) > 0 for p in self.phrases)
class Phrase:
......
......@@ -176,7 +176,7 @@ class BertEntityExtractor:
recall = metrics.recall_score(targets, outputs, average=None)
print('recall:', recall)
def extract_entity_probabilities(self, terms, file_path=None, dataset=None, size=None):
def extract_aspect_probabilities(self, entities, file_path=None, dataset=None, size=None):
# load data
if file_path is not None:
data, _ = EntityDataset.from_file(file_path, size=size)
......@@ -191,7 +191,7 @@ class BertEntityExtractor:
self.net.cuda()
self.net.eval()
probs = {term: [] for term in terms}
probs = {entity: [] for entity in entities}
with torch.no_grad():
for input_ids, attn_mask, entity_indices, instances in loader:
......@@ -207,3 +207,8 @@ class BertEntityExtractor:
probs[ins.entity].append(score)
return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()}
# BertEntityExtractor.train_and_validate('all_reviews_entities.tsv', 'entity_extractor_new_5_products', valid_frac=0.05)
# extr: BertEntityExtractor = BertEntityExtractor.load_saved('entity_extractor_new_5_products.pt')
# extr.evaluate(file_path='watch_entities_test.tsv', size=50000)
\ No newline at end of file
......@@ -5,61 +5,25 @@ import pandas as pd
import numpy as np
from ast import literal_eval
import os.path
from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
from agent.target_extraction.BERT.entity_extractor.entitybertnet import TRAINED_WEIGHTS
MAX_SEQ_LEN = 128
LABELS = ['ASPECT', 'NAN']
LABEL_MAP = {'ASPECT': 1, 'NAN': 0, None: None}
MASK_TOKEN = '[MASK]'
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
def generate_batch(batch):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
labels = torch.tensor([instance.label for instance in batch])
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, entity_indices, labels
def generate_production_batch(batch):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, entity_indices, batch
def indices_for_entity_ranges(ranges):
max_e_len = max(end - start for start, end in ranges)
indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES
for t in range(start, start + max_e_len + 1)]
for start, end in ranges])
return indices
class EntityDataset(Dataset):
def __init__(self, df, size=None):
# filter inapplicable rows
self.df = df[df.apply(lambda x: EntityDataset.instance_from_row(x) is not None, axis=1)]
def __init__(self, df, training=True, size=None):
self.df = df
self.training = training
# sample data if a size is specified
if size is not None and size < len(self):
self.df = self.df.sample(size, replace=False)
@staticmethod
def from_df(df, size=None):
dataset = EntityDataset(df, size=size)
def for_extraction(df):
dataset = EntityDataset(df, training=False)
print('Obtained dataset of size', len(dataset))
return dataset
......@@ -83,80 +47,52 @@ class EntityDataset(Dataset):
print('Obtained train set of size', len(dataset), 'and validation set of size', len(validset))
return dataset, validset
@staticmethod
def instance_from_row(row):
unpacked_arr = literal_eval(row['entityMentions']) if type(row['entityMentions']) is str else row['entityMentions']
rms = [rm for rm in unpacked_arr if 'label' not in rm or rm['label'] in LABELS]
if len(rms) == 1:
entity, label = rms[0]['text'], (rms[0]['label'] if 'label' in rms[0] else None)
def instance_from_row(self, row):
if self.training:
return EntityInstance(literal_eval(row['tokens']),
row['entity_idx'],
label=row['label'])
else:
return None # raise AttributeError('Instances must have exactly one relation')
text = row['sentText']
return EntityDataset.get_instance(text, entity, label=label)
@staticmethod
def get_instance(text, entity, label=None):
tokens = tokenizer.tokenize(text)
i = 0
found_entity = False
entity_range = None
while i < len(tokens):
match_length = EntityDataset.token_entity_match(i, entity.lower(), tokens)
if match_length is not None:
if found_entity:
return None # raise AttributeError('Entity {} appears twice in text {}'.format(entity, text))
found_entity = True
tokens[i:i + match_length] = [MASK_TOKEN] * match_length
entity_range = (i + 1, i + match_length) # + 1 taking into account the [CLS] token
i += match_length
else:
i += 1
if found_entity:
return PairRelInstance(tokens, entity, entity_range, LABEL_MAP[label], text)
else:
return None
@staticmethod
def token_entity_match(first_token_idx, entity, tokens):
token_idx = first_token_idx
remaining_entity = entity
while remaining_entity:
if remaining_entity == entity or remaining_entity.lstrip() != remaining_entity:
# start of new word
remaining_entity = remaining_entity.lstrip()
if token_idx < len(tokens) and tokens[token_idx] == remaining_entity[:len(tokens[token_idx])]:
remaining_entity = remaining_entity[len(tokens[token_idx]):]
token_idx += 1
else:
break
else:
# continuing same word
if (token_idx < len(tokens) and tokens[token_idx].startswith('##')
and tokens[token_idx][2:] == remaining_entity[:len(tokens[token_idx][2:])]):
remaining_entity = remaining_entity[len(tokens[token_idx][2:]):]
token_idx += 1
else:
break
if remaining_entity:
return None
else:
return token_idx - first_token_idx
return EntityInstance(row['tokens'],
row['entity_idx'],
entity=row['entity'])
def __len__(self):
return len(self.df.index)
def __getitem__(self, idx):
return EntityDataset.instance_from_row(self.df.iloc[idx])
return self.instance_from_row(self.df.iloc[idx])
class PairRelInstance:
class EntityInstance:
def __init__(self, tokens, entity, entity_range, label, text):
def __init__(self, tokens, entity_idx, label=None, entity=None):
self.tokens = tokens
self.entity = entity
self.entity_range = entity_range
self.entity_idx = entity_idx
self.label = label
self.text = text
self.entity = entity
def generate_batch(instances: [EntityInstance]):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in instances], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = torch.tensor([instance.entity_idx for instance in instances])
labels = torch.tensor([instance.label for instance in instances])
return input_ids, attn_mask, entity_indices, labels
def generate_production_batch(instances: [EntityInstance]):
encoded = tokenizer.batch_encode_plus([instance.tokens for instance in instances], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
entity_indices = torch.tensor([instance.entity_idx for instance in instances])
return input_ids, attn_mask, entity_indices, instances
......@@ -4,12 +4,12 @@ from transformers import BertTokenizer
import pandas as pd
import numpy as np
from ast import literal_eval
from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
import os
import os.path
from agent.target_extraction.BERT.relation_extractor.relbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
MAX_SEQ_LEN = 128
RELATIONS = ['/has_feature', '/no_relation']
RELATION_LABEL_MAP = {None: None, '/has_feature': 1, '/no_relation': 0}
LABELS = ['ASPECT', 'NAN']
LABEL_MAP = {'ASPECT': 1, 'NAN': 0, None: None}
MASK_TOKEN = '[MASK]'
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
......@@ -22,12 +22,9 @@ def generate_batch(batch):
attn_mask = encoded['attention_mask']
labels = torch.tensor([instance.label for instance in batch])
both_ranges = [instance.ranges for instance in batch]
masked_indices = torch.tensor([[ins_idx, token_idx] for ins_idx, ranges in enumerate(both_ranges)
for start, end in ranges for token_idx in range(start, end + 1)])
fst_indices, snd_indices = map(indices_for_entity_ranges, zip(*both_ranges))
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, masked_indices, fst_indices, snd_indices, labels
return input_ids, attn_mask, entity_indices, labels
def generate_production_batch(batch):
......@@ -37,12 +34,9 @@ def generate_production_batch(batch):
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
both_ranges = [instance.ranges for instance in batch]
masked_indices = torch.tensor([[ins_idx, token_idx] for ins_idx, ranges in enumerate(both_ranges)
for start, end in ranges for token_idx in range(start, end + 1)])
fst_indices, snd_indices = map(indices_for_entity_ranges, zip(*both_ranges))
entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
return input_ids, attn_mask, masked_indices, fst_indices, snd_indices, batch
return input_ids, attn_mask, entity_indices, batch
def indices_for_entity_ranges(ranges):
......@@ -53,11 +47,11 @@ def indices_for_entity_ranges(ranges):
return indices
class PairRelDataset(Dataset):
class EntityDataset(Dataset):
def __init__(self, df, size=None):
# filter inapplicable rows
self.df = df[df.apply(lambda x: PairRelDataset.instance_from_row(x) is not None, axis=1)]
self.df = df[df.apply(lambda x: EntityDataset.instance_from_row(x) is not None, axis=1)]
# sample data if a size is specified
if size is not None and size < len(self):
......@@ -65,7 +59,7 @@ class PairRelDataset(Dataset):
@staticmethod
def from_df(df, size=None):
dataset = PairRelDataset(df, size=size)
dataset = EntityDataset(df, size=size)
print('Obtained dataset of size', len(dataset))
return dataset
......@@ -73,9 +67,9 @@ class PairRelDataset(Dataset):
def from_file(file_name, valid_frac=None, size=None):
f = open(os.path.dirname(__file__) + '/../data/' + file_name)
if file_name.endswith('.json'):
dataset = PairRelDataset(pd.read_json(f, lines=True), size=size)
dataset = EntityDataset(pd.read_json(f, lines=True), size=size)
elif file_name.endswith('.tsv'):
dataset = PairRelDataset(pd.read_csv(f, sep='\t', error_bad_lines=False), size=size)
dataset = EntityDataset(pd.read_csv(f, sep='\t', error_bad_lines=False), size=size)
else:
raise AttributeError('Could not recognize file type')
......@@ -85,60 +79,45 @@ class PairRelDataset(Dataset):
else:
split_idx = int(len(dataset) * (1 - valid_frac))
dataset.df, valid_df = np.split(dataset.df, [split_idx], axis=0)
validset = PairRelDataset(valid_df)
validset = EntityDataset(valid_df)
print('Obtained train set of size', len(dataset), 'and validation set of size', len(validset))
return dataset, validset
@staticmethod
def instance_from_row(row):
unpacked_arr = literal_eval(row['relationMentions']) if type(row['relationMentions']) is str else row['relationMentions']
rms = [rm for rm in unpacked_arr if 'label' not in rm or rm['label'] in RELATIONS]
unpacked_arr = literal_eval(row['entityMentions']) if type(row['entityMentions']) is str else row['entityMentions']
rms = [rm for rm in unpacked_arr if 'label' not in rm or rm['label'] in LABELS]
if len(rms) == 1:
prod, feat, rel = rms[0]['em1Text'], rms[0]['em2Text'], (rms[0]['label'] if 'label' in rms[0] else None)
entity, label = rms[0]['text'], (rms[0]['label'] if 'label' in rms[0] else None)
else:
return None # raise AttributeError('Instances must have exactly one relation')
text = row['sentText']
return PairRelDataset.get_instance(text, [prod, feat], relation=rel)
return EntityDataset.get_instance(text, entity, label=label)
@staticmethod
def get_instance(text, entities, relation=None):
def get_instance(text, entity, label=None):
tokens = tokenizer.tokenize(text)
i = 0
found_entities = []
ranges = []
found_entity = False
entity_range = None
while i < len(tokens):
match = False
for entity in entities:
match_length = PairRelDataset.token_entity_match(i, entity.lower(), tokens)
if match_length is not None:
if entity in found_entities:
return None # raise AttributeError('Entity {} appears twice in text {}'.format(entity, text))
match = True
found_entities.append(entity)
tokens[i:i+match_length] = [MASK_TOKEN] * match_length
ranges.append((i + 1, i + match_length)) # + 1 taking into account the [CLS] token
i += match_length
break
if not match:
i += 1
if len(found_entities) != 2:
return None # raise AttributeError('Could not find entities {} and {} in {}. Found entities {}'.format(e1, e2, text, found_entities))
if relation is None:
return PairRelInstance(tokens, found_entities[0], found_entities[1], ranges, None, text)
if relation == '/has_feature':
if found_entities == entities:
return PairRelInstance(tokens, found_entities[0], found_entities[1], ranges, 1, text)
match_length = EntityDataset.token_entity_match(i, entity.lower(), tokens)
if match_length is not None:
if found_entity:
return None # raise AttributeError('Entity {} appears twice in text {}'.format(entity, text))
found_entity = True
tokens[i:i + match_length] = [MASK_TOKEN] * match_length
entity_range = (i + 1, i + match_length) # + 1 taking into account the [CLS] token
i += match_length
else:
assert found_entities == entities[::-1]
return PairRelInstance(tokens, found_entities[0], found_entities[1], ranges, 2, text)
i += 1
assert relation == '/no_relation'
return PairRelInstance(tokens, found_entities[0], found_entities[1], ranges, 0, text)
if found_entity:
return PairRelInstance(tokens, entity, entity_range, LABEL_MAP[label], text)
else:
return None
@staticmethod
def token_entity_match(first_token_idx, entity, tokens):
......@@ -170,23 +149,14 @@ class PairRelDataset(Dataset):
return len(self.df.index)
def __getitem__(self, idx):
return PairRelDataset.instance_from_row(self.df.iloc[idx])
return EntityDataset.instance_from_row(self.df.iloc[idx])
class PairRelInstance:
def __init__(self, tokens, fst, snd, ranges, label, text):
def __init__(self, tokens, entity, entity_range, label, text):
self.tokens = tokens
self.fst = fst
self.snd = snd
self.ranges = ranges
self.entity = entity
self.entity_range = entity_range
self.label = label
self.text = text
def get_relation_for_label(self, label):
if label == 0:
return self.fst, '/no_relation', self.snd
if label == 1:
return self.fst, '/has_feature', self.snd
if label == 2:
return self.snd, '/has_feature', self.fst
......@@ -20,14 +20,9 @@ class EntityBertNet(nn.Module):
bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask)
# max pooling at entity locations
entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices)
entity_pooled_output = bert_output[torch.arange(0, bert_output.shape[0]), entity_indices]
# fc layer (softmax activation done in loss function)
x = self.fc(entity_pooled_output)
return x
@staticmethod
def pooled_output(bert_output, indices):
outputs = torch.gather(bert_output, dim=1, index=indices)
pooled_output, _ = torch.max(outputs, dim=1)
return pooled_output
import torch
import torch.nn as nn
from transformers import *
HIDDEN_OUTPUT_FEATURES = 768
TRAINED_WEIGHTS = 'bert-base-uncased'
NUM_CLASSES = 2 # entity, not entity
class EntityBertNet(nn.Module):
def __init__(self):
super(EntityBertNet, self).__init__()
config = BertConfig.from_pretrained(TRAINED_WEIGHTS)
self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config)
self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, NUM_CLASSES)
def forward(self, input_ids, attn_mask, entity_indices):
# BERT
bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask)
# max pooling at entity locations
entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices)
# fc layer (softmax activation done in loss function)
x = self.fc(entity_pooled_output)
return x
@staticmethod
def pooled_output(bert_output, indices):
outputs = torch.gather(bert_output, dim=1, index=indices)
pooled_output, _ = torch.max(outputs, dim=1)
return pooled_output
......@@ -8,8 +8,8 @@ import time
import numpy as np
from sklearn import metrics
from transformers import get_linear_schedule_with_warmup
from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset, generate_batch, generate_production_batch
from agent.target_extraction.BERT.relation_extractor.pairbertnet import NUM_CLASSES, PairBertNet
from agent.target_extraction.BERT.relation_extractor.relation_dataset import RelationDataset, generate_batch, generate_production_batch
from agent.target_extraction.BERT.relation_extractor.relbertnet import NUM_CLASSES, RelBertNet
device = torch.device('cuda')