Commit e38035ff authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Major refactorings, changes to agent. Implemented text highlighting in back-end.

parent 4ac66e6b
-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEAr1eu+bmGaX3IJ+duCqPw+WIlXrvGOJuAEH/6j/HZIYTUliuEKG71OGocqvXO
/4x4BuQQx5+BA+6kWIfHcW7LzdEsUZlHDgRxO+WuRK+A6F5H5EnvlAvBSNpg4+ernabuEcnXa5f4
0+wem3t+RSOYCympAxpbuWZP33UkH5ucl8HAesztOGxUVMIj8VynET4ADpcdDwt+B2rD7OFO9QJP
hS5gcEw7lIYiOctPc2hJ3DvKFcdk7ZyER/l5g06VfnboV4OBzCLvd526eUZxrq8YLQvbTM3eu9qE
gJV1Bg+ZOLfbCZV+sKpVZL6ZyJxuVoSOjQIrXMSj0Wp9i1l1qZG9jwIDAQABAoIBAH3db1kLeBTZ
mYgrdK5mqGAbt7+dAtk3pmIxu+cAMGEDPKbfbeqoW5a3dQSzlneSmcY4iGMDeFUeRRLXxK8EXX/M
mweoA31oavAYG1RqtxoWM30IJdYb6g8l009FycdNQK+8N8qgOJnHcOEjcKyotEevDAeSMC2R59v7
9oZJ6cp0tcPEyEJzUQz8K4SzE137n0ieBpPXE/SHqRek53PGzDFG01SstaDECMSbPlcXyZw63CQs
U09r1O7CG6KcqLjgBEB+RtglNld56INLuSVMn29zTjWFzmKaLUj9A6PnTvq1UIM6hUsCNa6+2E1c
by6tZXRRujPKCMyXDFIiw8/W0cECgYEA4OeBsdVNx2/+M1nT9crLQWfbGz8YV/eAogngJhLeJEYJ
Ui2nBk1FbGRpivtfStMDRjRYzwzuCt5YvHFv+qiLFh3bFamqm5a9XnGtyqOrCB0JM1r0NJNeTc4A
+rxTuVNk0JG9cK4Ei/lKaz35vAi16pAI+NxkP4b0AJkB6X2w9HcCgYEAx5XvgZx6MQksuQQzpiWF
PVrZF4wX749iJV+LKp76cSlRZDhqKfOX/K5gKZXB0JXgKVXyifj6X5VA/rFlsY1H+daPuHrHIcRy
kUlf3EHm+m0G6c6FxpVcJtWGabJZ4uN+jpydCryrfGdFj/gIIIk/09ga1cnF6L/q6f3+AoaAPakC
gYEAlz4YEacH7x55K780sw31Mb7Nsbz0efOPJWjOu8vkZzBSBFl/ANXl4caE01nH82GDyipZrqNb
USPFOLRwQXgmUHEza5X1jTwJjQGVhbgaxBICpe58Wv7fZCMstXEAPJjAKyJW6vebRyYVzZiX4MNa
0qqZ3gEXyUDPxbcfeg10enECgYAik81MrSlWOXqwCxRPoox01rppo/G5pSOhye24Upac6EWbkVKD
NhqMwkESs0lCs9SEFMnSMXpe+OKrvVwa6JkhpZR8p3i8VuZqWUXsSNaVaDIca5UtlE7Ew5Vn1mhQ
MI2Kc4OZippDETkx+gyeZmjScMwlYbGR4z31j/Vjxp2NEQKBgAmp+i4yGQY+sstBLXDBPCNq1uEo
FMPSTbwYoz6ezVFRe5B+QnL/pW5hdH60KEez0qoQGTYkdgLW5bwTI+K2PzG3BPTPvW0eOvWRJBXF
I0Rj6KAnGdcrONYB0Mp++X4so1tJ5y2iZAPeArZcLcoXGNozSfsnuC3GKxXAJaFPC+u+
-----END RSA PRIVATE KEY-----
\ No newline at end of file
......@@ -8,10 +8,8 @@ import time
import numpy as np
from sklearn import metrics
semeval_2014_train_path = 'agent/SA/data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'agent/SA/data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'agent/SA/data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'agent/SA/semeval_2014_2.pt'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
BATCH_SIZE = 32
MAX_EPOCHS = 6
......@@ -25,19 +23,22 @@ def loss(outputs, labels):
class BertAnalyzer:
# @staticmethod
# def default():
# sa = BertAnalyzer()
# sa.load_saved(trained_model_path)
# return sa
@staticmethod
def default():
def load_saved(path):
sa = BertAnalyzer()
sa.load_saved(trained_model_path)
sa.net = TDBertNet(len(polarity_indices))
sa.net.load_state_dict(torch.load(path))
sa.net.eval()
return sa
def load_saved(self, path):
self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(path))
self.net.eval()
def train(self, data_file):
train_data = BertDataset.from_file(data_file)
def train(self, data_file, save_path, mask_target=False):
train_data = BertDataset.from_file(data_file, mask_target=mask_target)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch)
......@@ -71,10 +72,10 @@ class BertAnalyzer:
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(self.net.state_dict(), trained_model_path)
torch.save(self.net.state_dict(), save_path)
def evaluate(self, data_file):
test_data = BertDataset.from_file(data_file)
def evaluate(self, data_file, mask_target=False):
test_data = BertDataset.from_file(data_file, mask_target=mask_target)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_batch)
......@@ -113,7 +114,7 @@ class BertAnalyzer:
def get_sentiment_polarity(self, text, char_from, char_to):
instance = Instance(text, char_from, char_to)
tokens, tg_from, tg_to = instance.get()
tokens, tg_from, tg_to = instance.get(mask_target=False)
text, target_indices = instance.to_tensor()
with torch.no_grad():
......@@ -147,4 +148,4 @@ class BertAnalyzer:
return -val
else:
# neutral or conflicted
return 0
\ No newline at end of file
return 0
......@@ -8,6 +8,7 @@ import re
MAX_SEQ_LEN = 128
polarity_indices = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
MASK_TOKEN = '[MASK]'
def generate_batch(batch):
......@@ -42,12 +43,14 @@ class BertDataset(Dataset):
def __init__(self):
self.data = []
self.mask_target = False
@staticmethod
def from_file(file):
def from_file(file, mask_target=False):
dataset = BertDataset()
tree = ET.parse(file)
dataset.data = []
dataset.mask_target = mask_target
for sentence in tree.getroot():
text = sentence.find('text').text
aspect_terms = sentence.find('aspectTerms')
......@@ -71,7 +74,7 @@ class BertDataset(Dataset):
def __getitem__(self, idx):
instance, polarity_str = self.data[idx]
tokens, idx_from, idx_to = instance.get()
tokens, idx_from, idx_to = instance.get(self.mask_target)
polarity = polarity_index(polarity_str)
return {'tokens': tokens, 'from': idx_from, 'to': idx_to, 'polarity': polarity}
......@@ -84,14 +87,16 @@ class Instance:
self.char_from = char_from
self.char_to = char_to
def get(self):
def get(self, mask_target):
tokens = tokenizer.tokenize(self.text)
idx_from = token_for_char(self.char_from, self.text, tokens)
idx_to = token_for_char(self.char_to-1, self.text, tokens)
return tokens, idx_from, idx_to
idx_to = token_for_char(self.char_to-1, self.text, tokens) + 1
if mask_target:
tokens[idx_from:idx_to] = [MASK_TOKEN] * (idx_to - idx_from)
return tokens, idx_from + 1, idx_to # +1 for [CLS] token
def to_tensor(self):
tokens, idx_from, idx_to = self.get()
tokens, idx_from, idx_to = self.get(mask_target=False)
text = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_SEQ_LEN,
is_pretokenized=True, return_tensors='pt')
target_indices = torch.tensor([[[t] * HIDDEN_OUTPUT_FEATURES for t in range(idx_from, idx_to + 1)]])
......
......@@ -2034,7 +2034,7 @@ Overall a great value, Samsung is definitely the way to go.</review_body>
</sentence>
</sentences>
</review>
<review annotated="true">
<review annotated="false">
<review_id>A3150C7E8XF6MM</review_id>
<review_body>The chains were broken, but the cross is good. Wearing it next to my heart.</review_body>
<sentences>
......
......@@ -299,5 +299,5 @@ def prepare_annotated_reviews():
# prepare_reviews()
# annotate_reviews()
prepare_annotated_reviews()
\ No newline at end of file
annotate_reviews()
# prepare_annotated_reviews()
\ No newline at end of file
<?xml version="1.0" ?>
<data>
<review>
<review_id>R32MKKAKXKKW0C</review_id>
<text>I bought the same Sling pack but smaller 7x, after buying a couple more lenses I quickly grew out of it. This 9x is large, but perfect for me. I still use the smaller one too. I will take the larger one on vacations.</text>
<sentences>
<sentence>
<tokens>I bought the same Sling pack but smaller 7x , after buying a couple more lenses I quickly grew out of it .</tokens>
<annotations>
<annotation>
<range>4,5</range>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>15,15</range>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>21,21</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>This 9x is large , but perfect for me .</tokens>
<annotations>
<annotation>
<range>0,1</range>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>I still use the smaller one too .</tokens>
<annotations>
<annotation>
<range>3,5</range>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>I will take the larger one on vacations .</tokens>
<annotations>
<annotation>
<range>3,5</range>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</sentence>
</sentences>
</review>
<review>
<review_id>R1S1HWFSIF618Z</review_id>
<text>Instructions were written in very disjointed English. Was expecting much more from this $100 product. Very disappointed!</text>
<sentences>
<sentence>
<tokens>Instructions were written in very disjointed English .</tokens>
<annotations>
<annotation>
<range>0,0</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>Was expecting much more from this $ 100 product .</tokens>
<annotations>
<annotation>
<range>5,8</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>Very disappointed !</tokens>
<annotations/>
</sentence>
</sentences>
</review>
</data>
from agent.argumentquery import ArgumentQuery
from agent.agent import Agent
from agent.target_extraction.product import Product
from threading import Thread
class ADAMessage:
def __init__(self, text, arguments):
self.text = text
self.arguments = arguments
from agent.framework import Framework
from agent.message import ADAMessage, ADAText
class Communicator:
......@@ -22,31 +14,14 @@ class Communicator:
ArgumentQuery(5, 'What did users say about the {arg} being poor?'),
]
def __init__(self, dl):
self.dl = dl
self.product = None
self.agent = None
self.loading = False
def has_loaded_product(self, product_id):
return self.product is not None and self.product.id == product_id and not self.loading
def load_product(self, product_id, product_type): # product_type e.g. 'camera'
if self.product is None or product_id != self.product.id:
self.loading = True
self.product = Product.get_product(product_type)
self.product.id = product_id
Thread(target=self.load_product_bg).start()
def load_product_bg(self):
self.agent = Agent(self.product)
self.agent.analyze_reviews(self.dl.get_reviews(self.product.id))
self.loading = False
def __init__(self, product_id):
self.framework = Framework.load_saved(product_id)
self.product = self.framework.product
def get_init_message(self):
prod_node = self.product.root
prod = self.product.argument_for_node(prod_node)
text = 'What would you like to know about the {}?'.format(prod.name)
text = ADAText('What would you like to know about the *?', [prod.name])
queries = self.get_queries(prod_node)
args = [prod.with_queries(queries)]
return ADAMessage(text, args)
......@@ -56,53 +31,53 @@ class Communicator:
q_arg = self.product.argument_for_id(arg_id)
if query_id == 0:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
supp_node = self.framework.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.framework.get_strongest_attacking_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was highly rated because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_node))
text = ADAText('The * was highly rated because the * {} good'.format(self.was_were(supp_node)),
[q_arg.name, supp_name])
if att_node:
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_node))
text.add(', although the * {} poor.'.format(self.was_were(att_node)), [att_name])
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
text.add('.')
args = [q_arg_node, supp_node]
if query_id == 2:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
supp_node = self.framework.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.framework.get_strongest_attacking_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was considered to be good because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_node))
text = ADAText('The * was considered to be good because the * {} good'.format(self.was_were(supp_node)),
[q_arg.name, supp_name])
if att_node:
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_node))
text.add(', although the * {} poor.'.format(self.was_were(att_node)), [att_name])
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
text.add('.')
args = [q_arg_node, supp_node]
if query_id == 3:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
att_name = self.product.argument_for_node(att_node).name
text = 'The {} was considered to be poor because the {} {} poor'.format(q_arg.name, att_name,
self.was_were(att_node))
supp_node = self.framework.get_strongest_supporting_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
att_node = self.framework.get_strongest_attacking_subfeature(q_arg_node)
text = ADAText('The * was considered to be poor because the * {} poor'.format(self.was_were(supp_node)),
[q_arg.name, supp_name])
if supp_node:
supp_name = self.product.argument_for_node(supp_node).name
text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_node))
args = [q_arg_node, att_node, supp_node]
att_name = self.product.argument_for_node(att_node).name
text.add(', although the * {} good.'.format(self.was_were(att_node)), [att_name])
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
args = [q_arg_node, att_node]
text.add('.')
args = [q_arg_node, supp_node]
if query_id == 4 or query_id == 5:
phrase = (self.agent.best_supporting_phrase(q_arg_node) if query_id == 4
else self.agent.best_attacking_phrase(q_arg_node))
phrase = (self.framework.best_supporting_phrase(q_arg_node) if query_id == 4
else self.framework.best_attacking_phrase(q_arg_node))
while phrase[-1] == '.':
phrase = phrase[:-1]
text = '\"...{}...\"'.format(phrase)
text = ADAText('\"...*...\"', [phrase], style='QUOT')
args = [q_arg_node]
args = [self.product.argument_for_node(arg).with_queries(self.get_queries(arg)) for arg in args]
......@@ -114,16 +89,16 @@ class Communicator:
base = 0 if arg.id == 0 else 2
if self.agent.liked_argument(arg_node):
if self.agent.supported_argument(arg_node):
if self.framework.liked_argument(arg_node):
if self.framework.supported_argument(arg_node):
queries.append(self.queries[base].with_argument(arg))
supp_phrase = self.agent.best_supporting_phrase(arg_node)
supp_phrase = self.framework.best_supporting_phrase(arg_node)
if supp_phrase:
queries.append(self.queries[4].with_argument(arg))
else:
if self.agent.attacked_argument(arg_node):
if self.framework.attacked_argument(arg_node):
queries.append(self.queries[base + 1].with_argument(arg))
att_phrase = self.agent.best_attacking_phrase(arg_node)
att_phrase = self.framework.best_attacking_phrase(arg_node)
if att_phrase:
queries.append(self.queries[5].with_argument(arg))
......
......@@ -2,14 +2,17 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/reviews_for_watches.tsv'
data_location = 'agent/amazon_data/top_5_mixer_reviews_subset.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
return self.reviews[self.reviews['product_id'] == product_id].reset_index(drop=True)
@staticmethod
def get_reviews(product_id):
return DataLoader.reviews[DataLoader.reviews['product_id'] == product_id].reset_index(drop=True)
def get_product_name(self, product_id):
return self.get_reviews(product_id)['product_title'][0]
@staticmethod
def get_product_name(product_id):
return DataLoader.get_reviews(product_id)['product_title'][0]
def get_avg_star_rating(self, product_id):
return float(self.get_reviews(product_id)['star_rating'].mean())
@staticmethod
def get_avg_star_rating(product_id):
return float(DataLoader.get_reviews(product_id)['star_rating'].mean())
from anytree import PostOrderIter
from functools import reduce
from agent.SA.bert_analyzer import BertAnalyzer
from agent.target_extraction.product import Product
from agent.review import Review
from agent.dataloader import DataLoader
import pickle
import re
class Agent:
class Framework:
HIGH_SENTIMENT_THRESHOLD = 0.99
bert_analyzer = BertAnalyzer.load_saved('agent/SA/semeval_2014_both_unmasked_6.pt')
bert_analyzer = BertAnalyzer.default()
def __init__(self, product_type, product_id):
self.product_id = product_id
self.product = Product.get_product(product_type)
self.product_node = self.product.root
self.arguments = self.product.argument_nodes
self.features = self.product.feature_nodes
def __init__(self, product):
self.product = product
self.product_node = product.root
self.arguments = product.argument_nodes
self.features = product.feature_nodes
# get reviews
review_csv = DataLoader.get_reviews(product_id)
reviews = [Review(row, self.product) for _, row in review_csv.iterrows()]
# extract augmented votes
self.extract_votes(reviews)
voting_reviews = list(filter(lambda r: r.is_voting(), reviews))
if len(voting_reviews) / len(reviews) < 0.33:
print('warning: only a small fraction of reviews generated votes')
# get aggregates
ra, self.vote_sum, self.vote_phrases = self.get_aggregates(reviews)
# get qbaf from ra
self.qbaf, self.argument_polarities = self.get_qbaf(ra, len(reviews))
# apply gradual semantics
self.strengths = self.get_strengths(self.qbaf)
# save
self.save()
# print results
self.print()
def print(self):
print('qbaf:')
print(self.qbaf)
for argument in self.arguments:
print(argument.name)
print(' strength:', self.strengths[argument])
print(' polarity:', 'positive' if self.argument_polarities[argument] else 'negative')
print(' votes:')
print(' direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)),
len(self.attacking_phrases(argument))))
print(' augmented sum: {}'.format(self.vote_sum[argument]))
def get_bert_sentiments(self, data):
return list(self.bert_analyzer.get_batch_sentiment_polarity(data))
......@@ -36,7 +78,7 @@ class Agent:
for review in reviews:
for phrase in review.phrases:
for arg, sentiment in phrase.get_votes().items():
vote_phrases[arg].append({'phrase': phrase.text, 'sentiment': sentiment})
vote_phrases[arg].append({'phrase': phrase.text, 'sentiment': sentiment, 'n_args': len(phrase.args)})
for arg, sentiment in review.get_votes().items():
ra.append({'review_id': review.id, 'argument': arg, 'vote': sentiment})
vote_sum[arg] += sentiment
......@@ -51,25 +93,26 @@ class Agent:
if r['argument'] == argument:
argument_sums[argument] += r['vote']
# calculate attack/support relations for camera
# calculate attack/support relations
argument_polarities = {}
supporters = {r: [] for r in self.arguments}
attackers = {r: [] for r in self.arguments}
for r in self.arguments:
argument_polarities[r] = argument_sums[r] > 0
for subf in r.children:
if argument_sums[subf] > 0:
if (argument_sums[r] > 0 and argument_sums[subf] > 0) or (argument_sums[r] < 0 and argument_sums[subf] < 0):
supporters[r].append(subf)
elif argument_sums[subf] < 0:
elif (argument_sums[r] > 0 and argument_sums[subf] < 0) or (argument_sums[r] < 0 and argument_sums[subf] > 0):
attackers[r].append(subf)
# calculate base scores for arguments: CHANGES TO INTERIM REPORT METHOD
# calculate base scores for arguments
base_strengths = {self.product_node: 0.5 + 0.5 * argument_sums[self.product_node] / review_count}
for feature in self.features:
base_strengths[feature] = abs(argument_sums[feature]) / review_count
base_scores = {arg: 0.5 + 0.5 * argument_sums[arg] / review_count for arg in self.arguments}
qbaf = {'supporters': supporters, 'attackers': attackers,
'base_strengths': base_strengths, 'base_scores': base_scores}
return qbaf
'base_strengths': base_strengths}
return qbaf, argument_polarities
@staticmethod
def combined_strength(args):
......@@ -79,8 +122,8 @@ class Agent:
@staticmethod
def argument_strength(base_score, attacker_strengths, supporter_strengths):
attack = Agent.combined_strength(attacker_strengths)
support = Agent.combined_strength(supporter_strengths)
attack = Framework.combined_strength(attacker_strengths)
support = Framework.combined_strength(supporter_strengths)
if attack > support:
return base_score - (base_score * abs(attack - support))
elif attack < support:
......@@ -91,7 +134,6 @@ class Agent:
# CHANGES TO INTERIM REPORT METHOD
def get_strengths(self, qbaf):
strengths = {}
scores = {}
arguments = [node for node in PostOrderIter(self.product_node)]
for argument in arguments:
attacker_strengths = []
......@@ -101,36 +143,21 @@ class Agent:
attacker_strengths.append(strengths[child])
elif child in qbaf['supporters'][argument]:
supporter_strengths.append(strengths[child])
strengths[argument] = Agent.argument_strength(qbaf['base_strengths'][argument], attacker_strengths,
supporter_strengths)
scores[argument] = Agent.argument_strength(qbaf['base_scores'][argument<