Commit e38035ff authored by Joel Oksanen's avatar Joel Oksanen

Major refactorings, changes to agent. Implemented text highlighting in back-end.

parent 4ac66e6b
-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEAr1eu+bmGaX3IJ+duCqPw+WIlXrvGOJuAEH/6j/HZIYTUliuEKG71OGocqvXO
/4x4BuQQx5+BA+6kWIfHcW7LzdEsUZlHDgRxO+WuRK+A6F5H5EnvlAvBSNpg4+ernabuEcnXa5f4
0+wem3t+RSOYCympAxpbuWZP33UkH5ucl8HAesztOGxUVMIj8VynET4ADpcdDwt+B2rD7OFO9QJP
hS5gcEw7lIYiOctPc2hJ3DvKFcdk7ZyER/l5g06VfnboV4OBzCLvd526eUZxrq8YLQvbTM3eu9qE
gJV1Bg+ZOLfbCZV+sKpVZL6ZyJxuVoSOjQIrXMSj0Wp9i1l1qZG9jwIDAQABAoIBAH3db1kLeBTZ
mYgrdK5mqGAbt7+dAtk3pmIxu+cAMGEDPKbfbeqoW5a3dQSzlneSmcY4iGMDeFUeRRLXxK8EXX/M
mweoA31oavAYG1RqtxoWM30IJdYb6g8l009FycdNQK+8N8qgOJnHcOEjcKyotEevDAeSMC2R59v7
9oZJ6cp0tcPEyEJzUQz8K4SzE137n0ieBpPXE/SHqRek53PGzDFG01SstaDECMSbPlcXyZw63CQs
U09r1O7CG6KcqLjgBEB+RtglNld56INLuSVMn29zTjWFzmKaLUj9A6PnTvq1UIM6hUsCNa6+2E1c
by6tZXRRujPKCMyXDFIiw8/W0cECgYEA4OeBsdVNx2/+M1nT9crLQWfbGz8YV/eAogngJhLeJEYJ
Ui2nBk1FbGRpivtfStMDRjRYzwzuCt5YvHFv+qiLFh3bFamqm5a9XnGtyqOrCB0JM1r0NJNeTc4A
+rxTuVNk0JG9cK4Ei/lKaz35vAi16pAI+NxkP4b0AJkB6X2w9HcCgYEAx5XvgZx6MQksuQQzpiWF
PVrZF4wX749iJV+LKp76cSlRZDhqKfOX/K5gKZXB0JXgKVXyifj6X5VA/rFlsY1H+daPuHrHIcRy
kUlf3EHm+m0G6c6FxpVcJtWGabJZ4uN+jpydCryrfGdFj/gIIIk/09ga1cnF6L/q6f3+AoaAPakC
gYEAlz4YEacH7x55K780sw31Mb7Nsbz0efOPJWjOu8vkZzBSBFl/ANXl4caE01nH82GDyipZrqNb
USPFOLRwQXgmUHEza5X1jTwJjQGVhbgaxBICpe58Wv7fZCMstXEAPJjAKyJW6vebRyYVzZiX4MNa
0qqZ3gEXyUDPxbcfeg10enECgYAik81MrSlWOXqwCxRPoox01rppo/G5pSOhye24Upac6EWbkVKD
NhqMwkESs0lCs9SEFMnSMXpe+OKrvVwa6JkhpZR8p3i8VuZqWUXsSNaVaDIca5UtlE7Ew5Vn1mhQ
MI2Kc4OZippDETkx+gyeZmjScMwlYbGR4z31j/Vjxp2NEQKBgAmp+i4yGQY+sstBLXDBPCNq1uEo
FMPSTbwYoz6ezVFRe5B+QnL/pW5hdH60KEez0qoQGTYkdgLW5bwTI+K2PzG3BPTPvW0eOvWRJBXF
I0Rj6KAnGdcrONYB0Mp++X4so1tJ5y2iZAPeArZcLcoXGNozSfsnuC3GKxXAJaFPC+u+
-----END RSA PRIVATE KEY-----
\ No newline at end of file
......@@ -8,10 +8,8 @@ import time
import numpy as np
from sklearn import metrics
semeval_2014_train_path = 'agent/SA/data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'agent/SA/data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'agent/SA/data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'agent/SA/semeval_2014_2.pt'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
BATCH_SIZE = 32
MAX_EPOCHS = 6
......@@ -25,19 +23,22 @@ def loss(outputs, labels):
class BertAnalyzer:
# @staticmethod
# def default():
# sa = BertAnalyzer()
# sa.load_saved(trained_model_path)
# return sa
@staticmethod
def default():
def load_saved(path):
sa = BertAnalyzer()
sa.load_saved(trained_model_path)
sa.net = TDBertNet(len(polarity_indices))
sa.net.load_state_dict(torch.load(path))
sa.net.eval()
return sa
def load_saved(self, path):
self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(path))
self.net.eval()
def train(self, data_file):
train_data = BertDataset.from_file(data_file)
def train(self, data_file, save_path, mask_target=False):
train_data = BertDataset.from_file(data_file, mask_target=mask_target)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch)
......@@ -71,10 +72,10 @@ class BertAnalyzer:
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(self.net.state_dict(), trained_model_path)
torch.save(self.net.state_dict(), save_path)
def evaluate(self, data_file):
test_data = BertDataset.from_file(data_file)
def evaluate(self, data_file, mask_target=False):
test_data = BertDataset.from_file(data_file, mask_target=mask_target)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_batch)
......@@ -113,7 +114,7 @@ class BertAnalyzer:
def get_sentiment_polarity(self, text, char_from, char_to):
instance = Instance(text, char_from, char_to)
tokens, tg_from, tg_to = instance.get()
tokens, tg_from, tg_to = instance.get(mask_target=False)
text, target_indices = instance.to_tensor()
with torch.no_grad():
......@@ -147,4 +148,4 @@ class BertAnalyzer:
return -val
else:
# neutral or conflicted
return 0
\ No newline at end of file
return 0
......@@ -8,6 +8,7 @@ import re
MAX_SEQ_LEN = 128
polarity_indices = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
MASK_TOKEN = '[MASK]'
def generate_batch(batch):
......@@ -42,12 +43,14 @@ class BertDataset(Dataset):
def __init__(self):
self.data = []
self.mask_target = False
@staticmethod
def from_file(file):
def from_file(file, mask_target=False):
dataset = BertDataset()
tree = ET.parse(file)
dataset.data = []
dataset.mask_target = mask_target
for sentence in tree.getroot():
text = sentence.find('text').text
aspect_terms = sentence.find('aspectTerms')
......@@ -71,7 +74,7 @@ class BertDataset(Dataset):
def __getitem__(self, idx):
instance, polarity_str = self.data[idx]
tokens, idx_from, idx_to = instance.get()
tokens, idx_from, idx_to = instance.get(self.mask_target)
polarity = polarity_index(polarity_str)
return {'tokens': tokens, 'from': idx_from, 'to': idx_to, 'polarity': polarity}
......@@ -84,14 +87,16 @@ class Instance:
self.char_from = char_from
self.char_to = char_to
def get(self):
def get(self, mask_target):
tokens = tokenizer.tokenize(self.text)
idx_from = token_for_char(self.char_from, self.text, tokens)
idx_to = token_for_char(self.char_to-1, self.text, tokens)
return tokens, idx_from, idx_to
idx_to = token_for_char(self.char_to-1, self.text, tokens) + 1
if mask_target:
tokens[idx_from:idx_to] = [MASK_TOKEN] * (idx_to - idx_from)
return tokens, idx_from + 1, idx_to # +1 for [CLS] token
def to_tensor(self):
tokens, idx_from, idx_to = self.get()
tokens, idx_from, idx_to = self.get(mask_target=False)
text = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_SEQ_LEN,
is_pretokenized=True, return_tensors='pt')
target_indices = torch.tensor([[[t] * HIDDEN_OUTPUT_FEATURES for t in range(idx_from, idx_to + 1)]])
......
......@@ -2034,7 +2034,7 @@ Overall a great value, Samsung is definitely the way to go.</review_body>
</sentence>
</sentences>
</review>
<review annotated="true">
<review annotated="false">
<review_id>A3150C7E8XF6MM</review_id>
<review_body>The chains were broken, but the cross is good. Wearing it next to my heart.</review_body>
<sentences>
......
......@@ -299,5 +299,5 @@ def prepare_annotated_reviews():
# prepare_reviews()
# annotate_reviews()
prepare_annotated_reviews()
\ No newline at end of file
annotate_reviews()
# prepare_annotated_reviews()
\ No newline at end of file
<?xml version="1.0" ?>
<data>
<review>
<review_id>R32MKKAKXKKW0C</review_id>
<text>I bought the same Sling pack but smaller 7x, after buying a couple more lenses I quickly grew out of it. This 9x is large, but perfect for me. I still use the smaller one too. I will take the larger one on vacations.</text>
<sentences>
<sentence>
<tokens>I bought the same Sling pack but smaller 7x , after buying a couple more lenses I quickly grew out of it .</tokens>
<annotations>
<annotation>
<range>4,5</range>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>15,15</range>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>21,21</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>This 9x is large , but perfect for me .</tokens>
<annotations>
<annotation>
<range>0,1</range>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>I still use the smaller one too .</tokens>
<annotations>
<annotation>
<range>3,5</range>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>I will take the larger one on vacations .</tokens>
<annotations>
<annotation>
<range>3,5</range>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</sentence>
</sentences>
</review>
<review>
<review_id>R1S1HWFSIF618Z</review_id>
<text>Instructions were written in very disjointed English. Was expecting much more from this $100 product. Very disappointed!</text>
<sentences>
<sentence>
<tokens>Instructions were written in very disjointed English .</tokens>
<annotations>
<annotation>
<range>0,0</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>Was expecting much more from this $ 100 product .</tokens>
<annotations>
<annotation>
<range>5,8</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>Very disappointed !</tokens>
<annotations/>
</sentence>
</sentences>
</review>
</data>
from agent.argumentquery import ArgumentQuery
from agent.agent import Agent
from agent.target_extraction.product import Product
from threading import Thread
class ADAMessage:
def __init__(self, text, arguments):
self.text = text
self.arguments = arguments
from agent.framework import Framework
from agent.message import ADAMessage, ADAText
class Communicator:
......@@ -22,31 +14,14 @@ class Communicator:
ArgumentQuery(5, 'What did users say about the {arg} being poor?'),
]
def __init__(self, dl):
self.dl = dl
self.product = None
self.agent = None
self.loading = False
def has_loaded_product(self, product_id):
return self.product is not None and self.product.id == product_id and not self.loading
def load_product(self, product_id, product_type): # product_type e.g. 'camera'
if self.product is None or product_id != self.product.id:
self.loading = True
self.product = Product.get_product(product_type)
self.product.id = product_id
Thread(target=self.load_product_bg).start()
def load_product_bg(self):
self.agent = Agent(self.product)
self.agent.analyze_reviews(self.dl.get_reviews(self.product.id))
self.loading = False
def __init__(self, product_id):
self.framework = Framework.load_saved(product_id)
self.product = self.framework.product
def get_init_message(self):
prod_node = self.product.root
prod = self.product.argument_for_node(prod_node)
text = 'What would you like to know about the {}?'.format(prod.name)
text = ADAText('What would you like to know about the *?', [prod.name])
queries = self.get_queries(prod_node)
args = [prod.with_queries(queries)]
return ADAMessage(text, args)
......@@ -56,53 +31,53 @@ class Communicator:
q_arg = self.product.argument_for_id(arg_id)
if query_id == 0:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
supp_node = self.framework.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.framework.get_strongest_attacking_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was highly rated because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_node))
text = ADAText('The * was highly rated because the * {} good'.format(self.was_were(supp_node)),
[q_arg.name, supp_name])
if att_node:
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_node))
text.add(', although the * {} poor.'.format(self.was_were(att_node)), [att_name])
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
text.add('.')
args = [q_arg_node, supp_node]
if query_id == 2:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
supp_node = self.framework.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.framework.get_strongest_attacking_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was considered to be good because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_node))
text = ADAText('The * was considered to be good because the * {} good'.format(self.was_were(supp_node)),
[q_arg.name, supp_name])
if att_node:
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_node))
text.add(', although the * {} poor.'.format(self.was_were(att_node)), [att_name])
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
text.add('.')
args = [q_arg_node, supp_node]
if query_id == 3:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
att_name = self.product.argument_for_node(att_node).name
text = 'The {} was considered to be poor because the {} {} poor'.format(q_arg.name, att_name,
self.was_were(att_node))
supp_node = self.framework.get_strongest_supporting_subfeature(q_arg_node)
supp_name = self.product.argument_for_node(supp_node).name
att_node = self.framework.get_strongest_attacking_subfeature(q_arg_node)
text = ADAText('The * was considered to be poor because the * {} poor'.format(self.was_were(supp_node)),
[q_arg.name, supp_name])
if supp_node:
supp_name = self.product.argument_for_node(supp_node).name
text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_node))
args = [q_arg_node, att_node, supp_node]
att_name = self.product.argument_for_node(att_node).name
text.add(', although the * {} good.'.format(self.was_were(att_node)), [att_name])
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
args = [q_arg_node, att_node]
text.add('.')
args = [q_arg_node, supp_node]
if query_id == 4 or query_id == 5:
phrase = (self.agent.best_supporting_phrase(q_arg_node) if query_id == 4
else self.agent.best_attacking_phrase(q_arg_node))
phrase = (self.framework.best_supporting_phrase(q_arg_node) if query_id == 4
else self.framework.best_attacking_phrase(q_arg_node))
while phrase[-1] == '.':
phrase = phrase[:-1]
text = '\"...{}...\"'.format(phrase)
text = ADAText('\"...*...\"', [phrase], style='QUOT')
args = [q_arg_node]
args = [self.product.argument_for_node(arg).with_queries(self.get_queries(arg)) for arg in args]
......@@ -114,16 +89,16 @@ class Communicator:
base = 0 if arg.id == 0 else 2
if self.agent.liked_argument(arg_node):
if self.agent.supported_argument(arg_node):
if self.framework.liked_argument(arg_node):
if self.framework.supported_argument(arg_node):
queries.append(self.queries[base].with_argument(arg))
supp_phrase = self.agent.best_supporting_phrase(arg_node)
supp_phrase = self.framework.best_supporting_phrase(arg_node)
if supp_phrase:
queries.append(self.queries[4].with_argument(arg))
else:
if self.agent.attacked_argument(arg_node):
if self.framework.attacked_argument(arg_node):
queries.append(self.queries[base + 1].with_argument(arg))
att_phrase = self.agent.best_attacking_phrase(arg_node)
att_phrase = self.framework.best_attacking_phrase(arg_node)
if att_phrase:
queries.append(self.queries[5].with_argument(arg))
......
......@@ -2,14 +2,17 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/reviews_for_watches.tsv'
data_location = 'agent/amazon_data/top_5_mixer_reviews_subset.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
return self.reviews[self.reviews['product_id'] == product_id].reset_index(drop=True)
@staticmethod
def get_reviews(product_id):
return DataLoader.reviews[DataLoader.reviews['product_id'] == product_id].reset_index(drop=True)
def get_product_name(self, product_id):
return self.get_reviews(product_id)['product_title'][0]
@staticmethod
def get_product_name(product_id):
return DataLoader.get_reviews(product_id)['product_title'][0]
def get_avg_star_rating(self, product_id):
return float(self.get_reviews(product_id)['star_rating'].mean())
@staticmethod
def get_avg_star_rating(product_id):
return float(DataLoader.get_reviews(product_id)['star_rating'].mean())
class ADAMessage:
def __init__(self, text, arguments):
self.text = text
self.arguments = arguments
class ADAText:
def __init__(self, template, arguments, style='ARG'):
self.template = template
self.arguments = arguments
self.style = style
def add(self, template_add, arguments_add=None):
if arguments_add is None:
arguments_add = []
self.template += template_add
self.arguments += arguments_add
This diff is collapsed.
......@@ -21,20 +21,21 @@ def save_reviews(category, meta_file, review_file, output_file):
reviews.to_csv(output_file, sep='\t', index=False)
def save_top_reviewed_products(n, category, meta_file, review_file, output_file, product_title):
reviews = get_reviews(category, meta_file, review_file)
def save_top_reviewed_products(n, output_file, product_title, category=None, review_file=None, meta_file=None,
product_file=None):
if product_file:
reviews = pd.read_csv(product_file, sep='\t')
else:
reviews = get_reviews(category, meta_file, review_file)
top_reviewed = reviews.groupby(['asin'], sort=False).size().sort_values(ascending=False).head(n)
reviews = reviews[reviews['asin'].apply(lambda asin: asin in top_reviewed)]
reviews = reviews.rename(columns={'overall': 'star_rating', 'asin': 'product_id', 'reviewerID': 'review_id',
'reviewText': 'review_body'})
reviews = reviews[reviews['review_body'].apply(lambda b: b is not None and len(b) > 0)]
reviews = reviews[reviews['review_body'].apply(lambda b: not pd.isna(b) and len(b) > 0)]
reviews = reviews[reviews['star_rating'].apply(lambda r: type(r) is int or r.isdigit())]
reviews['product_title'] = product_title
reviews.to_csv(output_file, sep='\t', index=False)
# save_top_reviewed_products(3, 'Wrist Watches', 'amazon_data/meta_Clothing_Shoes_and_Jewelry.json',
# 'amazon_data/Clothing_Shoes_and_Jewelry.json', 'amazon_data/reviews_for_watches.tsv',
# 'watch')
save_reviews('Necklaces', 'agent/amazon_data/meta_Clothing_Shoes_and_Jewelry.json', 'agent/amazon_data/Clothing_Shoes_and_Jewelry.json', 'agent/target_extraction/data/verified_necklace_reviews.tsv')
save_top_reviewed_products(5, 'amazon_data/top_5_mixer_reviews.tsv', 'KitchenAid KSM150PSGR Artisan Series 5-Qt. Stand Mixer with Pouring Shield - Imperial Grey',
product_file='target_extraction/data/verified_stand_mixer_reviews.tsv')
This diff is collapsed.
......@@ -37,7 +37,7 @@ class Review:
# normalize
for arg in self.votes:
self.votes[arg] = 1 if self.votes[arg] > 0 else -1
# self.augment_votes()
self.augment_votes()
return self.votes
# augment votes (Definition 4.3) obtained for a single critic
......@@ -61,7 +61,7 @@ class Phrase:
def __init__(self, text, product):
self.product = product
self.text = text
self.tokens = [wnl.lemmatize(word.lower()) for word in word_tokenize(text)]
self.tokens = [word.lower() for word in word_tokenize(text)]
self.args = self.get_args()
self.votes = {}
......@@ -72,7 +72,8 @@ class Phrase:
while len(arguments) > 0:
arg = arguments.pop(0)
for term in self.product.glossary[arg]:
matches = [Arg(arg, start, end) for start, end in Phrase.matching_subsequences(term, self.tokens)]
matches = [Arg(arg, start, end)
for start, end in Phrase.matching_subsequences(term, self.tokens)]
if matches:
argument_matches += matches
self.remove_ancestors(arg, arguments)
......
This diff is collapsed.
......@@ -40,7 +40,7 @@ class EntityAnnotator:
print('obtaining counter...')
counter, phraser = EntityAnnotator.count_nouns(texts)
print('finished initialising annotator')
ann = EntityAnnotator(file_path, counter, phraser, name + '.pickle')
ann = EntityAnnotator(file_path, counter, phraser, 'annotators/{}.pickle'.format(name))
ann.save()
return ann
......@@ -358,5 +358,6 @@ class EntityAnnotator:
return text, rels
ea = EntityAnnotator.load_saved('example_annotator.pickle')
ea.annotate()
ea: EntityAnnotator = EntityAnnotator.load_saved('annotators/watch_annotator.pickle')
ea.save_annotated_pairs('BERT/data/annotated_watch_review_pairs.tsv')
ea.save_annotated_entities('BERT/data/annotated_watch_review_entities.tsv')
true false
3 0
3 0
3 0
3 0
3 0
3 0
3 0
\ No newline at end of file
true false
\ No newline at end of file
true false
1 2
1 2
0 3
2 1
0 3
\ No newline at end of file
true false
3 0
2 1
1 2
3 0
3 0
3 0
3 0
3 0
3 0
3 0
3 0
3 0
3 0
3 0
0 3
3 0
3 0
\ No newline at end of file
true false
3 0
3 0
1 2
2 1
3 0
3 0
\ No newline at end of file
true false
2 1
2 1
1 2
1 2
1 2
\ No newline at end of file