Commit ce393701 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Improved integration between target_extractor and server

parent 64448925
......@@ -140,16 +140,17 @@ class Agent:
return max(attacker_strengths, key=attacker_strengths.get)
def liked_argument(self, argument):
return self.vote_sum[
argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
return self.vote_sum[argument] >= 0
# self.strengths[argument] > 0.5
# len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
def supported_argument(self, argument):
return (self.get_strongest_supporting_subfeature(argument) is not None and
self.strengths[self.get_strongest_supporting_subfeature(argument)] > 0)
supp = self.get_strongest_supporting_subfeature(argument)
return supp is not None and self.strengths[supp] > 0
def attacked_argument(self, argument):
return (self.get_strongest_attacking_subfeature(argument) is not None and
self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0)
att = self.get_strongest_attacking_subfeature(argument)
return att is not None and self.strengths[att] > 0
def best_supporting_phrase(self, argument):
phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)}
......
......@@ -88,6 +88,20 @@ class Communicator:
text += '.'
args = [q_arg_node, supp_node]
if query_id == 3:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
att_name = self.product.argument_for_node(att_node).name
text = 'The {} was considered to be poor because the {} {} poor'.format(q_arg.name, att_name,
self.was_were(att_name))
if supp_node:
supp_name = self.product.argument_for_node(supp_node).name
text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_name))
args = [q_arg_node, att_node, supp_node]
else:
text += '.'
args = [q_arg_node, att_node]
if query_id == 4 or query_id == 5:
phrase = (self.agent.best_supporting_phrase(q_arg_node) if query_id == 4
else self.agent.best_attacking_phrase(q_arg_node))
......
import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
data_location = 'agent/amazon_data/amazon_reviews_us_pc.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
......
import pandas as pd
data_location = 'amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
data_location = 'amazon_data/amazon_reviews_us_camera.tsv'
training_data_location = 'amazon_data/reviews_trained_on.tsv'
output_location = 'amazon_data/camera_prepared_data.tsv'
min_reviews = 50
......
......@@ -25,13 +25,10 @@ def get_df(path):
pd.set_option('display.max_colwidth', None)
category = 'Backpacks'
metadata = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True)
for col in metadata.columns:
print(col)
metadata = metadata[metadata['category'].apply(lambda cats: category in cats)]
metadata_iter = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: category in cl)] for metadata in metadata_iter])
print(metadata['category'][:5])
print(len(metadata.index))
review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
......
import pandas as pd
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
data_location = 'amazon_reviews_us_camera.tsv'
training_data_location = 'reviews_trained_on.tsv'
min_characters = 50
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
......
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from agent.SA.bert_dataset import MAX_SEQ_LEN
from anytree import PostOrderIter
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
class Review:
......@@ -34,7 +37,7 @@ class Review:
# normalize
for arg in self.votes:
self.votes[arg] = 1 if self.votes[arg] > 0 else -1
self.augment_votes()
# self.augment_votes()
return self.votes
# augment votes (Definition 4.3) obtained for a single critic
......@@ -58,20 +61,21 @@ class Phrase:
def __init__(self, text, product):
self.product = product
self.text = text
self.args = self.get_args(text)
self.tokens = [wnl.lemmatize(word.lower()) for word in word_tokenize(text)]
self.args = self.get_args()
self.votes = {}
# get argument(s) that match phrase
def get_args(self, phrase):
def get_args(self):
argument_matches = []
arguments = [node for node in PostOrderIter(self.product.root)]
while len(arguments) > 0:
f = arguments.pop(0)
for word in self.product.glossary[f]:
matches = [Arg(f, m.start(), m.end()) for m in re.finditer(word, phrase)]
arg = arguments.pop(0)
for term in self.product.glossary[arg]:
matches = [Arg(arg, start, end) for start, end in Phrase.matching_subsequences(term, self.tokens)]
if matches:
argument_matches += matches
self.remove_ancestors(f, arguments)
self.remove_ancestors(arg, arguments)
break
return argument_matches
......@@ -97,6 +101,15 @@ class Phrase:
self.votes[arg.node] = arg.sentiment
return self.votes
@staticmethod
def matching_subsequences(l_sub, l):
sub_idxs = []
len_sub = len(l_sub)
for i in range(len(l)):
if l[i:i+len_sub] == l_sub:
sub_idxs.append((i, i+len_sub))
return sub_idxs
class Arg:
......
......@@ -12,7 +12,7 @@ import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'amazon_data/amazon_reviews_us_PC_v1_00.tsv'
data_location = 'amazon_data/amazon_reviews_us_pc.tsv'
selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
......
......@@ -12,9 +12,10 @@ class Product:
self.root = root
self.feature_nodes = [n for n in root.descendants]
self.argument_nodes = [root] + self.feature_nodes
self.glossary = {a_node: syns for a, syns in syn_dict.items() for a_node in self.argument_nodes
if a_node.name == a}
self.arguments = {a_node: Argument(a_idx, a_node.name) for a_idx, a_node in enumerate(self.argument_nodes)}
self.glossary = {a_node: [syn.split('_') for syn in syns]
for a, syns in syn_dict.items() for a_node in self.argument_nodes if a_node.name == a}
self.arguments = {a_node: Argument(a_idx, a_node.name.replace('_', ' '))
for a_idx, a_node in enumerate(self.argument_nodes)}
def argument_node_for_id(self, id):
return self.argument_nodes[id]
......
......@@ -50,20 +50,26 @@ class TargetExtractor:
SYNONYM_SIMILARITY = 0.1
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, texts, parent=None):
def __init__(self, product, texts, parent_texts=None, phraser=None):
self.product = product
self.parent = parent
print('tokenizing phrases...')
# tokenize and normalize phrases
self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))]
for text in texts for phrase in sent_tokenize(text)]
print('obtaining bigrams...')
# train bigram map
bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
self.phraser = Phraser(trigram)
if phraser is None:
print('obtaining n-grams...')
# train bigram map
bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
self.phraser = Phraser(trigram)
else:
self.phraser = phraser
if parent_texts is not None:
print('analysing parent reviews...')
self.parent = TargetExtractor(product=None, texts=parent_texts, phraser=self.phraser)
print('counting terms...')
# count terms
......@@ -71,7 +77,7 @@ class TargetExtractor:
self.total_count = sum(self.counter.values())
def save_product_representation(self):
f = open(Product.FILE_DIR + self.product + Product.FILE_EXTENSION, 'wb')
f = open('extracted_products/' + self.product + Product.FILE_EXTENSION, 'wb')
p = Product(self.tree, self.syn_dict)
pickle.dump(p, f)
f.close()
......@@ -105,7 +111,7 @@ class TargetExtractor:
print('extracting aspect tree...')
# extract aspect tree
self.tree = self.get_product_tree2()
self.tree = self.get_product_tree()
return self.tree, self.syn_dict
......@@ -179,7 +185,9 @@ class TargetExtractor:
nouns = []
for phrase in self.phrases:
pos_tags = pos_tag(phrase)
bigrams = [re.sub('_*' + self.product + '_*', '', bigram) if bigram != self.product else bigram
bigrams = [re.sub('_*' + self.product + '_*', '', bigram)
if self.product and bigram != self.product
else bigram
for bigram in self.phraser[phrase]]
word_idx = 0
for token in bigrams:
......@@ -189,13 +197,13 @@ class TargetExtractor:
has_noun = any(TargetExtractor.is_noun(pos_tags[i]) for i in word_range)
all_terms_valid = all(TargetExtractor.is_valid_term(pos_tags[i]) for i in word_range)
if has_noun and all_terms_valid:
nouns.append(TargetExtractor.singular(token))
nouns.append(token)
word_idx += len(words)
else:
is_noun = TargetExtractor.is_noun(pos_tags[word_idx])
is_valid = TargetExtractor.is_valid_term(pos_tags[word_idx])
if len(token) > 1 and is_noun and is_valid:
nouns.append(TargetExtractor.singular(token))
nouns.append(token)
word_idx += 1
return Counter(nouns)
......@@ -279,42 +287,7 @@ class TargetExtractor:
return root
# product has to be at idx 0
# targets have to be sorted in descending order based on counts (excluding product)
@staticmethod
def get_product_tree(dep_matrix, targets):
remaining_targets = [idx for idx in range(len(targets))]
root = Node(targets[remaining_targets.pop(0)])
n_null = 3
dependencies = [None] * n_null + [TargetExtractor.get_significant_dependence(
idx, dep_matrix, ignore_idx=[0]+list(range(idx+1, len(targets)))) for idx in range(n_null, len(targets))]
print(dependencies)
while remaining_targets:
idx = remaining_targets.pop(0)
t_node = Node(targets[idx], parent=root)
t_node.idx = idx
dependants = [(d_idx, dep[1]) for d_idx, dep in enumerate(dependencies) if dep and dep[0] == idx]
print(t_node, [targets[i] for i, _ in dependants])
for d_idx, _ in sorted(dependants, key=lambda p: p[1], reverse=True):
if d_idx not in remaining_targets:
continue
# parent = root
# if not t_node.children or any(TargetExtractor.are_correlated(d_idx, c.idx, dep_matrix, ignore_idx=[0, idx]) for c in t_node.children):
# parent = t_node
d_node = Node(targets[d_idx], parent=t_node)
d_node.idx = d_idx
remaining_targets.remove(d_idx)
# for idx, t in enumerate(targets):
# if idx == 0:
# continue
# dep_idx =
# parent = next((d for d in root.descendants if d.idx == dep_idx), root)
# node = Node(t, parent=parent)
# node.idx = idx
return root
def get_product_tree2(self):
def get_product_tree(self):
root = Node(self.aspects[0])
for idx in range(1, TargetExtractor.N_DIRECT_FEATURES + 1):
node = Node(self.aspects[idx], parent=root)
......@@ -500,16 +473,16 @@ class Synset:
return None
# electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000]
# electronics_extractor = TargetExtractor('device', electronics_texts)
# parent_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000]
# texts = obtain_texts('data/verified_laptop_reviews.tsv', 'reviewText')
# extractor = TargetExtractor('laptop', texts, parent=electronics_extractor)
# extractor = TargetExtractor(product='laptop', texts=texts, parent_texts=parent_texts)
# tree, syns = extractor.get_tree_and_synonyms()
# print(RenderTree(tree))
# extractor.save()
extractor: TargetExtractor = TargetExtractor.load_saved('camera')
extractor: TargetExtractor = TargetExtractor.load_saved('laptop')
extractor.save_product_representation()
# tree, syns = extractor.get_tree_and_synonyms()
# print(RenderTree(tree))
# print(extractor.syn_dict)
......
......@@ -4,7 +4,7 @@ import random
from nltk import classify, NaiveBayesClassifier
import pickle
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
data_location = 'amazon_reviews_us_camera.tsv'
selected_reviews_output_location = 'reviews_trained_on.tsv'
classifier_location = 'camera_review_classifier.pickle'
min_characters = 0
......
......@@ -18,7 +18,6 @@ def product(request):
product_type = request.GET.get('type', '')
if not communicators:
print(1)
communicators.append(Communicator(dl))
communicator = communicators[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment