Commit 64448925 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Target extraction integrated to server

parent ef388c59
import re
from agent.review_tokenizer import ReviewTokenizer
from anytree import PostOrderIter
import pickle
from agent.argument import *
from functools import reduce
from agent.SA.bert_analyzer import BertAnalyzer
from agent.review import Review
......@@ -10,23 +6,13 @@ from agent.review import Review
class Agent:
review_tokenizer = ReviewTokenizer()
bert_analyzer = BertAnalyzer.default()
def __init__(self):
# load classifier
f = open('agent/camera_review_classifier.pickle', 'rb')
self.classifier = pickle.load(f)
f.close()
# analyze sentiment
def get_bayes_sentiment(self, phrase):
# get classification
tokens = self.review_tokenizer.tokenize_review(phrase)
prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens))
classification = prob_classification.max()
strength = (prob_classification.prob(classification) - 0.5) * 2
return strength if classification == '+' else -strength
def __init__(self, product):
self.product = product
self.product_node = product.root
self.arguments = product.argument_nodes
self.features = product.feature_nodes
def get_bert_sentiments(self, data):
return list(self.bert_analyzer.get_batch_sentiment_polarity(data))
......@@ -39,20 +25,14 @@ class Agent:
for review in reviews:
for phrase in review.phrases:
bayes_sentiment = self.get_bayes_sentiment(phrase.text)
for arg in phrase.args:
sentiment = sentiments.pop(0)
print(phrase.text)
print('arg:', arg.start, '-', arg.end)
print('bert:', sentiment)
print('bayes:', bayes_sentiment)
arg.set_sentiment(sentiment)
@staticmethod
def get_aggregates(reviews):
def get_aggregates(self, reviews):
ra = []
vote_sum = {arg: 0 for arg in arguments}
vote_phrases = {arg: [] for arg in arguments}
vote_sum = {arg: 0 for arg in self.arguments}
vote_phrases = {arg: [] for arg in self.arguments}
for review in reviews:
for phrase in review.phrases:
for arg, sentiment in phrase.get_votes().items():
......@@ -62,20 +42,19 @@ class Agent:
vote_sum[arg] += sentiment
return ra, vote_sum, vote_phrases
@staticmethod
def get_qbaf(ra, review_count):
def get_qbaf(self, ra, review_count):
# sums of all positive and negative votes for arguments
argument_sums = {}
for argument in arguments:
for argument in self.arguments:
argument_sums[argument] = 0
for r in ra:
if r['argument'] == argument:
argument_sums[argument] += r['vote']
# calculate attack/support relations for camera
supporters = {r: [] for r in arguments}
attackers = {r: [] for r in arguments}
for r in arguments:
supporters = {r: [] for r in self.arguments}
attackers = {r: [] for r in self.arguments}
for r in self.arguments:
for subf in r.children:
if argument_sums[subf] > 0:
supporters[r].append(subf)
......@@ -83,22 +62,23 @@ class Agent:
attackers[r].append(subf)
# calculate base scores for arguments
base_scores = {}
base_scores[camera] = 0.5 + 0.5 * argument_sums[camera] / review_count
for feature in features:
base_scores = {self.product_node: 0.5 + 0.5 * argument_sums[self.product_node] / review_count}
for feature in self.features:
base_scores[feature] = abs(argument_sums[feature]) / review_count
qbaf = {'supporters': supporters, 'attackers': attackers, 'base_scores': base_scores}
return qbaf
def combined_strength(self, args):
@staticmethod
def combined_strength(args):
if len(args) != 0:
return 1 - reduce(lambda x, y: x * y, map(lambda v: 1 - v, args))
return 0
def argument_strength(self, base_score, attacker_strengths, supporter_strengths):
attack = self.combined_strength(attacker_strengths)
support = self.combined_strength(supporter_strengths)
@staticmethod
def argument_strength(base_score, attacker_strengths, supporter_strengths):
attack = Agent.combined_strength(attacker_strengths)
support = Agent.combined_strength(supporter_strengths)
if attack > support:
return base_score - (base_score * abs(attack - support))
elif attack < support:
......@@ -108,7 +88,7 @@ class Agent:
# apply DF-QUAD gradual semantics to qbaf
def get_strengths(self, qbaf):
strengths = {}
arguments = [node for node in PostOrderIter(camera)]
arguments = [node for node in PostOrderIter(self.product_node)]
for argument in arguments:
attacker_strengths = []
supporter_strengths = []
......@@ -117,19 +97,19 @@ class Agent:
attacker_strengths.append(strengths[child])
elif child in qbaf['supporters'][argument]:
supporter_strengths.append(strengths[child])
strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths,
supporter_strengths)
strengths[argument] = Agent.argument_strength(qbaf['base_scores'][argument], attacker_strengths,
supporter_strengths)
return strengths
def analyze_reviews(self, csv):
reviews = [Review(row) for _, row in csv.iterrows()]
reviews = [Review(row, self.product) for _, row in csv.iterrows()]
# extract augmented votes
self.extract_votes(reviews)
voting_reviews = list(filter(lambda r: r.is_voting(), reviews))
if len(voting_reviews) / len(reviews) < 0.33:
print('warning: only a small fraction of reviews generated votes')
# get aggregates
ra, self.vote_sum, self.vote_phrases = Agent.get_aggregates(reviews)
ra, self.vote_sum, self.vote_phrases = self.get_aggregates(reviews)
# get qbaf from ra
self.qbaf = self.get_qbaf(ra, len(reviews))
# apply gradual semantics
......@@ -140,7 +120,7 @@ class Agent:
print('strengths:')
print(self.strengths)
print('votes:')
for argument in arguments:
for argument in self.arguments:
print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)),
len(self.attacking_phrases(argument))))
print(argument, 'augmented sum: {}'.format(self.vote_sum[argument]))
......@@ -164,11 +144,11 @@ class Agent:
argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
def supported_argument(self, argument):
return (self.get_strongest_supporting_subfeature(argument) != None and
return (self.get_strongest_supporting_subfeature(argument) is not None and
self.strengths[self.get_strongest_supporting_subfeature(argument)] > 0)
def attacked_argument(self, argument):
return (self.get_strongest_attacking_subfeature(argument) != None and
return (self.get_strongest_attacking_subfeature(argument) is not None and
self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0)
def best_supporting_phrase(self, argument):
......
from anytree import Node
camera = Node('camera')
images = Node('images', parent=camera)
video = Node('video', parent=camera)
battery = Node('battery', parent=camera)
flash = Node('flash', parent=camera)
audio = Node('audio', parent=camera)
price = Node('price', parent=camera)
shipping = Node('shipping', parent=camera)
lens = Node('lens', parent=camera)
zoom = Node('zoom', parent=lens)
af = Node('autofocus', parent=lens)
arguments = [camera, images, video, battery, flash, audio, price, shipping, lens, zoom, af]
features = [images, video, battery, flash, audio, price, shipping, lens, zoom, af]
glossary = {
camera: ['camera', 'device', 'product'],
images: ['image', 'picture', ' pic '],
video: ['video'],
battery: ['battery'],
flash: ['flash'],
audio: ['audio', 'sound'],
price: ['price', 'value', 'cost'],
shipping: ['ship'],
lens: ['lens'],
zoom: ['zoom'],
af: ['autofocus', 'auto-focus']
}
class Argument:
def __init__(self, id, name):
self.id = id
self.name = name
self.queries = []
def withQueries(self, queries):
arg = Argument(self.id, self.name)
arg.queries = queries
return arg
......@@ -4,7 +4,7 @@ class ArgumentQuery:
self.queryID = queryID
self.text = text
def withArgument(self, argument):
def with_argument(self, argument):
query = ArgumentQuery(self.queryID, self.text)
query.argumentID = argument.id
query.text = query.text.format(arg=argument.name)
......
from agent.argumentquery import ArgumentQuery
from agent.argument import *
from agent.agent import Agent
from agent.target_extraction.product import Product
import inflect
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from threading import Thread
class ADAMessage:
......@@ -10,6 +12,7 @@ class ADAMessage:
self.text = text
self.arguments = arguments
class Communicator:
queries = [
......@@ -21,43 +24,51 @@ class Communicator:
ArgumentQuery(5, 'What did users say about the {arg} being poor?'),
]
agent = Agent()
inflect = inflect.engine()
stemmer = SnowballStemmer("english")
wnl = WordNetLemmatizer()
def __init__(self, dl):
self.dl = dl
self.product_id = None
self.product = None
self.agent = None
self.loading = False
def has_loaded_product(self, product_id):
return self.product_id == product_id
return self.product is not None and self.product.id == product_id and not self.loading
def load_product(self, product_id):
self.product_id = product_id
self.arguments = {arguments[i] : Argument(i, arguments[i].name) for i in range(len(arguments))}
self.argument_nodes = arguments
self.agent.analyze_reviews(self.dl.get_reviews(self.product_id))
def load_product(self, product_id, product_type): # product_type e.g. 'camera'
if self.product is None or product_id != self.product.id:
self.loading = True
self.product = Product.get_product(product_type)
self.product.id = product_id
Thread(target=self.load_product_bg).start()
def load_product_bg(self):
self.agent = Agent(self.product)
self.agent.analyze_reviews(self.dl.get_reviews(self.product.id))
self.loading = False
def get_init_message(self):
prod_node = self.argument_nodes[0]
prod = self.arguments[prod_node]
prod_node = self.product.root
prod = self.product.argument_for_node(prod_node)
text = 'What would you like to know about the {}?'.format(prod.name)
queries = self.get_queries(prod_node)
args = [prod.withQueries(queries)]
args = [prod.with_queries(queries)]
return ADAMessage(text, args)
def get_response(self, query_id, arg_id):
q_arg_node = self.argument_nodes[arg_id]
q_arg = self.arguments[q_arg_node]
q_arg_node = self.product.argument_node_for_id(arg_id)
q_arg = self.product.argument_for_id(arg_id)
if query_id == 0:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
text = 'The {} was highly rated because the {} {} good'.format(
q_arg.name, self.arguments[supp_node].name, self.was_were(self.arguments[supp_node]))
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was highly rated because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_name))
if att_node:
text += ', although the {} {} poor.'.format(
self.arguments[att_node].name, self.was_were(self.arguments[att_node]))
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_name))
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
......@@ -66,47 +77,48 @@ class Communicator:
if query_id == 2:
supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
text = 'The {} was considered to be good because the {} {} good'.format(
q_arg.name, self.arguments[supp_node].name, self.was_were(self.arguments[supp_node]))
supp_name = self.product.argument_for_node(supp_node).name
text = 'The {} was considered to be good because the {} {} good'.format(q_arg.name, supp_name,
self.was_were(supp_name))
if att_node:
text += ', although the {} {} poor.'.format(
self.arguments[att_node].name, self.was_were(self.arguments[att_node]))
att_name = self.product.argument_for_node(att_node).name
text += ', although the {} {} poor.'.format(att_name, self.was_were(att_name))
args = [q_arg_node, supp_node, att_node]
else:
text += '.'
args = [q_arg_node, supp_node]
if query_id == 4 or query_id == 5:
phrase = self.agent.best_supporting_phrase(q_arg_node) if query_id == 4 else self.agent.best_attacking_phrase(q_arg_node)
phrase = (self.agent.best_supporting_phrase(q_arg_node) if query_id == 4
else self.agent.best_attacking_phrase(q_arg_node))
while phrase[-1] == '.':
phrase = phrase[:-1]
text = '\"...{}...\"'.format(phrase)
args = [q_arg_node]
args = [self.arguments[arg].withQueries(self.get_queries(arg)) for arg in args]
args = [self.product.argument_for_node(arg).with_queries(self.get_queries(arg)) for arg in args]
return ADAMessage(text, args)
def get_queries(self, arg_node):
arg = self.arguments[arg_node]
arg = self.product.argument_for_node(arg_node)
queries = []
base = 0 if arg.id == 0 else 2
if self.agent.liked_argument(arg_node):
if self.agent.supported_argument(arg_node):
queries.append(self.queries[base].withArgument(arg))
queries.append(self.queries[base].with_argument(arg))
supp_phrase = self.agent.best_supporting_phrase(arg_node)
if supp_phrase:
queries.append(self.queries[4].withArgument(arg))
queries.append(self.queries[4].with_argument(arg))
else:
if self.agent.attacked_argument(arg_node):
queries.append(self.queries[base + 1].withArgument(arg))
queries.append(self.queries[base + 1].with_argument(arg))
att_phrase = self.agent.best_attacking_phrase(arg_node)
if att_phrase:
queries.append(self.queries[5].withArgument(arg))
queries.append(self.queries[5].with_argument(arg))
return queries
def was_were(self, arg):
return 'was' if self.stemmer.stem(arg.name) == arg.name else 'were'
def was_were(self, term):
return 'was' if self.wnl.lemmatize(term) == term else 'were'
from anytree import Node
camera = Node('camera')
image = Node('image', parent=camera)
video = Node('video', parent=camera)
battery = Node('battery', parent=camera)
flash = Node('flash', parent=camera)
audio = Node('audio', parent=camera)
price = Node('price', parent=camera)
shipping = Node('shipping', parent=camera)
lens = Node('lens', parent=camera)
zoom = Node('zoom', parent=lens)
af = Node('af', parent=lens)
reviewables = [camera, image, video, battery, flash, audio, price, shipping, lens, zoom, af]
features = [image, video, battery, flash, audio, price, shipping, lens, zoom, af]
glossary = {
camera: ['camera', 'device', 'product'],
image: ['image', 'picture', ' pic '],
video: ['video'],
battery: ['battery'],
flash: ['flash'],
audio: ['audio', 'sound'],
price: ['price', 'value', 'cost'],
shipping: ['ship']
}
......@@ -24,8 +24,8 @@ def get_df(path):
pd.set_option('display.max_colwidth', None)
category = 'Laptops'
metadata = pd.read_json('amazon_data/meta_Electronics.json', lines=True)# get_df('amazon_data/meta_Electronics.json.gz')
category = 'Backpacks'
metadata = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True)
for col in metadata.columns:
print(col)
......@@ -34,12 +34,12 @@ metadata = metadata[metadata['category'].apply(lambda cats: category in cats)]
print(metadata['category'][:5])
print(len(metadata.index))
review_iter = pd.read_json('amazon_data/Electronics.json', lines=True, chunksize=1000)
review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
reviews.to_csv('target_extraction/data/verified_laptop_reviews.tsv', sep='\t', index=False)
reviews.to_csv('target_extraction/data/verified_backpack_reviews.tsv', sep='\t', index=False)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
......
......@@ -2,29 +2,29 @@ import re
from nltk.tokenize import sent_tokenize
from agent.SA.bert_dataset import MAX_SEQ_LEN
from anytree import PostOrderIter
from agent.argument import *
class Review:
SENTIMENT_THRESHOLD = 0.95
PHRASE_MAX_WORDS = MAX_SEQ_LEN * 0.3
def __init__(self, data):
def __init__(self, data, product):
self.product = product
self.id = data['review_id']
self.body = data['review_body']
self.phrases = Review.extract_phrases(self.body)
self.phrases = Review.extract_phrases(self.body, product)
self.votes = {}
# extract phrases
@staticmethod
def extract_phrases(review_body):
def extract_phrases(review_body, product):
sentences = sent_tokenize(review_body)
texts = []
for sentence in sentences:
texts += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />',
sentence)
texts = filter(lambda t: len(t.split()) < Review.PHRASE_MAX_WORDS, texts)
phrases = [Phrase(text) for text in texts]
phrases = [Phrase(text, product) for text in texts]
return phrases
def get_votes(self):
......@@ -39,7 +39,7 @@ class Review:
# augment votes (Definition 4.3) obtained for a single critic
def augment_votes(self):
arguments = [node for node in PostOrderIter(camera)]
arguments = [node for node in PostOrderIter(self.product.root)]
for argument in arguments:
if argument not in self.votes:
polar_sum = 0
......@@ -55,7 +55,8 @@ class Review:
class Phrase:
def __init__(self, text):
def __init__(self, text, product):
self.product = product
self.text = text
self.args = self.get_args(text)
self.votes = {}
......@@ -63,10 +64,10 @@ class Phrase:
# get argument(s) that match phrase
def get_args(self, phrase):
argument_matches = []
arguments = [node for node in PostOrderIter(camera)]
arguments = [node for node in PostOrderIter(self.product.root)]
while len(arguments) > 0:
f = arguments.pop(0)
for word in glossary[f]:
for word in self.product.glossary[f]:
matches = [Arg(f, m.start(), m.end()) for m in re.finditer(word, phrase)]
if matches:
argument_matches += matches
......@@ -76,7 +77,7 @@ class Phrase:
# remove all ancestors of node in list l
def remove_ancestors(self, node, l):
if node.parent != None:
if node.parent is not None:
try:
l.remove(node.parent)
except ValueError:
......
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from agent.item import glossary
import string
class ReviewTokenizer:
def __init__(self, product):
self.flat_glossary = [val for sublist in list(product.glossary.values()) for val in sublist]
tokenizer = TweetTokenizer()
stop_words = stopwords.words('english')
flat_glossary = [val for sublist in list(glossary.values()) for val in sublist]
def tokenize_review(self, review):
return self.reduce_noise(self.tokenizer.tokenize(review))
......
class Argument:
def __init__(self, id, name):
self.id = id
self.name = name
self.queries = []
def with_queries(self, queries):
arg = Argument(self.id, self.name)
arg.queries = queries
return arg
from anytree import Node
import pickle
from os.path import isfile
from agent.target_extraction.argument import Argument
class Product:
FILE_DIR = 'agent/target_extraction/extracted_products/'
FILE_EXTENSION = '.pickle'
def __init__(self, root: Node, syn_dict):
self.root = root
self.feature_nodes = [n for n in root.descendants]
self.argument_nodes = [root] + self.feature_nodes
self.glossary = {a_node: syns for a, syns in syn_dict.items() for a_node in self.argument_nodes
if a_node.name == a}
self.arguments = {a_node: Argument(a_idx, a_node.name) for a_idx, a_node in enumerate(self.argument_nodes)}
def argument_node_for_id(self, id):
return self.argument_nodes[id]
def argument_for_id(self, id):
return self.argument_for_node(self.argument_node_for_id(id))
def argument_for_node(self, n):
return self.arguments[n]
@staticmethod
def get_product(name):
path = Product.FILE_DIR + name + Product.FILE_EXTENSION
if isfile(path):
f = open(path, 'rb')
product: Product = pickle.load(f)
f.close()
return product
else:
raise Exception('No representation found for product {} at {}'.format(name, path))