Commit 8416bbe7 authored by Joel Oksanen's avatar Joel Oksanen

Implementation complete

parent e38035ff
......@@ -8,6 +8,8 @@ import time
import numpy as np
from sklearn import metrics
device = torch.device('cuda')
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
......@@ -43,13 +45,20 @@ class BertAnalyzer:
collate_fn=generate_batch)
self.net = TDBertNet(len(polarity_indices))
# initialise GPU
self.net.cuda()
optimiser = optim.Adam(self.net.parameters(), lr=LEARNING_RATE)
start = time.time()
for epoch in range(MAX_EPOCHS):
batch_loss = 0.0
for i, (texts, target_indices, labels) in enumerate(train_loader):
for i, batch in enumerate(train_loader):
# send batch to gpu
texts, target_indices, labels = tuple(i.to(device) for i in batch)
# zero param gradients
optimiser.zero_grad()
......@@ -103,10 +112,14 @@ class BertAnalyzer:
dataset = BertDataset.from_data(data)
loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=8, collate_fn=generate_batch)
self.net.cuda()
self.net.eval()
predicted = []
with torch.no_grad():
for texts, target_indices, _ in loader:
outputs, attentions = self.net(texts, target_indices)
for input_ids, attn_mask, target_indices, _ in loader:
input_ids, attn_mask, target_indices = tuple(i.to(device) for i in [input_ids, attn_mask, target_indices])
outputs = self.net(input_ids, attn_mask, target_indices)
batch_val, batch_pred = torch.max(outputs.data, 1)
predicted += [BertAnalyzer.get_polarity(val, pred) for val, pred in zip(batch_val, batch_pred)]
......
......@@ -12,9 +12,11 @@ MASK_TOKEN = '[MASK]'
def generate_batch(batch):
texts = tokenizer.batch_encode_plus([entry['tokens'] for entry in batch], add_special_tokens=True,
encoded = tokenizer.batch_encode_plus([entry['tokens'] for entry in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']
max_tg_len = max(entry['to'] - entry['from'] for entry in batch)
target_indices = torch.tensor([[[min(t, entry['to'])] * HIDDEN_OUTPUT_FEATURES
......@@ -23,7 +25,7 @@ def generate_batch(batch):
polarity_labels = torch.tensor([entry['polarity'] for entry in batch])
return texts, target_indices, polarity_labels
return input_ids, attn_mask, target_indices, polarity_labels
def token_for_char(char_idx, text, tokens):
......
......@@ -11,18 +11,17 @@ class TDBertNet(nn.Module):
def __init__(self, num_class):
super(TDBertNet, self).__init__()
config = BertConfig.from_pretrained(TRAINED_WEIGHTS, output_attentions=True)
config = BertConfig.from_pretrained(TRAINED_WEIGHTS)
self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config)
self.bert_base.config.output_attentions = True
self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, num_class) # n of hidden features, n of output labels
def forward(self, texts, target_indices):
def forward(self, input_ids, attn_mask, target_indices):
# BERT
bert_output, _, attentions = self.bert_base(**texts)
bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask)
# max pooling at target locations
target_outputs = torch.gather(bert_output, dim=1, index=target_indices)
pooled_output = torch.max(target_outputs, dim=1)[0]
# fc layer with softmax activation
x = F.softmax(self.fc(pooled_output), 1)
return x, attentions[-1]
return x
class Argument:
def __init__(self, text, polarity, supporters, attackers, phrase, size):
self.text = text
self.polarity = 'POS' if polarity else 'NEG'
self.supporters = supporters
self.attackers = attackers
self.phrase = phrase.text if phrase else '-'
self.size = size
......@@ -21,7 +21,7 @@ class Communicator:
def get_init_message(self):
prod_node = self.product.root
prod = self.product.argument_for_node(prod_node)
text = ADAText('What would you like to know about the *?', [prod.name])
text = ADAText('What would you like to know about this *?', [prod.name])
queries = self.get_queries(prod_node)
args = [prod.with_queries(queries)]
return ADAMessage(text, args)
......@@ -75,9 +75,20 @@ class Communicator:
if query_id == 4 or query_id == 5:
phrase = (self.framework.best_supporting_phrase(q_arg_node) if query_id == 4
else self.framework.best_attacking_phrase(q_arg_node))
while phrase[-1] == '.':
phrase = phrase[:-1]
text = ADAText('\"...*...\"', [phrase], style='QUOT')
template = ''
args = []
i = 0
for form, start, end in phrase.get_arg_mentions(q_arg_node):
template += phrase.text[i:start] + '*'
i = end
args.append(form)
template += phrase.text[i:len(phrase.text)]
while template[-1] == '.':
template = template[:-1]
text = ADAText('\"...{}...\"'.format(template), args, style='QUOT')
args = [q_arg_node]
args = [self.product.argument_for_node(arg).with_queries(self.get_queries(arg)) for arg in args]
......@@ -106,3 +117,6 @@ class Communicator:
def was_were(self, arg_n):
return 'was' if self.product.singularities[arg_n] else 'were'
def get_argument_graph(self):
return self.framework.get_argument_graph()
......@@ -2,7 +2,7 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/top_5_mixer_reviews_subset.tsv'
data_location = 'agent/amazon_data/reviews.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
@staticmethod
......
from anytree import PostOrderIter
from anytree import PostOrderIter, PreOrderIter
from functools import reduce
from agent.SA.bert_analyzer import BertAnalyzer
from agent.target_extraction.product import Product
from agent.review import Review
from agent.dataloader import DataLoader
from agent.argument import Argument
import pickle
import re
from time import time
class Framework:
......@@ -19,9 +21,14 @@ class Framework:
self.arguments = self.product.argument_nodes
self.features = self.product.feature_nodes
ts = time()
# get reviews
review_csv = DataLoader.get_reviews(product_id)
reviews = [Review(row, self.product) for _, row in review_csv.iterrows()]
reviews = [Review(row, self.product) for _, row in review_csv.head(1000).iterrows()]
t_feature = time()
print('Feature detection took {} seconds'.format(t_feature - ts))
# extract augmented votes
self.extract_votes(reviews)
......@@ -29,15 +36,25 @@ class Framework:
if len(voting_reviews) / len(reviews) < 0.33:
print('warning: only a small fraction of reviews generated votes')
t_sa = time()
print('Sentiment analysis took {} seconds'.format(t_sa - t_feature))
# get aggregates
ra, self.vote_sum, self.vote_phrases = self.get_aggregates(reviews)
t_ra = time()
print('Review aggregation took {} seconds'.format(t_ra - t_sa))
# get qbaf from ra
self.qbaf, self.argument_polarities = self.get_qbaf(ra, len(reviews))
# apply gradual semantics
self.strengths = self.get_strengths(self.qbaf)
te = time()
print('QBAF construction took {} seconds'.format(te - t_ra))
print('Process took {} seconds'.format(te - ts))
# save
self.save()
......@@ -78,7 +95,7 @@ class Framework:
for review in reviews:
for phrase in review.phrases:
for arg, sentiment in phrase.get_votes().items():
vote_phrases[arg].append({'phrase': phrase.text, 'sentiment': sentiment, 'n_args': len(phrase.args)})
vote_phrases[arg].append(phrase) # {'phrase': phrase.text, 'sentiment': sentiment, 'n_args': len(phrase.args)}
for arg, sentiment in review.get_votes().items():
ra.append({'review_id': review.id, 'argument': arg, 'vote': sentiment})
vote_sum[arg] += sentiment
......@@ -187,31 +204,36 @@ class Framework:
return att is not None and self.strengths[att] > 0
def best_supporting_phrase(self, argument):
phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)
if vp['n_args'] == 1 and Framework.is_well_formatted(vp['phrase'])}
phrases = [phrase for phrase in self.supporting_phrases(argument)
if phrase.n_args() == 1 and Framework.is_well_formatted(phrase.text)]
if len(phrases) == 0:
return None
top_5 = list(sorted(phrases.keys(), key=lambda k: phrases[k], reverse=True))[:5]
print(top_5)
return max(top_5, key=lambda p: len(p))
top_5 = list(sorted(phrases, key=lambda p: p.get_vote(argument), reverse=True))[:5]
return max(top_5, key=lambda p: len(p.text))
def best_attacking_phrase(self, argument):
phrases = {vp['phrase']: vp['sentiment'] for vp in self.attacking_phrases(argument)
if vp['n_args'] == 1 and Framework.is_well_formatted(vp['phrase'])}
phrases = [phrase for phrase in self.attacking_phrases(argument)
if phrase.n_args() == 1 and Framework.is_well_formatted(phrase.text)]
if len(phrases) == 0:
return None
top_5 = list(sorted(phrases.keys(), key=lambda k: phrases[k]))[:5]
print(top_5)
return max(top_5, key=lambda p: len(p))
top_5 = list(sorted(phrases, key=lambda p: p.get_vote(argument)))[:5]
return max(top_5, key=lambda p: len(p.text))
@staticmethod
def is_well_formatted(phrase):
if not re.match('^[-a-zA-Z0-9();,./!?\'" ]*$', phrase):
print(phrase)
return re.match('^[-a-zA-Z0-9();,./!?\'" ]*$', phrase)
def supporting_phrases(self, argument):
return list(filter(lambda vp: vp['sentiment'] > 0, self.vote_phrases[argument]))
return list(filter(lambda phrase: phrase.get_vote(argument) > 0, self.vote_phrases[argument]))
def attacking_phrases(self, argument):
return list(filter(lambda vp: vp['sentiment'] < 0, self.vote_phrases[argument]))
return list(filter(lambda phrase: phrase.get_vote(argument) < 0, self.vote_phrases[argument]))
def get_argument_graph(self):
return self.create_arg(self.product_node, 120)
def create_arg(self, arg_node, size):
supporters = [self.create_arg(supp_node, size - 20) for supp_node in self.qbaf['supporters'][arg_node]]
attackers = [self.create_arg(att_node, size - 20) for att_node in self.qbaf['attackers'][arg_node]]
phrase = self.best_supporting_phrase(arg_node) if self.argument_polarities[arg_node] else self.best_attacking_phrase(arg_node)
return Argument(arg_node.name, self.argument_polarities[arg_node], supporters, attackers, phrase, size)
import pandas as pd
pd.set_option('display.max_colwidth', None)
all_reviews_file = 'amazon_data/reviews.tsv'
def get_reviews(category, meta_file, review_file):
......@@ -21,7 +22,7 @@ def save_reviews(category, meta_file, review_file, output_file):
reviews.to_csv(output_file, sep='\t', index=False)
def save_top_reviewed_products(n, output_file, product_title, category=None, review_file=None, meta_file=None,
def save_top_reviewed_products(n, category=None, review_file=None, meta_file=None,
product_file=None):
if product_file:
reviews = pd.read_csv(product_file, sep='\t')
......@@ -33,9 +34,35 @@ def save_top_reviewed_products(n, output_file, product_title, category=None, rev
'reviewText': 'review_body'})
reviews = reviews[reviews['review_body'].apply(lambda b: not pd.isna(b) and len(b) > 0)]
reviews = reviews[reviews['star_rating'].apply(lambda r: type(r) is int or r.isdigit())]
reviews['product_title'] = product_title
reviews.to_csv(output_file, sep='\t', index=False)
titles = {asin: input('Product title for {}: '.format(asin)) for asin in reviews['product_id'].unique()}
reviews['product_title'] = reviews['product_id'].apply(lambda asin: titles[asin])
all_reviews = pd.read_csv(all_reviews_file, sep='\t')
all_reviews = pd.concat([all_reviews, reviews])
all_reviews.to_csv(all_reviews_file, sep='\t', index=False)
def get_top_products_by_brand(n, brand, meta_file, review_file):
metadata_iter = pd.read_json(meta_file, lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['brand'].apply(lambda b: type(b) is str and b == brand)]
for metadata in metadata_iter])
print(len(metadata.index))
print(metadata.head())
print(metadata.columns)
save_top_reviewed_products(5, 'amazon_data/top_5_mixer_reviews.tsv', 'KitchenAid KSM150PSGR Artisan Series 5-Qt. Stand Mixer with Pouring Shield - Imperial Grey',
product_file='target_extraction/data/verified_stand_mixer_reviews.tsv')
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
top_reviewed = reviews.groupby(['asin'], sort=False).size().sort_values(ascending=False).head(n)
return top_reviewed
def get_product_reviews_for_asin(asin, review_file, output_file):
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].apply(lambda p_asin: p_asin == asin)] for reviews in review_iter])
print(len(reviews.index))
reviews.to_csv(output_file, sep='\t', index=False)
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
from agent.SA.bert_dataset import MAX_SEQ_LEN
from anytree import PostOrderIter
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()
class Review:
......@@ -61,7 +62,8 @@ class Phrase:
def __init__(self, text, product):
self.product = product
self.text = text
self.tokens = [word.lower() for word in word_tokenize(text)]
self.spans = list(tokenizer.span_tokenize(text))
self.tokens = [text[start:end] for start, end in self.spans]
self.args = self.get_args()
self.votes = {}
......@@ -72,7 +74,7 @@ class Phrase:
while len(arguments) > 0:
arg = arguments.pop(0)
for term in self.product.glossary[arg]:
matches = [Arg(arg, start, end)
matches = [Arg(arg, ' '.join(term), start, end)
for start, end in Phrase.matching_subsequences(term, self.tokens)]
if matches:
argument_matches += matches
......@@ -89,8 +91,8 @@ class Phrase:
pass
self.remove_ancestors(node.parent, l)
def add_arg(self, arg):
self.args.append(arg)
# def add_arg(self, arg):
# self.args.append(arg)
def num_args(self):
return len(self.args)
......@@ -102,6 +104,20 @@ class Phrase:
self.votes[arg.node] = arg.sentiment
return self.votes
def get_vote(self, node):
return self.votes[node]
def get_arg_mentions(self, node):
mentions = []
for arg in self.args:
if arg.node == node:
start, end = self.spans[arg.start][0], self.spans[arg.end - 1][1]
mentions.append((arg.form, start, end))
return mentions
def n_args(self):
return len(self.args)
@staticmethod
def matching_subsequences(l_sub, l):
sub_idxs = []
......@@ -114,8 +130,9 @@ class Phrase:
class Arg:
def __init__(self, node, start, end):
def __init__(self, node, form, start, end):
self.node = node
self.form = form
self.start = start
self.end = end
self.sentiment = None
......
......@@ -14,6 +14,14 @@ def get_votes(df):
return df.apply(lambda row: row['true'] > row['false'], axis=1)
def get_votes_for_product(product):
for i in range(n_raters):
votes = get_votes(get_df(product, i))
print(methods[i])
print(votes)
print('')
def get_accuracy(df):
votes = get_votes(df)
return sum(1 if vote else 0 for vote in votes) / len(votes) if len(votes) > 0 else None
......@@ -45,10 +53,11 @@ def get_kappa():
return (p - pe) / (1 - pe)
# for i in range(3):
# print(i)
# for prod in products:
# print(' ', prod, len(get_df(prod, i)))
# print(' ', sum(len(get_df(prod, i)) for prod in products))
def get_agreement():
df = pd.concat(get_df(prod, i) for prod in products for i in range(3)).reset_index(drop=True)
agreed = df[df.apply(lambda row: row['true'] == n_raters or row['false'] == n_raters, axis=1)]
return len(agreed) / len(df)
print_accuracies()
\ No newline at end of file
for p in products:
print(len(get_df(p, 0)))
\ No newline at end of file
......@@ -6,7 +6,7 @@ from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
from anytree import Node, RenderTree
from anytree import Node, RenderTree, PreOrderIter
import numpy as np
import re
from gensim.models import Word2Vec
......@@ -18,6 +18,7 @@ from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import Pai
from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor
from pathos.multiprocessing import ProcessingPool as Pool
import itertools
from time import time
np.set_printoptions(precision=4, threshold=np.inf, suppress=True)
stop_words = stopwords.words('english')
......@@ -160,9 +161,11 @@ class TargetExtractor:
self.product = product
self.file_path = file_path
ts = time()
print('tokenizing phrases...')
# tokenize and normalize phrases
texts = TargetExtractor.obtain_texts(file_path, text_column, n=200000)
texts = TargetExtractor.obtain_texts(file_path, text_column, n=500000)
self.sentences = list(itertools.chain.from_iterable(pool.map(sent_tokenize, texts)))
self.sentences = pool.map(lambda s: s.replace('_', ' ').lower(), self.sentences)
self.phrases = pool.map(word_tokenize, self.sentences)
......@@ -179,19 +182,21 @@ class TargetExtractor:
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
self.save()
t_noun = time()
print('Noun extraction took {} seconds'.format(t_noun - ts))
print('mining aspects...')
# mine aspects
self.aspects, self.counts = self.get_aspects(self.counter)
t_feature = time()
print('Feature extraction took {} seconds'.format(t_feature - t_noun))
print('training word2vec model...')
# train word2vec model
self.wv = self.get_word2vec_model(size=TargetExtractor.WV_SIZE, window=TargetExtractor.WV_WINDOW,
min_count=TargetExtractor.MIN_TERM_COUNT)
self.save()
print('mining aspects...')
# mine aspects
self.aspects, self.counts = self.get_aspects(self.counter)
print('extracting synonyms...')
# obtain synonyms
self.syn_dict = self.get_syn_dict()
......@@ -201,18 +206,19 @@ class TargetExtractor:
self.counts = {aspect: sum(self.counts[syn] for syn in self.syn_dict[aspect]) for aspect in self.aspects}
self.aspects = sorted(self.aspects, key=self.counts.get, reverse=True)
print(self.syn_dict)
self.save()
t_syn = time()
print('Synonym extraction took {} seconds'.format(t_syn - t_feature))
print('extracting relatedness matrix...')
self.relatedness_matrix = self.get_bert_relations()
self.save()
print('extracting aspect tree...')
self.tree = self.get_product_tree3()
te = time()
print('Ontology extraction took {} seconds'.format(te - t_syn))
print('Full process took {} seconds'.format(te - ts))
print('saving...')
self.save()
......@@ -508,8 +514,3 @@ class Synset:
if w in group:
return group
return None
# for p in ['mixer', 'game', 'necklace', 'watch', 'tv']:
# extr = TargetExtractor.load_saved(p)
# extr.save_product_representation()
......@@ -4,6 +4,8 @@ from . import views
urlpatterns = [
path('', views.index, name='index'),
path('products/', views.products, name='products'),
path('product/', views.product, name='product'),
path('message/', views.message, name='message')
path('message/', views.message, name='message'),
path('arguments/', views.arguments, name='arguments')
]
......@@ -4,46 +4,79 @@ import jsonpickle
from django.views.decorators.csrf import csrf_exempt
from agent.dataloader import DataLoader
from agent.communicator import Communicator
from time import time
communicators = [] # change into dict with cookie key to support several connections
product_ids = ['B0000TIKK8', 'B0000TIIPK', 'B000AYW0M2', 'B000AYW0KO', 'B004J30ERI', 'B004VR9HP2',
'B00005UP2N', 'B0001HLTTI', 'B00063ULMI', 'B00791QYMQ']
class Empty:
pass
def index(request):
return HttpResponse("OK")
def products(request):
product_infos = []
for product_id in product_ids:
product_title = DataLoader.get_product_name(product_id)
star_rating = DataLoader.get_avg_star_rating(product_id)
image_url = 'https://ws-na.amazon-adsystem.com/widgets/q?_encoding=UTF8&MarketPlace=US&ASIN=' + product_id + '&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=SL250'
product_info = Empty()
product_info.id = product_id
product_info.name = product_title
product_info.starRating = star_rating
product_info.imageURL = image_url
product_infos.append(product_info)
return HttpResponse(jsonpickle.encode(product_infos, unpicklable=False), content_type="application/json")
def product(request):
ts = time()
product_id = request.GET.get('id', '')
if not communicators:
communicators.append(Communicator(product_id))
if communicators:
communicators.pop()
communicators.append(Communicator(product_id))
communicator = communicators[0]
init_message = communicator.get_init_message()
print('Request took {} seconds'.format(time() - ts))
return HttpResponse(jsonpickle.encode(init_message, unpicklable=False), content_type="application/json")
@csrf_exempt
def message(request):
parsed = json.loads(request.body)
query_id = parsed['queryID']
arg_id = parsed['argumentID']
response = communicators[0].get_response(query_id, arg_id)
return HttpResponse(jsonpickle.encode(response, unpicklable=False), content_type="application/json")
def arguments(request):
product_id = request.GET.get('id', '')
comm = Communicator(product_id)
root = comm.get_argument_graph()
product_title = DataLoader.get_product_name(product_id)
star_rating = DataLoader.get_avg_star_rating(product_id)
image_url = 'https://ws-na.amazon-adsystem.com/widgets/q?_encoding=UTF8&MarketPlace=US&ASIN=' + product_id + '&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=SL250'
class Empty:
pass
product_info = Empty()
product_info.id = product_id
product_info.name = product_title
product_info.starRating = star_rating
product_info.imageURL = image_url
init_response = Empty()