Commit c516942e authored by Joel Oksanen's avatar Joel Oksanen

Cleaned up code

parent 21fae9ef
......@@ -4,7 +4,6 @@ __pycache__/
server/agent/amazon_data/
server/agent/SA/data/
server/agent/target_extraction/data/
server/agent/target_extraction/stanford-corenlp-full-2018-10-05
server/agent/target_extraction/BERT/data/
.DS_Store
*.pickle
......
This diff is collapsed.
class FeatureCounter:
def __init__(self):
self.dep_features = {} # init as empty
self.n_dep_features = 0
def indexof(self, feature, learning):
if feature in self.dep_features:
return self.dep_features[feature]
elif learning:
self.dep_features[feature] = self.n_dep_features
self.n_dep_features += 1
return self.dep_features[feature]
else:
return None
def count(self):
return self.n_dep_features
from nltk.tree import ParentedTree as Tree
class Instance:
def __init__(self, xml):
self.text = xml.find('text').text
self.opinion = xml.find('opinion').text
self.tree = Tree.fromstring(xml.find('tree').text)
import xml.etree.ElementTree as ET
from nltk.tree import ParentedTree as Tree
from sklearn import svm
from vectorizer import Vectorizer
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import confusion_matrix
from instance import Instance
from sklearn.feature_extraction.text import CountVectorizer
import os
import math
def resample_data(instances, labels):
label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels}
max_n_instances = max([len(v) for v in label_instances.values()])
resampled_data = []
for label in labels:
m = math.ceil(max_n_instances / len(label_instances[label]))
label_instances[label] = (label_instances[label] * m)[:max_n_instances]
resampled_data += label_instances[label]
print(len(resampled_data))
return resampled_data
class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
def train_expr_clf(self, instances):
fvs = [instance.vector for instance in instances]
targets = [instance.opinion for instance in instances]
self.expr_clf.fit(fvs, targets)
def get_feature_vector(self, instance):
return FeatureVector(instance, None)
# in: sentence parse tree with labelled argument ARG
# out: true if sentence expresses sentiment towards ARG else false
def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances])
semeval_2014_train_path = 'server/agent/SA/data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml'
semeval_2014_test_path = 'server/agent/SA/data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
amazon_train_path = 'data/Amazon/amazon_camera_train.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml'
semeval_train_path = 'server/agent/SA/data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
semeval_test_path = 'server/agent/SA/data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'server/agent/SA/data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
tweet_test_path = 'server/agent/SA/data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml'
train_path = semeval_2014_train_path
test_path = semeval_2014_test_path
labels = ['positive', 'neutral', 'negative', 'conflict']
sa = SentimentAnalyzer()
train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()]
# train_instances = resample_data(train_instances, labels)
# create and train vectorizer model
vec = Vectorizer(train_instances)
# train classifier for sentiment expression
sa.train_expr_clf(train_instances)
test_tree = ET.parse(test_path)
test_instances = [Instance(instance) for instance in test_tree.getroot()]
# obtain feature vectors and targets for test set
vec.vectorize(test_instances)
# predict test set values
pred = sa.expresses_sentiment(test_instances)
targets = [instance.opinion for instance in test_instances]
# evaluate results
cm = confusion_matrix(targets, pred, labels=labels)
acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets)
print(cm)
print('accuracy:', acc)
......@@ -13,7 +13,7 @@ from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'data/reviews/5_products_reviews.tsv'
selected_reviews_location = 'product_reviews_to_be_annotated.xml'
selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
......@@ -23,7 +23,7 @@ ann_fgs = {'positive': fg.green, 'neutral': fg.blue, 'negative': fg.red, 'confli
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
prepared_reviews_location = 'annotated_5_products_reviews_2.xml'
prepared_reviews_location = 'annotated_reviews.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
......
This diff is collapsed.
from nltk.tokenize import sent_tokenize
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from anytree import PostOrderIter
from functools import reduce
from matplotlib import pyplot
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error
import pickle
from server.agent.review_tokenizer import tokenize_review, reduce_noise
from server.agent.argument import *
reviewables = [camera, image, video, battery, flash, audio, price, shipping, lens, zoom, af]
features = [image, video, battery, flash, audio, price, shipping, lens, zoom, af]
glossary = {
camera: ['camera', 'device', 'product'],
image: ['image', 'picture', ' pic '],
video: ['video'],
battery: ['battery'],
flash: ['flash'],
audio: ['audio', 'sound'],
price: ['price', 'value', 'cost'],
shipping: ['ship']
}
sentiment_threshold = 0.95
f = open('camera_review_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()
# extract phrases
def extract_phrases(review_body):
sentences = sent_tokenize(review_body)
phrases = []
for sentence in sentences:
phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', sentence)
return phrases
# analyze sentiment
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(phrase):
# get vader score
# vader_s = analyzer.polarity_scores(phrase)
# compound_s = vader_s['compound']
# get classification
tokens = reduce_noise(tokenize_review(phrase))
prob_classification = classifier.prob_classify(dict([token, True] for token in tokens))
# return compound_s if the two agree
# if (classification == '+' and compound_s > 0) or (classification == '-' and compound_s < 0):
# return compound_s
# else:
# return 0
classification = prob_classification.max()
strength = (prob_classification.prob(classification) - 0.5) * 2
return strength if classification == '+' else -strength
# remove all ancestors of node in list l
def remove_ancestors(node, l):
if node.parent != None:
try:
l.remove(node.parent)
except ValueError:
pass
remove_ancestors(node.parent, l)
# get reviewable(s) that match phrase
def get_reviewables(phrase):
reviewable_matches = []
reviewables = [node for node in PostOrderIter(camera)]
while len(reviewables) > 0:
f = reviewables.pop(0)
for word in glossary[f]:
if word in phrase:
reviewable_matches.append(f)
remove_ancestors(f, reviewables)
break
return reviewable_matches
def extract_votes(phrases):
votes = {}
for phrase in phrases:
reviewables = get_reviewables(phrase)
sentiment = get_sentiment(phrase)
if abs(sentiment) > sentiment_threshold:
for reviewable in reviewables:
if (reviewable not in votes) or (abs(votes[reviewable]) < abs(sentiment)):
votes[reviewable] = sentiment # what if there's two phrases with same reviewable?
# normalize votes to 1 (+) or -1 (-)
for reviewable in votes:
votes[reviewable] = 1 if votes[reviewable] > 0 else -1
return votes
# augment votes (Definition 4.3) obtained for a single critic
def augment_votes(votes):
reviewables = [node for node in PostOrderIter(camera)]
for reviewable in reviewables:
if reviewable not in votes:
polar_sum = 0
for subfeat in reviewable.children:
if subfeat in votes:
polar_sum += votes[subfeat]
if polar_sum != 0:
votes[reviewable] = 1 if polar_sum > 0 else -1
def get_qbaf(ra, review_count):
# sums of all positive and negative votes for reviewables
reviewable_sums = {}
for reviewable in reviewables:
reviewable_sums[reviewable] = 0
for r in ra:
if r['reviewable'] == reviewable:
reviewable_sums[reviewable] += r['vote']
# calculate attack/support relations for camera
supporters = {r: [] for r in reviewables}
attackers = {r: [] for r in reviewables}
for r in reviewables:
for subf in r.children:
if reviewable_sums[subf] > 0:
supporters[r].append(subf)
elif reviewable_sums[subf] < 0:
attackers[r].append(subf)
# calculate base scores for reviewables
base_scores = {}
base_scores[camera] = 0.5 + 0.5 * reviewable_sums[camera] / review_count
for feature in features:
base_scores[feature] = abs(reviewable_sums[feature]) / review_count
qbaf = {"supporters": supporters, "attackers": attackers, "base_scores": base_scores}
return qbaf
def combined_strength(args):
if len(args) != 0:
return 1 - reduce(lambda x, y: x * y, map(lambda v: 1 - v, args))
return 0
def argument_strength(base_score, attacker_strengths, supporter_strengths):
attack = combined_strength(attacker_strengths)
support = combined_strength(supporter_strengths)
if attack > support:
return base_score - (base_score * abs(attack - support))
elif attack < support:
return base_score + ((1 - base_score) * abs(attack - support))
return base_score
# apply DF-QUAD gradual semantics to qbaf
def get_strengths(qbaf):
strengths = {}
reviewables = [node for node in PostOrderIter(camera)]
for reviewable in reviewables:
attacker_strengths = []
supporter_strengths = []
for child in reviewable.children:
if child in qbaf["attackers"][reviewable]:
attacker_strengths.append(strengths[child])
elif child in qbaf["supporters"][reviewable]:
supporter_strengths.append(strengths[child])
strengths[reviewable] = argument_strength(qbaf["base_scores"][reviewable], attacker_strengths, supporter_strengths)
return strengths
#############
all_reviews = pd.read_csv('target_extraction/data/camera_prepared_data.tsv', sep='\t', error_bad_lines=False)
camera_strengths = []
star_rating_averages = []
products_analyzed = 0
grouped = all_reviews.groupby('product_id')
for product_id, reviews in grouped:
# get ra
ra = []
voting_reviews = 0
review_count = 0
star_rating_sum = 0
for _, review in reviews.iterrows():
review_id = review['review_id']
review_count += 1
star_rating_sum += review['star_rating']
phrases = extract_phrases(review['review_body'])
votes = extract_votes(phrases)
augment_votes(votes)
voting_reviews += 1 if len(votes) > 0 else 0
# add final vote tuples to ra with simplified polarity in {+ (true), - (false)}
for reviewable in votes:
ra.append({'review_id': review_id, 'reviewable': reviewable, 'vote': votes[reviewable]})
# only consider items that obtained votes from at least 33% of reviewers
if voting_reviews / review_count < 0.33:
continue
# get qbaf from ra
qbaf = get_qbaf(ra, review_count)
# apply gradual semantics
strengths = get_strengths(qbaf)
# store results
camera_strengths.append(strengths[camera])
star_rating_averages.append(star_rating_sum / review_count)
products_analyzed += 1
print(products_analyzed)
# calculate Pearson's correlation
correlation, _ = pearsonr(camera_strengths, star_rating_averages)
print("pearson correlation: ", correlation)
# calculate MAE
scaled_star_rating_avgs = list(map(lambda x: (x - 1) / 4, star_rating_averages))
mae = mean_absolute_error(scaled_star_rating_avgs, camera_strengths)
print("mae: ", mae)
# plot result correlation
pyplot.scatter(camera_strengths, scaled_star_rating_avgs)
pyplot.show()
# vs = [{camera: 1, image: 1, zoom: -1},
# {camera: 1, image: 1, battery: -1},
# {image: 1, battery: 1, af: 1},
# {af: 1},
# {camera: -1, zoom: -1},
# {camera: -1, image: -1, af: 1},
# {battery: -1}]
#
# ra = []
# for v in vs:
# print(v)
# augment_votes(v)
# print(v)
# for reviewable in v:
# ra.append({'reviewable': reviewable, 'vote': v[reviewable]})
#
# qbaf = get_qbaf(ra, len(vs))
# strengths = get_strengths(qbaf)
# print(qbaf)
# print(strengths)
import pandas as pd
data_location = 'amazon_data/amazon_reviews_us_camera.tsv'
training_data_location = 'amazon_data/reviews_trained_on.tsv'
output_location = 'amazon_data/camera_prepared_data.tsv'
min_reviews = 50
min_characters = 50
n = 500
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews, training_reviews])
reviews = reviews.drop_duplicates(keep=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]
# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
# drop reviews for products with less than min_reviews reviews
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
# choose reviews for n first items
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews.to_csv(output_location, sep='\t', index=False)
print("Successfully prepared reviews for", reviews.groupby('product_id').ngroups, "products", sep=" ")
import pandas as pd
pd.set_option('display.max_colwidth', None)
all_reviews_file = 'amazon_data/reviews.tsv'
def get_reviews(category, meta_file, review_file):
metadata_iter = pd.read_json(meta_file, lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: type(cl) is list and category in cl)]
for metadata in metadata_iter])
print(len(metadata.index))
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
return reviews
def save_reviews(category, meta_file, review_file, output_file):
reviews = get_reviews(category, meta_file, review_file)
reviews.to_csv(output_file, sep='\t', index=False)
def save_top_reviewed_products(n, category=None, review_file=None, meta_file=None,
product_file=None):
if product_file:
reviews = pd.read_csv(product_file, sep='\t')
else:
reviews = get_reviews(category, meta_file, review_file)
top_reviewed = reviews.groupby(['asin'], sort=False).size().sort_values(ascending=False).head(n)
reviews = reviews[reviews['asin'].apply(lambda asin: asin in top_reviewed)]
reviews = reviews.rename(columns={'overall': 'star_rating', 'asin': 'product_id', 'reviewerID': 'review_id',
'reviewText': 'review_body'})
reviews = reviews[reviews['review_body'].apply(lambda b: not pd.isna(b) and len(b) > 0)]
reviews = reviews[reviews['star_rating'].apply(lambda r: type(r) is int or r.isdigit())]
titles = {asin: input('Product title for {}: '.format(asin)) for asin in reviews['product_id'].unique()}
reviews['product_title'] = reviews['product_id'].apply(lambda asin: titles[asin])
all_reviews = pd.read_csv(all_reviews_file, sep='\t')
all_reviews = pd.concat([all_reviews, reviews])
all_reviews.to_csv(all_reviews_file, sep='\t', index=False)
def get_top_products_by_brand(n, brand, meta_file, review_file):
metadata_iter = pd.read_json(meta_file, lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['brand'].apply(lambda b: type(b) is str and b == brand)]
for metadata in metadata_iter])
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
top_reviewed = reviews.groupby(['asin'], sort=False).size().sort_values(ascending=False).head(n)
return top_reviewed
def get_product_reviews_for_asin(asin, review_file, output_file):
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].apply(lambda p_asin: p_asin == asin)] for reviews in review_iter])
reviews.to_csv(output_file, sep='\t', index=False)
\ No newline at end of file
import pandas as pd
data_location = 'amazon_reviews_us_camera.tsv'
training_data_location = 'reviews_trained_on.tsv'
min_characters = 50
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews,training_reviews])
reviews = reviews.drop_duplicates(keep=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter += word_filter + '|'
filter = filter[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)]
# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
# drop reviews for products with less than min_reviews reviews
grouped = reviews.groupby(['product_id', 'product_title'], sort=False).size().sort_values(ascending=False)
print(grouped.head(50))
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
class ReviewTokenizer:
def __init__(self, product):
self.flat_glossary = [val for sublist in list(product.glossary.values()) for val in sublist]
tokenizer = TweetTokenizer()
stop_words = stopwords.words('english')
def tokenize_review(self, review):
return self.reduce_noise(self.tokenizer.tokenize(review))
def reduce_noise(self, tokens):
lowercase_tokens = list(map(lambda s: s.lower(), tokens))
return list(filter(lambda s: len(s) > 0 and s not in string.punctuation and s not in self.stop_words and s not in self.flat_glossary, lowercase_tokens))
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import cross_entropy
from torch.utils.data import DataLoader
import numpy as np
from sklearn import metrics
import time
from rel_dataset import RelInstance, RelDataset, generate_train_batch, generate_batch, tokenizer
from relbertnet import RelBertNet, NUM_RELS, BATCH_SIZE
train_data_path = 'data/train.json'
test_data_path = 'data/test.json'
trained_model_path = 'bert_extractor2.pt'
MAX_EPOCHS = 3
LEARNING_RATE = 0.00002
def loss(ner_loss, rc_output, target_relation_labels):
if rc_output is None:
return ner_loss
else:
return sum([ner_loss, cross_entropy(rc_output, target_relation_labels)])
class BertExtractor:
@staticmethod
def default():
sa = BertExtractor()
sa.load_saved(trained_model_path)
return sa
def load_saved(self, path):
self.net = RelBertNet()
self.net.load_state_dict(torch.load(path))
self.net.eval()
def train(self, data_file):
train_data = RelDataset.from_file(data_file, n_instances=40000)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_train_batch)
self.net = RelBertNet()
optimiser = optim.Adam(self.net.parameters(), lr=LEARNING_RATE)
start = time.time()
for epoch in range(MAX_EPOCHS):
batch_loss = 0.0
for idx, (batch, true_ner_tags, instances) in enumerate(train_loader):
# zero param gradients
optimiser.zero_grad()
# forward pass
_, ner_loss, rc_output, target_relation_labels, _ = self.net(batch, instances, ner_tags=true_ner_tags)
# backward pass
l = loss(ner_loss, rc_output, target_relation_labels)
l.backward()