Commit 87c4c359 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

1) Implemented gradual semantics and correlation checking. 2) Added more data/review filtering

parent 4d919498
......@@ -4,6 +4,9 @@ import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from anytree import Node, PostOrderIter
from functools import reduce
from matplotlib import pyplot
from scipy.stats import pearsonr
sentiment_threshold = 0.3
......@@ -99,15 +102,15 @@ def get_qbaf(ra, review_count):
reviewable_sums[reviewable] += r['vote']
# if there are sub-features, calculate attack/support relations here
supporters = []
attackers = []
# calculate attack/support relations for camera
supports = []
attacks = []
for feature in camera.children:
if reviewable_sums[feature] > 0:
supports.append((feature, camera))
supporters.append(feature)
elif reviewable_sums[feature] < 0:
attacks.append((feature, camera))
attackers.append(feature)
# calculate base scores for reviewables
base_scores = {}
......@@ -115,29 +118,78 @@ def get_qbaf(ra, review_count):
for feature in features:
base_scores[feature] = abs(reviewable_sums[feature]) / review_count
qbaf = {"supports": supports, "attacks": attacks, "base_scores": base_scores}
qbaf = {"supporters": supporters, "attackers": attackers, "base_scores": base_scores}
return qbaf
def combined_strength(args):
if len(args) != 0:
return 1 - reduce(lambda x, y: x * y, map(lambda v: 1 - v, args))
return 0
def argument_strength(base_score, attacker_strengths, supporter_strengths):
attack = combined_strength(attacker_strengths)
support = combined_strength(supporter_strengths)
if attack > support:
return base_score - (base_score * abs(attack - support))
elif attack < support:
return base_score + ((1 - base_score) * abs(attack - support))
return base_score
# apply DF-QUAD gradual semantics to qbaf
def get_strengths(qbaf):
strengths = {}
reviewables = [node for node in PostOrderIter(camera)]
for reviewable in reviewables:
attacker_strengths = []
supporter_strengths = []
for child in reviewable.children:
if child in qbaf["attackers"]:
attacker_strengths.append(strengths[child])
elif child in qbaf["supporters"]:
supporter_strengths.append(strengths[child])
strengths[reviewable] = argument_strength(qbaf["base_scores"][reviewable], attacker_strengths, supporter_strengths)
return strengths
#############
all_reviews = pd.read_csv('prepared_data.tsv', sep='\t', error_bad_lines=False)
camera_strengths = []
star_rating_averages = []
grouped = all_reviews.groupby('product_id')
for product_id, reviews in grouped:
# get ra
ra = []
voting_reviews = 0
review_count = 0
star_rating_sum = 0
for _, review in reviews.iterrows():
review_id = review['review_id']
review_count += 1
star_rating_sum += review['star_rating']
phrases = extract_phrases(review['review_body'])
votes = extract_votes(phrases)
augment_votes(votes)
voting_reviews += 1 if len(votes) > 0 else 0
# add final vote tuples to ra with simplified polarity in {+ (true), - (false)}
for reviewable in votes:
ra.append({'review_id': review_id, 'reviewable': reviewable, 'vote': votes[reviewable]})
# only consider items that obtained votes from at least 33% of reviewers
if voting_reviews / review_count < 0.33:
continue
# get qbaf from ra
qbaf = get_qbaf(ra, review_count)
# print results
print(reviews['product_title'].iloc[0])
print(qbaf)
# apply gradual semantics
strengths = get_strengths(qbaf)
# store results
camera_strengths.append(strengths[camera])
star_rating_averages.append(star_rating_sum / review_count)
# calculate Pearson's correlation
correlation, _ = pearsonr(camera_strengths, star_rating_averages)
print(correlation)
# plot result correlation
pyplot.scatter(camera_strengths, star_rating_averages)
pyplot.show()
......@@ -5,11 +5,18 @@ import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
output_location = 'prepared_data.tsv'
min_reviews = 10
n = 10
min_reviews = 30
min_characters = 25
n = 100
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
# drop reviews for products with less than min_reviews reviews
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment