Commit 4d919498 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

1. Created python script for prepping review data. 2. Python script for...

1. Created python script for prepping review data. 2. Python script for analysing data now creates RA and QBAF for reviews. 3. Created .gitignore
parent 0c8ed9ff
amazon_reviews_us_Camera_v1_00.tsv
amazon_reviews_us_Camera_v1_00.tsv.gz
.DS_Store
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from anytree import Node, PostOrderIter
sentiment_threshold = 0.3
camera = Node('camera')
image = Node('image', parent=camera)
video = Node('video', parent=camera)
battery = Node('battery', parent=camera)
flash = Node('flash', parent=camera)
audio = Node('audio', parent=camera)
reviewables = [camera, image, video, battery, flash, audio]
features = [image, video, battery, flash, audio]
glossary = {
camera: ['camera', 'device'],
image: ['image', 'picture'],
video: ['video'],
battery: ['battery'],
flash: ['flash'],
audio: ['audio', 'sound']
}
# extract phrases
def extract_phrases(review_body):
sentences = sent_tokenize(review_body)
phrases = []
for sentence in sentences:
phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', sentence)
return phrases
# analyze sentiment
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(phrase):
vader_s = analyzer.polarity_scores(phrase)
compound_s = vader_s['compound']
return compound_s
# remove all ancestors of node in list l
def remove_ancestors(node, l):
if node.parent != None:
try:
l.remove(node.parent)
except ValueError:
pass
remove_ancestors(node.parent, l)
# get reviewable(s) that match phrase
def get_reviewables(phrase):
reviewable_matches = []
reviewables = [node for node in PostOrderIter(camera)]
while len(reviewables) > 0:
f = reviewables.pop(0)
for word in glossary[f]:
if word in phrase:
reviewable_matches.append(f)
remove_ancestors(f, reviewables)
break
return reviewable_matches
def extract_votes(phrases):
votes = {}
for phrase in phrases:
reviewables = get_reviewables(phrase)
sentiment = get_sentiment(phrase)
if abs(sentiment) > sentiment_threshold:
for reviewable in reviewables:
if (reviewable not in votes) or (abs(votes[reviewable]) < abs(sentiment)):
votes[reviewable] = sentiment
# normalize votes to 1 (+) or -1 (-)
for reviewable in votes:
votes[reviewable] = 1 if votes[reviewable] > 0 else -1
return votes
# augment votes (Definition 4.3) obtained for a single critic
def augment_votes(votes):
reviewables = [node for node in PostOrderIter(camera)]
for reviewable in reviewables:
if reviewable not in votes:
polar_sum = 0
for subfeat in reviewable.children:
if subfeat in votes:
polar_sum += votes[subfeat]
if polar_sum != 0:
votes[reviewable] = 1 if polar_sum > 0 else 0
def get_qbaf(ra, review_count):
# sums of all positive and negative votes for reviewables
reviewable_sums = {}
for reviewable in reviewables:
reviewable_sums[reviewable] = 0
for r in ra:
if r['reviewable'] == reviewable:
reviewable_sums[reviewable] += r['vote']
# if there are sub-features, calculate attack/support relations here
# calculate attack/support relations for camera
supports = []
attacks = []
for feature in camera.children:
if reviewable_sums[feature] > 0:
supports.append((feature, camera))
elif reviewable_sums[feature] < 0:
attacks.append((feature, camera))
# calculate base scores for reviewables
base_scores = {}
base_scores[camera] = 0.5 + 0.5 * reviewable_sums[camera] / review_count
for feature in features:
base_scores[feature] = abs(reviewable_sums[feature]) / review_count
qbaf = {"supports": supports, "attacks": attacks, "base_scores": base_scores}
return qbaf
#############
all_reviews = pd.read_csv('prepared_data.tsv', sep='\t', error_bad_lines=False)
grouped = all_reviews.groupby('product_id')
for product_id, reviews in grouped:
# get ra
ra = []
review_count = 0
for _, review in reviews.iterrows():
review_id = review['review_id']
review_count += 1
phrases = extract_phrases(review['review_body'])
votes = extract_votes(phrases)
augment_votes(votes)
# add final vote tuples to ra with simplified polarity in {+ (true), - (false)}
for reviewable in votes:
ra.append({'review_id': review_id, 'reviewable': reviewable, 'vote': votes[reviewable]})
# get qbaf from ra
qbaf = get_qbaf(ra, review_count)
# print results
print(reviews['product_title'].iloc[0])
print(qbaf)
import nltk
import pandas as pd
reviews = pd.read_csv('sample_us.tsv', sep='\t')
print(reviews.head())
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
output_location = 'prepared_data.tsv'
min_reviews = 10
n = 10
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews for products with less than min_reviews reviews
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
# choose reviews for n first items
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews.to_csv(output_location, sep='\t', index=False)
This source diff could not be displayed because it is too large. You can view the blob instead.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment