analyze_data.py 8.07 KB
Newer Older
1
2
3
4
from nltk.tokenize import sent_tokenize
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
5
from anytree import PostOrderIter
6
7
8
from functools import reduce
from matplotlib import pyplot
from scipy.stats import pearsonr
9
from sklearn.metrics import mean_absolute_error
10
import pickle
11
12
from server.agent.review_tokenizer import tokenize_review, reduce_noise
from server.agent.argument import *
13
14
15
16
17
18
19
20
21
22
23
24
25
26

reviewables = [camera, image, video, battery, flash, audio, price, shipping, lens, zoom, af]
features = [image, video, battery, flash, audio, price, shipping, lens, zoom, af]

glossary = {
    camera: ['camera', 'device', 'product'],
    image: ['image', 'picture', ' pic '],
    video: ['video'],
    battery: ['battery'],
    flash: ['flash'],
    audio: ['audio', 'sound'],
    price: ['price', 'value', 'cost'],
    shipping: ['ship']
}
27

28
sentiment_threshold = 0.95
29

30
31
32
33
f = open('camera_review_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()

34
35
36
37
38
39
40
41
42
43
44
# extract phrases
def extract_phrases(review_body):
    sentences = sent_tokenize(review_body)
    phrases = []
    for sentence in sentences:
        phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', sentence)
    return phrases

# analyze sentiment
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(phrase):
45
    # get vader score
46
47
    # vader_s = analyzer.polarity_scores(phrase)
    # compound_s = vader_s['compound']
48
    # get classification
49
50
    tokens = reduce_noise(tokenize_review(phrase))
    prob_classification = classifier.prob_classify(dict([token, True] for token in tokens))
51
    # return compound_s if the two agree
52
53
54
55
56
57
58
    # if (classification == '+' and compound_s > 0) or (classification == '-' and compound_s < 0):
    #     return compound_s
    # else:
    #     return 0
    classification = prob_classification.max()
    strength = (prob_classification.prob(classification) - 0.5) * 2
    return strength if classification == '+' else -strength
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

# remove all ancestors of node in list l
def remove_ancestors(node, l):
    if node.parent != None:
        try:
            l.remove(node.parent)
        except ValueError:
            pass
        remove_ancestors(node.parent, l)

# get reviewable(s) that match phrase
def get_reviewables(phrase):
    reviewable_matches = []
    reviewables = [node for node in PostOrderIter(camera)]
    while len(reviewables) > 0:
        f = reviewables.pop(0)
        for word in glossary[f]:
            if word in phrase:
                reviewable_matches.append(f)
                remove_ancestors(f, reviewables)
                break
    return reviewable_matches

def extract_votes(phrases):
    votes = {}
    for phrase in phrases:
        reviewables = get_reviewables(phrase)
        sentiment = get_sentiment(phrase)
        if abs(sentiment) > sentiment_threshold:
            for reviewable in reviewables:
                if (reviewable not in votes) or (abs(votes[reviewable]) < abs(sentiment)):
90
                    votes[reviewable] = sentiment # what if there's two phrases with same reviewable?
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
    # normalize votes to 1 (+) or -1 (-)
    for reviewable in votes:
        votes[reviewable] = 1 if votes[reviewable] > 0 else -1
    return votes

# augment votes (Definition 4.3) obtained for a single critic
def augment_votes(votes):
    reviewables = [node for node in PostOrderIter(camera)]
    for reviewable in reviewables:
        if reviewable not in votes:
            polar_sum = 0
            for subfeat in reviewable.children:
                if subfeat in votes:
                    polar_sum += votes[subfeat]
            if polar_sum != 0:
106
                votes[reviewable] = 1 if polar_sum > 0 else -1
107
108
109
110
111
112
113
114
115
116
117

def get_qbaf(ra, review_count):
    # sums of all positive and negative votes for reviewables
    reviewable_sums = {}
    for reviewable in reviewables:
        reviewable_sums[reviewable] = 0
        for r in ra:
            if r['reviewable'] == reviewable:
                reviewable_sums[reviewable] += r['vote']

    # calculate attack/support relations for camera
118
119
120
121
122
123
124
125
    supporters = {r: [] for r in reviewables}
    attackers = {r: [] for r in reviewables}
    for r in reviewables:
        for subf in r.children:
            if reviewable_sums[subf] > 0:
                supporters[r].append(subf)
            elif reviewable_sums[subf] < 0:
                attackers[r].append(subf)
126
127
128
129
130
131
132

    # calculate base scores for reviewables
    base_scores = {}
    base_scores[camera] = 0.5 + 0.5 * reviewable_sums[camera] / review_count
    for feature in features:
        base_scores[feature] = abs(reviewable_sums[feature]) / review_count

133
    qbaf = {"supporters": supporters, "attackers": attackers, "base_scores": base_scores}
134
135
    return qbaf

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def combined_strength(args):
    if len(args) != 0:
        return 1 - reduce(lambda x, y: x * y, map(lambda v: 1 - v, args))
    return 0

def argument_strength(base_score, attacker_strengths, supporter_strengths):
    attack = combined_strength(attacker_strengths)
    support = combined_strength(supporter_strengths)
    if attack > support:
        return base_score - (base_score * abs(attack - support))
    elif attack < support:
        return base_score + ((1 - base_score) * abs(attack - support))
    return base_score

# apply DF-QUAD gradual semantics to qbaf
def get_strengths(qbaf):
    strengths = {}
    reviewables = [node for node in PostOrderIter(camera)]
    for reviewable in reviewables:
        attacker_strengths = []
        supporter_strengths = []
        for child in reviewable.children:
158
            if child in qbaf["attackers"][reviewable]:
159
                attacker_strengths.append(strengths[child])
160
            elif child in qbaf["supporters"][reviewable]:
161
162
163
164
                supporter_strengths.append(strengths[child])
        strengths[reviewable] = argument_strength(qbaf["base_scores"][reviewable], attacker_strengths, supporter_strengths)
    return strengths

165
166
#############

167
all_reviews = pd.read_csv('amazon_data/camera_prepared_data.tsv', sep='\t', error_bad_lines=False)
168

169
170
171
camera_strengths = []
star_rating_averages = []

172
products_analyzed = 0
173
174
175
176
grouped = all_reviews.groupby('product_id')
for product_id, reviews in grouped:
    # get ra
    ra = []
177
    voting_reviews = 0
178
    review_count = 0
179
    star_rating_sum = 0
180
181
182
    for _, review in reviews.iterrows():
        review_id = review['review_id']
        review_count += 1
183
        star_rating_sum += review['star_rating']
184
185
186
        phrases = extract_phrases(review['review_body'])
        votes = extract_votes(phrases)
        augment_votes(votes)
187
        voting_reviews += 1 if len(votes) > 0 else 0
188
189
190
        # add final vote tuples to ra with simplified polarity in {+ (true), - (false)}
        for reviewable in votes:
            ra.append({'review_id': review_id, 'reviewable': reviewable, 'vote': votes[reviewable]})
191
192
193
    # only consider items that obtained votes from at least 33% of reviewers
    if voting_reviews / review_count < 0.33:
        continue
194
195
    # get qbaf from ra
    qbaf = get_qbaf(ra, review_count)
196
197
198
199
200
    # apply gradual semantics
    strengths = get_strengths(qbaf)
    # store results
    camera_strengths.append(strengths[camera])
    star_rating_averages.append(star_rating_sum / review_count)
201
202
    products_analyzed += 1
    print(products_analyzed)
203
204
205

# calculate Pearson's correlation
correlation, _ = pearsonr(camera_strengths, star_rating_averages)
206
207
208
209
210
211
print("pearson correlation: ", correlation)

# calculate MAE
scaled_star_rating_avgs = list(map(lambda x: (x - 1) / 4, star_rating_averages))
mae = mean_absolute_error(scaled_star_rating_avgs, camera_strengths)
print("mae: ", mae)
212
213

# plot result correlation
214
pyplot.scatter(camera_strengths, scaled_star_rating_avgs)
215
pyplot.show()
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

# vs = [{camera: 1, image: 1, zoom: -1},
# {camera: 1, image: 1, battery: -1},
# {image: 1, battery: 1, af: 1},
# {af: 1},
# {camera: -1, zoom: -1},
# {camera: -1, image: -1, af: 1},
# {battery: -1}]
#
# ra = []
# for v in vs:
#     print(v)
#     augment_votes(v)
#     print(v)
#     for reviewable in v:
#         ra.append({'reviewable': reviewable, 'vote': v[reviewable]})
#
# qbaf = get_qbaf(ra, len(vs))
# strengths = get_strengths(qbaf)
# print(qbaf)
# print(strengths)