Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
87c4c359
Commit
87c4c359
authored
Jan 20, 2020
by
Joel Oksanen
Browse files
1) Implemented gradual semantics and correlation checking. 2) Added more data/review filtering
parent
4d919498
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
analyze_data.py
View file @
87c4c359
...
...
@@ -4,6 +4,9 @@ import pandas as pd
import
re
from
nltk.sentiment.vader
import
SentimentIntensityAnalyzer
from
anytree
import
Node
,
PostOrderIter
from
functools
import
reduce
from
matplotlib
import
pyplot
from
scipy.stats
import
pearsonr
sentiment_threshold
=
0.3
...
...
@@ -99,15 +102,15 @@ def get_qbaf(ra, review_count):
reviewable_sums
[
reviewable
]
+=
r
[
'vote'
]
# if there are sub-features, calculate attack/support relations here
supporters
=
[]
attackers
=
[]
# calculate attack/support relations for camera
supports
=
[]
attacks
=
[]
for
feature
in
camera
.
children
:
if
reviewable_sums
[
feature
]
>
0
:
supports
.
append
(
(
feature
,
camera
)
)
support
er
s
.
append
(
feature
)
elif
reviewable_sums
[
feature
]
<
0
:
attacks
.
append
(
(
feature
,
camera
)
)
attack
er
s
.
append
(
feature
)
# calculate base scores for reviewables
base_scores
=
{}
...
...
@@ -115,29 +118,78 @@ def get_qbaf(ra, review_count):
for
feature
in
features
:
base_scores
[
feature
]
=
abs
(
reviewable_sums
[
feature
])
/
review_count
qbaf
=
{
"supports"
:
supports
,
"attacks"
:
attacks
,
"base_scores"
:
base_scores
}
qbaf
=
{
"support
er
s"
:
support
er
s
,
"attack
er
s"
:
attack
er
s
,
"base_scores"
:
base_scores
}
return
qbaf
def
combined_strength
(
args
):
if
len
(
args
)
!=
0
:
return
1
-
reduce
(
lambda
x
,
y
:
x
*
y
,
map
(
lambda
v
:
1
-
v
,
args
))
return
0
def
argument_strength
(
base_score
,
attacker_strengths
,
supporter_strengths
):
attack
=
combined_strength
(
attacker_strengths
)
support
=
combined_strength
(
supporter_strengths
)
if
attack
>
support
:
return
base_score
-
(
base_score
*
abs
(
attack
-
support
))
elif
attack
<
support
:
return
base_score
+
((
1
-
base_score
)
*
abs
(
attack
-
support
))
return
base_score
# apply DF-QUAD gradual semantics to qbaf
def
get_strengths
(
qbaf
):
strengths
=
{}
reviewables
=
[
node
for
node
in
PostOrderIter
(
camera
)]
for
reviewable
in
reviewables
:
attacker_strengths
=
[]
supporter_strengths
=
[]
for
child
in
reviewable
.
children
:
if
child
in
qbaf
[
"attackers"
]:
attacker_strengths
.
append
(
strengths
[
child
])
elif
child
in
qbaf
[
"supporters"
]:
supporter_strengths
.
append
(
strengths
[
child
])
strengths
[
reviewable
]
=
argument_strength
(
qbaf
[
"base_scores"
][
reviewable
],
attacker_strengths
,
supporter_strengths
)
return
strengths
#############
all_reviews
=
pd
.
read_csv
(
'prepared_data.tsv'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
camera_strengths
=
[]
star_rating_averages
=
[]
grouped
=
all_reviews
.
groupby
(
'product_id'
)
for
product_id
,
reviews
in
grouped
:
# get ra
ra
=
[]
voting_reviews
=
0
review_count
=
0
star_rating_sum
=
0
for
_
,
review
in
reviews
.
iterrows
():
review_id
=
review
[
'review_id'
]
review_count
+=
1
star_rating_sum
+=
review
[
'star_rating'
]
phrases
=
extract_phrases
(
review
[
'review_body'
])
votes
=
extract_votes
(
phrases
)
augment_votes
(
votes
)
voting_reviews
+=
1
if
len
(
votes
)
>
0
else
0
# add final vote tuples to ra with simplified polarity in {+ (true), - (false)}
for
reviewable
in
votes
:
ra
.
append
({
'review_id'
:
review_id
,
'reviewable'
:
reviewable
,
'vote'
:
votes
[
reviewable
]})
# only consider items that obtained votes from at least 33% of reviewers
if
voting_reviews
/
review_count
<
0.33
:
continue
# get qbaf from ra
qbaf
=
get_qbaf
(
ra
,
review_count
)
# print results
print
(
reviews
[
'product_title'
].
iloc
[
0
])
print
(
qbaf
)
# apply gradual semantics
strengths
=
get_strengths
(
qbaf
)
# store results
camera_strengths
.
append
(
strengths
[
camera
])
star_rating_averages
.
append
(
star_rating_sum
/
review_count
)
# calculate Pearson's correlation
correlation
,
_
=
pearsonr
(
camera_strengths
,
star_rating_averages
)
print
(
correlation
)
# plot result correlation
pyplot
.
scatter
(
camera_strengths
,
star_rating_averages
)
pyplot
.
show
()
prep_data.py
View file @
87c4c359
...
...
@@ -5,11 +5,18 @@ import re
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
output_location
=
'prepared_data.tsv'
min_reviews
=
10
n
=
10
min_reviews
=
30
min_characters
=
25
n
=
100
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
# drop reviews with less than min_characters characters
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
x
:
len
(
str
(
x
))
>=
min_characters
)]
# drop reviews for products with less than min_reviews reviews
reviews
=
reviews
.
groupby
(
'product_id'
).
filter
(
lambda
x
:
len
(
x
.
index
)
>=
min_reviews
)
...
...
prepared_data.tsv
View file @
87c4c359
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment