Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
7cc83e97
Commit
7cc83e97
authored
Jan 22, 2020
by
Joel Oksanen
Browse files
Implemented classifier in analyze_data.py
parent
7ce1a50e
Changes
6
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
7cc83e97
...
...
@@ -3,3 +3,4 @@ amazon_reviews_us_Camera_v1_00.tsv.gz
amazon_reviews_us_Digital_Video_Games_v1_00.tsv
*.tsv
.DS_Store
__pycache__/
analyze_data.py
View file @
7cc83e97
...
...
@@ -7,6 +7,8 @@ from anytree import Node, PostOrderIter
from
functools
import
reduce
from
matplotlib
import
pyplot
from
scipy.stats
import
pearsonr
import
pickle
from
review_tokenizer
import
tokenize_review
sentiment_threshold
=
0.3
...
...
@@ -33,6 +35,10 @@ price: ['price', 'value'],
shipping
:
[
'ship'
]
}
f
=
open
(
'camera_review_classifier.pickle'
,
'rb'
)
classifier
=
pickle
.
load
(
f
)
f
.
close
()
# extract phrases
def
extract_phrases
(
review_body
):
sentences
=
sent_tokenize
(
review_body
)
...
...
@@ -44,9 +50,17 @@ def extract_phrases(review_body):
# analyze sentiment
analyzer
=
SentimentIntensityAnalyzer
()
def
get_sentiment
(
phrase
):
# get vader score
vader_s
=
analyzer
.
polarity_scores
(
phrase
)
compound_s
=
vader_s
[
'compound'
]
return
compound_s
# get classification
tokens
=
tokenize_review
(
phrase
)
classification
=
classifier
.
classify
(
dict
([
token
,
True
]
for
token
in
tokens
))
# return compound_s if the two agree
if
(
classification
==
'+'
and
compound_s
>
0
)
or
(
classification
==
'-'
and
compound_s
<
0
):
return
compound_s
else
:
return
0
# remove all ancestors of node in list l
def
remove_ancestors
(
node
,
l
):
...
...
@@ -161,6 +175,7 @@ all_reviews = pd.read_csv('camera_prepared_data.tsv', sep='\t', error_bad_lines=
camera_strengths
=
[]
star_rating_averages
=
[]
products_analyzed
=
0
grouped
=
all_reviews
.
groupby
(
'product_id'
)
for
product_id
,
reviews
in
grouped
:
# get ra
...
...
@@ -189,6 +204,8 @@ for product_id, reviews in grouped:
# store results
camera_strengths
.
append
(
strengths
[
camera
])
star_rating_averages
.
append
(
star_rating_sum
/
review_count
)
products_analyzed
+=
1
print
(
products_analyzed
)
# calculate Pearson's correlation
correlation
,
_
=
pearsonr
(
camera_strengths
,
star_rating_averages
)
...
...
camera_review_classifier.pickle
View file @
7cc83e97
No preview for this file type
prep_data.py
View file @
7cc83e97
...
...
@@ -4,12 +4,18 @@ import pandas as pd
import
re
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
training_data_location
=
'reviews_trained_on.tsv'
output_location
=
'camera_prepared_data.tsv'
min_reviews
=
50
min_characters
=
2
5
n
=
1
00
min_characters
=
5
0
n
=
5
00
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
ß
# drop reviews used for training
training_reviews
=
pd
.
read_csv
(
training_data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
concat
([
reviews
,
training_reviews
])
reviews
=
reviews
.
drop_duplicates
(
keep
=
False
)
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
...
...
review_tokenizer.py
0 → 100644
View file @
7cc83e97
from
nltk.tokenize
import
TweetTokenizer
tokenizer
=
TweetTokenizer
()
def
tokenize_review
(
review
):
return
tokenizer
.
tokenize
(
review
)
train_classifier.py
View file @
7cc83e97
import
pandas
as
pd
from
nltk.tokenize
import
sent_tokenize
import
re
from
nltk.
tokenize
import
TweetT
okenize
r
from
review_
tokenize
r
import
t
okenize
_review
import
random
from
nltk
import
classify
,
NaiveBayesClassifier
import
pickle
...
...
@@ -14,10 +14,6 @@ max_characters = 100
n
=
50000
train_factor
=
0.7
separators
=
' but |although|though|otherwise|however|unless|whereas|despite|<br />'
tokenizer
=
TweetTokenizer
()
def
tokenize_review
(
review
):
return
tokenizer
.
tokenize
(
review
)
def
get_tokens
(
tokenized_reviews_list
):
for
review
in
tokenized_reviews_list
:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment