Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
c26a85bc
Commit
c26a85bc
authored
Jan 22, 2020
by
Joel Oksanen
Browse files
prep_data.py now filters out reviews for camera accessories, added shipping as a feature
parent
b2aa259d
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
c26a85bc
amazon_reviews_us_Camera_v1_00.tsv
amazon_reviews_us_Camera_v1_00.tsv.gz
amazon_reviews_us_Digital_Video_Games_v1_00.tsv
*.tsv
.DS_Store
analyze_data.py
View file @
c26a85bc
...
...
@@ -17,18 +17,20 @@ battery = Node('battery', parent=camera)
flash
=
Node
(
'flash'
,
parent
=
camera
)
audio
=
Node
(
'audio'
,
parent
=
camera
)
price
=
Node
(
'price'
,
parent
=
camera
)
shipping
=
Node
(
'shipping'
,
parent
=
camera
)
reviewables
=
[
camera
,
image
,
video
,
battery
,
flash
,
audio
,
price
]
features
=
[
image
,
video
,
battery
,
flash
,
audio
,
price
]
reviewables
=
[
camera
,
image
,
video
,
battery
,
flash
,
audio
,
price
,
shipping
]
features
=
[
image
,
video
,
battery
,
flash
,
audio
,
price
,
shipping
]
glossary
=
{
camera
:
[
'camera'
,
'device'
],
image
:
[
'image'
,
'picture'
],
camera
:
[
'camera'
,
'device'
,
'product'
],
image
:
[
'image'
,
'picture
'
,
' pic
'
],
video
:
[
'video'
],
battery
:
[
'battery'
],
flash
:
[
'flash'
],
audio
:
[
'audio'
,
'sound'
],
price
:
[
'price'
,
'value'
]
price
:
[
'price'
,
'value'
],
shipping
:
[
'ship'
]
}
# extract phrases
...
...
@@ -154,7 +156,7 @@ def get_strengths(qbaf):
#############
all_reviews
=
pd
.
read_csv
(
'prepared_data.tsv'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
all_reviews
=
pd
.
read_csv
(
'
camera_
prepared_data.tsv'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
camera_strengths
=
[]
star_rating_averages
=
[]
...
...
prep_data.py
View file @
c26a85bc
...
...
@@ -4,8 +4,8 @@ import pandas as pd
import
re
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
output_location
=
'prepared_data.tsv'
min_reviews
=
3
0
output_location
=
'
camera_
prepared_data.tsv'
min_reviews
=
5
0
min_characters
=
25
n
=
100
...
...
@@ -14,6 +14,16 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
# try to filter out reviews for camera accessories
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
filter
=
''
for
word
in
filter_words
:
word_filter
=
'['
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
']'
+
word
[
1
:]
filter
+=
word_filter
+
'|'
filter
=
filter
[:
-
1
]
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
filter
,
regex
=
True
)]
# drop reviews with less than min_characters characters
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
x
:
len
(
str
(
x
))
>=
min_characters
)]
...
...
@@ -24,3 +34,5 @@ reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_rev
reviews
=
reviews
[
reviews
[
'product_id'
].
isin
(
reviews
[
'product_id'
].
unique
()[:
n
])]
reviews
.
to_csv
(
output_location
,
sep
=
'
\t
'
,
index
=
False
)
print
(
"Successfully prepared reviews for"
,
reviews
.
groupby
(
'product_id'
).
ngroups
,
"products"
,
sep
=
" "
)
prepared_data.tsv
deleted
100644 → 0
View file @
b2aa259d
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment