Commit c26a85bc authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

prep_data.py now filters out reviews for camera accessories, added shipping as a feature

parent b2aa259d
amazon_reviews_us_Camera_v1_00.tsv
amazon_reviews_us_Camera_v1_00.tsv.gz
amazon_reviews_us_Digital_Video_Games_v1_00.tsv
*.tsv
.DS_Store
......@@ -17,18 +17,20 @@ battery = Node('battery', parent=camera)
flash = Node('flash', parent=camera)
audio = Node('audio', parent=camera)
price = Node('price', parent=camera)
shipping = Node('shipping', parent=camera)
reviewables = [camera, image, video, battery, flash, audio, price]
features = [image, video, battery, flash, audio, price]
reviewables = [camera, image, video, battery, flash, audio, price, shipping]
features = [image, video, battery, flash, audio, price, shipping]
glossary = {
camera: ['camera', 'device'],
image: ['image', 'picture'],
camera: ['camera', 'device', 'product'],
image: ['image', 'picture', ' pic '],
video: ['video'],
battery: ['battery'],
flash: ['flash'],
audio: ['audio', 'sound'],
price: ['price', 'value']
price: ['price', 'value'],
shipping: ['ship']
}
# extract phrases
......@@ -154,7 +156,7 @@ def get_strengths(qbaf):
#############
all_reviews = pd.read_csv('prepared_data.tsv', sep='\t', error_bad_lines=False)
all_reviews = pd.read_csv('camera_prepared_data.tsv', sep='\t', error_bad_lines=False)
camera_strengths = []
star_rating_averages = []
......
......@@ -4,8 +4,8 @@ import pandas as pd
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
output_location = 'prepared_data.tsv'
min_reviews = 30
output_location = 'camera_prepared_data.tsv'
min_reviews = 50
min_characters = 25
n = 100
......@@ -14,6 +14,16 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter += word_filter + '|'
filter = filter[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)]
# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
......@@ -24,3 +34,5 @@ reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_rev
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews.to_csv(output_location, sep='\t', index=False)
print("Successfully prepared reviews for", reviews.groupby('product_id').ngroups, "products", sep=" ")
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment