product_finder.py 1.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd

data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
training_data_location = 'reviews_trained_on.tsv'
min_characters = 50
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)

# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews,training_reviews])
reviews = reviews.drop_duplicates(keep=False)

# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]

# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter = ''
for word in filter_words:
    word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
    filter += word_filter + '|'
filter = filter[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)]

# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]

# drop reviews for products with less than min_reviews reviews
grouped = reviews.groupby(['product_id', 'product_title'], sort=False).size().sort_values(ascending=False)
print(grouped.head(50))