prep_data.py 1.66 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd

data_location = 'amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
training_data_location = 'amazon_data/reviews_trained_on.tsv'
output_location = 'amazon_data/camera_prepared_data.tsv'
min_reviews = 50
min_characters = 50
n = 500

reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)

# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews, training_reviews])
reviews = reviews.drop_duplicates(keep=False)

# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]

# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
                'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
    word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
    filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]

# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]

# drop reviews for products with less than min_reviews reviews
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)

# choose reviews for n first items
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]

reviews.to_csv(output_location, sep='\t', index=False)

print("Successfully prepared reviews for", reviews.groupby('product_id').ngroups, "products", sep=" ")