Commit ef388c59 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

A lot of changed to target extractor. Performs decently on cameras and laptops now.

parent 8c3b320d
...@@ -4,3 +4,4 @@ __pycache__/ ...@@ -4,3 +4,4 @@ __pycache__/
server/agent/amazon_data/ server/agent/amazon_data/
server/agent/target_extraction/data/ server/agent/target_extraction/data/
.DS_Store .DS_Store
*.pickle
\ No newline at end of file
...@@ -22,25 +22,44 @@ def get_df(path): ...@@ -22,25 +22,44 @@ def get_df(path):
return pd.DataFrame.from_dict(df, orient='index') return pd.DataFrame.from_dict(df, orient='index')
child_product = 'speaker' pd.set_option('display.max_colwidth', None)
reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
compression='gzip')
parent_output = 'target_extraction/data/electronics_reviews.tsv'
child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
for col in reviews.columns: category = 'Laptops'
metadata = pd.read_json('amazon_data/meta_Electronics.json', lines=True)# get_df('amazon_data/meta_Electronics.json.gz')
for col in metadata.columns:
print(col) print(col)
c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)] metadata = metadata[metadata['category'].apply(lambda cats: category in cats)]
p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
c_reviews = c_reviews.head(MAX_ITEMS)
p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
p_reviews = p_reviews.head(MAX_ITEMS)
p_reviews.to_csv(parent_output, sep='\t', index=False) print(metadata['category'][:5])
c_reviews.to_csv(child_output, sep='\t', index=False) print(len(metadata.index))
print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
review_iter = pd.read_json('amazon_data/Electronics.json', lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
reviews.to_csv('target_extraction/data/verified_laptop_reviews.tsv', sep='\t', index=False)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
# compression='gzip')
# parent_output = 'target_extraction/data/electronics_reviews.tsv'
# child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
#
# for col in reviews.columns:
# print(col)
#
# c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
# p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
# c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
# c_reviews = c_reviews.head(MAX_ITEMS)
# p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
# p_reviews = p_reviews.head(MAX_ITEMS)
#
# p_reviews.to_csv(parent_output, sep='\t', index=False)
# c_reviews.to_csv(child_output, sep='\t', index=False)
# print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses # # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)] # metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment