Commit b74782c2 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Annotated laptop reviews

parent 59076bb3
This diff is collapsed.
This diff is collapsed.
...@@ -12,8 +12,8 @@ import readchar ...@@ -12,8 +12,8 @@ import readchar
from sty import fg, bg, ef, rs from sty import fg, bg, ef, rs
from wcwidth import wcswidth from wcwidth import wcswidth
data_location = 'amazon_reviews_us_Camera_v1_00.tsv' data_location = 'amazon_data/amazon_reviews_us_PC_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated.xml' selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
min_characters = 0 min_characters = 0
max_characters = 200 max_characters = 200
n = 500 n = 500
...@@ -22,7 +22,7 @@ ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'confli ...@@ -22,7 +22,7 @@ ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'confli
annotated_reviews_location = 'annotated_camera_reviews.xml' annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$'] included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS'] nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
prepared_reviews_location = 'prepared_amazon_camera_reviews.xml' prepared_reviews_location = 'annotated_amazon_laptop_reviews.xml'
tokenizer = TweetTokenizer() tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
...@@ -80,7 +80,10 @@ def prepare_reviews(): ...@@ -80,7 +80,10 @@ def prepare_reviews():
# drop reviews with empty review body # drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()] reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories # laptop reviews
reviews = reviews[reviews['product_title'].str.contains('laptop', case=False, na=False)]
# try to filter out reviews for accessories
filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter', filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security'] 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = '' filter_pat = ''
...@@ -88,7 +91,7 @@ def prepare_reviews(): ...@@ -88,7 +91,7 @@ def prepare_reviews():
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:] word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|' filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1] filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)] reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True, case=False, na=False)]
# shuffle reviews # shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True) reviews = reviews.sample(frac=1).reset_index(drop=True)
...@@ -325,7 +328,7 @@ def prepare_annotated_reviews(): ...@@ -325,7 +328,7 @@ def prepare_annotated_reviews():
xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent=' ') xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()]) xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open('amazon_camera_test.xml', 'w') as f: with open(prepared_reviews_location, 'w') as f:
f.write(xmlstr) f.write(xmlstr)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment