Commit b74782c2 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Annotated laptop reviews

parent 59076bb3
This diff is collapsed.
This diff is collapsed.
......@@ -12,8 +12,8 @@ import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated.xml'
data_location = 'amazon_data/amazon_reviews_us_PC_v1_00.tsv'
selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
......@@ -22,7 +22,7 @@ ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'confli
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
prepared_reviews_location = 'annotated_amazon_laptop_reviews.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
......@@ -80,7 +80,10 @@ def prepare_reviews():
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories
# laptop reviews
reviews = reviews[reviews['product_title'].str.contains('laptop', case=False, na=False)]
# try to filter out reviews for accessories
filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
......@@ -88,7 +91,7 @@ def prepare_reviews():
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True, case=False, na=False)]
# shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True)
......@@ -325,7 +328,7 @@ def prepare_annotated_reviews():
xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open('amazon_camera_test.xml', 'w') as f:
with open(prepared_reviews_location, 'w') as f:
f.write(xmlstr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment