Commit 4ac66e6b authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Improved sentiment annotation

parent f0285f85
This diff is collapsed.
......@@ -12,17 +12,18 @@ import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'amazon_data/amazon_reviews_us_pc.tsv'
selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
data_location = 'data/reviews/5_products_reviews.tsv'
selected_reviews_location = 'product_reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'conflict': bg.yellow}
ann_fgs = {'positive': fg.green, 'neutral': fg.blue, 'negative': fg.red, 'conflict': fg.yellow}
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
prepared_reviews_location = 'annotated_amazon_laptop_reviews.xml'
prepared_reviews_location = 'annotated_5_products_reviews_2.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
......@@ -77,42 +78,16 @@ def is_opinion_target(tree):
def prepare_reviews():
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# laptop reviews
reviews = reviews[reviews['product_title'].str.contains('laptop', case=False, na=False)]
# try to filter out reviews for accessories
filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True, case=False, na=False)]
# shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True)
# pick first n reviews
reviews = reviews.head(n)
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
root = Element('reviews')
for _, review in reviews.iterrows():
review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title')
title_node.text = review['product_title']
id_node.text = review['reviewerID']
text_node = SubElement(review_node, 'review_body')
# reformat text
text = review['review_body']
text = review['reviewText']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text)
text = text.replace('&#34;', '"')
......@@ -122,21 +97,11 @@ def prepare_reviews():
sentences_node = SubElement(review_node, 'sentences')
sentences = sent_tokenizer.tokenize(text)
phrase_indices = []
for sentence in sentences:
sentence_node = SubElement(sentences_node, 'sentence')
sentence_text_node = SubElement(sentence_node, 'text')
sentence_text_node.text = sentence
parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str)
parse_tree_node = SubElement(sentence_node, 'parse_tree')
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``', '""')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
......@@ -177,8 +142,10 @@ def annotate_reviews():
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
sent_str = ''
for c, sent in sentiment_mappings.items():
sent_str += '{}{}: {}{}, '.format(ann_fgs[sent], sent, c, bcolors.ENDC)
print(sent_str[:-2])
print('')
text_row = ''
......@@ -310,7 +277,6 @@ def prepare_annotated_reviews():
for review in annotated:
for sentence in review.find('sentences'):
text = sentence.find('text').text
tree_str = sentence.find('parse_tree').text
sentence_node = SubElement(prepared_root, 'sentence')
text_node = SubElement(sentence_node, 'text')
text_node.text = text
......@@ -334,4 +300,4 @@ def prepare_annotated_reviews():
# prepare_reviews()
# annotate_reviews()
prepare_annotated_reviews()
prepare_annotated_reviews()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment