Commit 4ac66e6b authored by  Joel  Oksanen's avatar Joel Oksanen

Improved sentiment annotation

parent f0285f85
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -12,17 +12,18 @@ import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'amazon_data/amazon_reviews_us_pc.tsv'
selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
data_location = 'data/reviews/5_products_reviews.tsv'
selected_reviews_location = 'product_reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'conflict': bg.yellow}
ann_fgs = {'positive': fg.green, 'neutral': fg.blue, 'negative': fg.red, 'conflict': fg.yellow}
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
prepared_reviews_location = 'annotated_amazon_laptop_reviews.xml'
prepared_reviews_location = 'annotated_5_products_reviews_2.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
......@@ -77,42 +78,16 @@ def is_opinion_target(tree):
def prepare_reviews():
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# laptop reviews
reviews = reviews[reviews['product_title'].str.contains('laptop', case=False, na=False)]
# try to filter out reviews for accessories
filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True, case=False, na=False)]
# shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True)
# pick first n reviews
reviews = reviews.head(n)
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
root = Element('reviews')
for _, review in reviews.iterrows():
review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title')
title_node.text = review['product_title']
id_node.text = review['reviewerID']
text_node = SubElement(review_node, 'review_body')
# reformat text
text = review['review_body']
text = review['reviewText']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text)
text = text.replace('&#34;', '"')
......@@ -122,21 +97,11 @@ def prepare_reviews():
sentences_node = SubElement(review_node, 'sentences')
sentences = sent_tokenizer.tokenize(text)
phrase_indices = []
for sentence in sentences:
sentence_node = SubElement(sentences_node, 'sentence')
sentence_text_node = SubElement(sentence_node, 'text')
sentence_text_node.text = sentence
parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str)
parse_tree_node = SubElement(sentence_node, 'parse_tree')
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``', '""')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
......@@ -177,8 +142,10 @@ def annotate_reviews():
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
sent_str = ''
for c, sent in sentiment_mappings.items():
sent_str += '{}{}: {}{}, '.format(ann_fgs[sent], sent, c, bcolors.ENDC)
print(sent_str[:-2])
print('')
text_row = ''
......@@ -310,7 +277,6 @@ def prepare_annotated_reviews():
for review in annotated:
for sentence in review.find('sentences'):
text = sentence.find('text').text
tree_str = sentence.find('parse_tree').text
sentence_node = SubElement(prepared_root, 'sentence')
text_node = SubElement(sentence_node, 'text')
text_node.text = text
......@@ -334,4 +300,4 @@ def prepare_annotated_reviews():
# prepare_reviews()
# annotate_reviews()
prepare_annotated_reviews()
prepare_annotated_reviews()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment