Commit 6c348770 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Lots of changes, started working on ConceptNet, CoreNLP, in order to prepare...

Lots of changes, started working on ConceptNet, CoreNLP, in order to prepare SemEval data for training classifier.
parent 9e151325
-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEAr1eu+bmGaX3IJ+duCqPw+WIlXrvGOJuAEH/6j/HZIYTUliuEKG71OGocqvXO
/4x4BuQQx5+BA+6kWIfHcW7LzdEsUZlHDgRxO+WuRK+A6F5H5EnvlAvBSNpg4+ernabuEcnXa5f4
0+wem3t+RSOYCympAxpbuWZP33UkH5ucl8HAesztOGxUVMIj8VynET4ADpcdDwt+B2rD7OFO9QJP
hS5gcEw7lIYiOctPc2hJ3DvKFcdk7ZyER/l5g06VfnboV4OBzCLvd526eUZxrq8YLQvbTM3eu9qE
gJV1Bg+ZOLfbCZV+sKpVZL6ZyJxuVoSOjQIrXMSj0Wp9i1l1qZG9jwIDAQABAoIBAH3db1kLeBTZ
mYgrdK5mqGAbt7+dAtk3pmIxu+cAMGEDPKbfbeqoW5a3dQSzlneSmcY4iGMDeFUeRRLXxK8EXX/M
mweoA31oavAYG1RqtxoWM30IJdYb6g8l009FycdNQK+8N8qgOJnHcOEjcKyotEevDAeSMC2R59v7
9oZJ6cp0tcPEyEJzUQz8K4SzE137n0ieBpPXE/SHqRek53PGzDFG01SstaDECMSbPlcXyZw63CQs
U09r1O7CG6KcqLjgBEB+RtglNld56INLuSVMn29zTjWFzmKaLUj9A6PnTvq1UIM6hUsCNa6+2E1c
by6tZXRRujPKCMyXDFIiw8/W0cECgYEA4OeBsdVNx2/+M1nT9crLQWfbGz8YV/eAogngJhLeJEYJ
Ui2nBk1FbGRpivtfStMDRjRYzwzuCt5YvHFv+qiLFh3bFamqm5a9XnGtyqOrCB0JM1r0NJNeTc4A
+rxTuVNk0JG9cK4Ei/lKaz35vAi16pAI+NxkP4b0AJkB6X2w9HcCgYEAx5XvgZx6MQksuQQzpiWF
PVrZF4wX749iJV+LKp76cSlRZDhqKfOX/K5gKZXB0JXgKVXyifj6X5VA/rFlsY1H+daPuHrHIcRy
kUlf3EHm+m0G6c6FxpVcJtWGabJZ4uN+jpydCryrfGdFj/gIIIk/09ga1cnF6L/q6f3+AoaAPakC
gYEAlz4YEacH7x55K780sw31Mb7Nsbz0efOPJWjOu8vkZzBSBFl/ANXl4caE01nH82GDyipZrqNb
USPFOLRwQXgmUHEza5X1jTwJjQGVhbgaxBICpe58Wv7fZCMstXEAPJjAKyJW6vebRyYVzZiX4MNa
0qqZ3gEXyUDPxbcfeg10enECgYAik81MrSlWOXqwCxRPoox01rppo/G5pSOhye24Upac6EWbkVKD
NhqMwkESs0lCs9SEFMnSMXpe+OKrvVwa6JkhpZR8p3i8VuZqWUXsSNaVaDIca5UtlE7Ew5Vn1mhQ
MI2Kc4OZippDETkx+gyeZmjScMwlYbGR4z31j/Vjxp2NEQKBgAmp+i4yGQY+sstBLXDBPCNq1uEo
FMPSTbwYoz6ezVFRe5B+QnL/pW5hdH60KEez0qoQGTYkdgLW5bwTI+K2PzG3BPTPvW0eOvWRJBXF
I0Rj6KAnGdcrONYB0Mp++X4so1tJ5y2iZAPeArZcLcoXGNozSfsnuC3GKxXAJaFPC+u+
-----END RSA PRIVATE KEY-----
\ No newline at end of file
......@@ -2,7 +2,7 @@ import pandas as pd
class DataLoader:
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
data_location = 'camera_prepared_data.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
......
......@@ -144,6 +144,7 @@ class Agent:
def analyze_reviews(self, reviews):
# get ra
self.ra = []
self.vote_sum = {argument : 0 for argument in arguments}
self.vote_phrases = {argument : [] for argument in arguments}
voting_reviews = 0
review_count = 0
......@@ -157,6 +158,7 @@ class Agent:
# add final vote tuples to ra with simplified polarity in {+ (true), - (false)}
for argument in votes:
self.ra.append({'review_id': review_id, 'argument': argument, 'vote': votes[argument]})
self.vote_sum[argument] += votes[argument]
for argument in vote_phrases:
self.vote_phrases[argument].append(vote_phrases[argument])
# only consider items that obtained votes from at least 33% of reviewers
......@@ -171,6 +173,10 @@ class Agent:
print(self.qbaf)
print('strengths:')
print(self.strengths)
print('votes:')
for argument in arguments:
print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)), len(self.attacking_phrases(argument))))
print(argument, 'augmented sum: {}'.format(self.vote_sum[argument]))
def get_strongest_supporting_subfeature(self, argument):
supporters = self.qbaf['supporters'][argument]
......@@ -187,7 +193,7 @@ class Agent:
return max(attacker_strengths, key=attacker_strengths.get)
def liked_argument(self, argument):
return len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
return self.vote_sum[argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
def supported_argument(self, argument):
return (self.get_strongest_supporting_subfeature(argument) != None and
......
......@@ -10,7 +10,7 @@ price = Node('price', parent=camera)
shipping = Node('shipping', parent=camera)
lens = Node('lens', parent=camera)
zoom = Node('zoom', parent=lens)
af = Node('af', parent=lens)
af = Node('autofocus', parent=lens)
arguments = [camera, images, video, battery, flash, audio, price, shipping, lens, zoom, af]
features = [images, video, battery, flash, audio, price, shipping, lens, zoom, af]
......
This diff is collapsed.
import xml.etree.ElementTree as ET
from stanfordcorenlp import StanfordCoreNLP
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2.xml')
reviews = tree.getroot()
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
for review in reviews:
sentences = review[0]
assert sentences.tag == 'sentences'
for sentence in sentences:
opinions = sentence.find('Opinions')
if opinions is None:
continue
text = sentence.find('text').text
parse_tree_str = nlp.parse(text)
parse_tree = ET.Element('tree')
parse_tree.text = parse_tree_str
sentence.append(parse_tree)
tree.write('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
import string
from nltk.tree import Tree
included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD']
# Marks all subtrees with descriptive label with 'DESC'
def get_np_words(np):
w = []
for np_sub in np:
if type(np_sub) is Tree:
if np_sub.label() not in included_labels:
w += ['DESC']
else:
w += get_np_words(np_sub)
else:
return [np_sub]
return w
def filtered_np(np):
w = get_np_words(np)
i = (len(w) - w[::-1].index('DESC')) if 'DESC' in w else 0
w = w[i:]
return ' '.join(w).replace('-LRB-', '(').replace('-RRB-', ')')
def extract_extended_nouns(tree_str):
phrases = []
trees = Tree.fromstring(tree_str)
for tree in trees:
for subtree in tree.subtrees():
if subtree.label() == 'NP':
# np = ' '.join(np.leaves())
phrases.append(filtered_np(subtree))
return phrases
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
glossary = {
'laptop': ['computer', 'price', 'cost', 'macbook'],
'display': ['monitor', 'screen'],
'cpu': ['processor'],
'hard disc': ['storage'],
'memory': ['ram'],
'power supply': ['charger', 'power supply cord', 'power adapter'],
'keyboard': ['keys', 'numpad'],
'mouse': ['mouse pad'],
'fans cooling': ['fan', 'cooling', 'heat sink'],
'optical drives': ['cd players', 'dvd drive', 'disc drive', 'dvd burner'],
'ports': ['usb port', 'hdmi port', 'vga port', 'card reader', 'firewire port', 'sd card slot', 'dvi port', 'thunderbolt port'],
'graphics': ['graphics card', 'video card', 'graphics chip', 'gpu'],
'multimedia devices': ['sound', 'audio', 'microphone', 'camera', 'webcam', 'speakers', 'headphone'],
'os': ['os x', 'windows', 'linux', 'start menu', 'safe mode', 'boot manager', 'drag and drop feature'],
'software': ['office', 'iwork', 'word processor', 'microsoft word', 'powerpoint', 'browser', 'skype', 'iphoto', 'ilife', 'pages', 'keynote', 'antivirus program', 'firewall', 'games', 'facial recognition']
}
n = len(reviews)
i = 0
prepped_opinions = 0
total_opinions = 0
def parse_opinion(opinion):
category = opinion.attrib['category']
feature = category[:category.index("#")].lower()
polarity = opinion.attrib['polarity']
return (feature.replace('_', ' '), polarity)
def get_glossary(feature):
return [feature] + glossary[feature] if feature in glossary.keys() else [feature]
# def replace_feature(feature, text):
# tokens = word_tokenize(text)
# for i in range(len(tokens)):
# token = tokens[i]
# if token.lower() in get_glossary(feature):
# text = text.replace(token, '$T$')
# return text if '$T$' in text else None
def replace_feature_nps(feature, text, nps):
np_tokens = [word_tokenize(np) for np in nps]
detected_nps = set()
for np_i in range(len(nps)):
tokens = np_tokens[np_i]
for i in range(len(tokens)):
token = tokens[i]
if token.lower() in get_glossary(feature):
detected_nps.add(nps[np_i])
if len(detected_nps) == 0:
return None
print(nps)
print(detected_nps)
unique_nps = list(filter(lambda np: not any(other_np in np for other_np in detected_nps.difference({np})), detected_nps))
print(unique_nps)
for unique_np in unique_nps:
text = text.replace(unique_np, '$T$')
return text
for review in reviews:
sentences = review[0]
assert sentences.tag == 'sentences'
for sentence in sentences:
opinions = sentence.find('Opinions')
if opinions is None:
continue
opinions = set([parse_opinion(opinion) for opinion in opinions])
text = sentence.find('text').text
parse_tree = sentence.find('tree').text
nps = extract_extended_nouns(parse_tree)
for opinion in opinions:
total_opinions += 1
replaced_text = replace_feature_nps(opinion[0], text, nps)
if replaced_text:
prepped_opinions += 1
print('---')
print(text)
print(replaced_text)
print(opinion)
print('---')
else:
pass
# print('---')
# print(text)
# print(nps)
# print(opinion)
i += 1
print('{}/{}'.format(i, n))
print('{}/{} opinions prepared'.format(prepped_opinions, total_opinions))
=====Description=====
As people tend to post comments for the celebrities, products, and companies,
we use these keywords (such as ``\textit{bill gates}'', ``\textit{taylor swift}'',
``\textit{xbox}'', ``\textit{windows 7}'', ``\textit{google}'') to query the Twitter API.
After obtaining the tweets, we manually annotate the sentiment labels (negative,
neutral, positive) for these targets. In order to eliminate the effects of data imbalance
problem, we randomly sample the tweets and make the data balanced. The negative, neutral,
positive classes account for 25\%, 50\%, 25\%, respectively. Training data consists of
6,248 tweets, and testing data has 692 tweets.
=====Format=====
Each instance consists three lines:
- sentence (the target is replaced with $T$)
- target
- polarity label (0: neutral, 1:positive, -1:negative)
=====Citation=====
@inproceedings{dong2014adaptive,
title={Adaptive Recursive Neural Network for Target-dependent Twitter Sentiment Classification},
author={Dong, Li and Wei, Furu and Tan, Chuanqi and Tang, Duyu and Zhou, Ming and Xu, Ke},
booktitle={The 52nd Annual Meeting of the Association for Computational Linguistics (ACL)},
year={2014},
organization={ACL}
}
=====Contact=====
donglixp@gmail.com
This diff is collapsed.
This diff is collapsed.
......@@ -3,6 +3,7 @@ from dataloader import DataLoader
from argument import *
from agent import Agent
import inflect
from nltk.stem.snowball import SnowballStemmer
class ADAMessage:
......@@ -23,6 +24,7 @@ class Communicator:
agent = Agent()
inflect = inflect.engine()
stemmer = SnowballStemmer("english")
def __init__(self, dl):
self.dl = dl
......@@ -104,4 +106,4 @@ class Communicator:
return queries
def was_were(self, arg):
return 'were' if self.inflect.singular_noun(arg.name) else 'was'
return 'was' if self.stemmer.stem(arg.name) == arg.name else 'were'
import requests
import threading
from anytree import Node
# rel = '/r/DefinedAs'
# rel2 = '/r/MadeOf'
# uri = '/query?start=/c/en/{feature}&rel={rel}&rel={rel2}'.format(feature=feature, rel=rel, rel2=rel2) # '/related/c/en/cpu?filter=/c/en'
#
# uri = '/c/en/{feature}'.format(feature=feature)
#
# print([(obj['edges'][i]['rel']['label'],obj['edges'][i]['end']['label']) for i in range(len(obj['edges']))])
class ConceptNet:
url = 'http://api.conceptnet.io'
limit = 5
def find_rels(self, feature, rel):
uri = '/query?node=/c/en/{feature}&other=/c/en&rel=/r/{rel}&limit={limit}'.format(feature=feature, rel=rel, limit=self.limit)
obj = requests.get(self.url + uri).json()
unique = set([obj['edges'][i]['end']['label'] for i in range(len(obj['edges']))])
return unique
def get_relatedness(self, f1, f2):
uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_'))
obj = requests.get(self.url + uri).json()
return obj['value']
def append_synonyms(self, feature, rel, synonyms, lock):
rels = self.find_rels(feature, rel)
lock.acquire()
synonyms.update(rels)
lock.release()
def parent_check(self, node, parent, synonyms):
if parent == None:
return
min_r = 0.1
ratio = 1.2 # relatedness for child has to be at least twice as high as for parent
rm = set()
for s in synonyms:
r_child = self.get_relatedness(node.name, s)
r_parent = self.get_relatedness(parent.name, s)
if (r_child < min_r) or (r_parent < min_r) or (r_parent != 0 and r_child / r_parent < ratio):
rm.add(s)
synonyms.difference_update(rm)
self.parent_check(node, parent.parent, synonyms)
def synonyms_for_node(self, node):
rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo']
synonyms = set()
lock = threading.Lock()
threads = []
for rel in rels:
t = threading.Thread(target=self.append_synonyms, args=(node.name, rel, synonyms, lock))
t.start()
threads.append(t)
for t in threads:
t.join()
self.parent_check(node, node.parent, synonyms)
return synonyms
net = ConceptNet()
computer = Node('display')
cpu = Node('quality', parent=computer)
syns = net.synonyms_for_node(cpu)
print(syns)
# print(requests.get('http://54.243.2.221' + '/c/en/computer').json())
# print(requests.get('http://54.243.2.221' + '/c/en/cpu').json())
annotators = tokenize,ssplit,pos,lemma,ner,parse,depparse,coref
outputFormat = serialized
serializer = edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer
......@@ -20,24 +20,24 @@ reviews = reviews.drop_duplicates(keep=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter += word_filter + '|'
filter = filter[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)]
# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
# drop reviews for products with less than min_reviews reviews
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
# choose reviews for n first items
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
# # try to filter out reviews for camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
# filter = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter += word_filter + '|'
# filter = filter[:-1]
# reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)]
#
# # drop reviews with less than min_characters characters
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
#
# # drop reviews for products with less than min_reviews reviews
# reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
#
# # choose reviews for n first items
# reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews.to_csv(output_location, sep='\t', index=False)
......
......@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['192.168.0.13']
ALLOWED_HOSTS = ['192.168.0.13', '146.169.222.109', '146.169.218.37']
# Application definition
......
from stanfordnlp.server import CoreNLPClient
from nltk.tree import *
pos_dict = {
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
'EX': 'existential there',
'FW': 'foreign word','IN': 'preposition/subordinating conjunction','JJ': 'adjective',
'JJR': 'adjective, comparative','JJS': 'adjective, superlative',
'LS': 'list marker','MD': 'modal','NN': 'noun, singular',
'NNS': 'noun plural','NNP': 'proper noun, singular',
'NNPS': 'proper noun, plural','PDT': 'predeterminer',
'POS': 'possessive ending','PRP': 'personal pronoun',
'PRP$': 'possessive pronoun','RB': 'adverb',
'RBR': 'adverb, comparative','RBS': 'adverb, superlative',
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take','VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
}
sentence = 'Also the battery life on this camera is dismal even if you are not using the GPS function or autofocus mode.'
with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','parse','depparse','coref'], timeout=60000, memory='16G') as client:
print('annotating...')
ann = client.annotate(sentence)
# print('tags:')
# print([[(w.text, pos_dict[w.pos] if w.pos in pos_dict.keys() else '-') for w in sent.words] for sent in ann.sentence])
# print('dependencies:')
# [sent.print_dependencies() for sent in doc.sentences]
print('tree:')
print(ann.sentence[0].parseTree)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment