Commit fcfce2f4 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Add mbp local changes

parent f5e7e588
......@@ -8,7 +8,7 @@ import time
import numpy as np
from sklearn import metrics
device = torch.device('cuda')
device = torch.device('cpu')
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
......@@ -47,7 +47,7 @@ class BertAnalyzer:
self.net = TDBertNet(len(polarity_indices))
# initialise GPU
self.net.cuda()
# self.net.cuda()
optimiser = optim.Adam(self.net.parameters(), lr=LEARNING_RATE)
......@@ -112,7 +112,7 @@ class BertAnalyzer:
dataset = BertDataset.from_data(data)
loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=8, collate_fn=generate_batch)
self.net.cuda()
# self.net.cuda()
self.net.eval()
predicted = []
......
......@@ -15,9 +15,9 @@ class Framework:
HIGH_SENTIMENT_THRESHOLD = 0.99
bert_analyzer = BertAnalyzer.load_saved(os.path.dirname(__file__) + '/SA/semeval_2014_both_unmasked_6.pt')
def __init__(self, product_category, product_id, review_df=None, wordnet=False):
def __init__(self, product_category, product_id, review_df=None, method=None):
self.product_id = product_id
self.product = Product.get_product(product_category, wordnet=wordnet)
self.product = Product.get_product(product_category, method=method)
self.product_node = self.product.root
self.arguments = self.product.argument_nodes
self.features = self.product.feature_nodes
......
from agent.target_extraction.BERT.entity_extractor.bert_entity_extractor import BertEntityExtractor
from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor
# BertRelExtractor.train_and_validate('all_relation_instances.tsv',
# 'relation_extractor_camera_backpack_cardigan_laptop_watch', valid_frac=0.05)
# BertEntityExtractor.train_and_validate('all_entity_instances.tsv',
# 'entity_extractor_camera_backpack_cardigan_laptop_watch', valid_frac=0.05)
BertRelExtractor.train_and_validate('all_relation_instances.tsv',
'relation_extractor_camera_backpack_cardigan_laptop_watch', valid_frac=0.05)
BertEntityExtractor.train_and_validate('all_entity_instances.tsv',
'entity_extractor_camera_backpack_cardigan_laptop_watch', valid_frac=0.05)
# BertEntityExtractor.load_saved('entity_extractor/entity_extractor_camera_backpack_cardigan_laptop_game.pt').evaluate('watch_entity_instances.tsv')
# BertRelExtractor.load_saved('relation_extractor/relation_extractor_camera_backpack_cardigan_laptop_game.pt').evaluate('watch_relation_instances.tsv')
......
import requests
import threading
import time
class ConceptNet:
url = 'http://api.conceptnet.io'
limit = 50
def find_related(self, feature, rel):
uri = '/query?node=/c/en/{feature}&other=/c/en&rel=/r/{rel}&limit={limit}'.format(feature=feature, rel=rel, limit=self.limit)
obj = requests.get(self.url + uri).json()
unique = set([obj['edges'][i]['end']['label'] for i in range(len(obj['edges']))])
return unique
def find_relations(self, f1, f2):
uri = '/query?node=/c/en/{f1}&other=/c/en/{f2}'.format(f1=f1, f2=f2)
obj = requests.get(self.url + uri).json()
return obj
def get_relatedness(self, f1, f2):
uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_'))
obj = requests.get(self.url + uri).json()
time.sleep(0.5) # only 3600 requests allowed / hour
return obj['value']
def append_result(self, feature, rel, result_set, lock):
rels = self.find_related(feature, rel)
lock.acquire()
result_set.update(rels)
lock.release()
def parent_check(self, node, parent, synonyms):
if parent == None:
return
min_r = 0.1
ratio = 1.2 # relatedness for child has to be at least twice as high as for parent
rm = set()
for s in synonyms:
r_child = self.get_relatedness(node.name, s)
r_parent = self.get_relatedness(parent.name, s)
if (r_child < min_r) or (r_parent < min_r) or (r_parent != 0 and r_child / r_parent < ratio):
rm.add(s)
synonyms.difference_update(rm)
self.parent_check(node, parent.parent, synonyms)
def sem_synonyms_for_node(self, node):
rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo'] # SimilarTo? FormOf?
synonyms = set()
lock = threading.Lock()
threads = []
for rel in rels:
t = threading.Thread(target=self.append_result, args=(node.name, rel, synonyms, lock))
t.start()
threads.append(t)
for t in threads:
t.join()
self.parent_check(node, node.parent, synonyms)
synonyms.add(node.name)
return synonyms
def sub_features_for_argument(self, argument):
rels = ['UsedFor', 'HasA', 'CapableOf', 'Causes', 'HasSubevent', 'HasProperty', 'MadeOf']
features = set()
lock = threading.Lock()
threads = []
for rel in rels:
t = threading.Thread(target=self.append_result, args=(argument, rel, features, lock))
t.start()
threads.append(t)
for t in threads:
t.join()
return features
......@@ -8,12 +8,13 @@ import os
import readchar
from sty import fg, bg
from anytree import Node, RenderTree, LevelOrderIter, PreOrderIter
from itertools import repeat, chain
from itertools import repeat, chain, combinations
from pathos.multiprocessing import ProcessingPool
from multiprocessing import Pool
from transformers import BertTokenizer
from agent.target_extraction.BERT.entity_extractor.entitybertnet import TRAINED_WEIGHTS
from agent.target_extraction.target_extractor import TargetExtractor, ngrams, get_nouns
from agent.target_extraction.ontology_extractor import OntologyExtractor, ngrams, get_nouns
from agent.target_extraction.product import Product
from collections import Counter
PHRASE_THRESHOLD = 4
......@@ -46,8 +47,8 @@ class EntityAnnotator:
phrases = pool.map(word_tokenize, sentences)
print('obtaining phraser...')
bigram = Phrases(phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
bigram = Phrases(phrases, threshold=OntologyExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[phrases], threshold=OntologyExtractor.PHRASE_THRESHOLD)
phraser = Phraser(trigram)
ngram_phrases = pool.starmap(ngrams, zip(phrases, repeat(phraser, len(phrases))))
......@@ -62,7 +63,7 @@ class EntityAnnotator:
@staticmethod
def get_sentences(file, pool, n=None):
df = pd.read_csv(file, sep='\t', error_bad_lines=False)
texts = TargetExtractor.obtain_texts(df, 'reviewText', n=n)
texts = OntologyExtractor.obtain_texts(df, 'reviewText', n=n)
sentences = list(chain.from_iterable(pool.map(sent_tokenize, texts)))
sentences = list(chain.from_iterable(pool.map(str.splitlines, sentences)))
sentences = pool.map(filter_underscore, sentences)
......@@ -270,8 +271,9 @@ class EntityAnnotator:
if sentences is None:
sentences = self.sentences
all_entities = {(e, True) for e in self.get_annotated_entities()}.union(
{(e, False) for e in self.get_nan_entities()})
all_entities = {(e, 0) for e in self.get_nan_entities()}.union(
{(e, 1) for e in self.get_features()}).union(
{(e, 2) for e in self.get_product_entities()})
instances = []
idx = 0
......@@ -279,22 +281,31 @@ class EntityAnnotator:
texts_sub = sentences[idx:idx + 20000]
idx += 20000
instances += filter(lambda i: i is not None, map(entity_instances_for_text,
repeat(tokenizer, len(texts_sub)),
repeat(all_entities, len(texts_sub)),
texts_sub))
repeat(tokenizer, len(texts_sub)),
repeat(all_entities, len(texts_sub)),
texts_sub))
print(len(instances))
instances = instances[:n]
df = pd.DataFrame(instances, columns=['tokens', 'entity_idx', 'label'])
df = pd.DataFrame(instances, columns=['text', 'tokens', 'entity_idx', 'label'])
df.to_csv(save_path, sep='\t', index=False)
def get_annotated_entities(self):
return {syn.lower() for n in PreOrderIter(self.root) for syn in n.synset}
def get_product_entities(self):
return {syn.lower() for syn in self.root.synset}
def get_features(self):
return {syn.lower() for n in self.root.descendants for syn in n.synset}
def get_nan_entities(self):
annotated = self.get_annotated_entities()
aspects = self.get_features().union(self.get_product_entities())
return {t.replace('_', ' ').lower() for t, _ in self.counter.most_common(self.n_annotated)
if t.replace('_', ' ').lower() not in annotated}
if t.replace('_', ' ').lower() not in aspects}
def save_product(self, product):
f = open('extracted_products/{}_manual{}'.format(product, Product.FILE_EXTENSION), 'wb')
p = Product(self.root)
pickle.dump(p, f)
f.close()
def entity_instances_for_text(tokenizer, all_entities, text):
......@@ -367,7 +378,7 @@ def entity_instances_for_text(tokenizer, all_entities, text):
entity_mention = None
for i in range(len(tokens)):
for entity, is_aspect in all_entities:
for entity, label in all_entities:
match_length = token_entity_match(i, entity, tokens)
if match_length is not None:
e_range = (i + 1, i + match_length)
......@@ -377,14 +388,14 @@ def entity_instances_for_text(tokenizer, all_entities, text):
continue
elif not e_range[0] <= entity_mention[0][0] and e_range[1] >= entity_mention[0][1]:
return None
entity_mention = ((i + 1, i + match_length), is_aspect) # + 1 taking into account the [CLS] token
entity_mention = ((i + 1, i + match_length), label) # + 1 taking into account the [CLS] token
if entity_mention is None or not is_noun(tokens, entity_mention):
return None
e_range, is_aspect = entity_mention
e_range, label = entity_mention
tokens = mask_tokens(tokens, e_range) # mask entity mention
return tokens, e_range[0], 1 if is_aspect else 0
return text, tokens, e_range[0], label
def relation_instances_for_text(tokenizer, root, text):
......@@ -494,18 +505,24 @@ def relation_instances_for_text(tokenizer, root, text):
return tokens, aspect_indices, get_rel_label(entity_mentions[0], entity_mentions[1])
# EntityAnnotator.new_from_tsv('data/verified_necklace_reviews.tsv', 'necklace')
# print(1)
# ea: EntityAnnotator = EntityAnnotator.load_saved('game')
# ea.root = None
# ea.n_annotated = 0
# ea.save()
# ea.save_entity_instances('BERT/data/guitar_product_entity_instances.tsv', 50000)
# ea: EntityAnnotator = EntityAnnotator.load_saved('watch')
# ea.update_tree_indices()
# ea.save()
if __name__ == '__main__':
with Pool(4) as pool:
for p in ['necklace', 'headphones']: # 'guitar', 'cardigan', 'backpack', 'watch', 'camera', 'laptop', 'game', 'headphones', 'necklace'
ea: EntityAnnotator = EntityAnnotator.load_saved(p)
sentences = EntityAnnotator.get_sentences('data/verified_{}_reviews.tsv'.format(p), pool)
ea.save_entity_instances('BERT/data/{}_entity_instances.tsv'.format(p), 42313,
sentences=sentences)
ea.save_relation_instances('BERT/data/{}_relation_instances.tsv'.format(p), 21131,
sentences=sentences)
for p in ['camera', 'backpack', 'cardigan', 'guitar', 'laptop']:
ea: EntityAnnotator = EntityAnnotator.load_saved(p)
ea.save_product(p)
# if __name__ == '__main__':
# with Pool(4) as pool:
# for p in ['game']:
# ea: EntityAnnotator = EntityAnnotator.load_saved(p)
# sentences = EntityAnnotator.get_sentences('data/verified_{}_reviews.tsv'.format(p), pool)
# ea.save_entity_instances('BERT/data/{}_product_entity_instances.tsv'.format(p), 42313)
from anytree import Node, RenderTree, PreOrderIter
from anytree.dotexport import RenderTreeGraph
from agent.target_extraction.product import Product
import pickle
# Based on HasA relation of COMeT with sampling method topk-24
tv = Node('television')
Node('screen', parent=tv)
Node('remote control', parent=tv)
Node('channel', parent=tv)
Node('program', parent=tv)
Node('vcr', parent=tv)
Node('sound', parent=tv)
Node('volume control', parent=tv)
Node('live feed', parent=tv)
Node('flat screen', parent=tv)
Node('monitor', parent=tv)
Node('lot of channel', parent=tv)
Node('many channel', parent=tv)
Node('dial', parent=tv)
Node('control', parent=tv)
Node('menu', parent=tv)
Node('cable', parent=tv)
Node('button to turn it off', parent=tv)
Node('low resolution than real tv', parent=tv)
Node('be turn off', parent=tv)
Node('picture on them', parent=tv)
Node('entertainment center', parent=tv)
Node('tape', parent=tv)
Node('effect of make person feel happy and happy when not', parent=tv)
Node('frame', parent=tv)
watch = Node('watch')
Node('hand', parent=watch)
Node('face', parent=watch)
Node('time', parent=watch)
Node('dial', parent=watch)
Node('second hand', parent=watch)
Node('number', parent=watch)
Node('finger', parent=watch)
Node('wrist', parent=watch)
Node('needle', parent=watch)
Node('digital display', parent=watch)
Node('arm', parent=watch)
Node('read hand', parent=watch)
Node('tick and beat', parent=watch)
Node('numeric scale', parent=watch)
Node('five finger and thumb', parent=watch)
Node('watch face', parent=watch)
Node('chain', parent=watch)
Node('button', parent=watch)
Node('four arm and five hand', parent=watch)
Node('many chronometer', parent=watch)
Node('two hand', parent=watch)
Node('wire', parent=watch)
Node('green hand', parent=watch)
Node('display time', parent=watch)
mixer = Node('stand mixer')
Node('handle', parent=mixer)
Node('many different utensil', parent=mixer)
Node('two handle', parent=mixer)
Node('four turn top', parent=mixer)
Node('long handle', parent=mixer)
Node('three pronged handle', parent=mixer)
Node('wheel', parent=mixer)
Node('hand', parent=mixer)
Node('5 foot radius', parent=mixer)
Node('six side', parent=mixer)
Node('2 wheel', parent=mixer)
Node('effect of improve efficiency', parent=mixer)
Node('short range', parent=mixer)
Node('five side', parent=mixer)
Node('metal arm', parent=mixer)
Node('become popular activity over time', parent=mixer)
Node('tool', parent=mixer)
Node('turn crank', parent=mixer)
Node('be used in bar', parent=mixer)
Node('pedal', parent=mixer)
Node('screw', parent=mixer)
Node('eight side', parent=mixer)
Node('lever', parent=mixer)
Node('very high power power', parent=mixer)
game = Node('video game')
Node('win or lose', parent=game)
Node('change', parent=game)
Node('winner', parent=game)
Node('fun', parent=game)
Node('effect of entertain other person', parent=game)
Node('outcome', parent=game)
Node('result', parent=game)
Node('lose their appeal over time', parent=game)
Node('be for entertainment', parent=game)
Node('solve problem', parent=game)
Node('many program', parent=game)
Node('different game plan', parent=game)
Node('play game', parent=game)
Node('complicate program', parent=game)
Node('theme', parent=game)
Node('high resolution', parent=game)
Node('channel to person', parent=game)
Node('lot of fun', parent=game)
Node('program', parent=game)
Node('player', parent=game)
Node('discard number', parent=game)
Node('game console', parent=game)
Node('entertain result', parent=game)
Node('improve over time', parent=game)
necklace = Node('necklace')
Node('many piece', parent=necklace)
Node('two jewel', parent=necklace)
Node('celtic design', parent=necklace)
Node('chain', parent=necklace)
Node('charm on it', parent=necklace)
Node('design on top', parent=necklace)
Node('four finger', parent=necklace)
Node('2 link', parent=necklace)
Node('three - sided design', parent=necklace)
Node('butterfly on it', parent=necklace)
Node('jewel in it', parent=necklace)
Node('small star on it', parent=necklace)
Node('sharp point', parent=necklace)
Node('five side', parent=necklace)
Node('button', parent=necklace)
Node('hole', parent=necklace)
Node('six thread', parent=necklace)
Node('string', parent=necklace)
Node('fleur - de - lis on it', parent=necklace)
Node('star at their centre', parent=necklace)
Node('gold and silver chain', parent=necklace)
Node('tiny bird', parent=necklace)
Node('flower on it', parent=necklace)
Node('5 link', parent=necklace)
def save(tree, product):
for n in PreOrderIter(tree):
n.synset = {n.name}
f = open('{}_comet{}'.format(product, Product.FILE_EXTENSION), 'wb')
p = Product(tree)
pickle.dump(p, f)
f.close()
# save(mixer, 'mixer')
# save(necklace, 'necklace')
# save(tv, 'television')
# save(game, 'game')
# save(watch, 'watch')
print(RenderTree(watch))
true false
1 2
1 2
0 3
2 1
0 3
\ No newline at end of file
true false
0 3
0 3
1 2
0 3
2 1
3 0
3 0
0 3
0 3
0 3
0 3
0 3
0 3
0 3
3 0
2 1
0 3
0 3
0 3
1 2
0 3
0 3
0 3
0 3
\ No newline at end of file
true false
3 0
3 0
0 3
3 0
0 3
3 0
3 0
0 3
0 3
0 3
3 0
2 1
3 0
3 0
3 0
1 2
\ No newline at end of file
true false
0 3
0 3
0 3
0 3
0 3
0 3
2 1
0 3
0 3
0 3
3 0
0 3
0 3
0 3
0 3
0 3
2 1
3 0
0 3
0 3
3 0
0 3
0 3
3 0
0 3
0 3
2 1
0 3
0 3
0 3
3 0
3 0
0 3
2 1
0 3
0 3
0 3
0 3
3 0
0 3
0 3
0 3
0 3
0 3
0 3
2 1
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment