Commit 29ed5986 authored by  Joel  Oksanen's avatar Joel Oksanen

Implemented entity annotation

parent cbd56512
import pandas as pd
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from gensim.models.phrases import Phrases, Phraser
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from nltk.corpus import stopwords
import re
from collections import Counter
import pickle
import os
import readchar
from sty import fg, bg
from anytree import Node, RenderTree, LevelOrderIter, PreOrderIter
stop_words = stopwords.words('english')
ann_bgs = [,] # child, parent
class EntityAnnotator:
def __init__(self, text_file_path, counter, save_path):
self.text_file_path = text_file_path
self.counter = counter
self.save_path = save_path
self.root = None
self.n_annotated = 0
def new_from_tsv(file_path, name):
df = pd.read_csv(file_path, sep='\t', error_bad_lines=False)
texts = [text.replace('_', ' ')
for _, par in df['reviewText'].items() if not pd.isnull(par)
for text in sent_tokenize(par)]
counter = EntityAnnotator.count_nouns(texts)
ann = EntityAnnotator(file_path, counter, name + '.pickle')
return ann
def load_saved(file_path):
f = open(file_path, 'rb')
ann = pickle.load(f)
return ann
def save(self):
f = open(self.save_path, 'wb')
pickle.dump(self, f)
def count_nouns(texts):
# obtain phraser
bigram = Phrases(texts, threshold=PHRASE_THRESHOLD)
trigram = Phrases(bigram[texts], threshold=PHRASE_THRESHOLD)
phraser = Phraser(trigram)
# count nouns
nouns = []
for text in texts:
pos_tags = pos_tag(text)
ngrams = phraser[text]
word_idx = 0
for token in ngrams:
if '_' in token:
words = token.split('_')
word_range = range(word_idx, word_idx + len(words))
has_noun = any(EntityAnnotator.is_noun(pos_tags[i]) for i in word_range)
all_terms_valid = all(EntityAnnotator.is_valid_term(pos_tags[i]) for i in word_range)
if has_noun and all_terms_valid:
word_idx += len(words)
is_noun = EntityAnnotator.is_noun(pos_tags[word_idx])
is_valid = EntityAnnotator.is_valid_term(pos_tags[word_idx])
if len(token) > 1 and is_noun and is_valid:
word_idx += 1
return Counter(nouns)
def is_noun(pos_tagged):
word, tag = pos_tagged
return tag.startswith('NN') and word.lower() not in string.punctuation and word not in stop_words
# true if term is not a preposition and does not include special characters
def is_valid_term(pos_tagged):
alpha_numeric_pat = '^\w+$'
word, tag = pos_tagged
return tag != 'IN' and re.match(alpha_numeric_pat, word)
def annotate(self):
while True:
entity = self.select_entity()
print(fg.li_blue + '{} entities annotated'.format(self.n_annotated) +
print(fg.li_black + 'root: \'r\'' +
print(fg.li_black + 'subfeat: [number of parent node][ENTER]' +
print(fg.li_black + 'skip: \'s\'' +
print(fg.li_black + 'quit: \'q\'' +
if self.root is not None:
task = readchar.readkey()
if task == 'r':
old_root = self.root
self.root = Node(entity)
old_root.parent = self.root
self.n_annotated += 1
if task.isdigit():
n = int(task)
while True:
subtask = readchar.readkey()
if subtask.isdigit():
n = n * 10 + int(subtask)
if subtask == readchar.key.ENTER:
Node(entity, parent=self.node_with_number(n))
self.n_annotated += 1
if task == 's':
self.n_annotated += 1
if task == 'q':
def select_entity(self):
entity = self.counter.most_common()[self.n_annotated]
return entity.replace('_', ' ')
def node_with_number(self, n):
return list(LevelOrderIter(self.root))[n]
def update_tree_indices(self):
i = 0
for node in LevelOrderIter(self.root):
node.n = i
i += 1
# def get_relation_tuples(self):
# rels = []
# for e1 in LevelOrderIter(self.root):
# if e1.isleaf():
# continue
# for e2 in e1.children:
# rels.append((, # e1 hasFeature e2
# return rels
def get_annotated_texts(self, save_path):
df = pd.read_csv(self.text_file_path, sep='\t', error_bad_lines=False)
df['relations'] = df['reviewText'].apply(lambda t: self.relations_for_text(t))
df = df[~df['relations'].isnull()]
df.to_csv(save_path, sep='\t', index=False)
def relations_for_text(self, text):
rels = []
child_entities = []
for e1 in PreOrderIter(self.root):
if not e1.isleaf() and in text:
for e2 in e1.children:
if in text:
# e1 is a parent of an entity in the text
if e1 in child_entities:
# e1 cannot be a parent and a child
return None
rels.append({'em1Text': e1, 'em2Text': e2, 'label': '/has_feature'})
return rels
ann = EntityAnnotator.new_from_tsv('data/verified_camera_reviews.tsv', 'camera_entity_annotator')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment