Commit ba2d99f3 authored by Joel Oksanen's avatar Joel Oksanen

Minor eval changes

parent a2ecd08d
......@@ -2,7 +2,7 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/reviews_for_backpack.tsv'
data_location = 'agent/amazon_data/reviews_for_watches.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
......
import pandas as pd
import gzip
import json
MAX_ITEMS = 200000
def parse(path):
g = gzip.open(path, 'rb')
for line in g:
yield json.loads(line)
def get_df(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
if i == MAX_ITEMS:
break
return pd.DataFrame.from_dict(df, orient='index')
pd.set_option('display.max_colwidth', None)
category = 'Cardigans'
metadata_iter = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: type(cl) is list and category in cl)]
for metadata in metadata_iter])
def get_reviews(category, meta_file, review_file):
metadata_iter = pd.read_json(meta_file, lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: type(cl) is list and category in cl)]
for metadata in metadata_iter])
print(len(metadata.index))
print(len(metadata.index))
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
return reviews
print(len(reviews.index))
reviews.to_csv('target_extraction/data/verified_cardigan_reviews.tsv', sep='\t', index=False)
def save_reviews(category, meta_file, review_file, output_file):
reviews = get_reviews(category, meta_file, review_file)
reviews.to_csv(output_file, sep='\t', index=False)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
# compression='gzip')
# parent_output = 'target_extraction/data/electronics_reviews.tsv'
# child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
#
# for col in reviews.columns:
# print(col)
#
# c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
# p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
# c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
# c_reviews = c_reviews.head(MAX_ITEMS)
# p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
# p_reviews = p_reviews.head(MAX_ITEMS)
#
# p_reviews.to_csv(parent_output, sep='\t', index=False)
# c_reviews.to_csv(child_output, sep='\t', index=False)
# print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
def save_top_reviewed_products(n, category, meta_file, review_file, output_file, product_title):
reviews = get_reviews(category, meta_file, review_file)
top_reviewed = reviews.groupby(['asin'], sort=False).size().sort_values(ascending=False).head(n)
reviews = reviews[reviews['asin'].apply(lambda asin: asin in top_reviewed)]
reviews = reviews.rename(columns={'overall': 'star_rating', 'asin': 'product_id', 'reviewerID': 'review_id',
'reviewText': 'review_body'})
reviews = reviews[reviews['review_body'].apply(lambda b: b is not None and len(b) > 0)]
reviews = reviews[reviews['star_rating'].apply(lambda r: type(r) is int or r.isdigit())]
reviews['product_title'] = product_title
reviews.to_csv(output_file, sep='\t', index=False)
# # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
#
# # try to filter out camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
# filter_pat = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter_pat += word_filter + '|'
# filter_pat = filter_pat[:-1]
# r = re.compile(filter_pat)
# metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
# metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
#
# for _, row in metadata.head(20).iterrows():
# print('features:', row['feature'])
# print('description:', row['description'])
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
# save_top_reviewed_products(3, 'Wrist Watches', 'amazon_data/meta_Clothing_Shoes_and_Jewelry.json',
# 'amazon_data/Clothing_Shoes_and_Jewelry.json', 'amazon_data/reviews_for_watches.tsv',
# 'watch')
save_reviews('Necklaces', 'agent/amazon_data/meta_Clothing_Shoes_and_Jewelry.json', 'agent/amazon_data/Clothing_Shoes_and_Jewelry.json', 'agent/target_extraction/data/verified_necklace_reviews.tsv')
......@@ -20,7 +20,7 @@ LEARNING_RATE = 0.00002
MAX_GRAD_NORM = 1.0
# training
N_EPOCHS = 2
N_EPOCHS = 3
BATCH_SIZE = 32
WARM_UP_FRAC = 0.05
......@@ -48,13 +48,13 @@ class BertEntityExtractor:
return extractor
@staticmethod
def train_and_validate(file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_and_validate(file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
extractor = BertEntityExtractor()
extractor.train_with_file(file_path, save_path, size=size, valid_frac=valid_frac,
extractor.train_with_file(file_path, save_file, size=size, valid_frac=valid_frac,
valid_file_path=valid_file_path)
return extractor
def train_with_file(self, file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_with_file(self, file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
# load training data
if valid_file_path is None:
train_data, valid_data = EntityDataset.from_file(file_path, size=size, valid_frac=valid_frac)
......@@ -122,11 +122,11 @@ class BertEntityExtractor:
if valid_data is not None:
self.evaluate(data=valid_data)
torch.save(self.net.state_dict(), '{}.pt'.format(save_file))
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(self.net.state_dict(), save_path)
def evaluate(self, file_path=None, data=None, size=None):
# load eval data
if file_path is not None:
......
......@@ -19,7 +19,7 @@ LEARNING_RATE = 0.00002
MAX_GRAD_NORM = 1.0
# training
N_EPOCHS = 2
N_EPOCHS = 3
BATCH_SIZE = 16
WARM_UP_FRAC = 0.05
......@@ -47,13 +47,13 @@ class BertRelExtractor:
return extractor
@staticmethod
def train_and_validate(file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_and_validate(file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
extractor = BertRelExtractor()
extractor.train_with_file(file_path, save_path, size=size, valid_frac=valid_frac,
extractor.train_with_file(file_path, save_file, size=size, valid_frac=valid_frac,
valid_file_path=valid_file_path)
return extractor
def train_with_file(self, file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_with_file(self, file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
# load training data
if valid_file_path is None:
train_data, valid_data = PairRelDataset.from_file(file_path, size=size, valid_frac=valid_frac)
......@@ -121,11 +121,11 @@ class BertRelExtractor:
if valid_data is not None:
self.evaluate(data=valid_data)
torch.save(self.net.state_dict(), '{}.pt'.format(save_file))
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(self.net.state_dict(), save_path)
def evaluate(self, file_path=None, data=None, size=None):
# load eval data
if file_path is not None:
......
......@@ -5,6 +5,7 @@ import pandas as pd
import numpy as np
from ast import literal_eval
from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
import os
MAX_SEQ_LEN = 128
RELATIONS = ['/has_feature', '/no_relation']
......@@ -69,11 +70,12 @@ class PairRelDataset(Dataset):
return dataset
@staticmethod
def from_file(path, valid_frac=None, size=None):
if path.endswith('.json'):
dataset = PairRelDataset(pd.read_json(path, lines=True), size=size)
elif path.endswith('.tsv'):
dataset = PairRelDataset(pd.read_csv(path, sep='\t', error_bad_lines=False), size=size)
def from_file(file_name, valid_frac=None, size=None):
f = open(os.path.dirname(__file__) + '/../data/' + file_name)
if file_name.endswith('.json'):
dataset = PairRelDataset(pd.read_json(f, lines=True), size=size)
elif file_name.endswith('.tsv'):
dataset = PairRelDataset(pd.read_csv(f, sep='\t', error_bad_lines=False), size=size)
else:
raise AttributeError('Could not recognize file type')
......
......@@ -111,7 +111,7 @@ class EntityAnnotator:
os.system('clear')
print(fg.li_green + '{} entities annotated'.format(self.n_annotated) + fg.rs)
print(fg.li_green + '{} nouns annotated'.format(self.n_annotated) + fg.rs)
print('')
print(fg.li_black + 'root: \'r\'' + fg.rs)
......@@ -249,11 +249,12 @@ class EntityAnnotator:
def pair_relations_for_text(self, text, nan_entities=None):
single_tokens = word_tokenize(text)
all_tokens = set().union(*[single_tokens, self.phraser[single_tokens]])
tagged_single = pos_tag(single_tokens)
tagged_all = set().union(*[tagged_single, pos_tag(self.phraser[single_tokens])])
entity_mentions = []
for n in PreOrderIter(self.root):
cont, mention = self.mention_in_text(all_tokens, node=n)
cont, mention = self.mention_in_text(tagged_all, node=n)
if not cont:
# many mentions of same entity
return None
......@@ -269,7 +270,7 @@ class EntityAnnotator:
if nan_entities is not None and len(entity_mentions) == 1:
nan_mention = None
for term in nan_entities:
cont, mention = self.mention_in_text(all_tokens, term=term)
cont, mention = self.mention_in_text(tagged_all, term=term)
if not cont:
# many mentions of term
return None
......@@ -286,10 +287,11 @@ class EntityAnnotator:
# returns True, (synonym of node / term / None) if there is exactly one or zero such occurrence,
# otherwise False, None, None
def mention_in_text(self, tokens, node=None, term=None):
def mention_in_text(self, tagged_tokens, node=None, term=None):
mention = None
for syn in ({syn.lower() for syn in self.synset[node]} if node is not None else {term}):
n_matches = sum(1 for token in tokens if syn == token.lower().replace('_', ' '))
n_matches = sum(1 for token, tag in tagged_tokens
if syn == token.lower().replace('_', ' ') and tag.startswith('NN'))
if n_matches > 1:
return False, None
if n_matches == 1:
......@@ -301,11 +303,12 @@ class EntityAnnotator:
def entity_mentions_in_text(self, text, all_entities):
single_tokens = word_tokenize(text)
all_tokens = set().union(*[single_tokens, self.phraser[single_tokens]])
tagged_single = pos_tag(single_tokens)
tagged_all = set().union(*[tagged_single, pos_tag(self.phraser[single_tokens])])
entity_mention = None
for entity, is_aspect in all_entities:
cont, mention = self.mention_in_text(all_tokens, term=entity)
cont, mention = self.mention_in_text(tagged_all, term=entity)
if not cont:
# many mentions of same entity
return None
......@@ -355,12 +358,5 @@ class EntityAnnotator:
return text, rels
ann: EntityAnnotator = EntityAnnotator.load_saved('acoustic_guitar_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_acoustic_guitar_review_entities.tsv')
ann: EntityAnnotator = EntityAnnotator.load_saved('camera_entity_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_camera_review_entities.tsv')
ann: EntityAnnotator = EntityAnnotator.load_saved('laptop_entity_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_laptop_review_entities.tsv')
ann: EntityAnnotator = EntityAnnotator.load_saved('backpack_entity_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_backpack_review_entities.tsv')
ea = EntityAnnotator.load_saved('example_annotator.pickle')
ea.annotate()
from agent.target_extraction.BERT.entity_extractor.bert_entity_extractor import BertEntityExtractor
from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor
from agent.target_extraction.BERT.data.combine_files import combine_files
# from agent.target_extraction.entity_annotation import EntityAnnotator
# ann: EntityAnnotator = EntityAnnotator.load_saved('acoustic_guitar_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_acoustic_guitar_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_acoustic_guitar_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('camera_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_camera_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_camera_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('laptop_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_laptop_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_laptop_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('backpack_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_backpack_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_backpack_review_entities.tsv')
#
# entity_files = [
# 'BERT/data/annotated_camera_review_entities.tsv',
# 'BERT/data/annotated_backpack_review_entities.tsv',
# 'BERT/data/annotated_laptop_review_entities.tsv',
# 'BERT/data/annotated_acoustic_guitar_review_entities.tsv',
# 'BERT/data/annotated_cardigan_review_entities.tsv'
# ]
# entity_output_file = 'BERT/data/camera_backpack_laptop_guitar_cardigan_review_entities_2.tsv'
#
# pair_files = [
# 'BERT/data/annotated_camera_review_pairs.tsv',
# 'BERT/data/annotated_backpack_review_pairs.tsv',
# 'BERT/data/annotated_laptop_review_pairs.tsv',
# 'BERT/data/annotated_acoustic_guitar_review_pairs.tsv',
# 'BERT/data/annotated_cardigan_review_pairs.tsv'
# ]
# pair_output_file = 'BERT/data/camera_backpack_laptop_guitar_cardigan_review_pairs_2.tsv'
#
# for n in range(1, 6):
# combine_files(entity_files[:n], 'BERT/data/review_entities_' + str(n) + '.tsv', total_size=50000)
# combine_files(pair_files[:n], 'BERT/data/review_pairs_' + str(n) + '.tsv', total_size=50000)
#
# combine_files(entity_files, entity_output_file)
# combine_files(pair_files, pair_output_file)
for n in range(1, 6):
BertRelExtractor.train_and_validate('review_pairs_' + str(n) + '.tsv',
'rel_extractor_' + str(n) + '_products')
BertEntityExtractor.train_and_validate('review_entities_' + str(n) + '.tsv',
'entity_extractor_' + str(n) + '_products')
......@@ -16,16 +16,117 @@ from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityD
from agent.target_extraction.BERT.entity_extractor.bert_entity_extractor import BertEntityExtractor
from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset
from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor
from pathos.multiprocessing import ProcessingPool as Pool
import itertools
from pandarallel import pandarallel
np.set_printoptions(precision=3, threshold=np.inf, suppress=True)
np.set_printoptions(precision=4, threshold=np.inf, suppress=True)
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
sentiment_lexicon = pd.read_csv('data/NRC-Sentiment-Lexicon-Wordlevel-v0.92.tsv', sep='\t', index_col=0)
entity_extractor_path = 'BERT/entity_extractor/trained_bert_entity_extractor_camera_backpack_laptop.pt'
rel_extractor_path = 'BERT/relation_extractor/trained_bert_rel_extractor_camera_backpack_laptop_no_nan.pt'
pool = Pool(2)
entity_extractor_path = 'BERT/entity_extractor/entity_extractor_five_products.pt'
rel_extractor_path = 'BERT/relation_extractor/rel_extractor_five_products.pt'
pandarallel.initialize()
def ngrams(tagged_tokens, phraser):
tokens, tags = zip(*tagged_tokens)
unfiltered = [term.split('_') for term in phraser[tokens]]
tagged_unfiltered = []
n = 0
for term in unfiltered:
tagged_unfiltered.append([(subterm, list(tags)[n + idx]) for idx, subterm in enumerate(term)])
n += len(term)
return [subterm for term in tagged_unfiltered for subterm in filter_ngram(term)]
def filter_ngram(term):
if len(term) > 1 and (any(not re.compile('NN|JJ').match(tag) for _, tag in term)
or any(tag.startswith('JJ') and polar_adjective(t) for t, tag in term)):
return [subterm for subterm, _ in term]
return [' '.join([subterm for subterm, _ in term])]
def polar_adjective(adj):
return adj in sentiment_lexicon.index and (sentiment_lexicon.loc[adj]['positive'] == 1 or
sentiment_lexicon.loc[adj]['negative'] == 1)
def count_phrase_nouns(tagged_tokens, tagged_ngrams):
def is_noun(pos_tagged):
word, tag = pos_tagged
return tag.startswith('NN') and word not in string.punctuation and word not in stop_words
# true if term is not a preposition and does not include special characters
def is_valid_term(pos_tagged):
alpha_numeric_pat = '^\w+$'
word, tag = pos_tagged
return tag != 'IN' and re.match(alpha_numeric_pat, word)
nouns = []
word_idx = 0
for token, _ in tagged_ngrams:
if ' ' in token:
words = token.split(' ')
word_range = range(word_idx, word_idx + len(words))
has_noun = any(is_noun(tagged_tokens[i]) for i in word_range)
all_terms_valid = all(is_valid_term(tagged_tokens[i]) for i in word_range)
if has_noun and all_terms_valid:
nouns.append(token)
word_idx += len(words)
else:
token_is_noun = is_noun(tagged_tokens[word_idx])
is_valid = is_valid_term(tagged_tokens[word_idx])
if len(token) > 1 and token_is_noun and is_valid:
nouns.append(token)
word_idx += 1
return nouns
def entity_mentions_in_text(text, tagged_tokens, tagged_ngrams, entities):
all_tokens = {t for t, tag in set().union(*[tagged_tokens, tagged_ngrams]) if tag.startswith('NN')}
entity_mention = None
for entity in entities:
n_mentions = sum(1 for token in all_tokens if entity == token.lower())
if n_mentions > 1:
# many mentions of same entity
return None
if n_mentions == 1:
if entity_mention is None:
entity_mention = entity
elif entity_mention in entity:
entity_mention = entity
elif entity not in entity_mention:
# text cannot have more than one entity mention, unless one is a subset of the other,
# in which case the longer one is taken
return None
if entity_mention is not None:
return text, [{'text': entity_mention}]
return None
def pair_relations_for_text(text, tagged_ngrams, aspects, syn_dict):
def overlapping_terms(ts, t):
if len(ts) == 0:
return False
return any(t in t2.split(' ') if len(t) < len(t2) else t2 in t.split(' ') for t2 in ts)
found_aspects = []
for aspect in aspects:
found_form = False
for form in syn_dict[aspect]:
if any(t == form and tag.startswith('NN') for t, tag in tagged_ngrams):
if len(found_aspects) > 1 or found_form or overlapping_terms(found_aspects, form):
# cannot have more than two aspects, or two forms of the same aspect, or overlapping terms
return None
found_aspects.append(form)
found_form = True
return (text, [{'em1Text': found_aspects[0], 'em2Text': found_aspects[1]}]) if len(found_aspects) == 2 else None
class TargetExtractor:
......@@ -39,7 +140,7 @@ class TargetExtractor:
# word2vec
MIN_TERM_COUNT = 100
SYNONYM_SIMILARITY = 0.12
SYNONYM_SIMILARITY = 0.11
SYNONYM_SIMILARITY_PRODUCT = 0.09
WV_SIZE = 100
WV_WINDOW = 7
......@@ -54,36 +155,36 @@ class TargetExtractor:
self.file_path = file_path
print('tokenizing phrases...')
# tokenize and normalize phrases
texts = TargetExtractor.obtain_texts(file_path, text_column)
self.sentences = list(itertools.chain.from_iterable(pool.map(sent_tokenize, texts)))
self.sentences = pool.map(lambda s: s.replace('_', ' ').lower(), self.sentences)
self.phrases = pool.map(word_tokenize, self.sentences)
self.texts = TargetExtractor.obtain_texts(file_path, text_column, n=50000)
# obtain normalized sentences
self.texts = self.texts.rename(columns={text_column: 'sentence'})
self.texts['sentence'] = self.texts['sentence'].parallel_apply(sent_tokenize)
self.texts = self.texts.explode('sentence').reset_index(drop=True)
self.texts['sentence'] = self.texts['sentence'].parallel_apply(lambda s: s.replace('_', ' ').lower())
# obtain tokens and their pos tags in a new column
self.texts['tokens'] = self.texts['sentence'].parallel_apply(lambda s: pos_tag(word_tokenize(s)))
print('obtaining n-grams...')
# train bigram map
bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
self.phraser = Phraser(trigram)
self.ngram_phrases = self.ngrams(self.phrases)
# train ngrams and their pos tags in a new column
tokens = [[t for t, tag in tagged_ts] for tagged_ts in self.texts['tokens']]
bigram = Phrases(tokens, threshold=TargetExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[tokens], threshold=TargetExtractor.PHRASE_THRESHOLD)
phraser = Phraser(trigram)
self.texts['ngrams'] = self.texts.apply(lambda row: ngrams(row.tokens, phraser), axis=1)
self.texts['ngrams'] = self.texts['ngrams'].parallel_apply(lambda t: pos_tag(t))
print('counting terms...')
# count terms
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
self.save()
print('training word2vec model...')
# train word2vec model
self.wv = self.get_word2vec_model(size=TargetExtractor.WV_SIZE, window=TargetExtractor.WV_WINDOW,
min_count=TargetExtractor.MIN_TERM_COUNT)
self.save()
print('mining aspects...')
# mine aspects
self.aspects, self.counts = self.get_aspects(self.counter)
self.aspects, self.counts = self.get_aspects()
print('extracting synonyms...')
# obtain synonyms
......@@ -96,15 +197,9 @@ class TargetExtractor:
self.counts = {aspect: sum(self.counts[syn] for syn in self.syn_dict[aspect]) for aspect in self.aspects}
self.aspects = sorted(self.aspects, key=self.counts.get, reverse=True)
print(self.syn_dict)
self.save()
print('extracting relatedness matrix...')
self.relatedness_matrix = self.get_bert_relations()
self.save()
print('extracting aspect tree...')
self.tree = self.get_product_tree()
......@@ -126,52 +221,37 @@ class TargetExtractor:
@staticmethod
def obtain_texts(path, col, n=None):
file = pd.read_csv(path, sep='\t', error_bad_lines=False)
file = file[~file[col].isnull()]
if n and n < len(file.index):
file = file.sample(frac=1).reset_index(drop=True)
file = file.head(n)
texts = [text for _, text in file[col].items() if not pd.isnull(text)]
texts = pd.read_csv(path, usecols=[col], squeeze=False, sep='\t', error_bad_lines=False)
texts = texts[~texts[col].isnull()]
texts = texts[texts[col] != '']
if n and n < len(texts.index):
texts = texts.sample(frac=1).reset_index(drop=True)
texts = texts.head(n)
# texts = [text for _, text in df[col].items() if not pd.isnull(text)]
print('Obtained {} texts'.format(len(texts)))
return texts
def ngrams(self, text):
if any(isinstance(subtext, list) for subtext in text):