Commit ba2d99f3 authored by Joel Oksanen's avatar Joel Oksanen

Minor eval changes

parent a2ecd08d
......@@ -2,7 +2,7 @@ import pandas as pd
class DataLoader:
data_location = 'agent/amazon_data/reviews_for_backpack.tsv'
data_location = 'agent/amazon_data/reviews_for_watches.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
......
import pandas as pd
import gzip
import json
MAX_ITEMS = 200000
def parse(path):
g = gzip.open(path, 'rb')
for line in g:
yield json.loads(line)
def get_df(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
if i == MAX_ITEMS:
break
return pd.DataFrame.from_dict(df, orient='index')
pd.set_option('display.max_colwidth', None)
category = 'Cardigans'
metadata_iter = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: type(cl) is list and category in cl)]
for metadata in metadata_iter])
def get_reviews(category, meta_file, review_file):
metadata_iter = pd.read_json(meta_file, lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: type(cl) is list and category in cl)]
for metadata in metadata_iter])
print(len(metadata.index))
print(len(metadata.index))
review_iter = pd.read_json(review_file, lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
return reviews
print(len(reviews.index))
reviews.to_csv('target_extraction/data/verified_cardigan_reviews.tsv', sep='\t', index=False)
def save_reviews(category, meta_file, review_file, output_file):
reviews = get_reviews(category, meta_file, review_file)
reviews.to_csv(output_file, sep='\t', index=False)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
# compression='gzip')
# parent_output = 'target_extraction/data/electronics_reviews.tsv'
# child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
#
# for col in reviews.columns:
# print(col)
#
# c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
# p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
# c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
# c_reviews = c_reviews.head(MAX_ITEMS)
# p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
# p_reviews = p_reviews.head(MAX_ITEMS)
#
# p_reviews.to_csv(parent_output, sep='\t', index=False)
# c_reviews.to_csv(child_output, sep='\t', index=False)
# print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
def save_top_reviewed_products(n, category, meta_file, review_file, output_file, product_title):
reviews = get_reviews(category, meta_file, review_file)
top_reviewed = reviews.groupby(['asin'], sort=False).size().sort_values(ascending=False).head(n)
reviews = reviews[reviews['asin'].apply(lambda asin: asin in top_reviewed)]
reviews = reviews.rename(columns={'overall': 'star_rating', 'asin': 'product_id', 'reviewerID': 'review_id',
'reviewText': 'review_body'})
reviews = reviews[reviews['review_body'].apply(lambda b: b is not None and len(b) > 0)]
reviews = reviews[reviews['star_rating'].apply(lambda r: type(r) is int or r.isdigit())]
reviews['product_title'] = product_title
reviews.to_csv(output_file, sep='\t', index=False)
# # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
#
# # try to filter out camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
# filter_pat = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter_pat += word_filter + '|'
# filter_pat = filter_pat[:-1]
# r = re.compile(filter_pat)
# metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
# metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
#
# for _, row in metadata.head(20).iterrows():
# print('features:', row['feature'])
# print('description:', row['description'])
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
# save_top_reviewed_products(3, 'Wrist Watches', 'amazon_data/meta_Clothing_Shoes_and_Jewelry.json',
# 'amazon_data/Clothing_Shoes_and_Jewelry.json', 'amazon_data/reviews_for_watches.tsv',
# 'watch')
save_reviews('Necklaces', 'agent/amazon_data/meta_Clothing_Shoes_and_Jewelry.json', 'agent/amazon_data/Clothing_Shoes_and_Jewelry.json', 'agent/target_extraction/data/verified_necklace_reviews.tsv')
......@@ -20,7 +20,7 @@ LEARNING_RATE = 0.00002
MAX_GRAD_NORM = 1.0
# training
N_EPOCHS = 2
N_EPOCHS = 3
BATCH_SIZE = 32
WARM_UP_FRAC = 0.05
......@@ -48,13 +48,13 @@ class BertEntityExtractor:
return extractor
@staticmethod
def train_and_validate(file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_and_validate(file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
extractor = BertEntityExtractor()
extractor.train_with_file(file_path, save_path, size=size, valid_frac=valid_frac,
extractor.train_with_file(file_path, save_file, size=size, valid_frac=valid_frac,
valid_file_path=valid_file_path)
return extractor
def train_with_file(self, file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_with_file(self, file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
# load training data
if valid_file_path is None:
train_data, valid_data = EntityDataset.from_file(file_path, size=size, valid_frac=valid_frac)
......@@ -122,11 +122,11 @@ class BertEntityExtractor:
if valid_data is not None:
self.evaluate(data=valid_data)
torch.save(self.net.state_dict(), '{}.pt'.format(save_file))
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(self.net.state_dict(), save_path)
def evaluate(self, file_path=None, data=None, size=None):
# load eval data
if file_path is not None:
......
......@@ -19,7 +19,7 @@ LEARNING_RATE = 0.00002
MAX_GRAD_NORM = 1.0
# training
N_EPOCHS = 2
N_EPOCHS = 3
BATCH_SIZE = 16
WARM_UP_FRAC = 0.05
......@@ -47,13 +47,13 @@ class BertRelExtractor:
return extractor
@staticmethod
def train_and_validate(file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_and_validate(file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
extractor = BertRelExtractor()
extractor.train_with_file(file_path, save_path, size=size, valid_frac=valid_frac,
extractor.train_with_file(file_path, save_file, size=size, valid_frac=valid_frac,
valid_file_path=valid_file_path)
return extractor
def train_with_file(self, file_path, save_path, size=None, valid_frac=None, valid_file_path=None):
def train_with_file(self, file_path, save_file, size=None, valid_frac=None, valid_file_path=None):
# load training data
if valid_file_path is None:
train_data, valid_data = PairRelDataset.from_file(file_path, size=size, valid_frac=valid_frac)
......@@ -121,11 +121,11 @@ class BertRelExtractor:
if valid_data is not None:
self.evaluate(data=valid_data)
torch.save(self.net.state_dict(), '{}.pt'.format(save_file))
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(self.net.state_dict(), save_path)
def evaluate(self, file_path=None, data=None, size=None):
# load eval data
if file_path is not None:
......
......@@ -5,6 +5,7 @@ import pandas as pd
import numpy as np
from ast import literal_eval
from agent.target_extraction.BERT.relation_extractor.pairbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
import os
MAX_SEQ_LEN = 128
RELATIONS = ['/has_feature', '/no_relation']
......@@ -69,11 +70,12 @@ class PairRelDataset(Dataset):
return dataset
@staticmethod
def from_file(path, valid_frac=None, size=None):
if path.endswith('.json'):
dataset = PairRelDataset(pd.read_json(path, lines=True), size=size)
elif path.endswith('.tsv'):
dataset = PairRelDataset(pd.read_csv(path, sep='\t', error_bad_lines=False), size=size)
def from_file(file_name, valid_frac=None, size=None):
f = open(os.path.dirname(__file__) + '/../data/' + file_name)
if file_name.endswith('.json'):
dataset = PairRelDataset(pd.read_json(f, lines=True), size=size)
elif file_name.endswith('.tsv'):
dataset = PairRelDataset(pd.read_csv(f, sep='\t', error_bad_lines=False), size=size)
else:
raise AttributeError('Could not recognize file type')
......
......@@ -111,7 +111,7 @@ class EntityAnnotator:
os.system('clear')
print(fg.li_green + '{} entities annotated'.format(self.n_annotated) + fg.rs)
print(fg.li_green + '{} nouns annotated'.format(self.n_annotated) + fg.rs)
print('')
print(fg.li_black + 'root: \'r\'' + fg.rs)
......@@ -249,11 +249,12 @@ class EntityAnnotator:
def pair_relations_for_text(self, text, nan_entities=None):
single_tokens = word_tokenize(text)
all_tokens = set().union(*[single_tokens, self.phraser[single_tokens]])
tagged_single = pos_tag(single_tokens)
tagged_all = set().union(*[tagged_single, pos_tag(self.phraser[single_tokens])])
entity_mentions = []
for n in PreOrderIter(self.root):
cont, mention = self.mention_in_text(all_tokens, node=n)
cont, mention = self.mention_in_text(tagged_all, node=n)
if not cont:
# many mentions of same entity
return None
......@@ -269,7 +270,7 @@ class EntityAnnotator:
if nan_entities is not None and len(entity_mentions) == 1:
nan_mention = None
for term in nan_entities:
cont, mention = self.mention_in_text(all_tokens, term=term)
cont, mention = self.mention_in_text(tagged_all, term=term)
if not cont:
# many mentions of term
return None
......@@ -286,10 +287,11 @@ class EntityAnnotator:
# returns True, (synonym of node / term / None) if there is exactly one or zero such occurrence,
# otherwise False, None, None
def mention_in_text(self, tokens, node=None, term=None):
def mention_in_text(self, tagged_tokens, node=None, term=None):
mention = None
for syn in ({syn.lower() for syn in self.synset[node]} if node is not None else {term}):
n_matches = sum(1 for token in tokens if syn == token.lower().replace('_', ' '))
n_matches = sum(1 for token, tag in tagged_tokens
if syn == token.lower().replace('_', ' ') and tag.startswith('NN'))
if n_matches > 1:
return False, None
if n_matches == 1:
......@@ -301,11 +303,12 @@ class EntityAnnotator:
def entity_mentions_in_text(self, text, all_entities):
single_tokens = word_tokenize(text)
all_tokens = set().union(*[single_tokens, self.phraser[single_tokens]])
tagged_single = pos_tag(single_tokens)
tagged_all = set().union(*[tagged_single, pos_tag(self.phraser[single_tokens])])
entity_mention = None
for entity, is_aspect in all_entities:
cont, mention = self.mention_in_text(all_tokens, term=entity)
cont, mention = self.mention_in_text(tagged_all, term=entity)
if not cont:
# many mentions of same entity
return None
......@@ -355,12 +358,5 @@ class EntityAnnotator:
return text, rels
ann: EntityAnnotator = EntityAnnotator.load_saved('acoustic_guitar_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_acoustic_guitar_review_entities.tsv')
ann: EntityAnnotator = EntityAnnotator.load_saved('camera_entity_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_camera_review_entities.tsv')
ann: EntityAnnotator = EntityAnnotator.load_saved('laptop_entity_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_laptop_review_entities.tsv')
ann: EntityAnnotator = EntityAnnotator.load_saved('backpack_entity_annotator.pickle')
ann.save_annotated_entities('BERT/data/annotated_backpack_review_entities.tsv')
ea = EntityAnnotator.load_saved('example_annotator.pickle')
ea.annotate()
from agent.target_extraction.BERT.entity_extractor.bert_entity_extractor import BertEntityExtractor
from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor
from agent.target_extraction.BERT.data.combine_files import combine_files
# from agent.target_extraction.entity_annotation import EntityAnnotator
# ann: EntityAnnotator = EntityAnnotator.load_saved('acoustic_guitar_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_acoustic_guitar_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_acoustic_guitar_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('camera_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_camera_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_camera_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('laptop_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_laptop_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_laptop_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('backpack_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_backpack_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_backpack_review_entities.tsv')
#
# entity_files = [
# 'BERT/data/annotated_camera_review_entities.tsv',
# 'BERT/data/annotated_backpack_review_entities.tsv',
# 'BERT/data/annotated_laptop_review_entities.tsv',
# 'BERT/data/annotated_acoustic_guitar_review_entities.tsv',
# 'BERT/data/annotated_cardigan_review_entities.tsv'
# ]
# entity_output_file = 'BERT/data/camera_backpack_laptop_guitar_cardigan_review_entities_2.tsv'
#
# pair_files = [
# 'BERT/data/annotated_camera_review_pairs.tsv',
# 'BERT/data/annotated_backpack_review_pairs.tsv',
# 'BERT/data/annotated_laptop_review_pairs.tsv',
# 'BERT/data/annotated_acoustic_guitar_review_pairs.tsv',
# 'BERT/data/annotated_cardigan_review_pairs.tsv'
# ]
# pair_output_file = 'BERT/data/camera_backpack_laptop_guitar_cardigan_review_pairs_2.tsv'
#
# for n in range(1, 6):
# combine_files(entity_files[:n], 'BERT/data/review_entities_' + str(n) + '.tsv', total_size=50000)
# combine_files(pair_files[:n], 'BERT/data/review_pairs_' + str(n) + '.tsv', total_size=50000)
#
# combine_files(entity_files, entity_output_file)
# combine_files(pair_files, pair_output_file)
for n in range(1, 6):
BertRelExtractor.train_and_validate('review_pairs_' + str(n) + '.tsv',
'rel_extractor_' + str(n) + '_products')
BertEntityExtractor.train_and_validate('review_entities_' + str(n) + '.tsv',
'entity_extractor_' + str(n) + '_products')
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment