Commit a56456f5 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Created script for preparing metadata

parent af560174
......@@ -2,4 +2,5 @@
*.pt
__pycache__/
server/agent/amazon_data/
server/agent/target_extraction/data/
.DS_Store
......@@ -8,10 +8,10 @@ import time
import numpy as np
from sklearn import metrics
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
semeval_2014_train_path = 'agent/SA/data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'agent/SA/data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'agent/SA/data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'semeval_2014_2.pt'
trained_model_path = 'agent/SA/semeval_2014_2.pt'
BATCH_SIZE = 32
MAX_EPOCHS = 6
......
......@@ -19,10 +19,10 @@ class ConceptNet:
obj = requests.get(self.url + uri).json()
return obj['value']
def append_synonyms(self, feature, rel, synonyms, lock):
def append_result(self, feature, rel, result_set, lock):
rels = self.find_rels(feature, rel)
lock.acquire()
synonyms.update(rels)
result_set.update(rels)
lock.release()
def parent_check(self, node, parent, synonyms):
......@@ -40,14 +40,14 @@ class ConceptNet:
self.parent_check(node, parent.parent, synonyms)
def sem_synonyms_for_node(self, node):
rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo']
rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo'] # SimilarTo? FormOf?
synonyms = set()
lock = threading.Lock()
threads = []
for rel in rels:
t = threading.Thread(target=self.append_synonyms, args=(node.name, rel, synonyms, lock))
t = threading.Thread(target=self.append_result, args=(node.name, rel, synonyms, lock))
t.start()
threads.append(t)
for t in threads:
......@@ -57,8 +57,26 @@ class ConceptNet:
return synonyms
def sub_features_for_node(self, node):
rels = ['UsedFor', 'HasA', 'CapableOf', 'Causes', 'HasSubevent', 'HasProperty', 'MadeOf']
features = set()
lock = threading.Lock()
threads = []
for rel in rels:
t = threading.Thread(target=self.append_result, args=(node.name, rel, features, lock))
t.start()
threads.append(t)
for t in threads:
t.join()
return features
net = ConceptNet()
parent = Node(str(sys.argv[1]))
child = Node(str(sys.argv[2]), parent=parent)
syns = net.sem_synonyms_for_node(child)
print(syns)
# parent = Node(str(sys.argv[1]))
# child = Node(str(sys.argv[2]), parent=parent)
# syns = net.sem_synonyms_for_node(child)
# print(syns)
node = Node('camera')
print(net.sub_features_for_node(node))
import pandas as pd
import gzip
import json
import re
output_location = 'target_extraction/data/camera_metadata.tsv'
def parse(path):
g = gzip.open(path, 'rb')
for line in g:
yield json.loads(line)
def get_df(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/meta_Electronics.json.gz')
for col in metadata.columns:
print(col)
# get metadata for camera products
metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
# try to filter out camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
r = re.compile(filter_pat)
metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
for _, row in metadata.head(20).iterrows():
print('features:', row['feature'])
print('description:', row['description'])
print('tech1:', row['tech1'])
print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'products')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment