Skip to content
Snippets Groups Projects
Commit a56456f5 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Created script for preparing metadata

parent af560174
No related branches found
No related tags found
No related merge requests found
...@@ -2,4 +2,5 @@ ...@@ -2,4 +2,5 @@
*.pt *.pt
__pycache__/ __pycache__/
server/agent/amazon_data/ server/agent/amazon_data/
server/agent/target_extraction/data/
.DS_Store .DS_Store
...@@ -8,10 +8,10 @@ import time ...@@ -8,10 +8,10 @@ import time
import numpy as np import numpy as np
from sklearn import metrics from sklearn import metrics
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml' semeval_2014_train_path = 'agent/SA/data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml' semeval_2014_test_path = 'agent/SA/data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'agent/SA/data/Amazon/annotated_amazon_laptop_reviews.xml' amazon_test_path = 'agent/SA/data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'semeval_2014_2.pt' trained_model_path = 'agent/SA/semeval_2014_2.pt'
BATCH_SIZE = 32 BATCH_SIZE = 32
MAX_EPOCHS = 6 MAX_EPOCHS = 6
......
...@@ -19,10 +19,10 @@ class ConceptNet: ...@@ -19,10 +19,10 @@ class ConceptNet:
obj = requests.get(self.url + uri).json() obj = requests.get(self.url + uri).json()
return obj['value'] return obj['value']
def append_synonyms(self, feature, rel, synonyms, lock): def append_result(self, feature, rel, result_set, lock):
rels = self.find_rels(feature, rel) rels = self.find_rels(feature, rel)
lock.acquire() lock.acquire()
synonyms.update(rels) result_set.update(rels)
lock.release() lock.release()
def parent_check(self, node, parent, synonyms): def parent_check(self, node, parent, synonyms):
...@@ -40,14 +40,14 @@ class ConceptNet: ...@@ -40,14 +40,14 @@ class ConceptNet:
self.parent_check(node, parent.parent, synonyms) self.parent_check(node, parent.parent, synonyms)
def sem_synonyms_for_node(self, node): def sem_synonyms_for_node(self, node):
rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo'] rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo'] # SimilarTo? FormOf?
synonyms = set() synonyms = set()
lock = threading.Lock() lock = threading.Lock()
threads = [] threads = []
for rel in rels: for rel in rels:
t = threading.Thread(target=self.append_synonyms, args=(node.name, rel, synonyms, lock)) t = threading.Thread(target=self.append_result, args=(node.name, rel, synonyms, lock))
t.start() t.start()
threads.append(t) threads.append(t)
for t in threads: for t in threads:
...@@ -57,8 +57,26 @@ class ConceptNet: ...@@ -57,8 +57,26 @@ class ConceptNet:
return synonyms return synonyms
def sub_features_for_node(self, node):
rels = ['UsedFor', 'HasA', 'CapableOf', 'Causes', 'HasSubevent', 'HasProperty', 'MadeOf']
features = set()
lock = threading.Lock()
threads = []
for rel in rels:
t = threading.Thread(target=self.append_result, args=(node.name, rel, features, lock))
t.start()
threads.append(t)
for t in threads:
t.join()
return features
net = ConceptNet() net = ConceptNet()
parent = Node(str(sys.argv[1])) # parent = Node(str(sys.argv[1]))
child = Node(str(sys.argv[2]), parent=parent) # child = Node(str(sys.argv[2]), parent=parent)
syns = net.sem_synonyms_for_node(child) # syns = net.sem_synonyms_for_node(child)
print(syns) # print(syns)
node = Node('camera')
print(net.sub_features_for_node(node))
import pandas as pd
import gzip
import json
import re
output_location = 'target_extraction/data/camera_metadata.tsv'
def parse(path):
g = gzip.open(path, 'rb')
for line in g:
yield json.loads(line)
def get_df(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/meta_Electronics.json.gz')
for col in metadata.columns:
print(col)
# get metadata for camera products
metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
# try to filter out camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
r = re.compile(filter_pat)
metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
for _, row in metadata.head(20).iterrows():
print('features:', row['feature'])
print('description:', row['description'])
print('tech1:', row['tech1'])
print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'products')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment