Commit 5039567d authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Created simple token-based metadata target extractor

parent a56456f5
import pandas as pd
import ast
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
class TargetExtractor:
def __init__(self, metadata_path):
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False)
self.features = self.get_all('feature')
self.descriptions = self.get_all('description')
self.tech1 = self.get_all('tech1')
self.tech2 = self.get_all('tech2')
print(len(self.features), len(self.descriptions), len(self.tech1), len(self.tech2))
n = 50
print('features:', TargetExtractor.get_common_nouns(self.features, n))
# print('descriptions:', TargetExtractor.get_common_nouns(self.descriptions, n))
# print('tech1:', TargetExtractor.get_common_nouns(self.tech1, n))
@staticmethod
def get_common_nouns(phrases, n):
# tokenize and normalize phrases
tokenized_normalized = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in phrases]
# train bigram map
tokenized_phrases = Phrases(tokenized_normalized)
bigrammer = Phraser(tokenized_phrases)
nouns = []
for phrase in tokenized_normalized:
pos_tags = pos_tag(phrase)
bigrams = bigrammer[phrase]
word_idx = 0
for token in bigrams:
if '_' in token:
words = token.split('_')
if any(TargetExtractor.is_noun(pos_tags[i]) for i in range(word_idx, word_idx + len(words))):
nouns.append(token)
word_idx += len(words)
else:
if TargetExtractor.is_noun(pos_tags[word_idx]):
nouns.append(token)
word_idx += 1
c = Counter(nouns)
return c.most_common(n)
@staticmethod
def singular(word):
return wnl.lemmatize(word)
@staticmethod
def is_noun(pos_tagged):
word, tag = pos_tagged
return tag.startswith('NN') and word.lower() not in string.punctuation and word not in stop_words
def get_all(self, col):
return [item for _, items in self.metadata[col].items() if not pd.isnull(items)
for item in ast.literal_eval(items)]
def extract(self):
pass
TargetExtractor('data/camera_metadata.tsv')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment