review_tokenizer.py 677 Bytes
Newer Older
1
from nltk.tokenize import TweetTokenizer
2
from nltk.corpus import stopwords
3
from agent.item import glossary
4
import string
5

6
class ReviewTokenizer:
7

8
9
10
    tokenizer = TweetTokenizer()
    stop_words = stopwords.words('english')
    flat_glossary = [val for sublist in list(glossary.values()) for val in sublist]
11

12
13
14
15
16
17
    def tokenize_review(self, review):
        return self.reduce_noise(self.tokenizer.tokenize(review))

    def reduce_noise(self, tokens):
        lowercase_tokens = list(map(lambda s: s.lower(), tokens))
        return list(filter(lambda s: len(s) > 0 and s not in string.punctuation and s not in self.stop_words and s not in self.flat_glossary, lowercase_tokens))