Commit f1cba1d6 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Improved classifier noise reduction. Implemented pure classifier semantic...

Improved classifier noise reduction. Implemented pure classifier semantic analysis in analyzze_data.py with Pearson Coefficient 0.62
parent 7cc83e97
......@@ -8,9 +8,9 @@ from functools import reduce
from matplotlib import pyplot
from scipy.stats import pearsonr
import pickle
from review_tokenizer import tokenize_review
from review_tokenizer import tokenize_review, reduce_noise
sentiment_threshold = 0.3
sentiment_threshold = 0.95
camera = Node('camera')
image = Node('image', parent=camera)
......@@ -31,7 +31,7 @@ video: ['video'],
battery: ['battery'],
flash: ['flash'],
audio: ['audio', 'sound'],
price: ['price', 'value'],
price: ['price', 'value', 'cost'],
shipping: ['ship']
}
......@@ -51,16 +51,19 @@ def extract_phrases(review_body):
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(phrase):
# get vader score
vader_s = analyzer.polarity_scores(phrase)
compound_s = vader_s['compound']
# vader_s = analyzer.polarity_scores(phrase)
# compound_s = vader_s['compound']
# get classification
tokens = tokenize_review(phrase)
classification = classifier.classify(dict([token, True] for token in tokens))
tokens = reduce_noise(tokenize_review(phrase))
prob_classification = classifier.prob_classify(dict([token, True] for token in tokens))
# return compound_s if the two agree
if (classification == '+' and compound_s > 0) or (classification == '-' and compound_s < 0):
return compound_s
else:
return 0
# if (classification == '+' and compound_s > 0) or (classification == '-' and compound_s < 0):
# return compound_s
# else:
# return 0
classification = prob_classification.max()
strength = (prob_classification.prob(classification) - 0.5) * 2
return strength if classification == '+' else -strength
# remove all ancestors of node in list l
def remove_ancestors(node, l):
......
......@@ -10,7 +10,7 @@ min_reviews = 50
min_characters = 50
n = 500
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)ß
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
......
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
tokenizer = TweetTokenizer()
stop_words = stopwords.words('english')
def tokenize_review(review):
return tokenizer.tokenize(review)
def reduce_noise(tokens):
lowercase_tokens = list(map(lambda s: s.lower(), tokens))
return list(filter(lambda s: len(s) > 0 and s not in string.punctuation and s not in stop_words, lowercase_tokens))
import pandas as pd
from nltk.tokenize import sent_tokenize
import re
from review_tokenizer import tokenize_review
from review_tokenizer import tokenize_review, reduce_noise
import random
from nltk import classify, NaiveBayesClassifier
import pickle
......@@ -17,7 +17,7 @@ separators = ' but |although|though|otherwise|however|unless|whereas|despite|<br
def get_tokens(tokenized_reviews_list):
for review in tokenized_reviews_list:
yield dict([token.lower(), True] for token in review)
yield dict([token, True] for token in review)
#####
......@@ -45,9 +45,9 @@ print("Obtained ", len(positive_reviews), " positive and ", len(negative_reviews
# save reviews used for training
pd.concat([positive_reviews, negative_reviews]).to_csv(selected_reviews_output_location, sep='\t', index=False)
# tokenize reviews
positive_reviews_list = list(map(tokenize_review, positive_reviews['review_body']))
negative_reviews_list = list(map(tokenize_review, negative_reviews['review_body']))
# tokenize reviews and reduce noise
positive_reviews_list = list(map(lambda r: reduce_noise(tokenize_review(r)), positive_reviews['review_body']))
negative_reviews_list = list(map(lambda r: reduce_noise(tokenize_review(r)), negative_reviews['review_body']))
# obtain review tokens for model
positive_tokens = get_tokens(positive_reviews_list)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment