Commit 7cc83e97 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented classifier in analyze_data.py

parent 7ce1a50e
......@@ -3,3 +3,4 @@ amazon_reviews_us_Camera_v1_00.tsv.gz
amazon_reviews_us_Digital_Video_Games_v1_00.tsv
*.tsv
.DS_Store
__pycache__/
......@@ -7,6 +7,8 @@ from anytree import Node, PostOrderIter
from functools import reduce
from matplotlib import pyplot
from scipy.stats import pearsonr
import pickle
from review_tokenizer import tokenize_review
sentiment_threshold = 0.3
......@@ -33,6 +35,10 @@ price: ['price', 'value'],
shipping: ['ship']
}
f = open('camera_review_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()
# extract phrases
def extract_phrases(review_body):
sentences = sent_tokenize(review_body)
......@@ -44,9 +50,17 @@ def extract_phrases(review_body):
# analyze sentiment
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(phrase):
# get vader score
vader_s = analyzer.polarity_scores(phrase)
compound_s = vader_s['compound']
return compound_s
# get classification
tokens = tokenize_review(phrase)
classification = classifier.classify(dict([token, True] for token in tokens))
# return compound_s if the two agree
if (classification == '+' and compound_s > 0) or (classification == '-' and compound_s < 0):
return compound_s
else:
return 0
# remove all ancestors of node in list l
def remove_ancestors(node, l):
......@@ -161,6 +175,7 @@ all_reviews = pd.read_csv('camera_prepared_data.tsv', sep='\t', error_bad_lines=
camera_strengths = []
star_rating_averages = []
products_analyzed = 0
grouped = all_reviews.groupby('product_id')
for product_id, reviews in grouped:
# get ra
......@@ -189,6 +204,8 @@ for product_id, reviews in grouped:
# store results
camera_strengths.append(strengths[camera])
star_rating_averages.append(star_rating_sum / review_count)
products_analyzed += 1
print(products_analyzed)
# calculate Pearson's correlation
correlation, _ = pearsonr(camera_strengths, star_rating_averages)
......
......@@ -4,12 +4,18 @@ import pandas as pd
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
training_data_location = 'reviews_trained_on.tsv'
output_location = 'camera_prepared_data.tsv'
min_reviews = 50
min_characters = 25
n = 100
min_characters = 50
n = 500
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)ß
# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews,training_reviews])
reviews = reviews.drop_duplicates(keep=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
......
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
def tokenize_review(review):
return tokenizer.tokenize(review)
import pandas as pd
from nltk.tokenize import sent_tokenize
import re
from nltk.tokenize import TweetTokenizer
from review_tokenizer import tokenize_review
import random
from nltk import classify, NaiveBayesClassifier
import pickle
......@@ -14,10 +14,6 @@ max_characters = 100
n = 50000
train_factor = 0.7
separators = ' but |although|though|otherwise|however|unless|whereas|despite|<br />'
tokenizer = TweetTokenizer()
def tokenize_review(review):
return tokenizer.tokenize(review)
def get_tokens(tokenized_reviews_list):
for review in tokenized_reviews_list:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment