Commit 7ce1a50e authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented script to train review classifier

parent c26a85bc
import pandas as pd
from nltk.tokenize import sent_tokenize
import re
from nltk.tokenize import TweetTokenizer
import random
from nltk import classify, NaiveBayesClassifier
import pickle
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_output_location = 'reviews_trained_on.tsv'
classifier_location = 'camera_review_classifier.pickle'
min_characters = 0
max_characters = 100
n = 50000
train_factor = 0.7
separators = ' but |although|though|otherwise|however|unless|whereas|despite|<br />'
tokenizer = TweetTokenizer()
def tokenize_review(review):
return tokenizer.tokenize(review)
def get_tokens(tokenized_reviews_list):
for review in tokenized_reviews_list:
yield dict([token.lower(), True] for token in review)
#####
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# select reviews with specified review_body length
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# filter out reviews with more than one phrase
reviews = reviews[~reviews['product_title'].str.contains(pat = separators, regex = True)]
# pick out highly positive and negative reviews
positive_reviews = reviews[reviews['star_rating'].apply(lambda x: x == 5)]
negative_reviews = reviews[reviews['star_rating'].apply(lambda x: x == 1)]
# take first n/2 positive and negative reviews
positive_reviews = positive_reviews.head(round(n/2))
negative_reviews = negative_reviews.head(round(n/2))
print("Obtained ", len(positive_reviews), " positive and ", len(negative_reviews), " negative reviews")
# save reviews used for training
pd.concat([positive_reviews, negative_reviews]).to_csv(selected_reviews_output_location, sep='\t', index=False)
# tokenize reviews
positive_reviews_list = list(map(tokenize_review, positive_reviews['review_body']))
negative_reviews_list = list(map(tokenize_review, negative_reviews['review_body']))
# obtain review tokens for model
positive_tokens = get_tokens(positive_reviews_list)
negative_tokens = get_tokens(negative_reviews_list)
# obtain train and test data
positive_dataset = [(dict, "+") for dict in positive_tokens]
negative_dataset = [(dict, "-") for dict in negative_tokens]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:round(train_factor*n)]
test_data = dataset[round(train_factor*n):]
# train classifier
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))
# save classifier
f = open(classifier_location, 'wb')
pickle.dump(classifier, f)
f.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment