train_classifier.py 2.64 KB
Newer Older
1
import pandas as pd
2
from server.agent.review_tokenizer import tokenize_review, reduce_noise
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import random
from nltk import classify, NaiveBayesClassifier
import pickle

data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_output_location = 'reviews_trained_on.tsv'
classifier_location = 'camera_review_classifier.pickle'
min_characters = 0
max_characters = 100
n = 50000
train_factor = 0.7
separators = ' but |although|though|otherwise|however|unless|whereas|despite|<br />'

def get_tokens(tokenized_reviews_list):
    for review in tokenized_reviews_list:
18
        yield dict([token, True] for token in review)
19
20
21
22
23
24
25
26
27
28
29
30

#####

reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)

# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]

# select reviews with specified review_body length
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]

# filter out reviews with more than one phrase
31
reviews = reviews[~reviews['product_title'].str.contains(pat=separators, regex=True)]
32
33
34
35
36
37
38
39
40
41
42
43
44
45

# pick out highly positive and negative reviews
positive_reviews = reviews[reviews['star_rating'].apply(lambda x: x == 5)]
negative_reviews = reviews[reviews['star_rating'].apply(lambda x: x == 1)]

# take first n/2 positive and negative reviews
positive_reviews = positive_reviews.head(round(n/2))
negative_reviews = negative_reviews.head(round(n/2))

print("Obtained ", len(positive_reviews), " positive and ", len(negative_reviews), " negative reviews")

# save reviews used for training
pd.concat([positive_reviews, negative_reviews]).to_csv(selected_reviews_output_location, sep='\t', index=False)

46
47
48
# tokenize reviews and reduce noise
positive_reviews_list = list(map(lambda r: reduce_noise(tokenize_review(r)), positive_reviews['review_body']))
negative_reviews_list = list(map(lambda r: reduce_noise(tokenize_review(r)), negative_reviews['review_body']))
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

# obtain review tokens for model
positive_tokens = get_tokens(positive_reviews_list)
negative_tokens = get_tokens(negative_reviews_list)

# obtain train and test data
positive_dataset = [(dict, "+") for dict in positive_tokens]
negative_dataset = [(dict, "-") for dict in negative_tokens]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:round(train_factor*n)]
test_data = dataset[round(train_factor*n):]

# train classifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

# save classifier
f = open(classifier_location, 'wb')
pickle.dump(classifier, f)
f.close()