Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
7ce1a50e
Commit
7ce1a50e
authored
Jan 22, 2020
by
Joel Oksanen
Browse files
Implemented script to train review classifier
parent
c26a85bc
Changes
2
Hide whitespace changes
Inline
Side-by-side
camera_review_classifier.pickle
0 → 100644
View file @
7ce1a50e
File added
train_classifier.py
0 → 100644
View file @
7ce1a50e
import
pandas
as
pd
from
nltk.tokenize
import
sent_tokenize
import
re
from
nltk.tokenize
import
TweetTokenizer
import
random
from
nltk
import
classify
,
NaiveBayesClassifier
import
pickle
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_output_location
=
'reviews_trained_on.tsv'
classifier_location
=
'camera_review_classifier.pickle'
min_characters
=
0
max_characters
=
100
n
=
50000
train_factor
=
0.7
separators
=
' but |although|though|otherwise|however|unless|whereas|despite|<br />'
tokenizer
=
TweetTokenizer
()
def
tokenize_review
(
review
):
return
tokenizer
.
tokenize
(
review
)
def
get_tokens
(
tokenized_reviews_list
):
for
review
in
tokenized_reviews_list
:
yield
dict
([
token
.
lower
(),
True
]
for
token
in
review
)
#####
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
# select reviews with specified review_body length
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
x
:
len
(
str
(
x
))
>=
min_characters
and
len
(
str
(
x
))
<=
max_characters
)]
# filter out reviews with more than one phrase
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
separators
,
regex
=
True
)]
# pick out highly positive and negative reviews
positive_reviews
=
reviews
[
reviews
[
'star_rating'
].
apply
(
lambda
x
:
x
==
5
)]
negative_reviews
=
reviews
[
reviews
[
'star_rating'
].
apply
(
lambda
x
:
x
==
1
)]
# take first n/2 positive and negative reviews
positive_reviews
=
positive_reviews
.
head
(
round
(
n
/
2
))
negative_reviews
=
negative_reviews
.
head
(
round
(
n
/
2
))
print
(
"Obtained "
,
len
(
positive_reviews
),
" positive and "
,
len
(
negative_reviews
),
" negative reviews"
)
# save reviews used for training
pd
.
concat
([
positive_reviews
,
negative_reviews
]).
to_csv
(
selected_reviews_output_location
,
sep
=
'
\t
'
,
index
=
False
)
# tokenize reviews
positive_reviews_list
=
list
(
map
(
tokenize_review
,
positive_reviews
[
'review_body'
]))
negative_reviews_list
=
list
(
map
(
tokenize_review
,
negative_reviews
[
'review_body'
]))
# obtain review tokens for model
positive_tokens
=
get_tokens
(
positive_reviews_list
)
negative_tokens
=
get_tokens
(
negative_reviews_list
)
# obtain train and test data
positive_dataset
=
[(
dict
,
"+"
)
for
dict
in
positive_tokens
]
negative_dataset
=
[(
dict
,
"-"
)
for
dict
in
negative_tokens
]
dataset
=
positive_dataset
+
negative_dataset
random
.
shuffle
(
dataset
)
train_data
=
dataset
[:
round
(
train_factor
*
n
)]
test_data
=
dataset
[
round
(
train_factor
*
n
):]
# train classifier
classifier
=
NaiveBayesClassifier
.
train
(
train_data
)
print
(
"Accuracy is:"
,
classify
.
accuracy
(
classifier
,
test_data
))
print
(
classifier
.
show_most_informative_features
(
10
))
# save classifier
f
=
open
(
classifier_location
,
'wb'
)
pickle
.
dump
(
classifier
,
f
)
f
.
close
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment