Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
56fb62b9
Commit
56fb62b9
authored
Apr 23, 2020
by
Joel Oksanen
Browse files
Major refactors in order to make server the main source folder
parent
57b5fe71
Changes
312
Show whitespace changes
Inline
Side-by-side
ADA/dataloader.py
→
ADA/
server/agent/
dataloader.py
View file @
56fb62b9
import
pandas
as
pd
class
DataLoader
:
data_location
=
'
camera_prepared_data
.tsv'
data_location
=
'
agent/amazon_data/amazon_reviews_us_Camera_v1_00
.tsv'
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
def
get_reviews
(
self
,
product_id
):
...
...
ADA/item.py
→
ADA/
server/agent/
item.py
View file @
56fb62b9
File moved
ADA/pc_reviews_to_be_annotated.xml
→
ADA/
server/agent/
pc_reviews_to_be_annotated.xml
View file @
56fb62b9
File moved
ADA/prep_data.py
→
ADA/
server/agent/
prep_data.py
View file @
56fb62b9
import
nltk
from
nltk.tokenize
import
sent_tokenize
import
pandas
as
pd
import
re
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
training_data_location
=
'reviews_trained_on.tsv'
output_location
=
'camera_prepared_data.tsv'
data_location
=
'amazon_
data/amazon_
reviews_us_Camera_v1_00.tsv'
training_data_location
=
'
amazon_data/
reviews_trained_on.tsv'
output_location
=
'
amazon_data/
camera_prepared_data.tsv'
min_reviews
=
50
min_characters
=
50
n
=
500
...
...
@@ -14,30 +11,30 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training
training_reviews
=
pd
.
read_csv
(
training_data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
concat
([
reviews
,
training_reviews
])
reviews
=
pd
.
concat
([
reviews
,
training_reviews
])
reviews
=
reviews
.
drop_duplicates
(
keep
=
False
)
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
#
# try to filter out reviews for camera accessories
#
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
#
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
#
filter = ''
#
for word in filter_words:
#
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
#
filter += word_filter + '|'
#
filter = filter[:-1]
#
reviews = reviews[~reviews['product_title'].str.contains(pat
=
filter, regex
=
True)]
#
#
# drop reviews with less than min_characters characters
#
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
#
#
# drop reviews for products with less than min_reviews reviews
#
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
#
#
# choose reviews for n first items
#
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
# try to filter out reviews for camera accessories
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
filter
_pat
=
''
for
word
in
filter_words
:
word_filter
=
'['
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
']'
+
word
[
1
:]
filter
_pat
+=
word_filter
+
'|'
filter
_pat
=
filter
_pat
[:
-
1
]
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
filter
_pat
,
regex
=
True
)]
# drop reviews with less than min_characters characters
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
x
:
len
(
str
(
x
))
>=
min_characters
)]
# drop reviews for products with less than min_reviews reviews
reviews
=
reviews
.
groupby
(
'product_id'
).
filter
(
lambda
x
:
len
(
x
.
index
)
>=
min_reviews
)
# choose reviews for n first items
reviews
=
reviews
[
reviews
[
'product_id'
].
isin
(
reviews
[
'product_id'
].
unique
()[:
n
])]
reviews
.
to_csv
(
output_location
,
sep
=
'
\t
'
,
index
=
False
)
...
...
ADA/prepared_amazon_camera_reviews.xml
→
ADA/
server/agent/
prepared_amazon_camera_reviews.xml
View file @
56fb62b9
File moved
ADA/product_finder.py
→
ADA/
server/agent/
product_finder.py
View file @
56fb62b9
File moved
ADA/review_annotation.py
→
ADA/
server/agent/
review_annotation.py
View file @
56fb62b9
File moved
ADA/review_tokenizer.py
→
ADA/
server/agent/
review_tokenizer.py
View file @
56fb62b9
from
nltk.tokenize
import
TweetTokenizer
from
nltk.corpus
import
stopwords
from
item
import
glossary
from
agent.
item
import
glossary
import
string
class
ReviewTokenizer
:
...
...
ADA/reviews_to_be_annotated.xml
→
ADA/
server/agent/
reviews_to_be_annotated.xml
View file @
56fb62b9
File moved
ADA/text_analyzer.py
→
ADA/
server/agent/
text_analyzer.py
View file @
56fb62b9
File moved
ADA/train_classifier.py
→
ADA/
server/agent/
train_classifier.py
View file @
56fb62b9
import
pandas
as
pd
from
nltk.tokenize
import
sent_tokenize
import
re
from
review_tokenizer
import
tokenize_review
,
reduce_noise
from
server.agent.review_tokenizer
import
tokenize_review
,
reduce_noise
import
random
from
nltk
import
classify
,
NaiveBayesClassifier
import
pickle
...
...
ADA/server/ios_server/views.py
View file @
56fb62b9
from
django.http
import
JsonResponse
,
HttpResponse
from
django.http
import
HttpResponse
import
json
import
jsonpickle
from
django.views.decorators.csrf
import
csrf_exempt
import
sys
sys
.
path
.
append
(
'/home/joel/individual_project/ADA'
)
from
dataloader
import
DataLoader
from
communicator
import
Communicator
from
agent.dataloader
import
DataLoader
from
agent.communicator
import
Communicator
dl
=
DataLoader
()
communicator
=
Communicator
(
dl
)
def
index
(
request
):
return
HttpResponse
(
"OK"
)
def
product
(
request
):
id
=
request
.
GET
.
get
(
'id'
,
''
)
...
...
@@ -38,6 +37,7 @@ def product(request):
return
HttpResponse
(
jsonpickle
.
encode
(
init_response
,
unpicklable
=
False
),
content_type
=
"application/json"
)
@
csrf_exempt
def
message
(
request
):
parsed
=
json
.
loads
(
request
.
body
)
...
...
Prev
1
…
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment