Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
56fb62b9
Commit
56fb62b9
authored
Apr 23, 2020
by
Joel Oksanen
Browse files
Major refactors in order to make server the main source folder
parent
57b5fe71
Changes
312
Hide whitespace changes
Inline
Side-by-side
ADA/dataloader.py
→
ADA/
server/agent/
dataloader.py
View file @
56fb62b9
import
pandas
as
pd
import
pandas
as
pd
class
DataLoader
:
class
DataLoader
:
data_location
=
'
camera_prepared_data
.tsv'
data_location
=
'
agent/amazon_data/amazon_reviews_us_Camera_v1_00
.tsv'
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
def
get_reviews
(
self
,
product_id
):
def
get_reviews
(
self
,
product_id
):
...
...
ADA/item.py
→
ADA/
server/agent/
item.py
View file @
56fb62b9
File moved
ADA/pc_reviews_to_be_annotated.xml
→
ADA/
server/agent/
pc_reviews_to_be_annotated.xml
View file @
56fb62b9
File moved
ADA/prep_data.py
→
ADA/
server/agent/
prep_data.py
View file @
56fb62b9
import
nltk
from
nltk.tokenize
import
sent_tokenize
import
pandas
as
pd
import
pandas
as
pd
import
re
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
data_location
=
'amazon_
data/amazon_
reviews_us_Camera_v1_00.tsv'
training_data_location
=
'reviews_trained_on.tsv'
training_data_location
=
'
amazon_data/
reviews_trained_on.tsv'
output_location
=
'camera_prepared_data.tsv'
output_location
=
'
amazon_data/
camera_prepared_data.tsv'
min_reviews
=
50
min_reviews
=
50
min_characters
=
50
min_characters
=
50
n
=
500
n
=
500
...
@@ -14,30 +11,30 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
...
@@ -14,30 +11,30 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training
# drop reviews used for training
training_reviews
=
pd
.
read_csv
(
training_data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
training_reviews
=
pd
.
read_csv
(
training_data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
concat
([
reviews
,
training_reviews
])
reviews
=
pd
.
concat
([
reviews
,
training_reviews
])
reviews
=
reviews
.
drop_duplicates
(
keep
=
False
)
reviews
=
reviews
.
drop_duplicates
(
keep
=
False
)
# drop reviews with empty review body
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
#
# try to filter out reviews for camera accessories
# try to filter out reviews for camera accessories
#
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
#
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
#
filter = ''
filter
_pat
=
''
#
for word in filter_words:
for
word
in
filter_words
:
#
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
word_filter
=
'['
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
']'
+
word
[
1
:]
#
filter += word_filter + '|'
filter
_pat
+=
word_filter
+
'|'
#
filter = filter[:-1]
filter
_pat
=
filter
_pat
[:
-
1
]
#
reviews = reviews[~reviews['product_title'].str.contains(pat
=
filter, regex
=
True)]
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
filter
_pat
,
regex
=
True
)]
#
#
# drop reviews with less than min_characters characters
# drop reviews with less than min_characters characters
#
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
x
:
len
(
str
(
x
))
>=
min_characters
)]
#
#
# drop reviews for products with less than min_reviews reviews
# drop reviews for products with less than min_reviews reviews
#
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
reviews
=
reviews
.
groupby
(
'product_id'
).
filter
(
lambda
x
:
len
(
x
.
index
)
>=
min_reviews
)
#
#
# choose reviews for n first items
# choose reviews for n first items
#
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews
=
reviews
[
reviews
[
'product_id'
].
isin
(
reviews
[
'product_id'
].
unique
()[:
n
])]
reviews
.
to_csv
(
output_location
,
sep
=
'
\t
'
,
index
=
False
)
reviews
.
to_csv
(
output_location
,
sep
=
'
\t
'
,
index
=
False
)
...
...
ADA/prepared_amazon_camera_reviews.xml
→
ADA/
server/agent/
prepared_amazon_camera_reviews.xml
View file @
56fb62b9
File moved
ADA/product_finder.py
→
ADA/
server/agent/
product_finder.py
View file @
56fb62b9
File moved
ADA/review_annotation.py
→
ADA/
server/agent/
review_annotation.py
View file @
56fb62b9
File moved
ADA/review_tokenizer.py
→
ADA/
server/agent/
review_tokenizer.py
View file @
56fb62b9
from
nltk.tokenize
import
TweetTokenizer
from
nltk.tokenize
import
TweetTokenizer
from
nltk.corpus
import
stopwords
from
nltk.corpus
import
stopwords
from
item
import
glossary
from
agent.
item
import
glossary
import
string
import
string
class
ReviewTokenizer
:
class
ReviewTokenizer
:
...
...
ADA/reviews_to_be_annotated.xml
→
ADA/
server/agent/
reviews_to_be_annotated.xml
View file @
56fb62b9
File moved
ADA/text_analyzer.py
→
ADA/
server/agent/
text_analyzer.py
View file @
56fb62b9
File moved
ADA/train_classifier.py
→
ADA/
server/agent/
train_classifier.py
View file @
56fb62b9
import
pandas
as
pd
import
pandas
as
pd
from
nltk.tokenize
import
sent_tokenize
from
server.agent.review_tokenizer
import
tokenize_review
,
reduce_noise
import
re
from
review_tokenizer
import
tokenize_review
,
reduce_noise
import
random
import
random
from
nltk
import
classify
,
NaiveBayesClassifier
from
nltk
import
classify
,
NaiveBayesClassifier
import
pickle
import
pickle
...
...
ADA/server/ios_server/views.py
View file @
56fb62b9
from
django.http
import
JsonResponse
,
HttpResponse
from
django.http
import
HttpResponse
import
json
import
json
import
jsonpickle
import
jsonpickle
from
django.views.decorators.csrf
import
csrf_exempt
from
django.views.decorators.csrf
import
csrf_exempt
from
agent.dataloader
import
DataLoader
import
sys
from
agent.communicator
import
Communicator
sys
.
path
.
append
(
'/home/joel/individual_project/ADA'
)
from
dataloader
import
DataLoader
from
communicator
import
Communicator
dl
=
DataLoader
()
dl
=
DataLoader
()
communicator
=
Communicator
(
dl
)
communicator
=
Communicator
(
dl
)
def
index
(
request
):
def
index
(
request
):
return
HttpResponse
(
"OK"
)
return
HttpResponse
(
"OK"
)
def
product
(
request
):
def
product
(
request
):
id
=
request
.
GET
.
get
(
'id'
,
''
)
id
=
request
.
GET
.
get
(
'id'
,
''
)
...
@@ -38,6 +37,7 @@ def product(request):
...
@@ -38,6 +37,7 @@ def product(request):
return
HttpResponse
(
jsonpickle
.
encode
(
init_response
,
unpicklable
=
False
),
content_type
=
"application/json"
)
return
HttpResponse
(
jsonpickle
.
encode
(
init_response
,
unpicklable
=
False
),
content_type
=
"application/json"
)
@
csrf_exempt
@
csrf_exempt
def
message
(
request
):
def
message
(
request
):
parsed
=
json
.
loads
(
request
.
body
)
parsed
=
json
.
loads
(
request
.
body
)
...
...
Prev
1
…
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment