Commit 56fb62b9 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Major refactors in order to make server the main source folder

parent 57b5fe71
import pandas as pd
class DataLoader:
data_location = 'camera_prepared_data.tsv'
data_location = 'agent/amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id):
......
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
training_data_location = 'reviews_trained_on.tsv'
output_location = 'camera_prepared_data.tsv'
data_location = 'amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
training_data_location = 'amazon_data/reviews_trained_on.tsv'
output_location = 'amazon_data/camera_prepared_data.tsv'
min_reviews = 50
min_characters = 50
n = 500
......@@ -14,30 +11,30 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews,training_reviews])
reviews = pd.concat([reviews, training_reviews])
reviews = reviews.drop_duplicates(keep=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# # try to filter out reviews for camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
# filter = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter += word_filter + '|'
# filter = filter[:-1]
# reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)]
#
# # drop reviews with less than min_characters characters
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
#
# # drop reviews for products with less than min_reviews reviews
# reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
#
# # choose reviews for n first items
# reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]
# drop reviews with less than min_characters characters
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
# drop reviews for products with less than min_reviews reviews
reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
# choose reviews for n first items
reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews.to_csv(output_location, sep='\t', index=False)
......
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from item import glossary
from agent.item import glossary
import string
class ReviewTokenizer:
......
import pandas as pd
from nltk.tokenize import sent_tokenize
import re
from review_tokenizer import tokenize_review, reduce_noise
from server.agent.review_tokenizer import tokenize_review, reduce_noise
import random
from nltk import classify, NaiveBayesClassifier
import pickle
......
from django.http import JsonResponse, HttpResponse
from django.http import HttpResponse
import json
import jsonpickle
from django.views.decorators.csrf import csrf_exempt
import sys
sys.path.append('/home/joel/individual_project/ADA')
from dataloader import DataLoader
from communicator import Communicator
from agent.dataloader import DataLoader
from agent.communicator import Communicator
dl = DataLoader()
communicator = Communicator(dl)
def index(request):
return HttpResponse("OK")
def product(request):
id = request.GET.get('id', '')
......@@ -38,6 +37,7 @@ def product(request):
return HttpResponse(jsonpickle.encode(init_response, unpicklable=False), content_type="application/json")
@csrf_exempt
def message(request):
parsed = json.loads(request.body)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment