Commit 56fb62b9 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Major refactors in order to make server the main source folder

parent 57b5fe71
import pandas as pd import pandas as pd
class DataLoader: class DataLoader:
data_location = 'camera_prepared_data.tsv' data_location = 'agent/amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
def get_reviews(self, product_id): def get_reviews(self, product_id):
......
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd import pandas as pd
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv' data_location = 'amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
training_data_location = 'reviews_trained_on.tsv' training_data_location = 'amazon_data/reviews_trained_on.tsv'
output_location = 'camera_prepared_data.tsv' output_location = 'amazon_data/camera_prepared_data.tsv'
min_reviews = 50 min_reviews = 50
min_characters = 50 min_characters = 50
n = 500 n = 500
...@@ -14,30 +11,30 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) ...@@ -14,30 +11,30 @@ reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews used for training # drop reviews used for training
training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False) training_reviews = pd.read_csv(training_data_location, sep='\t', error_bad_lines=False)
reviews = pd.concat([reviews,training_reviews]) reviews = pd.concat([reviews, training_reviews])
reviews = reviews.drop_duplicates(keep=False) reviews = reviews.drop_duplicates(keep=False)
# drop reviews with empty review body # drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()] reviews = reviews[~reviews['review_body'].isnull()]
# # try to filter out reviews for camera accessories # try to filter out reviews for camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security'] 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
# filter = '' filter_pat = ''
# for word in filter_words: for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:] word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter += word_filter + '|' filter_pat += word_filter + '|'
# filter = filter[:-1] filter_pat = filter_pat[:-1]
# reviews = reviews[~reviews['product_title'].str.contains(pat = filter, regex = True)] reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]
#
# # drop reviews with less than min_characters characters # drop reviews with less than min_characters characters
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)] reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters)]
#
# # drop reviews for products with less than min_reviews reviews # drop reviews for products with less than min_reviews reviews
# reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews) reviews = reviews.groupby('product_id').filter(lambda x: len(x.index) >= min_reviews)
#
# # choose reviews for n first items # choose reviews for n first items
# reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])] reviews = reviews[reviews['product_id'].isin(reviews['product_id'].unique()[:n])]
reviews.to_csv(output_location, sep='\t', index=False) reviews.to_csv(output_location, sep='\t', index=False)
......
from nltk.tokenize import TweetTokenizer from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords from nltk.corpus import stopwords
from item import glossary from agent.item import glossary
import string import string
class ReviewTokenizer: class ReviewTokenizer:
......
import pandas as pd import pandas as pd
from nltk.tokenize import sent_tokenize from server.agent.review_tokenizer import tokenize_review, reduce_noise
import re
from review_tokenizer import tokenize_review, reduce_noise
import random import random
from nltk import classify, NaiveBayesClassifier from nltk import classify, NaiveBayesClassifier
import pickle import pickle
......
from django.http import JsonResponse, HttpResponse from django.http import HttpResponse
import json import json
import jsonpickle import jsonpickle
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
from agent.dataloader import DataLoader
import sys from agent.communicator import Communicator
sys.path.append('/home/joel/individual_project/ADA')
from dataloader import DataLoader
from communicator import Communicator
dl = DataLoader() dl = DataLoader()
communicator = Communicator(dl) communicator = Communicator(dl)
def index(request): def index(request):
return HttpResponse("OK") return HttpResponse("OK")
def product(request): def product(request):
id = request.GET.get('id', '') id = request.GET.get('id', '')
...@@ -38,6 +37,7 @@ def product(request): ...@@ -38,6 +37,7 @@ def product(request):
return HttpResponse(jsonpickle.encode(init_response, unpicklable=False), content_type="application/json") return HttpResponse(jsonpickle.encode(init_response, unpicklable=False), content_type="application/json")
@csrf_exempt @csrf_exempt
def message(request): def message(request):
parsed = json.loads(request.body) parsed = json.loads(request.body)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment