Commit a59058de authored by Zhao, Junchen's avatar Zhao, Junchen

Initial commit

parents
File added
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 15 19:56:09 2020
@author: junchenzhao
"""
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from joblib import dump, load
"""
Define the numpy arrays for each of the source and target datasets.
"""
src_train = open("src-train.txt")
src_train_list = [i for i in src_train.readlines()]
tgt_train = open("tgt-train.txt")
tgt_train_list = [i for i in tgt_train.readlines()]
"""
Transform string in the lists to computable vectors so that we can do the unsupervised learning
"""
src_vectorizer = TfidfVectorizer(stop_words='english')
tgt_vectorizer = TfidfVectorizer(stop_words='english')
src_train_vector = src_vectorizer.fit_transform(src_train_list)
tgt_train_vector = tgt_vectorizer.fit_transform(tgt_train_list)
src_train.close()
tgt_train.close()
"""
Define the Kmeans for the purpose of unsupervised learning
"""
kmeans_src = KMeans(n_clusters=26, n_init =300).fit(src_train_vector)
kmeans_tgt = KMeans(n_clusters=26, n_init =300).fit(tgt_train_vector)
"""
Make the prediction on samples
"""
sample_src = ["derive the class CacheHandler from the object base class."]
sample_tgt = ["class CacheHandler ( object ) :"]
dump(kmeans_src, "model_src6.joblib")
dump(kmeans_tgt, "model_tgt6.joblib")
"""
Retreving the centroid and features for the unsupervised models
"""
src_order_centroids = kmeans_src.cluster_centers_.argsort()[:, ::-1]
src_terms = src_vectorizer.get_feature_names()
tgt_order_centroids = kmeans_tgt.cluster_centers_.argsort()[:, ::-1]
tgt_terms = tgt_vectorizer.get_feature_names()
print('Source training file clusters.\n')
for i in range(26):
print("Cluster %d:" % i)
for ind in src_order_centroids[i, :10]:
print(' %s' % src_terms[ind])
print('\n')
print('Target training file clusters.\n')
for i in range(26):
print("Cluster %d:" % i)
for ind in tgt_order_centroids[i, :10]:
print(' %s' % tgt_terms[ind])
print('\n')
print('Prediction on source')
src_pred_transform = src_vectorizer.transform(sample_src)
src_pred = kmeans_src.predict(src_pred_transform)
print(src_pred)
print('\n')
print('Prediction on target')
tgt_pred_transform = tgt_vectorizer.transform(sample_tgt)
tgt_pred = kmeans_tgt.predict(tgt_pred_transform)
print(tgt_pred)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 15 21:28:01 2020
@author: junchenzhao
"""
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from joblib import dump, load
model_src = load("model_src260.joblib")
model_tgt = load("model_tgt260.joblib")
"""
Define the numpy arrays for each of the source and target datasets.
"""
src_train = open("src-train.txt")
src_train_list = [i for i in src_train.readlines()]
tgt_train = open("tgt-train.txt")
tgt_train_list = [i for i in tgt_train.readlines()]
"""
Transform string in the lists to computable vectors so that we can do the unsupervised learning
"""
src_vectorizer = TfidfVectorizer(stop_words='english')
tgt_vectorizer = TfidfVectorizer(stop_words='english')
src_train_vector = src_vectorizer.fit_transform(src_train_list)
tgt_train_vector = tgt_vectorizer.fit_transform(tgt_train_list)
src_train.close()
tgt_train.close()
"""
Retreving the centroid and features for the unsupervised models
"""
src_order_centroids = model_src.cluster_centers_.argsort()[:, ::-1]
src_terms = src_vectorizer.get_feature_names()
tgt_order_centroids = model_tgt.cluster_centers_.argsort()[:, ::-1]
tgt_terms = tgt_vectorizer.get_feature_names()
print('Source training file clusters.\n')
for i in range(6):
print("Cluster %d:" % i)
for ind in src_order_centroids[i, :10]:
print(' %s' % src_terms[ind])
print('\n')
print('Target training file clusters.\n')
for i in range(6):
print("Cluster %d:" % i)
for ind in tgt_order_centroids[i, :10]:
print(' %s' % tgt_terms[ind])
"""
Make the prediction on samples
"""
sample_src = ["raise an ValidationError with 3 arguments: dictionary self.error_messages value under the key 'invalid_pk_value'","derive the class DefaultCacheProxy from the base class object.",
"define the function get_cache with backend and dictionary pair of elements kwargs as arguments.",
"define the function _create_cache with backend and dictionary pair of elements kwargs as arguments.",
"dest as a string 'symlinks', default as boolean False and help as a string 'Follows symlinks to directories when examining ' 'source code and templates for translation strings.'. call the method parser.add_argument with 7 arguments: string '--ignore', string '-i', action as a string 'append',",
"help set to a string 'The file extension(s) to examine (default: 'html,txt', or 'js' ' 'if the domain is 'djangojs'). Separate multiple extensions with commas, or use -e multiple times.' and action as a string 'append'. call the method parser.add_argument with 6 arguments: string '--symlinks', string '-s', action as a string 'store_true'",
"from threading import local into default name space."
]
sample_tgt = ["raise ValidationError ( self . error_messages [ 'invalid_pk_value' ] , code = 'invalid_pk_value' , params = { 'pk' : pk } , )",
"def get_cache ( backend , ** kwargs ) :",
"def _create_cache ( backend , ** kwargs ) :",
"parser . add_argument ( '--ignore' , '-i' , action = 'append' , dest = 'ignore_patterns' , default = [ ] , metavar = 'PATTERN' , help = 'Ignore files or directories matching this glob-style pattern. ' 'Use multiple times to ignore more.' )",
"parser . add_argument ( '--symlinks' , '-s' , action = 'store_true' , dest = 'symlinks' , default = False , help = 'Follows symlinks to directories when examining ' 'source code and templates for translation strings.' )",
"from threading import local"
]
print('\n')
print('Prediction on source')
src_pred_transform = src_vectorizer.transform(sample_src)
src_pred = model_src.predict(src_pred_transform)
print(src_pred)
print('\n')
print('Prediction on target')
tgt_pred_transform = tgt_vectorizer.transform(sample_tgt)
tgt_pred = model_tgt.predict(tgt_pred_transform)
print(tgt_pred)
print('\n')
"""
Constructing mapping.
In the list of src_list, tgt_list, there are sets (label, sentence document index)
"""
src_counter = 0
tgt_counter = 0
#(label, sentence document index)
src_list = []
tgt_list = []
#{label: sentence document index}
src_dict = {}
tgt_dict = {}
#full_src_pred array
full_src_pred = model_src.predict(src_train_vector)
#full tgt_pred array
full_tgt_pred = model_tgt.predict(tgt_train_vector)
for i in full_src_pred:
src_list.append((i, src_counter))
src_counter += 1
for j in full_tgt_pred:
tgt_list.append((j,tgt_counter))
tgt_counter += 1
for i in src_list:
if i[0] not in src_dict:
src_dict[i[0]] = [i[1]]
else:
src_dict[i[0]].append(i[1])
for j in tgt_list:
if j[0] not in tgt_dict:
tgt_dict[j[0]] = [j[1]]
else:
tgt_dict[j[0]].append(j[1])
#now, we have created the dictionary, and now we want to find the mapping bewteen the sentences in each file with the similar semantic meaning
#label_list: [(source label, target label), ....,()]
label_list = []
for a, b in src_dict.items():
print("src_label:", a, "value length:", len(b))
print('\n')
for c, d in tgt_dict.items():
print("tgt_label:", c, "value_length:", len(d))
'''
Now, we start to map the file between src and target.
'''
#for codeline target file
'''for key, val in tgt_dict.items():
with open("tgt7_"+str(key)+".txt", "w") as out_files:
for index in val:
out_files.write(tgt_train_list[index])
for key, val in tgt_dict.items():
with open("src7_"+str(key)+".txt", "w") as out_files:
for index in val:
out_files.write(src_train_list[index])'''
with open("X_train_super.csv", "w") as out_files:
for key, val in tgt_dict.items():
for index in val:
out_files.write("_label"+str(key)+"_,"+src_train_list[index])
with open("y_train_super.csv", "w") as out_files:
for key, val in tgt_dict.items():
for index in val:
out_files.write("_label"+str(key)+"_,"+tgt_train_list[index])
File added
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 30 14:22:09 2020
@author: junchenzhao
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from joblib import dump, load
#open test file
src_train = open("final_test.txt")
predict_text= [i for i in src_train.readlines()]
src_train.close()
#load dataframe
data = pd.read_csv('X_train_super.csv', sep=",", header=None).iloc[:,0:2]
data.columns = ['CATEGORY', 'TITLE']
print(data.head())
stemmer = PorterStemmer()
def stemming_tokenizer(text):
stemmer = PorterStemmer()
return [stemmer.stem(w) for w in word_tokenize(text)]
def train(classifier, X, y,predict_text):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=33)
supervised_model = classifier.fit(X_train, y_train)
dump(supervised_model, "supervised_classification_model.joblib")
prediction = (supervised_model.predict(predict_text))
print ("Accuracy: %s" % supervised_model.score(X_test, y_test))
print ("predicted category: ",prediction)
pred_range = range(len(predict_text))
src_dict = {}
for i in pred_range:
if prediction[i] not in src_dict:
src_dict[prediction[i]] = [predict_text[i]]
else:
src_dict[prediction[i]].append(predict_text[i])
print(src_dict)
for key, val in src_dict.items():
with open("classifed_"+str(key)+".txt", "w") as out_files:
for content in val:
out_files.write(content)
#Try decision tree model ; Accuracy: 0.7621782599446926
dec_tree_model = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
stop_words=stopwords.words('english') + list(string.punctuation))),
('classifier', DecisionTreeClassifier())])
#Try SVM model ; Accuracy: 0.8098276962348436
svm_model = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
stop_words=stopwords.words('english') + list(string.punctuation))),
('classifier', SVC(kernel='linear'))])
#Try logistic model ; Accuracy: 0.7647309083173793
logistic_model = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
stop_words=stopwords.words('english') + list(string.punctuation))),
('classifier', LogisticRegression())])
#Try Naive model ;Accuracy: 0.7236758136566688
naive_model = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer, stop_words=stopwords.words('english') + list(string.punctuation))),
('classifier', MultinomialNB(alpha=0.05))])
#After the experiment, we choose to go with svm model which has higher accuracy
train(svm_model, data['TITLE'], data['CATEGORY'],predict_text)
#train(naive_model, data['TITLE'], data['CATEGORY'],predict_text)
testing_data @ fbafc516
Subproject commit fbafc51641033f64d41c7fd6d6dbfe2f7b47ed4e
This diff is collapsed.
training_data @ 40a94909
Subproject commit 40a94909e1f0a69c07c0e78abb86c9d39ece4567
if DEFAULT_CACHE_ALIAS is not contained in settings.CACHES,
substitute value under the backend key of settings.CACHES dictionary for conf.
if alias is not contained in setting.CACHES,
if features.needs_datetime_string_cast field of the object represented stored under the db key of connections dictionary is true,
if features.needs_datetime_string_cast field of object under the db key of connections dictionary is true and current_expires is not an instance of datetime object.
if settings.USE_TZ is true,
num_entries is length of filelist.
if server is an instance of six.string_types,
if call to the settings.is_overridden with string 'TEST_RUNNER' evaluates to boolean False.
settings.SITE_ID.
settings.BASE_DIR.
call the function settings.is_overridden with argument string 'TEMPLATE_LOADERS', if it evaluates to boolean True,
call the function settings.is_overridden with argument string 'MANAGERS', if it evaluates to boolean True,
call the function settings.is_overridden with argument string 'ADMINS', if it evaluates to boolean True,
call the function settings.is_overridden with argument string 'MIDDLEWARE_CLASSES', if the result is boolean False,
if level is not an integer raise an error with message string "The first argument should be level.".
if self.id is contained in settings.SILENCED_SYSTEM_CHECKS return boolean True, False otherwise.
if obj is instance of models.signals.ModelSignal class,
if receiver is an instance of types.FunctionType type,
call the method self.registered_checks.append with argument check.
if settings.DEBUG is true and value of the request.META dictionary under the 'REMOTE_ADDR' key is contained in settings.INTERNAL_IPS,
substitute settings.LANGUAGES for the value under the 'LANGUAGES' key of the context_extras dictionary.
return an dictionary containing 1 entry: settings.STATIC_URL for 'STATIC_URL'.
return an dictionary containing 1 entry: settings.MEDIA_URL for 'MEDIA_URL'.
if message is an instance of ValidationError class,
if message has an 'error_dict' attribute,
substitute message.error_dict for message.
substitute message.error_list for message.
substitute message.message, message.code, message.params for message, code and params, respectively.
if message is an instance of dict type,
if messages is not an instance of the ValidationError class,
for every message is message,
if message is not an instance of the ValidationError class,
message is an instance of the ValidationError class created with an argument message.
substitute message for self.message.
substitute error.message for message.
divide message by error.params, substitute the residue of the division for message.
substitute StringIO for stream_class if content is an instance of six.text_type, otherwise substitute BytesIO for stream_class.
substitute length of content for self.size.
substitute the settings.MEDIA_ROOT for location.
substitute settings.MEDIA_URL for base_url.
mode is a string 'wb' is chunk is instance of the type bytes, otherwise mode is a string 'wt'.
append entry to the directories list.
append entry to files list.
call the import_string with argument: import_path if exists or settings.DEFAULT_FILE_STORAGE, if not, return the result.
if length of name is greater than integer 255,
subtract length of the ext from the integer 255, take first elements of the name list up to the previous result index,
if settings.FILE_UPLOAD_TEMP_DIR is true,
call the __init__ method from the base class of the SimpleUploadedFile class with 7 arguments: BytesIO(content), None, name,
if content_length is greater than settings.FILE_UPLOAD_MAX_MEMORY_SIZE,
for every middleware_path in settings.MIDDLEWARE_CLASSES:
append mw_instance.process_request at the end of request_middleware.
append mw_instance.process_view at the end of self._view_middleware.
substitute settings.ROOT_URLCONF for urlconf.
if callback is of instance types.FunctionType,
and 'request with value request, respectively use the previous as the arguments for the call to the logger.warning function. if settings.DEBUG is True,
'status code' with integer value of 400 and 'request' with value request, respectively use the previous to call the function security_logger.error. if settings.DEBUG is True,
if settings.DEBUG_PROPAGATE_EXCEPTIONS is True,
extra is an dictionary with two pairs of elements 'status_code' with value 500, and 'request with value request, respectively use the previous as the arguments for the call to the logger.error function. if settings.DEBUG is True,
decrement self.remaining by length of result.
convert 'Set-Cookie' to a string and convert the return value of method c.output(header='') call to a string,
if settings.FORCE_SCRIPT_NAME is not None,
evaluate the force_text function with settings.FORCE_SCRIPT_NAME as argument, return the result.
take all but the length of path_info last elements of script_url, substitute it for script_name.
call the function import_string with backend if exists or settings.EMAIL_BACKEND if not, store the result into klass.
password set to auth_password, fail_silently set to fail_silently as arguments, assign the result to connection. call the EmailMessage function wit subject, message, sender, recipient and connection set to connection,
if settings.ADMINS is false,
This diff is collapsed.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration,
skip this loop execution.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration,
skip this loop iteration.
continue with the next iteration of the for loop.
skip this loop iteration.
skip this loop iteration,
skip this loop iteration.
skip this loop iteration,
skip this loop iteration,
skip this loop iteration,
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration,s
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
skip this loop iteration.
decorator function register.filter with 2 arguments expects_localtime set to boolean True and is_safe set to boolean False.
decorator function register.filter with 2 arguments expects_localtime set to boolean True and is_safe set to boolean False.
decorator function register.filter with an argument is_safe set to boolean True.
decorator function register.filter with 2 arguments string "phone2numeric" and is_safe set to boolean True.
decorator function register.filter with an argument is_safe set to boolean True.
call the method var.resolve with 2 arguments: context and True, substitute the result for value.
call the method self.sequence.resolve with 2 arguments: context and boolean True, substitute the result for values.
pop_context is boolean True.
call the method self.var1.resolve with 2 arguments: context and boolean True, substitute the result for val2.
call the method self.var2.resolve with 2 arguments: context and boolean True, substitute the result for val2.
match is boolean True.
call the method self.expression.resolve with 2 arguments: context and boolean True, return the result.
call the method self.target.resolve with 2 arguments: context and boolean True, substitute it for obj_list.
return boolean True.
where '%s' is replaced with last element of args. as_form is boolean True
silent is boolean True.
as_form is boolean True.
call the method self.value.resolve with 2 arguments context and ignore_failures set to boolean True, return the result.
parsed is boolean True.
must_be_first is boolean True.
value is boolean True.
is_usable is boolean True.
is_usable is boolean True.
is_usable is boolean True.
load_template_source.is_usable is boolean True.
self._is_rendered is boolean True.
BM_compatible is boolean True.
supported is boolean True.
return boolean True.
USE_INOTIFY is boolean True.
RUN_RELOADER is boolean True.
pyinotify.IN_MOVED_FROM, pyinotify.IN_MOVED_TO and pyinotify.IN_CREATE, store the result in mask. for every path in return value of the call to the function gen_filenames with argument only_new set to boolean True,
endless loop,
value under the "RUN_MAIN" key of new_environ dictionary is a string 'true'.
if value under the "RUN_MAIN" key of the os.environ dictionary equals to string "true",
endless loop,
return first element of t converted to lowercase and boolean True, as a tuple.
if second element of t is boolean True,
return tuple, consisting of first element of t converted to lowercase and True.
using_sysrandom is boolean True.
return boolean True.
use_func is boolean True.
return boolean True.
call the method self.configure_logger with 3 arguments: name, value under name key of loggers dictionary and boolean True.
call the method self.configure_root with 2 arguments: root and boolean True.
logger.propagate is boolean True.
logger.disabled is boolean True.
author_name set to None, author_link set to None, subtitle set to None, categories set to None, feed_url set to None, feed_copyright set to None, feed_guid set to None, ttl set to None and unpacked dictionary kwargs. to_unicode is an lambda function with an argument s, result is the result of the function force_text,
to_unicode is an lambda function with an argument s, result is the result of the function force_text,
call the function formatdate with 2 arguments: epoch_seconds and usegmt set to boolean True, return the result.
endless loop,
return boolean True.
return boolean True.
return boolean True.
return boolean True.
call the method logging.captureWarnings with an argument boolean True.
call the function get_connection with 2 arguments: backend set to self.email_backend and fail_silently set to boolean True.
return boolean True.
return boolean True.
return boolean True.
return boolean True.
return boolean True.
consume_next is boolean True.
endless loop,
consume_next is boolean True.
return boolean True.
return boolean True.
want_unicode is boolean True.
want_unicode is boolean True.
want_unicode is boolean True.
call the method text.splitlines with an argument boolean True, for every line in the result,
evaluate the function timesince with d, now and reversed set to boolean true as arguments, return the result.
check_for_language is lambda function that returns boolean True for every argument x.
substitute it for localedir. use_null_fallback is boolean True.
to_locale called with an argument lang_code, if the result is not None, return boolean True.
inplural is boolean True.
intrans is boolean True.
incomment is boolean True.
\ No newline at end of file
where '%s' is replaced with name. if name is not contained in parser._namedCycleNodes,
get the index name, of the parser._namedCycleNodes, return the result.
call the function parser.compile_filter with an argument arg for every arg in args from the index 1 to the index -2,
if parser doesnt have an '_namedCycleNodes' attribute,
parser._namedCycleNodes is an empty dictionary.
substituet node for value under the name key of the parser._namedCycleNodes dictionary.
call the function parser.compile_filter with an argument arg for every arg in args from first element to the end,
define the function csrf_token with 2 arguments: parser and token.
define the function debug with 2 arguments: parser and token.
define the function do_filter with 2 arguments: parser and token.
call the method parser.compile_filter with an argument string "var|%s", where '%s' is replaced with rest, substitute the result for filter_expr.
where '%s' is replaced with filter_name. call the method parser.parse with an argument tuple with an element string 'endfilter', substitute the result for nodelist.
call the method parser.delete_first_token.
define the function firstof with 2 arguments: parser and token.
return an instance of FirstOfNode class, created with an argument, result of the method parser.compile_filter called with bit,
define the function do_for with 2 arguments: parser and token.
where '%s' is replaced with token.contents. call the method parser.compile_filter with an argument, in_index incremented by one, and used to index parser.compile_filter,
substitute the result for sequence. call the method parser.parse with an argument tuple with 2 elements strings 'empty' and 'endfor', substitute the result for nodelist_loop.
call the method parser.next_token, substitute the result for token.
call the method parser.parse with an argument tuple with an element string 'endfor', substitute the result for nodelist_empty.
call the method parser.delete_first_token.
define the function do_ifequal with 2 arguments: parser, token and negate.
call the method parser.parse with an argument tuple with an element string 'else' and end_tag, substitute the result for nodelist_true.
call the method parser.next_token, substitute the result for token.
call the method parser.parse with an argument tuple with an element end_tag, substitute the result for nodelist_false.
call the method parser.delete_first_token.
call the method parser.compile_filter with an argument, second element of bits, substitute the result for val1.
call the method parser.compile_filter with an argument, third element of bits, substitute the result for val2.
define the function ifequal with 2 arguments: parser and token.
call the function do_ifequal with 3 arguments: parser, token and boolean False, return the result.
define the function ifnotequal with 2 arguments: parser and token.
call the function do_ifequal with 3 arguments: parser, token and boolean False, return the result.
substitute parser for self.template_parser.
define the function do_if with 2 arguments: parser and token.
call the method parse from the instance of TemplateIfParser class, created with 2 arguments parser and bits, substitute the result for condition.
call the method parser.next_token, substitute the result for token.
call the method parse from the instance of TemplateIfParser class, created with 2 arguments parser and bits, substitute the result for condition.
call the method parser.next_token, substitute the result for token.
call the method parser.parse with an argument tuple with an element string 'endif', substitute the result for nodelist.
call the method parser.next_token, substitute the result for token.
define the function ifchanged with 2 arguments: parser and token.
call the method parser.parse with an argument tuple with 2 elements strings: 'else' and 'endfilter', substitute the result for nodelistelse.
call the method parser.next_token, substitute the result for token.
call the method parser.parse with an argument tuple with an element string 'endifchanged', substitute the result for nodelist_false.
call the method parser.delete_first_token.
for every bit in elements of bits without the first element, call the method parser.compile_filter, with an argument bit,
define the function ssi with 2 arguments: parser and token.
where '%s' is replaced with first element of bits. call the method parser.compile_filter with second element of bits as an argument, substitute the result for filepath.
define the function load with 2 arguments: parser and token.
where '%s' is replaced with name and taglib. call the method parser.add_library with an argument temp_lib.