I am trying to implement the k-fold cross-validation algorithm in python. I know SKLearn provides an implementation but still... This is my code as of right now.
from sklearn import metrics import numpy as np class Cross_Validation: @staticmethod def partition(vector, fold, k): size = vector.shape[0] start = (size/k)*fold end = (size/k)*(fold+1) validation = vector[start:end] if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>": indices = range(start, end) mask = np.ones(vector.shape[0], dtype=bool) mask[indices] = False training = vector[mask] elif str(type(vector)) == "<type 'numpy.ndarray'>": training = np.concatenate((vector[:start], vector[end:])) return training, validation @staticmethod def Cross_Validation(learner, k, examples, labels): train_folds_score = [] validation_folds_score = [] for fold in range(0, k): training_set, validation_set = Cross_Validation.partition(examples, fold, k) training_labels, validation_labels = Cross_Validation.partition(labels, fold, k) learner.fit(training_set, training_labels) training_predicted = learner.predict(training_set) validation_predicted = learner.predict(validation_set) train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted)) validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted)) return train_folds_score, validation_folds_score The learner parameter is a classifier from SKlearn library, k is the number of folds, examples is a sparse matrix produced by the CountVectorizer (again SKlearn) that is the representation of the bag of words. For example:
from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from Cross_Validation import Cross_Validation as cv vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word") data = vectorizer.fit_transform("""textual data""") clfMNB = MultinomialNB(alpha=.0001) score = cv.Cross_Validation(clfMNB, 10, data, labels) print "Train score" + str(score[0]) print "Test score" + str(score[1]) I'm assuming there is some logic error somewhere since the scores are 95% on the training set (as expected) but practically 0 on the test test, but I can't find it.
I hope I was clear. Thanks in advance.
________________________________EDIT___________________________________
This is the code that loads the text into the vector that can be passed to the vectorizer. It also returns the label vector.
from nltk.tokenize import word_tokenize from Categories_Data import categories import numpy as np import codecs import glob import os import re class Data_Preprocessor: def tokenize(self, text): tokens = word_tokenize(text) alpha = [t for t in tokens if unicode(t).isalpha()] return alpha def header_not_fully_removed(self, text): if ":" in text.splitlines()[0]: return len(text.splitlines()[0].split(":")[0].split()) == 1 else: return False def strip_newsgroup_header(self, text): _before, _blankline, after = text.partition('\n\n') if len(after) > 0 and self.header_not_fully_removed(after): after = self.strip_newsgroup_header(after) return after def strip_newsgroup_quoting(self, text): _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)') good_lines = [line for line in text.split('\n') if not _QUOTE_RE.search(line)] return '\n'.join(good_lines) def strip_newsgroup_footer(self, text): lines = text.strip().split('\n') for line_num in range(len(lines) - 1, -1, -1): line = lines[line_num] if line.strip().strip('-') == '': break if line_num > 0: return '\n'.join(lines[:line_num]) else: return text def raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1): base_dir = os.getcwd() train_data = [] label_data = [] for category in categories: os.chdir(base_dir) os.chdir(path+"/"+category[0]) for filename in glob.glob("*"): with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target: data = target.read() if "quoting" in to_be_stripped: data = self.strip_newsgroup_quoting(data) if "header" in to_be_stripped: data = self.strip_newsgroup_header(data) if "footer" in to_be_stripped: data = self.strip_newsgroup_footer(data) if len(data) > noise_threshold: train_data.append(data) label_data.append(category[1]) os.chdir(base_dir) return np.array(train_data), np.array(label_data) This is what "from Categories_Data import categories" imports...
categories = [ ('alt.atheism',0), ('comp.graphics',1), ('comp.os.ms-windows.misc',2), ('comp.sys.ibm.pc.hardware',3), ('comp.sys.mac.hardware',4), ('comp.windows.x',5), ('misc.forsale',6), ('rec.autos',7), ('rec.motorcycles',8), ('rec.sport.baseball',9), ('rec.sport.hockey',10), ('sci.crypt',11), ('sci.electronics',12), ('sci.med',13), ('sci.space',14), ('soc.religion.christian',15), ('talk.politics.guns',16), ('talk.politics.mideast',17), ('talk.politics.misc',18), ('talk.religion.misc',19) ]