#Train tdm = txtm.TermDocumentMatrix() for doc in x_train: tdm.add_doc(doc) # Push the TDM data to a list of lists, then make that an ndarray, which then becomes a DataFrame. tdm_rows = [] for row in tdm.rows(cutoff = 3): # The setting cutoff=1 means that words which appear in 1 or more documents will be included in the output tdm_rows.append(row) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] TDM_df_train = pd.DataFrame(tdm_array, columns = tdm_terms) TDM_df_train = TDM_df_train.reindex_axis(sorted(TDM_df_train.columns), axis=1) #Ordena las columnas en orden alfabético #Test tdm = txtm.TermDocumentMatrix() for doc in x_test: tdm.add_doc(doc) # Push the TDM data to a list of lists, then make that an ndarray, which then becomes a DataFrame. tdm_rows = [] for row in tdm.rows(cutoff = 3): # The setting cutoff=1 means that words which appear in 1 or more documents will be included in the output tdm_rows.append(row) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] TDM_df_test = pd.DataFrame(tdm_array, columns = tdm_terms) #Remove from TDM_df_test words that aren't on TDM_df_train for col in TDM_df_test: if col not in TDM_df_train.columns: del TDM_df_test[col] TDM_df_test = TDM_df_test.reindex_axis(sorted(TDM_df_train.columns), axis=1, fill_value=0) tfidf = TfidfTransformer() tfidfRedTrain = tfidf.fit_transform(TDM_df_train.values) tfidfRedTest = tfidf.fit_transform(TDM_df_test.values) #Train tdm = txtm.TermDocumentMatrix() for doc in x_train: tdm.add_doc(doc) # Push the TDM data to a list of lists, then make that an ndarray, which then becomes a DataFrame. tdm_rows = [] for row in tdm.rows(cutoff = 3): # The setting cutoff=1 means that words which appear in 1 or more documents will be included in the output tdm_rows.append(row) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] TDM_df_train = pd.DataFrame(tdm_array, columns = tdm_terms) TDM_df_train = TDM_df_train.reindex_axis(sorted(TDM_df_train.columns), axis=1) #Ordena las columnas en orden alfabético #Test tdm = txtm.TermDocumentMatrix() for doc in x_test: tdm.add_doc(doc) # Push the TDM data to a list of lists, then make that an ndarray, which then becomes a DataFrame. tdm_rows = [] for row in tdm.rows(cutoff = 3): # The setting cutoff=1 means that words which appear in 1 or more documents will be included in the output tdm_rows.append(row) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] TDM_df_test = pd.DataFrame(tdm_array, columns = tdm_terms) #Remove from TDM_df_test words that aren't on TDM_df_train for col in TDM_df_test: if col not in TDM_df_train.columns: del TDM_df_test[col] TDM_df_test = TDM_df_test.reindex_axis(sorted(TDM_df_train.columns), axis=1, fill_value=0) tfidf = TfidfTransformer() tfidfRedTrain = tfidf.fit_transform(TDM_df_train.values) tfidfRedTest = tfidf.fit_transform(TDM_df_test.values) This works perfectly fine with small data sets, but now I have a new trouble. Can this be done more efficiently? Because with another dataset I have with 2000 documents, this doesn't work, it just keeps training and never ends. Does anybody know a way to do this in Scala, Spark, HaddopHadoop or something that is faster? Or can recommend me where to do it?