Source code for libpyhat.utils.folds

import numpy as np
from sklearn import model_selection


# This function assigns spectra to folds randomly, but keeps spectra with
# the same value in a user-defined column together.
# This ensures that multiple spectra of the same target end up in the same fold


[docs] def random(df, col, nfolds=5, seed=10, meta_label="meta"): df[(meta_label, "Folds")] = "None" # Create an entry in the data frame that holds # the folds foldslist = np.array(df[(meta_label, "Folds")]) folds = model_selection.GroupKFold(n_splits=nfolds) i = 1 for train, test in folds.split(df, groups=df[col]): foldslist[test] = i i = i + 1 df[(meta_label, "Folds")] = foldslist return df
# This function divides the data up into a specified number of folds, # using sorting # to try to get folds that look similar to each other. # This function keeps spectra with the same value in a user-defined column # together. # This ensures that multiple spectra of the same target end up in the same fold
[docs] def stratified_folds(df, nfolds=5, sortby=None, tiebreaker=None, meta_label="meta"): df[(meta_label, "Folds")] = np.NaN # Create an entry in the data frame that holds # the folds print("Sort by:" + str(sortby)) if tiebreaker is not None: print("Tiebreaker:" + str(tiebreaker)) df.sort_values(by=sortby, inplace=True) # sort the data frame by the column of # interest uniqvals = np.unique( df[sortby] ) # get the unique values from the column of interest # assign folds by stepping through the unique values fold_num = 1 for i in uniqvals: if np.isnan(i): # handle cases where the sorting value is nan ind = np.isnan(df[sortby]) df.loc[df.index[ind], (meta_label, "Folds")] = -1 else: ind = df[sortby] == i # find where the data frame matches the # unique value if tiebreaker is not None: uniqvals2 = np.unique(df[tiebreaker][ind]) for j in uniqvals2: if np.isnan(j): ind2 = np.isnan(df[tiebreaker]) # if the # tiebreaker is nan, we can still use that else: ind2 = df[tiebreaker] == j df.loc[ df.index[np.all([ind, ind2], axis=0)], (meta_label, "Folds") ] = fold_num # Inrement the fold number, reset to 1 if it is greater # than the desired number of folds fold_num = fold_num + 1 if fold_num > nfolds: fold_num = 1 else: df.loc[df.index[ind], (meta_label, "Folds")] = fold_num # Increment the fold number, reset to 1 if it is greater # than the desired number of folds fold_num = fold_num + 1 if fold_num > nfolds: fold_num = 1 # sort by index to return the df to its original order df.sort_index(inplace=True) return df