Source code for libpyhat.utils.folds

import numpy as np
from sklearn import model_selection


# This function assigns spectra to folds randomly, but keeps spectra with
# the same value in a user-defined column together.
# This ensures that multiple spectra of the same target end up in the same fold



[docs]
def random(df, col, nfolds=5, seed=10, meta_label="meta"):
    df[(meta_label, "Folds")] = "None"  # Create an entry in the data frame that holds
    # the folds
    foldslist = np.array(df[(meta_label, "Folds")])
    folds = model_selection.GroupKFold(n_splits=nfolds)
    i = 1
    for train, test in folds.split(df, groups=df[col]):
        foldslist[test] = i
        i = i + 1

    df[(meta_label, "Folds")] = foldslist
    return df



# This function divides the data up into a specified number of folds,
# using sorting
# to try to get folds that look similar to each other.
# This function keeps spectra with the same value in a user-defined column
# together.
# This ensures that multiple spectra of the same target end up in the same fold

[docs]
def stratified_folds(df, nfolds=5, sortby=None, tiebreaker=None, meta_label="meta"):
    df[(meta_label, "Folds")] = np.NaN  # Create an entry in the data frame that holds
    # the folds
    print("Sort by:" + str(sortby))
    if tiebreaker is not None:
        print("Tiebreaker:" + str(tiebreaker))
    df.sort_values(by=sortby, inplace=True)  # sort the data frame by the column of
    # interest
    uniqvals = np.unique(
        df[sortby]
    )  # get the unique values from the column of interest

    # assign folds by stepping through the unique values
    fold_num = 1
    for i in uniqvals:
        if np.isnan(i):  # handle cases where the sorting value is nan
            ind = np.isnan(df[sortby])
            df.loc[df.index[ind], (meta_label, "Folds")] = -1
        else:
            ind = df[sortby] == i  # find where the data frame matches the
            # unique value
            if tiebreaker is not None:
                uniqvals2 = np.unique(df[tiebreaker][ind])

                for j in uniqvals2:
                    if np.isnan(j):
                        ind2 = np.isnan(df[tiebreaker])  # if the
                        # tiebreaker is nan, we can still use that
                    else:
                        ind2 = df[tiebreaker] == j

                    df.loc[
                        df.index[np.all([ind, ind2], axis=0)], (meta_label, "Folds")
                    ] = fold_num
                    # Inrement the fold number, reset to 1 if it is greater
                    # than the desired number of folds
                    fold_num = fold_num + 1
                    if fold_num > nfolds:
                        fold_num = 1
            else:
                df.loc[df.index[ind], (meta_label, "Folds")] = fold_num
                # Increment the fold number, reset to 1 if it is greater
                # than the desired number of folds
                fold_num = fold_num + 1
                if fold_num > nfolds:
                    fold_num = 1

    # sort by index to return the df to its original order
    df.sort_index(inplace=True)
    return df