import numpy as np
from sklearn import model_selection
# This function assigns spectra to folds randomly, but keeps spectra with
# the same value in a user-defined column together.
# This ensures that multiple spectra of the same target end up in the same fold
[docs]
def random(df, col, nfolds=5, seed=10, meta_label="meta"):
df[(meta_label, "Folds")] = "None" # Create an entry in the data frame that holds
# the folds
foldslist = np.array(df[(meta_label, "Folds")])
folds = model_selection.GroupKFold(n_splits=nfolds)
i = 1
for train, test in folds.split(df, groups=df[col]):
foldslist[test] = i
i = i + 1
df[(meta_label, "Folds")] = foldslist
return df
# This function divides the data up into a specified number of folds,
# using sorting
# to try to get folds that look similar to each other.
# This function keeps spectra with the same value in a user-defined column
# together.
# This ensures that multiple spectra of the same target end up in the same fold
[docs]
def stratified_folds(df, nfolds=5, sortby=None, tiebreaker=None, meta_label="meta"):
df[(meta_label, "Folds")] = np.NaN # Create an entry in the data frame that holds
# the folds
print("Sort by:" + str(sortby))
if tiebreaker is not None:
print("Tiebreaker:" + str(tiebreaker))
df.sort_values(by=sortby, inplace=True) # sort the data frame by the column of
# interest
uniqvals = np.unique(
df[sortby]
) # get the unique values from the column of interest
# assign folds by stepping through the unique values
fold_num = 1
for i in uniqvals:
if np.isnan(i): # handle cases where the sorting value is nan
ind = np.isnan(df[sortby])
df.loc[df.index[ind], (meta_label, "Folds")] = -1
else:
ind = df[sortby] == i # find where the data frame matches the
# unique value
if tiebreaker is not None:
uniqvals2 = np.unique(df[tiebreaker][ind])
for j in uniqvals2:
if np.isnan(j):
ind2 = np.isnan(df[tiebreaker]) # if the
# tiebreaker is nan, we can still use that
else:
ind2 = df[tiebreaker] == j
df.loc[
df.index[np.all([ind, ind2], axis=0)], (meta_label, "Folds")
] = fold_num
# Inrement the fold number, reset to 1 if it is greater
# than the desired number of folds
fold_num = fold_num + 1
if fold_num > nfolds:
fold_num = 1
else:
df.loc[df.index[ind], (meta_label, "Folds")] = fold_num
# Increment the fold number, reset to 1 if it is greater
# than the desired number of folds
fold_num = fold_num + 1
if fold_num > nfolds:
fold_num = 1
# sort by index to return the df to its original order
df.sort_index(inplace=True)
return df