Source code for libpyhat.regression.cv

import copy
import warnings

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import LeaveOneGroupOut

from libpyhat.regression import local_regression
from libpyhat.regression.regression import regression

# Suppressing warning from regression.
# TODO: Determine if this is really necessary.
warnings.filterwarnings("ignore")


[docs] def RMSE(ypred, y): return np.sqrt(np.mean((np.squeeze(ypred) - np.squeeze(y)) ** 2))
[docs] def cv_core(i, paramgrid, Train, xcols, ycol, method, yrange, meta_label="meta"): train_temp = copy.deepcopy(Train) try: train_temp = train_temp.drop("predict", axis=1) except: pass print("Permutation " + str(i + 1) + " of " + str(len(paramgrid))) paramstring = "" for key in paramgrid[i].keys(): paramstring = paramstring + key + ": " + str(paramgrid[i][key]) + "; " print(paramstring[:-2]) try: # create an iterator for cross validation based on the predefined folds cv_iterator = LeaveOneGroupOut().split( train_temp[xcols], train_temp[ycol], train_temp[(meta_label, "Folds")] ) n_folds = LeaveOneGroupOut().get_n_splits( groups=train_temp[(meta_label, "Folds")] ) except KeyError: print( "***No folds found! Did you remember to define folds before " "running cross validation?***" ) return 0 # create an empty output data frame to serve as template output_tmp = pd.DataFrame() # add columns for RMSEC, RMSECV, and RMSE for the folds output_tmp["RMSEC"] = 0 output_tmp["RMSECV"] = 0 # for f in np.array(range(n_folds)) + 1: for f in np.array(range(n_folds)) + 1: output_tmp["Fold " + str(f)] = 0 # fill in the output template based on the current permutation parameters for k in paramgrid[i].keys(): output_tmp.at[0, k] = paramgrid[i][k] output_tmp["Method"] = method cvcols = None foldcount = 1 for train, holdout in cv_iterator: # Iterate through each of the folds # in the training set cv_train = train_temp.iloc[ train ] # extract the data to be used to create the model cv_holdout = train_temp.iloc[ holdout ] # extract the data that will be held out of the model if method == "Local Regression": params = paramgrid[i] try: # on the first pass, pop off the n_neigbors parameter so it # can be passed correctly n_neighbors = params["n_neighbors"] params.pop("n_neighbors") verbose = params["verbose"] params.pop("verbose") except: pass cvcols = [ ( "predict", '"' + method + "- CV -" + str(paramgrid[i]) + " n_neighbors: " + str(n_neighbors) + '"', ) ] model = local_regression.LocalRegression( params, n_neighbors=n_neighbors, verbose=verbose ) y_pred_holdout, coeffs, intercepts = model.fit_predict( cv_train[xcols], cv_train[ycol], cv_holdout[xcols] ) else: cvcols = [("predict", '"' + method + "- CV -" + str(paramgrid[i]) + '"')] # fit the model and predict the held-out data model = regression([method], [paramgrid[i]]) model.fit(cv_train[xcols], cv_train[ycol]) if model.goodfit: y_pred_holdout = model.predict(cv_holdout[xcols]) else: y_pred_holdout = cv_holdout[ycol] * np.nan # add the predictions to the appropriate column in the training data train_temp.loc[train_temp.index[holdout], cvcols[0]] = y_pred_holdout # append the RMSECV to the list output_tmp["Fold " + str(foldcount)] = RMSE(y_pred_holdout, cv_holdout[ycol]) pass foldcount = foldcount + 1 # now that all the folds have been held out and predicted, calculate the # overall rmsecv and add it to the output rmsecv = [] if cvcols is not None: for col in cvcols: rmsecv.append(RMSE(train_temp[col], train_temp[ycol])) predictkeys = [col[-1]] output_tmp["RMSECV"] = rmsecv if method == "Local Regression": model = local_regression.LocalRegression( paramgrid[i], n_neighbors=n_neighbors, verbose=verbose ) modelkey = "{} - {} - ({}, {}) {} n_neighbors: {}".format( method, ycol[-1], yrange[0], yrange[1], paramgrid[i], n_neighbors ) else: model = regression([method], [paramgrid[i]]) modelkey = "{} - {} - ({}, {}) {}".format( method, ycol[-1], yrange[0], yrange[1], paramgrid[i] ) ypred_train = train_temp[ycol] * np.nan if method == "Local Regression": ypred_train, coeffs, intercepts = model.fit_predict( train_temp[xcols], train_temp[ycol], train_temp[xcols] ) else: model.fit(train_temp[xcols], train_temp[ycol]) # if the fit is good, then predict the training set if model.goodfit: ypred_train = model.predict(train_temp[xcols]) else: model = None modelkey = None # add the calibration predictions to the appropriate column if method == "Local Regression": calcol = ( "predict", '"' + method + "- Cal -" + str(paramgrid[i]) + " n_neighbors: " + str(n_neighbors) + '"', ) else: calcol = ("predict", '"' + method + "- Cal -" + str(paramgrid[i]) + '"') predictkeys.append(calcol[-1]) train_temp[calcol] = ypred_train # append the RMSEC for the current settings to the cllection of all RMSECs output_tmp["RMSEC"] = RMSE(ypred_train, train_temp[ycol]) output = output_tmp return output, model, modelkey, predictkeys, train_temp["predict"]
[docs] class cv: def __init__(self, paramgrid, meta_label="meta"): self.paramgrid = paramgrid self.meta_label = meta_label
[docs] def do_cv( self, Train, xcols="wvl", ycol=("comp", "SiO2"), method="PLS", yrange=None, n_jobs=-1, ): if yrange is None: yrange = [np.min(Train[ycol]), np.max(Train[ycol])] args = list(range(len(self.paramgrid))) results = Parallel(n_jobs=n_jobs)( delayed(cv_core)( i, self.paramgrid, Train, xcols, ycol, method, yrange, self.meta_label ) for i in args ) models = [] modelkeys = [] predictkeys = [] output = pd.DataFrame() for i in results: if i != 0: output = pd.concat((output, i[0])) if i[1] is not None: models.append(i[1]) if i[2] is not None: modelkeys.append(i[2]) if i[3] is not None: for j in i[3]: predictkeys.append(j) if i[4] is not None: # TODO: This seems broken. cv_predicts not defined previously. try: cv_predicts = pd.merge( cv_predicts, i[4], left_index=True, right_index=True ) except: cv_predicts = i[4] cv_predicts.columns = [("predict", i) for i in cv_predicts.columns] Train = pd.concat((Train, cv_predicts), axis=1) # make the columns of the output data frame multi-indexed cols = output.columns.values cols = [("cv", i) for i in cols] output.columns = pd.MultiIndex.from_tuples(cols) return Train, output, models, modelkeys, predictkeys