Source code for libpyhat.regression.cv

import copy
import warnings

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import LeaveOneGroupOut

from libpyhat.regression import local_regression
from libpyhat.regression.regression import regression

# Suppressing warning from regression.
# TODO: Determine if this is really necessary.
warnings.filterwarnings("ignore")



[docs]
def RMSE(ypred, y):
    return np.sqrt(np.mean((np.squeeze(ypred) - np.squeeze(y)) ** 2))




[docs]
def cv_core(i, paramgrid, Train, xcols, ycol, method, yrange, meta_label="meta"):
    train_temp = copy.deepcopy(Train)
    try:
        train_temp = train_temp.drop("predict", axis=1)
    except:
        pass
    print("Permutation " + str(i + 1) + " of " + str(len(paramgrid)))
    paramstring = ""
    for key in paramgrid[i].keys():
        paramstring = paramstring + key + ": " + str(paramgrid[i][key]) + "; "
    print(paramstring[:-2])

    try:
        # create an iterator for cross validation based on the predefined folds
        cv_iterator = LeaveOneGroupOut().split(
            train_temp[xcols], train_temp[ycol], train_temp[(meta_label, "Folds")]
        )
        n_folds = LeaveOneGroupOut().get_n_splits(
            groups=train_temp[(meta_label, "Folds")]
        )

    except KeyError:
        print(
            "***No folds found! Did you remember to define folds before "
            "running cross validation?***"
        )
        return 0

    # create an empty output data frame to serve as template
    output_tmp = pd.DataFrame()
    # add columns for RMSEC, RMSECV, and RMSE for the folds
    output_tmp["RMSEC"] = 0
    output_tmp["RMSECV"] = 0

    # for f in np.array(range(n_folds)) + 1:
    for f in np.array(range(n_folds)) + 1:
        output_tmp["Fold " + str(f)] = 0
    # fill in the output template based on the current permutation parameters
    for k in paramgrid[i].keys():
        output_tmp.at[0, k] = paramgrid[i][k]

    output_tmp["Method"] = method

    cvcols = None
    foldcount = 1

    for train, holdout in cv_iterator:  # Iterate through each of the folds
        # in the training set

        cv_train = train_temp.iloc[
            train
        ]  # extract the data to be used to create the model
        cv_holdout = train_temp.iloc[
            holdout
        ]  # extract the data that will be held out of the model

        if method == "Local Regression":
            params = paramgrid[i]
            try:
                # on the first pass, pop off the n_neigbors parameter so it
                # can be passed correctly
                n_neighbors = params["n_neighbors"]
                params.pop("n_neighbors")
                verbose = params["verbose"]
                params.pop("verbose")
            except:
                pass
            cvcols = [
                (
                    "predict",
                    '"'
                    + method
                    + "- CV -"
                    + str(paramgrid[i])
                    + " n_neighbors: "
                    + str(n_neighbors)
                    + '"',
                )
            ]
            model = local_regression.LocalRegression(
                params, n_neighbors=n_neighbors, verbose=verbose
            )
            y_pred_holdout, coeffs, intercepts = model.fit_predict(
                cv_train[xcols], cv_train[ycol], cv_holdout[xcols]
            )
        else:
            cvcols = [("predict", '"' + method + "- CV -" + str(paramgrid[i]) + '"')]

            # fit the model and predict the held-out data
            model = regression([method], [paramgrid[i]])
            model.fit(cv_train[xcols], cv_train[ycol])
            if model.goodfit:
                y_pred_holdout = model.predict(cv_holdout[xcols])
            else:
                y_pred_holdout = cv_holdout[ycol] * np.nan
        # add the predictions to the appropriate column in the training data
        train_temp.loc[train_temp.index[holdout], cvcols[0]] = y_pred_holdout
        # append the RMSECV to the list
        output_tmp["Fold " + str(foldcount)] = RMSE(y_pred_holdout, cv_holdout[ycol])
        pass

        foldcount = foldcount + 1

    # now that all the folds have been held out and predicted, calculate the
    # overall rmsecv and add it to the output
    rmsecv = []
    if cvcols is not None:
        for col in cvcols:
            rmsecv.append(RMSE(train_temp[col], train_temp[ycol]))
            predictkeys = [col[-1]]
        output_tmp["RMSECV"] = rmsecv

    if method == "Local Regression":
        model = local_regression.LocalRegression(
            paramgrid[i], n_neighbors=n_neighbors, verbose=verbose
        )
        modelkey = "{} - {} - ({}, {}) {} n_neighbors: {}".format(
            method, ycol[-1], yrange[0], yrange[1], paramgrid[i], n_neighbors
        )
    else:
        model = regression([method], [paramgrid[i]])
        modelkey = "{} - {} - ({}, {}) {}".format(
            method, ycol[-1], yrange[0], yrange[1], paramgrid[i]
        )

    ypred_train = train_temp[ycol] * np.nan
    if method == "Local Regression":
        ypred_train, coeffs, intercepts = model.fit_predict(
            train_temp[xcols], train_temp[ycol], train_temp[xcols]
        )
    else:
        model.fit(train_temp[xcols], train_temp[ycol])
        # if the fit is good, then predict the training set
        if model.goodfit:
            ypred_train = model.predict(train_temp[xcols])
        else:
            model = None
            modelkey = None

    # add the calibration predictions to the appropriate column
    if method == "Local Regression":
        calcol = (
            "predict",
            '"'
            + method
            + "- Cal -"
            + str(paramgrid[i])
            + " n_neighbors: "
            + str(n_neighbors)
            + '"',
        )
    else:
        calcol = ("predict", '"' + method + "- Cal -" + str(paramgrid[i]) + '"')
    predictkeys.append(calcol[-1])
    train_temp[calcol] = ypred_train
    # append the RMSEC for the current settings to the cllection of all RMSECs
    output_tmp["RMSEC"] = RMSE(ypred_train, train_temp[ycol])

    output = output_tmp
    return output, model, modelkey, predictkeys, train_temp["predict"]




[docs]
class cv:
    def __init__(self, paramgrid, meta_label="meta"):
        self.paramgrid = paramgrid
        self.meta_label = meta_label


[docs]
    def do_cv(
        self,
        Train,
        xcols="wvl",
        ycol=("comp", "SiO2"),
        method="PLS",
        yrange=None,
        n_jobs=-1,
    ):
        if yrange is None:
            yrange = [np.min(Train[ycol]), np.max(Train[ycol])]

        args = list(range(len(self.paramgrid)))
        results = Parallel(n_jobs=n_jobs)(
            delayed(cv_core)(
                i, self.paramgrid, Train, xcols, ycol, method, yrange, self.meta_label
            )
            for i in args
        )

        models = []
        modelkeys = []
        predictkeys = []
        output = pd.DataFrame()
        for i in results:
            if i != 0:
                output = pd.concat((output, i[0]))

                if i[1] is not None:
                    models.append(i[1])
                if i[2] is not None:
                    modelkeys.append(i[2])
                if i[3] is not None:
                    for j in i[3]:
                        predictkeys.append(j)
                if i[4] is not None:
                    # TODO: This seems broken. cv_predicts not defined previously.
                    try:
                        cv_predicts = pd.merge(
                            cv_predicts, i[4], left_index=True, right_index=True
                        )
                    except:
                        cv_predicts = i[4]

        cv_predicts.columns = [("predict", i) for i in cv_predicts.columns]
        Train = pd.concat((Train, cv_predicts), axis=1)

        # make the columns of the output data frame multi-indexed
        cols = output.columns.values
        cols = [("cv", i) for i in cols]
        output.columns = pd.MultiIndex.from_tuples(cols)

        return Train, output, models, modelkeys, predictkeys