Source code for libpyhat.transform.dim_red

import numpy as np
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import TSNE

from libpyhat.transform.dim_reductions.jade import JADE
from libpyhat.transform.dim_reductions.lfda import LFDA
from libpyhat.transform.dim_reductions.mnf import MNF


# This function does dimensionality reduction on a data frame full of
# spectra. A number of different methos can be chosen


[docs] def dim_red(df, xcol, method, params, kws, load_fit=None, ycol=None): xdata = df[xcol] add_const = None if method == "PCA": do_dim_red = PCA(*params, **kws) if method == "FastICA": do_dim_red = FastICA(*params, **kws) if method == "t-SNE": do_dim_red = TSNE(*params, **kws) if method == "LLE": do_dim_red = LocallyLinearEmbedding(*params, **kws) if method == "JADE-ICA": do_dim_red = JADE(*params, **kws) if method == "LDA": do_dim_red = LinearDiscriminantAnalysis(*params, **kws) if method == "NNMF": add_const = kws.pop("add_constant") do_dim_red = NMF(*params, **kws) if method == "MNF": do_dim_red = MNF(*params, **kws) if method == "LFDA": do_dim_red = LFDA(*params, **kws) if load_fit: do_dim_red = load_fit else: if method not in ["t-SNE", "MNF"]: if ycol is not None: # find the multi-index that matches the specified single index ycol_tuple = [a for a in df.columns.values if ycol in a][0] ydata = df[ycol_tuple] if method == "LDA": # Check to make sure # of components isn't too high for LDA max_nc = np.min([len(np.unique(ydata)) - 1, len(df[xcol].columns)]) if do_dim_red.n_components > max_nc: print( "n_components cannot be larger than min(" "n_features, n_classes - 1)" ) print("n_features = " + str(len(df[xcol].columns))) print("n_classes-1 = " + str(len(np.unique(ydata)) - 1)) print( "Setting n_components from " + str(do_dim_red.n_components) + " to " + str(max_nc) ) do_dim_red.n_components = max_nc do_dim_red.fit(xdata, ydata) else: if method == "NNMF": if add_const: if xdata.min().min() < 0: xdata = xdata - xdata.min().min() else: print( "Data is already positive: no need to add a constant!" ) check_positive(xdata) do_dim_red.fit(xdata) dim_red_result = do_dim_red.transform(xdata) else: if method == "t-SNE": dim_red_result = do_dim_red.fit_transform(xdata) if method == "MNF": dim_red_result = do_dim_red.fit_transform(xdata) # Need to revisit the loop below for methods that don't use n_components # to make sure column # names makes sense. Also, this produces a not-so-helpful error when # user enters n_components that is too small for i in list(range(1, dim_red_result.shape[1] + 1)): df[(method + " (" + str(xcol) + ")", method + "-" + str(i))] = dim_red_result[ :, i - 1 ] return df, do_dim_red
[docs] def check_positive(data): if data.min().min() < 0: print("NNMF will not work with data containing negative values!")