Source code for libpyhat.IO.io_ccam_pds

# This code is used to read individual ChemCam files
# Header data is stored as attributes of the data frame
# White space is stripped from the column names
import datetime
import gc
import os
import pickle

import numpy as np
import pandas as pd
import scipy.io as io

from libpyhat.spectral_data import SpectralData
from libpyhat.utils.file_search import file_search



[docs]
def CCAM_CSV(input_data, ave=True):
    # read the beginning of the file
    header = pd.read_csv(input_data, nrows=20, engine="c", header=None)

    # count how many rows are commented
    header_rows = header[0].str.contains("#").sum() - 1
    df = pd.read_csv(
        input_data, header=header_rows, engine="c", delimiter=",", index_col=False
    )
    cols = list(df.columns.values)
    df.columns = [
        i.strip().replace("# ", "") for i in cols
    ]  # strip whitespace from column names
    df.set_index(["wave"], inplace=True)  # use wavelengths as indices
    metadata = pd.read_csv(
        input_data,
        sep="=",
        nrows=header_rows,
        comment=",",
        engine="c",
        index_col=0,
        header=None,
    )

    if ave:
        df = pd.DataFrame(df["mean"])
    else:
        try:
            df = df.drop(["mean"], axis=1)
            df = df.drop(["median"], axis=1)

        except:
            pass
    df.index = [
        ["wvl"] * len(df.index),
        df.index.values.round(4),
    ]  # create multiindex so spectra can be easily
    # extracted with a single key
    df = df.T  # transpose so that each spectrum is a row

    # remove extraneous stuff from the metadataindices
    metadata.index = [
        i.strip().strip("# ").replace(" FLOAT", "").lower()
        for i in metadata.index.values
    ]
    metadata = metadata.T

    # extract info from the file name
    fname = os.path.basename(input_data)
    metadata["sclock"] = fname[4:13]
    metadata["seqid"] = fname[25:34].upper()
    metadata["Pversion"] = fname[34:36]

    # duplicate the metadata for each row in the df
    if not ave:
        metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
    metadata.index = df.index  # make the indices match
    metadata.columns = [
        ["meta"] * len(metadata.columns),
        metadata.columns.values,
    ]  # make the columns into
    # multiindex
    df = pd.concat([metadata, df], axis=1)  # combine the spectra with the metadata
    return df




[docs]
def CCAM_SAV(input_data, ave=True):
    # read the IDL .SAV file

    data = io.readsav(input_data, python_dict=True)

    # put the spectra into data frames and combine them
    df_UV = pd.DataFrame(data["uv"], index=data["defuv"])
    df_VIS = pd.DataFrame(data["vis"], index=data["defvis"])
    df_VNIR = pd.DataFrame(data["vnir"], index=data["defvnir"])
    df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
    df_spect.columns = [
        "shot" + str(i + 1) for i in df_spect.columns
    ]  # add 1 to the columns so they
    # correspond to shot number

    df_aUV = pd.DataFrame(data["auv"], index=data["defuv"], columns=["average"])
    df_aVIS = pd.DataFrame(data["avis"], index=data["defvis"], columns=["average"])
    df_aVNIR = pd.DataFrame(data["avnir"], index=data["defvnir"], columns=["average"])
    df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])

    df_mUV = pd.DataFrame(data["muv"], index=data["defuv"], columns=["median"])
    df_mVIS = pd.DataFrame(data["mvis"], index=data["defvis"], columns=["median"])
    df_mVNIR = pd.DataFrame(data["mvnir"], index=data["defvnir"], columns=["median"])
    df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])

    df = pd.concat([df_spect, df_ave, df_med], axis=1)
    # create multiindex to access wavelength values
    # also, round the wavlength values to a more reasonable level of precision
    df.index = [["wvl"] * len(df.index), df.index.values.round(4)]
    # transpose so that spectra are rows rather than columns
    df = df.T
    df[("meta", "Shot Number")] = df.index
    # extract metadata from the file name and add it to the data frame
    # use the multiindex label "meta" for all metadata

    pathname, fname = os.path.split(input_data)

    # for some reason, some ChemCam files have the 'darkname' key, others
    # call it 'darkspect'
    # this try-except pair converts to 'darkname' when needed
    try:
        data["darkname"]
    except:
        try:
            data["darkname"] = data["darkspec"]
        except:
            data["darkname"] = ""

    metadata = [fname, pathname, fname[4:13], fname[25:34].upper(), fname[34:36]]
    metalist = [
        "continuumvismin",
        "continuumvnirmin",
        "continuumuvmin",
        "continuumvnirend",
        "distt",
        "darkname",
        "nshots",
        "dnoiseiter",
        "dnoisesig",
        "matchedfilter",
    ]
    metalist_keep = []
    for name in metalist:
        try:
            metadata.append(data[name])
            metalist_keep.append(name)
        except:
            pass

    metadata = np.tile(metadata, (len(df.index), 1))
    metadata_cols = list(
        zip(
            ["meta"] * len(df.index),
            ["file", "filepath", "sclock", "seqid", "Pversion"] + metalist_keep,
        )
    )
    try:
        metadata = pd.DataFrame(
            metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index
        )
        df = pd.concat([metadata, df], axis=1)
    except:
        pass

    if ave is True:
        df = df.loc["average"]
        df = df.to_frame().T
    else:
        pass

    return df




[docs]
def ccam_batch(
    directory,
    searchstring="*ccs*.csv",
    to_csv=None,
    ave=True,
    versioncheck=True,
    data_name="ChemCam",
    to_pickle=False,
    outpath="",
    outfile=None,
):
    # Determine if the file is a .csv or .SAV
    if "sav" in searchstring.lower():
        is_sav = True
    else:
        is_sav = False
    filelist = file_search(directory, searchstring)
    if len(filelist) == 0:
        print("No files found in " + directory + " using search string " + searchstring)
        return
    basenames = np.zeros_like(filelist)
    sclocks = np.zeros_like(filelist)
    P_version = np.zeros_like(filelist, dtype="int")

    if versioncheck is True:
        # Extract the sclock and version for each file and ensure that only one
        # file per sclock is being read, and that it is the one with the
        # highest version number
        for i, name in enumerate(filelist):
            basenames[i] = os.path.basename(name)
            sclocks[i] = basenames[i][4:13]  # extract the sclock
            P_version[i] = basenames[i][-5:-4]  # extract the version

        sclocks_unique = np.unique(sclocks)  # find unique sclocks
        filelist_new = np.array([], dtype="str")
        for i in sclocks_unique:
            match = sclocks == i  # find all instances with matching sclocks
            maxP = P_version[match] == max(
                P_version[match]
            )  # find the highest version among these files
            filelist_new = np.append(
                filelist_new, filelist[match][maxP]
            )  # keep only the file with
            # thei highest version

        filelist = filelist_new

    filecount = 0
    workinglist = []
    subcount = 0

    for i, file in enumerate(filelist):
        filecount = filecount + 1
        print("File #" + str(filecount) + " of " + str(len(filelist)))
        print(file)
        if is_sav:
            tmp = CCAM_SAV(file, ave=ave)
        else:
            tmp = CCAM_CSV(file, ave=ave)
        try:
            # This ensures that rounding errors are not causing mismatches
            # in columns
            # TODO: This is broken! var:combined not defined anywhere.
            cols1 = list(combined["wvl"].columns)
            cols2 = list(tmp["wvl"].columns)
            if set(cols1) == set(cols2):
                combined = pd.concat([combined, tmp])
            else:
                print("Wavelengths don't match!")
        except:
            combined = tmp
        # if doing single shots, save out the data every 50 files so that
        # the program doesn't run out of memory
        if filecount % 50 == 0 and ave is False:
            workingfilename = (
                "temporary_data_files_" + str(subcount) + "-" + str(filecount) + ".csv"
            )
            workinglist.append(workingfilename)
            combined.to_csv(workingfilename)
            subcount = filecount
            del combined
            gc.collect()

        pass
    if ave is False:
        for f in workinglist:
            pass

    try:
        combined.loc[:, ("meta", "sclock")] = pd.to_numeric(
            combined.loc[:, ("meta", "sclock")]
        )
    except:
        pass

    if outfile is None:
        todays_date = str(datetime.datetime.today()).split()[0].replace("-", "_")
        # get the date to label output file

        if ave is True:
            outfile = data_name + "_" + todays_date
        else:
            outfile = data_name + "_" + todays_date + "_shots"

    if to_pickle is True:  # save to a pickle file
        print("Saving to " + outpath + "//" + outfile + ".pkl")
        pickle_file = open(outpath + "//" + outfile + ".pkl", "wb")
        pickle.dump(combined, pickle_file)
        pickle_file.close()

    if to_csv is True:
        combined.to_csv(outpath + "//" + outfile + ".csv")

    return SpectralData(
        combined, name=data_name, spect_label="wvl", meta_label="meta", comp_label=None
    )