Source code for libpyhat.IO.io_ccam_pds

# This code is used to read individual ChemCam files
# Header data is stored as attributes of the data frame
# White space is stripped from the column names
import datetime
import gc
import os
import pickle

import numpy as np
import pandas as pd
import scipy.io as io

from libpyhat.spectral_data import SpectralData
from libpyhat.utils.file_search import file_search


[docs] def CCAM_CSV(input_data, ave=True): # read the beginning of the file header = pd.read_csv(input_data, nrows=20, engine="c", header=None) # count how many rows are commented header_rows = header[0].str.contains("#").sum() - 1 df = pd.read_csv( input_data, header=header_rows, engine="c", delimiter=",", index_col=False ) cols = list(df.columns.values) df.columns = [ i.strip().replace("# ", "") for i in cols ] # strip whitespace from column names df.set_index(["wave"], inplace=True) # use wavelengths as indices metadata = pd.read_csv( input_data, sep="=", nrows=header_rows, comment=",", engine="c", index_col=0, header=None, ) if ave: df = pd.DataFrame(df["mean"]) else: try: df = df.drop(["mean"], axis=1) df = df.drop(["median"], axis=1) except: pass df.index = [ ["wvl"] * len(df.index), df.index.values.round(4), ] # create multiindex so spectra can be easily # extracted with a single key df = df.T # transpose so that each spectrum is a row # remove extraneous stuff from the metadataindices metadata.index = [ i.strip().strip("# ").replace(" FLOAT", "").lower() for i in metadata.index.values ] metadata = metadata.T # extract info from the file name fname = os.path.basename(input_data) metadata["sclock"] = fname[4:13] metadata["seqid"] = fname[25:34].upper() metadata["Pversion"] = fname[34:36] # duplicate the metadata for each row in the df if not ave: metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True) metadata.index = df.index # make the indices match metadata.columns = [ ["meta"] * len(metadata.columns), metadata.columns.values, ] # make the columns into # multiindex df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata return df
[docs] def CCAM_SAV(input_data, ave=True): # read the IDL .SAV file data = io.readsav(input_data, python_dict=True) # put the spectra into data frames and combine them df_UV = pd.DataFrame(data["uv"], index=data["defuv"]) df_VIS = pd.DataFrame(data["vis"], index=data["defvis"]) df_VNIR = pd.DataFrame(data["vnir"], index=data["defvnir"]) df_spect = pd.concat([df_UV, df_VIS, df_VNIR]) df_spect.columns = [ "shot" + str(i + 1) for i in df_spect.columns ] # add 1 to the columns so they # correspond to shot number df_aUV = pd.DataFrame(data["auv"], index=data["defuv"], columns=["average"]) df_aVIS = pd.DataFrame(data["avis"], index=data["defvis"], columns=["average"]) df_aVNIR = pd.DataFrame(data["avnir"], index=data["defvnir"], columns=["average"]) df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR]) df_mUV = pd.DataFrame(data["muv"], index=data["defuv"], columns=["median"]) df_mVIS = pd.DataFrame(data["mvis"], index=data["defvis"], columns=["median"]) df_mVNIR = pd.DataFrame(data["mvnir"], index=data["defvnir"], columns=["median"]) df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR]) df = pd.concat([df_spect, df_ave, df_med], axis=1) # create multiindex to access wavelength values # also, round the wavlength values to a more reasonable level of precision df.index = [["wvl"] * len(df.index), df.index.values.round(4)] # transpose so that spectra are rows rather than columns df = df.T df[("meta", "Shot Number")] = df.index # extract metadata from the file name and add it to the data frame # use the multiindex label "meta" for all metadata pathname, fname = os.path.split(input_data) # for some reason, some ChemCam files have the 'darkname' key, others # call it 'darkspect' # this try-except pair converts to 'darkname' when needed try: data["darkname"] except: try: data["darkname"] = data["darkspec"] except: data["darkname"] = "" metadata = [fname, pathname, fname[4:13], fname[25:34].upper(), fname[34:36]] metalist = [ "continuumvismin", "continuumvnirmin", "continuumuvmin", "continuumvnirend", "distt", "darkname", "nshots", "dnoiseiter", "dnoisesig", "matchedfilter", ] metalist_keep = [] for name in metalist: try: metadata.append(data[name]) metalist_keep.append(name) except: pass metadata = np.tile(metadata, (len(df.index), 1)) metadata_cols = list( zip( ["meta"] * len(df.index), ["file", "filepath", "sclock", "seqid", "Pversion"] + metalist_keep, ) ) try: metadata = pd.DataFrame( metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index ) df = pd.concat([metadata, df], axis=1) except: pass if ave is True: df = df.loc["average"] df = df.to_frame().T else: pass return df
[docs] def ccam_batch( directory, searchstring="*ccs*.csv", to_csv=None, ave=True, versioncheck=True, data_name="ChemCam", to_pickle=False, outpath="", outfile=None, ): # Determine if the file is a .csv or .SAV if "sav" in searchstring.lower(): is_sav = True else: is_sav = False filelist = file_search(directory, searchstring) if len(filelist) == 0: print("No files found in " + directory + " using search string " + searchstring) return basenames = np.zeros_like(filelist) sclocks = np.zeros_like(filelist) P_version = np.zeros_like(filelist, dtype="int") if versioncheck is True: # Extract the sclock and version for each file and ensure that only one # file per sclock is being read, and that it is the one with the # highest version number for i, name in enumerate(filelist): basenames[i] = os.path.basename(name) sclocks[i] = basenames[i][4:13] # extract the sclock P_version[i] = basenames[i][-5:-4] # extract the version sclocks_unique = np.unique(sclocks) # find unique sclocks filelist_new = np.array([], dtype="str") for i in sclocks_unique: match = sclocks == i # find all instances with matching sclocks maxP = P_version[match] == max( P_version[match] ) # find the highest version among these files filelist_new = np.append( filelist_new, filelist[match][maxP] ) # keep only the file with # thei highest version filelist = filelist_new filecount = 0 workinglist = [] subcount = 0 for i, file in enumerate(filelist): filecount = filecount + 1 print("File #" + str(filecount) + " of " + str(len(filelist))) print(file) if is_sav: tmp = CCAM_SAV(file, ave=ave) else: tmp = CCAM_CSV(file, ave=ave) try: # This ensures that rounding errors are not causing mismatches # in columns # TODO: This is broken! var:combined not defined anywhere. cols1 = list(combined["wvl"].columns) cols2 = list(tmp["wvl"].columns) if set(cols1) == set(cols2): combined = pd.concat([combined, tmp]) else: print("Wavelengths don't match!") except: combined = tmp # if doing single shots, save out the data every 50 files so that # the program doesn't run out of memory if filecount % 50 == 0 and ave is False: workingfilename = ( "temporary_data_files_" + str(subcount) + "-" + str(filecount) + ".csv" ) workinglist.append(workingfilename) combined.to_csv(workingfilename) subcount = filecount del combined gc.collect() pass if ave is False: for f in workinglist: pass try: combined.loc[:, ("meta", "sclock")] = pd.to_numeric( combined.loc[:, ("meta", "sclock")] ) except: pass if outfile is None: todays_date = str(datetime.datetime.today()).split()[0].replace("-", "_") # get the date to label output file if ave is True: outfile = data_name + "_" + todays_date else: outfile = data_name + "_" + todays_date + "_shots" if to_pickle is True: # save to a pickle file print("Saving to " + outpath + "//" + outfile + ".pkl") pickle_file = open(outpath + "//" + outfile + ".pkl", "wb") pickle.dump(combined, pickle_file) pickle_file.close() if to_csv is True: combined.to_csv(outpath + "//" + outfile + ".csv") return SpectralData( combined, name=data_name, spect_label="wvl", meta_label="meta", comp_label=None )