## script prepared by Patrick Kirby for Amir Behnamian for IGARSS 2019
## prepared:  June 2019
##
## Script contains a series of functions for analyzing Random Forest models and
## land cover classification outputs.
##
#
############################################
##############  General function definitions
def _reqSettMet(key,val):
    """
    # function for checking whether a required setting is met (returns True) or not (returns False)
    # - called by the print_sett function
    # - key: a dictionary key representing the setting name
    # - val: a value representing the setting
    """
    import os
    if key in ('inRefCSV','inRastPath'):
        if not isinstance(val,str): return False
        if not os.path.exists(val) : return False
    elif key == 'FN_classnum':
        if not ( isinstance(val,str) or not isinstance(val,unicode) ): return False
        if len(val)<1: return False
    elif key == 'FN_xy':
        if not ( isinstance(val,tuple) or isinstance(val,list) ): return False
        if len(val)!=2: return False
        for v in val:
            if not ( isinstance(val,str) or not isinstance(val,unicode) ): return False
            if len(v)<1: return False
    return True
#
def _getPixelColRow(x,y,geot):
    """
    # function for getting pixel col/row index, given x-coord (x), y-coord, (y) and geotransform (geot)
    # - returns tuple of integers: [0] is column index (xpix), [1] is row index (ypix)
    # - column indices (xpix) are from 0 to ncols-1, from left to right
    # - row indices (ypix) are from 0 to nrows-1, from top to bottom
    # - based on Affine GeoTransform equations at:  https://www.gdal.org/gdal_datamodel.html
    """
    ypix=(y*geot[1]-geot[3]*geot[1]-x*geot[4]-geot[4]*geot[0])/(geot[5]*geot[1]+geot[4]*geot[2])
    xpix=(x-geot[0]-ypix*geot[2])/geot[1]
    return(int(xpix),int(ypix))
#
def _extractFromRastAtPoint(rast,x,y):
    """
    # function for extracting sample across all channels in a GDAL Dataset (rast) at a single point, given x-coord (x) and y-coord (y)
    # - returns numpy array of length equal to the number of channels in rast (order matches as well)
    # - spatial reference system of x and y coordinates is expected to match that of the GDAL Dataset
    """
    xyoff = _getPixelColRow(x,y,rast.GetGeoTransform())
    return rast.ReadAsArray(xoff=xyoff[0],yoff=xyoff[1],xsize=1,ysize=1).flatten()
#
def _getTrainSampleColumnNames(refdat,pattern='^ch[0-9]+\_',startAfter='y',endBefore=None):
    """
    # function for getting an array of field/column names in refdat that store the training sample (X) data that were extracted from the raster Dataset
    # - returns numpy array of column names representing columns/fields that contain the training sample (X) data
    # - default settings match those columns starting after the 'y' column, and whose name starts with 'ch#_', where # is any number of digits
    # - subsets columns to include only those that come after 'startAfter' (string), come before 'endBefore' (string), and match the reg-ex 'pattern' (string)
    # - any of the 'startAfter','endBefore', and/or 'pattern' options can be omitted by setting them to None
    """
    import numpy as np, re
    cols = np.array(refdat.columns)
    cols = cols[np.where(cols==startAfter)[0][0]+1:] if startAfter is not None else cols
    cols = cols[:np.where(cols==endBefore)[0][0]] if endBefore is not None else cols
    cols = cols[[re.match(re.compile(pattern),c) is not None for c in cols]] if pattern is not None else cols
    return cols
#
def _imputeData(refdat,data_sett):
    """
    # function for omitting or imputing NoData/NaN values in sample data, X, (training along with independent validation)
    # - if  data_sett['impute_strategy'] is None, data points with NoData/NaN values are omitted
    # - if  data_sett['impute_strategy'] is a supported imputation strategy, imputation is performed
    # - returns a copy of refdat, with omitted data (rows removed) or imputed data for the columns that store the sample data (X)
    """
    import numpy as np, pandas as pd, sklearn.impute
    # get array of field/column names in refdat that store the training sample data (X)
    xcols = _getTrainSampleColumnNames(refdat)
    if data_sett['impute_strategy'] is None:
        refdat = refdat[refdat[xcols].apply(lambda x: not x.isnull().any(),axis=1)]
    elif isinstance(data_sett['impute_strategy'],str) or isinstance(data_sett['impute_strategy'],unicode):
        if data_sett['impute_strategy'] not in ('mean','median','most_frequent'):
            raise Exception('Expects data_sett[\'impute_strategy\'] to be one of:  None, int, float, \'mean\', \'median\', \'most_frequent\'')
        # impute by class or in general based on user specification for 'impute_strategy' and 'impute_by_class' (if user chose to impute)
        imputer = sklearn.impute.SimpleImputer(missing_values=np.NaN,strategy=data_sett['impute_strategy'])
        dat = refdat[xcols].copy()
        if data_sett['impute_by_class']:
            for cls in np.unique(refdat['classNum']):
                inds = dat[refdat['classNum']==cls].index
                dat.loc[inds] = imputer.fit_transform(X=dat.loc[inds])
        else:
            dat[xcols] = imputer.fit_transform(X=dat)
        refdat[xcols] = dat
    elif isinstance(data_sett['impute_strategy'],int) or isinstance(data_sett['impute_strategy'],float):
        refdat[xcols]=refdat[xcols].where(pd.notna(refdat[xcols]),other=data_sett['impute_strategy'])
    else:
        raise Exception('Expects data_sett[\'impute_strategy\'] to be one of:  None, int, float, \'mean\', \'median\', \'most_frequent\'')
    return refdat
#
############################################
##############  Function for initializing data settings) and random forest model settings
#
def save_sett(sett):
    """
    # Can be called to save settings to json file, after updating them.
    """
    import json,numpy as np
    if np.in1d(['data','rf'],sett.keys()).sum() < 2:
        if sett['save_to_json'] is not None:
            with open(sett['save_to_json'],"w") as outf:
                json.dump(sett,outf,indent=True)
    else:
        if np.in1d('data',sett.keys())[0]:
            if sett['data']['save_to_json'] is not None:
                with open(sett['data']['save_to_json'],"w") as outf:
                    json.dump(sett['data'],outf,indent=True)
        if np.in1d('rf',sett.keys())[0]:
            if sett['rf']['save_to_json'] is not None:
                with open(sett['rf']['save_to_json'],"w") as outf:
                    json.dump(sett['rf'],outf,indent=True)
    return None
#
def print_sett(sett):
    """
    # Prints current state of the settings to the console.
    """
    from collections import OrderedDict
    import os, numpy as np
    if np.in1d(['data','rf'],sett.keys()).sum() < 2:
        print('\n')
        if 'inRefCSV' in sett.keys():
            print('\n** represents a required setting that still needs valid specification\n')
        maxchars = max([len(x) for x in sett.keys()])
        for key in sett.keys():
            val = sett['data'][key]
            quotes = '\'' if isinstance(val,str) or isinstance(val,unicode) else ''
            if key in ('inRefCSV','FN_classnum','FN_xy','inRastPath'):
                if not _reqSettMet(key,val):
                    print('**  '+' '*(maxchars-len(key))+key+'  :  '+quotes+str(val)+quotes)
                    continue
            print('    '+' '*(maxchars-len(key))+key+'  :  '+quotes+str(val)+quotes)
    else:
        print('\n** represents a required setting that still needs valid specification')
        if np.in1d('data',sett.keys())[0]:
            print('\n-------------\nData settings\n-------------\n')
            maxchars = max([len(x) for x in sett['data'].keys()])
            for key in sett['data'].keys():
                if key == 'working_dir':
                    print('    '+' '*(maxchars-len(key))+key+'  :  \''+str(os.getcwdu())+'\'')
                else:
                    val = sett['data'][key]
                    quotes = '\'' if isinstance(val,str) or isinstance(val,unicode) else ''
                    if key in ('inRefCSV','FN_classnum','FN_xy','inRastPath'):
                        if not _reqSettMet(key,sett['data'][key]):
                            print('**  '+' '*(maxchars-len(key))+key+'  :  '+quotes+str(val)+quotes)
                            continue
                    print('    '+' '*(maxchars-len(key))+key+'  :  '+quotes+str(val)+quotes)
        if np.in1d('rf',sett.keys())[0]:
            print('\n----------------------------\nRandom Forest model settings\n----------------------------\n')
            maxchars = max([len(x) for x in sett['rf'].keys()])
            for key in sett['rf'].keys():
                val = sett['rf'][key]
                quotes = '\'' if isinstance(val,str) or isinstance(val,unicode) else ''
                print('    '+' '*(maxchars-len(key))+key+'  :  '+quotes+str(val)+quotes)
    return None
#
def prep_sett(DATA_load_from_json = None,       DATA_save_to_json  = None,          DATA_randSeedValue  = None,
              DATA_inRefCSV       = '',         DATA_FN_pointID    = '',            DATA_FN_classnum = '',
              DATA_FN_classlab    = '',         DATA_FN_xy         = ('',''),       DATA_inRastPath  = '',
              DATA_indValidSplit  = 0.3,        DATA_splitByClass  = True,          DATA_minPointsForModel = 10,
              DATA_saveDataCSV    = None,       DATA_impute_strategy  = 'mean',     DATA_impute_by_class = True,
              DATA_saveImputedDataCSV = None,   DATA_saveRFclassifier = None,       DATA_saveImportancesCSV = None,
              DATA_saveErrorMatrixCSV = None,   DATA_saveAccuracyCSV  = None,
              DATA_consol_outCSV_dir = None,    DATA_consol_outCSV_basename = 'consol_',
              RF_load_from_json   = None,       RF_save_to_json   = None,           RF_n_estimators      = 300,
              RF_criterion        = 'gini',     RF_max_depth      = None,           RF_min_samples_split = 2,
              RF_min_samples_leaf = 1,          RF_min_weight_fraction_leaf = 0.0,  RF_max_features      = 'auto',
              RF_max_leaf_nodes   = None,       RF_min_impurity_decrease    = 0.0,  RF_bootstrap         = True,
              RF_oob_score        = False,      RF_n_jobs     = None,               RF_random_state      = None,
              RF_verbose          = 0,          RF_warm_start = False,              RF_class_weight      = None,
              ):
    """
    # Returns an ordered dictionary of two ordered dictionaries.  The two sub-dictionaries are:
    #    - ['data'] : data settings.
    #    - ['rf']   : random forest model settings.
    # If this function is called without modifying any arguments, then all settigns in the returned dictionary will be defaults.
    # Settings may also be saved to or loaded from json file for both 'data' and 'rf' settings, by providing the filename (in working dir)
    #    or path for the  ..._load_from_json and ..._save_to_json arguments.
    """
    import os, json
    from collections import OrderedDict
    sett = OrderedDict([('data',OrderedDict([])),('rf',OrderedDict([]))])
    ## prep/load data settings (sett['data'])
    if DATA_load_from_json is not None:
        if os.path.exists(DATA_load_from_json):
            sett['data'] = json.load(open(DATA_load_from_json,'r'))
            sett['data']['load_from_json'] = DATA_load_from_json
            with open(DATA_load_from_json,'r') as settfile:
               sett['data'] = json.loads(settfile.read(),object_pairs_hook=OrderedDict)
            sett['data']['load_from_json'] = DATA_load_from_json
            sett['data']['working_dir'] = os.getcwdu()
        else:
            raise Exception('Could not find the specified path for DATA_load_from_json')
    else:
        d = OrderedDict([])
        d['working_dir'] = os.getcwdu()
        d['load_from_json'] = None;               d['save_to_json'] = DATA_save_to_json;       d['randSeedValue'] = DATA_randSeedValue
        d['inRefCSV'] = DATA_inRefCSV;            d['FN_pointID'] = DATA_FN_pointID;           d['FN_classnum'] = DATA_FN_classnum
        d['FN_classlab'] = DATA_FN_classlab;      d['FN_xy'] = DATA_FN_xy;                     d['inRastPath'] = DATA_inRastPath
        d['indValidSplit'] = DATA_indValidSplit;  d['splitByClass'] = DATA_splitByClass;       d['minPointsForModel'] = DATA_minPointsForModel
        d['saveDataCSV'] = DATA_saveDataCSV;      d['impute_strategy'] = DATA_impute_strategy; d['impute_by_class'] = DATA_impute_by_class
        d['saveImputedDataCSV'] = DATA_saveImputedDataCSV;         d['saveRFclassifier']   = DATA_saveRFclassifier
        d['saveRFclassifier']   = DATA_saveRFclassifier;           d['saveImportancesCSV'] = DATA_saveImportancesCSV
        d['saveErrorMatrixCSV'] = DATA_saveErrorMatrixCSV;         d['saveAccuracyCSV'] = DATA_saveAccuracyCSV
        d['consol_outCSV_dir'] = DATA_consol_outCSV_dir;           d['consol_outCSV_basename'] = DATA_consol_outCSV_basename
        sett['data'] = d
    ## prep/load Random Forest settings (sett['rf'])
    if RF_load_from_json is not None:
        if os.path.exists(RF_load_from_json):
            sett['rf'] = json.load(open(RF_load_from_json,'r'))
            sett['rf']['load_from_json'] = RF_load_from_json
            with open(RF_load_from_json,'r') as settfile:
               sett['rf'] = json.loads(settfile.read(),object_pairs_hook=OrderedDict)
            sett['rf']['load_from_json'] = RF_load_from_json
            sett['rf']['working_dir'] = os.getcwdu()
        else:
            raise Exception('Could not find the specified path for RF_load_from_json')
    else:
        d = OrderedDict([])
        d['working_dir'] = os.getcwdu()
        d['load_from_json'] = None;                  d['save_to_json'] = RF_save_to_json;      d['n_estimators'] = RF_n_estimators
        d['criterion'] = RF_criterion;               d['max_depth'] = RF_max_depth;            d['min_samples_split'] = RF_min_samples_split
        d['min_samples_leaf'] = RF_min_samples_leaf; d['min_weight_fraction_leaf'] = RF_min_weight_fraction_leaf
        d['max_features'] = RF_max_features;         d['max_leaf_nodes'] = RF_max_leaf_nodes;  d['min_impurity_decrease'] = RF_min_impurity_decrease
        d['bootstrap'] = RF_bootstrap;               d['oob_score'] = RF_oob_score;            d['n_jobs'] = RF_n_jobs
        d['random_state'] = RF_random_state;         d['verbose'] = RF_verbose;                d['warm_start'] = RF_warm_start
        d['class_weight'] = RF_class_weight
        sett['rf'] = d
    ## save settings to json file if user chose to do so
    save_sett(sett)
    return sett
#
##########################################################################################
############## Point biserial correlation as a measure of association
def pointBiserialCorr(dat,n,saveToCSV=False,outCSV_dir='',outCSV_basename='pointBiserialCorr'):
    """
    # Takes a copy of reference data dataframe (refdat), calculates the absolute point biserial 
    # correlation  for all predictor variables across all class pairs, and extracts the top n variables.
    # -  input data (dat) can be impdat instead of refdat, only if the omission option was used, 
    #    as in the 'impute_strategy' data setting is None
    # - n should be an integer > 0 and <= the total number of predictor variables / image channels
    # - channel naming format in refdat/impdat expected to be 'ch#_...' where # is any number of digits
    #   representing the channel number
    # Returns a dictionary with the following keys and objects:
    #   ['top_vars']: pandas data frame with columns representing selected variables by name
    #                 ('channel_names' column) and by image channel number ('channel_numbers' column)
    #   ['rank_chnames']: pandas data frame with columns representing channels/variable names ranked
    #                     by point biserial correlation for a given class pair
    #   ['rank_chnums'] : pandas data frame with columns representing channels/variable numbers ranked
    #                     by point biserial correlation for a given class pair     
    #   ['corr'] : pandas data frame with columns representing the absolute point biserial correlation
    #              across all image channels/variables               
    """
    import os, re, numpy as np, pandas as pd
    from itertools import combinations
    ## get array of field/column names in impdat that store the training sample data (X)
    xcols = _getTrainSampleColumnNames(dat)
    ## extract channel numbers from channel names (assumes 'ch#_' naming format, where # is any number of digits)
    ch_nums = np.array([int(x.split('_',1)[0][2:]) for x in xcols])
    ## get unique class numbers in ascending order, and a string for formatting these numbers
    classes = np.unique(dat['classNum'])
    classNumFmt = '%0'+str(len(str(classes.max())))+'d'
    ## get all combinations of class pairs
    classPairs  = list(combinations(classes,2))
    ## get an array representing labels for class pairs (used for column names in the returned dataframes)
    out_colnames = np.array(['c'+classNumFmt%c1+'_c'+classNumFmt%c2 for c1,c2 in classPairs])
    ## generate data frame representing absolute point biserial correlations for all class pairs
    corr = pd.DataFrame(index=xcols,columns=out_colnames)
    for cp,(c1,c2) in enumerate(classPairs):
        c1_dat = dat[dat['classNum']==c1][xcols]
        c2_dat = dat[dat['classNum']==c2][xcols]
        c1_n = c1_dat.notna().sum()
        c2_n = c2_dat.notna().sum()
        mean1 = c1_dat.mean(axis=0,skipna=True)
        mean2 = c2_dat.mean(axis=0,skipna=True)
        sd = pd.concat([c1_dat,c2_dat]).std(axis=0,skipna=True)
        corr.iloc[:,cp] = abs(((mean1-mean2)/sd) * ( (c1_n*c2_n)/(c1_n+c2_n)**2 )**0.5)
    ## rank channels/variables based on point biserial correlation
    rankNumFmt = '%0'+str(len(str(len(xcols))))+'d'
    rankLabels = np.array(['rank_'+rankNumFmt%i for i in range(1,len(xcols)+1)])
    rank_chname = pd.DataFrame(index=rankLabels,columns=out_colnames)
    rank_chnum = pd.DataFrame(index=rankLabels,columns=out_colnames)
    for cp in range(0,len(classPairs)):
        inds = (corr.iloc[:,cp].values*-1).argsort()
        rank_chnum.iloc[:,cp] = ch_nums[inds]
        rank_chname.iloc[:,cp] = xcols[inds]
    ## get array of top n variables, based on user specification for n
    top_n = n if n <= len(xcols) else len(xcols)
    row = 0 ## indicator for which row of the ranked channel name dataframe to pull top considered variables from
    var = np.array([]) ## will store selected variables
    while len(var) < top_n:
        ## get additional variables for consideration from row
        consid_var = np.unique(rank_chname.iloc[row,])
        consid_var = consid_var[np.logical_not(np.in1d(consid_var,var))]
        ## if there are no new variables for consideration in the current row, increment row and continue
        if len(consid_var)==0:
            row+=1
            continue
        ## add considered variables to var in an order based on their absolute point biserial correlation sums 
        ## across all class pairs (largest sums added first). Total number of variables in var is capped at top_n.
        sortedCorrSums = corr.loc[consid_var].sum(axis=1).sort_values(ascending=False)
        var = np.r_[var,sortedCorrSums.index[0:(top_n-len(var))]]
        row+=1
    ## prep arrays of selected variables by channel name and channel number, and combine into a single dataframe
    var_chnames = np.array([str(v) for v in var])
    var_chnums = np.array([(np.where(v==xcols)[0][0])+1 for v in var])
    top_vars = pd.DataFrame(index=range(1,len(var_chnames)+1),data=var_chnames,columns=['channel_names'])
    top_vars = top_vars.join(pd.DataFrame(index=range(1,len(var_chnums)+1),data=var_chnums,columns=['channel_numbers']))
    ## save results to file, if user chose to do so
    if saveToCSV:
        if len(outCSV_dir)>0:
            out_dir = outCSV_dir+'/' if re.search(re.compile('(/|\\\\)$'),outCSV_dir) is None else outCSV_dir
        else:
            out_dir = outCSV_dir
        out_paths = {'top_vars':    out_dir+outCSV_basename+'_top_vars.csv',
                     'rank_chname': out_dir+outCSV_basename+'_rank_chname.csv',
                     'rank_chnum':  out_dir+outCSV_basename+'_rank_chnum.csv',
                     'corr':        out_dir+outCSV_basename+'_absCorrCoeff.csv'}
        for k in out_paths.keys():
            if os.path.exists(out_paths[k]):
                raise Exception('Output CSV path already exists. Consider changing outCSV_dir and/or outCSV_basename.\n'+\
                                '       '+out_paths[k]+'\n')
        top_vars.to_csv(path_or_buf=out_paths['top_vars'],index_label='select_order')
        rank_chname.to_csv(path_or_buf=out_paths['rank_chname'],index_label='rank')
        rank_chnum.to_csv(path_or_buf=out_paths['rank_chnum'],index_label='rank')
        corr.to_csv(path_or_buf=out_paths['corr'],index_label='variable')
    ## return dictionary of selected variable arrays, ranked variable dataframes, and the point biserial correlation dataframe
    return {'top_vars':top_vars,'rank_chnames':rank_chname,'rank_chnums':rank_chnum,'corr':corr}
#
##########################################################################################
############## Subset data
def subsetVariables(dat,channels,drop=False):
    """
    # Takes a dataframe (refdat or impdat) and returns a subset (columns removed) based on specified channel/variable
    # names or channel numbers (channels).  If drop is True, the specified channel names or numbers will
    # be removed.  If drop is False, only the specified channel names or numbers will be removed. 'channels' must be
    # a list, tuple, pandas series, or numpy array.  If specifying channel numbers instead of names, it must contain
    # integer values representing channel numbers, instead of strings.  Columns in dat representing image channels
    # are expected to have names starting with 'ch#_...' where # is any number of digits representing the channel
    # number.
    """
    import re, numpy as np, pandas as pd
    if not ( isinstance(channels,list) or isinstance(channels,tuple) or
             isinstance(channels,pd.Series) or isinstance(channels,np.ndarray)):
        raise Exception('channels must be a list, tuple, pandas series, or numpy array.')
    xcols = _getTrainSampleColumnNames(dat)
    channels = np.array(channels)
    if np.issubdtype(channels[0],np.integer):
        patt = re.compile('^ch0*('+'|'.join(channels.astype(str))+')_')
        channels = xcols[np.apply_along_axis(np.vectorize(lambda x: re.search(patt,x) is not None),0,xcols)]
    if not np.all(np.in1d(channels,xcols)):
        raise Exception('Not all specified channel names or numbers were found within dat')
    toDrop = xcols[np.logical_not(np.in1d(xcols,channels))] if not drop else channels
    return dat.drop(columns=toDrop)
#
##########################################################################################
############## Reset reference data frame (refdat) and/or imputed reference data frame (impdat), when performing iterations
def resetRefDataframe(dat,imputedAndSplit=False):
    """
    # Takes a copy of the reference dataframe or imputed reference dataframe as dat and resets it to an earlier state.  For use when performing iterations.
    # Earlier state is either:
    #     imputedAndSplit=False : state just before the imputation and splitting off of the independent validation set.
    #         - i.e. to the same state as reference data returned by the a_intialDataPrep function
    #         - this should not be used if dat represents the imputed and split reference data (e.g. impdat)
    #     imputedAndSplit=True  : state just after the imputation and splitting off of the independent validation set
    #         - i.e. to the same state as reference data (refdat) or imputed reference data (impdat) returned returned by the b_imputeDataAndSplitIndValidSet function
    #  *NOTE* This function only subsets columns.  It does not change any values in the retained columns.
    """
    import numpy as np, pandas as pd
    if imputedAndSplit:
        return dat[np.r_[['indValidSet','classLab','classNum','x','y'],_getTrainSampleColumnNames(dat)]]
    else:
        return dat[np.r_[['classLab','classNum','x','y'],_getTrainSampleColumnNames(dat)]]
#
############################################################################################################
##############Initial preparation of reference dataset and extraction of raster data at reference locations
def a_initialDataPrep(sett):
    """
    # extract data at refdat locations, across all channels of the input raster Dataset
    # - returns a list with 3 components:
    #   [0] refdat   : a data frame representing reference data with extracted raster data at ref point locations
    #   [1] inrast   : a gdal Dataset representing the opened raster dataset at data_sett['inRastPath']
    #   [2] chaninfo : a dictionary containing information about the channels within inrast, for use in other functions
    """
    import numpy as np, pandas as pd, random
    from collections import OrderedDict
    try:
        from osgeo import gdal
    except ImportError:
        import gdal
    # get data settings
    data_sett = sett['data']
    # set the random seed based on data_sett['randSeedValue']. If data_sett['randSeedValue'] is None, this command will have no effect on the random state.
    random.seed(data_sett['randSeedValue'])
    # read inRefCSV as pandas data frame and standardize columns in terms of name and order
    FN_pointID = None if data_sett['FN_pointID'] == "" else data_sett['FN_pointID']
    refdat = pd.read_csv(data_sett['inRefCSV'],sep=",",header=0,index_col=FN_pointID)
    refdat = refdat.rename(columns={data_sett['FN_classlab']:'classLab',data_sett['FN_classnum']:'classNum',
                                    data_sett['FN_xy'][0]:'x',data_sett['FN_xy'][1]:'y'})
    fns = np.array(['classLab','classNum','x','y'])
    if not np.in1d('classLab',refdat.columns)[0]:
        refdat = refdat.join(pd.DataFrame(data=np.array(refdat[['classNum']]),index=refdat.index,columns=['classLab']))
    refdat = refdat[fns]
    # dictionary of GDAL data type codes (as keys) and corresponding numpy data types
    typeCodeDict = {1:np.uint8,2:np.uint16,3:np.int16,4:np.uint32,5:np.int32,6:np.float32,7:np.float64,
                    8:np.complex64,9:np.complex64,10:np.complex64,11:np.complex128}
    # open inRastPath as a gdal Dataset
    inrast = gdal.Open(data_sett['inRastPath'])
    # cycle through the channels in the input raster dataset, storing information about each in a series of nested dictionaries
    # e.g. chaninfo[2]['name'] returns the name associated with channel 2 ; chaninfo[10]['nodatavalue'] returns the NoDataValue associated with channel 10
    chaninfo = {}
    for ch in range(1,inrast.RasterCount+1):
        band = inrast.GetRasterBand(ch)
        chaninfo[ch] = {'number':ch,'name':band.GetDescription(),'datatype_gdal':band.DataType,'datatype_np':typeCodeDict[band.DataType],
                        'nodatavalue':band.GetNoDataValue(),'xysize':(band.XSize,band.YSize),'metadata':band.GetMetadata_Dict()}
    #
    # extract the data at refdat locations
    rastdat = np.apply_along_axis(lambda x: _extractFromRastAtPoint(inrast,x[0],x[1]),1,refdat[['x','y']])
    # cycle through channels: prep field/column names, prep data types, and join the extracted data to refdat as new fields/columns
    for ch in range(1,inrast.RasterCount+1):
        # field/column name 
        rastdat_fn = 'ch'+('%0'+str(len(str(inrast.RasterCount)))+'d')%ch+'_'+chaninfo[ch]['name'][0:50]
        # data type
        np_dtype = chaninfo[ch]['datatype_np']
        if np.issubdtype(np_dtype,np.signedinteger) or np.issubdtype(np_dtype,np.unsignedinteger):
            rastdat_dtype = int
        elif np.issubdtype(np_dtype,np.floating):
            rastdat_dtype =  float
        elif np.issubdtype(np_dtype,np.bool_):
            rastdat_dtype = bool
        elif np.issubdtype(np_dtype,np.complex):
            rastdat_dtype = complex
        else:
            raise Exception("Unexpected raster data type encountered.")
        # join data from current channel to refdat
        refdat = refdat.join(pd.DataFrame(data=rastdat[:,(ch-1)],index=refdat.index,columns=[rastdat_fn],dtype=rastdat_dtype))
        # if there is a NoDataValue for the current channel, set those values to NaN in refdata
        if chaninfo[ch]['nodatavalue'] is not None:
            refdat[[rastdat_fn]] = refdat[[rastdat_fn]].replace(to_replace=chaninfo[ch]['nodatavalue'],value=np.NaN)
    #
    return [refdat,inrast,chaninfo]
#
###############################################################
############## Impute data and split independent validation set
def b_imputeDataAndSplitIndValidSet(refdat,sett,verbose=True):
    """
    # Perform data imputation (or omission) and split dataset into an independent valiation set and model fitting set
    # - returns a list with two components:
    #   [0] a copy of refdat with 'indValidSplit' as the first column, indicating whether a data point has been set 
    #       aside for independent validation (1) or set aside for model fitting (0)
    #       - if the data_sett['impute_strategy'] was None, meaning omission of NoData/NaN data points, then 'indValidSplit' may contain
    #         values of -1 indicating that the row will be omitted from both model fitting and independent validation due to NoData/NaN values
    #   [1] a copy of with the same format as above (including 'indValidSplit') but with imputed values in the sample data (X) columns
    #       where the NoData/NaN values were in the original copy of the reference data
    """
    import numpy as np, pandas as pd
    from collections import OrderedDict
    data_sett = sett['data']
    # add field/column for representing whether (1) or not (0) a reference data point has been set aside for independent validation
    if 'indValidSet' in refdat:
        refdat['indValidSet']=0
    else:
        refdat = pd.DataFrame(data=0,index=refdat.index,columns=['indValidSet'],dtype=int).join(refdat)
    # ensure that refdat doesn't already have predictions attached to it
    if np.in1d('predict',refdat.columns)[0]:
        raise Exception('\n\nrefdat should not have predictions attached to it at this point.\nIt should be in the state as '+
                        'returned by the a_intialDataPrep function.\n\nIf iteratively preparing random forests, '+
                        'consider using the resetRefDataframe function.')
    # impute data based on data_sett['impute_strategy'] or omit points with NoData/NaN if data_sett['impute_strategy'] is None
    # - from this point on, refdat represents the omitted/imputed data, and refdat_orig represents the non-omitted/non-imputed data
    refdat_orig = refdat.copy()
    refdat_orig['indValidSet'] = -1
    refdat = _imputeData(refdat,data_sett)
    # perform independent validation split
    if data_sett['splitByClass']:
        # perform class-level independent validation split
        for cls in np.unique(refdat[['classNum']].values):
            dat_sample = refdat[refdat['classNum']==cls].sample(frac=data_sett['indValidSplit'],replace=False)
            refdat.loc[dat_sample.index,'indValidSet'] = 1
            # perform the class-level check of minimum points for the RF model, if user chose to do so
            if data_sett['minPointsForModel'] > 0 and data_sett['indValidSplit'] > 0:
                if dat_sample.shape[0] < data_sett['minPointsForModel']:
                    raise Exception('Class number '+str(cls)+' only has '+str(dat_sample.shape[0])+' data points for model.'+\
                                    ' Minimum number is '+str(data_sett['minPointsForModel'])+'.')
    else:
        # perform general independent validation split
        dat_sample = refdat.sample(frac=data_sett['indValidSplit'],replace=False)
        refdat.loc[dat_sample.index,'indValidSet'] = 1
        # perform general check of minimum points for the RF model, if user chose to do so
        if data_sett['minPointsForModel'] > 0 and data_sett['indValidSplit'] > 0:
            if dat_sample.shape[0] < data_sett['minPointsForModel']:
                raise Exception('Only '+str(dat_sample.shape[0])+' data points for model.'+\
                                ' Minimum number is '+str(data_sett['minPointsForModel'])+'.')
    # update refdat_orig with independent validation split indicators
    refdat_orig.loc[refdat.index,'indValidSet'] = refdat['indValidSet']
    # notify user of split (if verbose)
    if verbose:
        if data_sett['impute_strategy'] is None:
            print("\n"+str(len(refdat_orig)-len(refdat))+" data points omitted due to NoData values and lack of imputation.\n")
        if data_sett['splitByClass']:
            print('\n'+str(int(round(data_sett['indValidSplit']*100,0)))+'% of reference data points ('+str(refdat.shape[0])+\
                  ' total) of each class set aside for independent validation:\n')
            for cls in np.unique(refdat[['classNum']].values):
                print('    Class '+str(cls)+' : '+str(((refdat['classNum']==cls) & (refdat['indValidSet']==1)).sum())+' of '+\
                      str((refdat['classNum']==cls).sum())+' reference points set aside')
                refdat[refdat['classNum']==cls].shape[0]
            print('\n')
        else:
            print('\n'+str(int(round(data_sett['indValidSplit']*100,0)))+'% of reference data points ('+\
                  str((refdat['indValidSet']==1).sum())+' of '+str(refdat.shape[0])+') set aside for independent validation.\n')
    # at this points refdat_orig is the input refernce dataset, and refdat is the imputed reference dataset
    return [refdat_orig,refdat]
#
##########################################
############## Prepare Random Forest model
def c_prepRandomForest(refdat,impdat,sett):
    """
    # Define Random Forest Classifier based on rf_sett settings, fit the model on imputed/subset data (NoData/NaN values handled),
    # make predictions (with probabilities) at reference data locations, and save reference datasets (incl. predicitons/probabilities).
    # - returns a list of three components:
    #   [0] a copy of the reference data with predictions/probabilities added
    #   [1] a copy of the imputed/subset reference data with predictions/probabilities added
    #   [2] the trained/fit Random Forest Classifier
    # - see: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    """
    import numpy as np, pandas as pd, os,json,pickle
    from collections import OrderedDict
    from sklearn.ensemble import RandomForestClassifier
    data_sett = sett['data']
    rf_sett = sett['rf']
    # ensure that refdat doesn't already have predictions attached to it
    if np.in1d('predict',refdat.columns)[0]:
        raise Exception('\n\nrefdat should not have predictions attached to it at this point.\nIt should be in the state as '+
                        'returned by the a_intialDataPrep function.\n\nIf iteratively preparing random forests, '+
                        'consider using the resetRefDataframe function.')
    if np.in1d('predict',impdat.columns)[0]:
        raise Exception('\n\nimpdat should not have predictions attached to it at this point.\nIt should be in the state as '+
                        'returned by the a_intialDataPrep function.\n\nIf iteratively preparing random forests, '+
                        'consider using the resetRefDataframe function.')
    # if the supplied impdat dataframe does not have an 'indValidSet' column, then use all of the data points provided
    if not 'indValidSet' in impdat:
        impdat = pd.DataFrame(data=0,index=impdat.index,columns=['indValidSet'],dtype=int).join(impdat)
    # define the Random Forest classifier based on user specifications in rf_sett
    rf = RandomForestClassifier(n_estimators=rf_sett['n_estimators'],criterion=rf_sett['criterion'],max_depth=rf_sett['max_depth'],
                                min_samples_split=rf_sett['min_samples_split'],min_samples_leaf=rf_sett['min_samples_leaf'],
                                min_weight_fraction_leaf=rf_sett['min_weight_fraction_leaf'],max_features=rf_sett['max_features'],
                                max_leaf_nodes=rf_sett['max_leaf_nodes'],min_impurity_decrease=rf_sett['min_impurity_decrease'],
                                bootstrap=rf_sett['bootstrap'],oob_score=rf_sett['oob_score'],n_jobs=rf_sett['n_jobs'],
                                random_state=rf_sett['random_state'],verbose=rf_sett['verbose'],warm_start=rf_sett['warm_start'],
                                class_weight=rf_sett['class_weight'])
    # get array of field/column names in impdat that store the training sample data (X)
    xcols = _getTrainSampleColumnNames(impdat)
    # subset impdat to only include those data points that are to be included in the model
    mdat = impdat[impdat['indValidSet']==0]
    # fit/train the Random Forest
    rf = rf.fit(X=mdat[xcols],y=mdat['classNum'])
    #### predictions at reference data locations
    classes = np.unique(refdat['classNum'])
    # prep field/column names for storing predictions and class membership probilities at reference data locations
    classNumFmt = '%0'+str(len(str(classes.max())))+'d'
    startFNs = refdat.columns[:np.where(refdat.columns=='classNum')[0][0]+1] # 'classNum' field name and all preceeding field names 
    predFNs = np.r_[np.array('predict'),np.array(['prob_c'+classNumFmt%c for c in classes])] # fields to be added
    endFNs   = refdat.columns[np.where(refdat.columns=='classNum')[0][0]+1:] # all field names after 'classNum'
    # add and populate fields/columns to refdat, storing the predictions and class membership probabilities
    refdat = refdat.join(pd.DataFrame(data=-1,index=refdat.index,columns=[predFNs[0]],dtype=int))
    refdat = refdat.join(pd.DataFrame(data=np.NaN,index=refdat.index,columns=predFNs[1:],dtype=float))
    refdat.loc[impdat.index,'predict'] = rf.predict(impdat[xcols])
    refdat.loc[impdat.index,predFNs[1:]] = rf.predict_proba(impdat[xcols])
    # add the prediction/probability fields to impdat as well
    impdat = impdat.join(refdat.loc[impdat.index,predFNs])
    # re-order the columns in both refdat and impdat so that the prediction/probability columns are between the 'classNum' and 'x' fields
    refdat = refdat[np.r_[startFNs,predFNs,endFNs]]
    impdat = impdat[np.r_[startFNs,predFNs,endFNs]]
    # save refdat, impdat, and/or feature importances to CSV file, if user specified to do so
    if data_sett['saveDataCSV'] is not None:
        refdat.to_csv(data_sett['saveDataCSV'],index=True,index_label=data_sett['FN_pointID'])
    if data_sett['saveImputedDataCSV'] is not None:
        impdat.to_csv(data_sett['saveImputedDataCSV'],index=True,index_label=data_sett['FN_pointID'])
    if data_sett['saveImportancesCSV'] is not None:
        pd.DataFrame(data=[rf.feature_importances_],index='importance',columns=xcols).to_csv(data_sett['saveImportancesCSV'],index=True)
    # pickle/save Random Forest model to file if user specified to do so
    if data_sett['saveRFclassifier'] is not None:
        pickle.dump(rf,open(data_sett['saveRFclassifier'],'w'))
    # 
    return [refdat,impdat,rf]
#
##########################################################################################
############## Perform validation on reference points set aside for independent validation
def d_independentValidation(refdat,sett):
    """
    # Takes reference data and generates accuracy assessment statistics (error matrix, user's & producer's accuracy, overall accuracy, kappa)
    # - returns a dictionary with the following items, listed below, by key:
    #   ['errorMat']   : the error/confustion matrix as a pandas data frame
    #   ['classAcc']   : class-specific user's and producer's accuracies as a pandas data frame
    #   ['overallAcc'] : a float representing the overall accuracy
    #   ['kappa']      : a float representing Cohen's kappa coefficient
    """
    import numpy as np, pandas as pd
    from collections import OrderedDict
    data_sett = sett['data']
    # prepare subset dat, including only those reference data points that were set aside for independent validation
    dat = refdat.loc[refdat['indValidSet']==1]
    if len(dat)==0: return None
    # get array of unique classes (class values)
    classes = np.unique(dat['classNum'])
    # prepare error matrix
    classNumFmt = '%0'+str(len(str(classes.max())))+'d' # format string for class values as strings (zero padding)
    errmat = pd.DataFrame(data=0,index=['Predicted_c'+classNumFmt%c for c in classes],columns=['Actual_c'+classNumFmt%c for c in classes])
    for rind in range(0,len(classes)):
        for cind in range(0,len(classes)):
            errmat.iloc[rind][cind] = ((dat['classNum']==classes[cind])&(dat['predict']==classes[rind])).sum()
    classAcc = pd.DataFrame(data=-1.0,index=['users','producers'],columns=['c'+classNumFmt%c for c in classes])
    for c in range(0,len(classes)):
        classAcc.loc['users'][c]     = float(errmat.iloc[c,c])/errmat.iloc[c,:].sum()
        classAcc.loc['producers'][c] = float(errmat.iloc[c,c])/errmat.iloc[:,c].sum()
    overallAcc = float(np.diag(errmat).sum())/len(dat)
    kappa = float(len(dat)*np.diag(errmat).sum()-(errmat.apply(sum,1).values*errmat.apply(sum,0).values).sum()) / \
            (len(dat)**2-(errmat.apply(sum,1).values*errmat.apply(sum,0).values).sum())
    # save error matrix and/or accuracy stats to CSV, if user chose to do so
    if data_sett['saveErrorMatrixCSV'] is not None:
        errmat.to_csv(data_sett['saveErrorMatrixCSV'])
    if data_sett['saveAccuracyCSV'] is not None:
        pd.DataFrame(data=[np.r_[overallAcc,kappa,classAcc.values.flatten()].tolist()],
                     columns=np.r_[['overall','kappa'],[classAcc.index[0]+'_'+c for c in classAcc.columns],
                                   [classAcc.index[1]+'_'+c for c in classAcc.columns]]).to_csv(data_sett['saveErrorMatrixCSV'],index=False)
    #
    return {'errorMat':errmat,'classAcc':classAcc,'overallAcc':overallAcc,'kappa':kappa}
#
##########################################################################################
############## Consolidate RF and/or independent validation results across iterations
def e_consolidateArossIterations(dat,sett,valid=None,rf=None):
    """
    # Takes:
    #     dat :   a copy of the reference data (e.g. refdat) or imputed reference data (e.g. impdat),
    #     sett :  a copy of the settings dictionary
    #     valid : a list or dictionary (with iteration numbers as keys) of independent validation results as returned by d_independentValidation, and/or
    #     rf :    a list or dictionary (with iteration numbers as keys) of fitted Random Forest models as returned by c_prepRandomForest
    #   and consolidates the results in valid and/or rf across iterations (input dictionary keys).
    #
    # Returns a dictionary with the following objects (listed by key):
    #    'errorMat' :  a pandas data frame of error matrices across iterations (requires input for valid). This object is None if valid is None.
    #    'accStats' :  a pandas data frame of independent validation accuracy statistics across iterations (requires input for valid). This object is None if valid is None.
    #    'featureImportances' : a pandas data frame of feature importances across iterations (requires input for rf).  This object is None if rf is None.
    #    'oobScore' :  a pandas data frame of out-of-bag Random Forest accuracies across iterations (requires input for rf, with all RF models having been set so that
    #                  oob_score is True). This object is None if rf is None or the Random Forest models were fit without calculating out-of-bag accuracy score (oob_score
    #                  is False).
    """
    import numpy as np, pandas as pd, os
    from sklearn.ensemble import RandomForestClassifier
    #
    outdict = dict.fromkeys(['errorMat','accStats','featureImportances','oobScore'])
    classes = np.unique(dat['classNum'])
    xcols = _getTrainSampleColumnNames(dat)
    if valid is not None:
        # checks
        if isinstance(valid,list):
            vdict = dict([(i,valid[i-1]) for i in range(1,len(valid)+1)])
        else:
            vdict = valid
        if not isinstance(vdict,dict):
            raise Exception('valid is expected to be either a list or dictionary of independent validation dictionaries), or None.')
        expectedKeys = ['errorMat','overallAcc','kappa','classAcc']
        for key in expectedKeys:
            if key not in vdict[vdict.keys()[0]].keys():
                raise Exception('Items in valid are expected to be independent validation dictionaries, as returned by d_independentValidation().')
        # consolidation
        vdict_keys = np.sort(vdict.keys())
        errmat_ind_df = pd.DataFrame(data=vdict[vdict_keys[0]]['errorMat'].index.values,index=vdict[vdict_keys[0]]['errorMat'].index,columns=['Predicted'])
        outdict['errorMat'] = pd.concat([pd.DataFrame(data=np.repeat(i,len(classes)),index=errmat_ind_df.index,
                                                      columns=['iter']).join([errmat_ind_df,vdict[i]['errorMat']]) for i in vdict_keys])
        overallAcc = pd.DataFrame(data=[vdict[i]['overallAcc'] for i in vdict_keys],index=vdict_keys,columns=['overallAcc'])
        kappa = pd.DataFrame(data=[vdict[i]['kappa'] for i in vdict_keys],index=vdict_keys,columns=['kappa'])
        users = pd.DataFrame(data=[vdict[i]['classAcc'].loc['users'].values for i in vdict_keys],index=vdict_keys,
                             columns=['users_'+c for c in vdict[vdict_keys[0]]['classAcc'].columns])
        producers = pd.DataFrame(data=[vdict[i]['classAcc'].loc['producers'].values for i in vdict_keys],index=vdict_keys,
                                 columns=['producers_'+c for c in vdict[vdict_keys[0]]['classAcc'].columns])
        outdict['accStats'] = overallAcc.join([kappa,users,producers])
        # save to file if user chose to do so
        if sett['data']['consol_outCSV_dir'] is not None:
            outpath = sett['data']['consol_outCSV_dir']+'/'+sett['data']['consol_outCSV_basename']+'indErrorMatrices.csv'
            if os.path.exists(outpath):
                raise Exception('\n\nPath for output independent validation error matrices CSV already exists.'+
                                '\nConsider changing \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' data settings.')
            outdict['errorMat'].to_csv(outpath,index=False)
        if sett['data']['consol_outCSV_dir'] is not None:
            outpath = sett['data']['consol_outCSV_dir']+'/'+sett['data']['consol_outCSV_basename']+'accuracyStats.csv'
            if os.path.exists(outpath):
                raise Exception('\n\nPath for output independent validation accuracy statistics CSV already exists.'+
                                '\nConsider changing \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' data settings.')
            outdict['accStats'].to_csv(outpath,index=True,index_label='iter')
    if rf is not None:
        # checks
        if isinstance(rf,list):
            rfdict = dict.fromkeys(range(1,len(rf)+1))
            for i in range(1,len(rf)+1):
                rfdict[i] = rf[i]
        else:
            rfdict = rf
        if not isinstance(rfdict,dict):
            raise Exception('valid is expected to be either a list or dictionary of independent validation dictionaries, or None.')
        if not isinstance(rfdict[rfdict.keys()[0]],RandomForestClassifier):
            raise Exception('Items in rf are expected to be objects of class sklearn.ensemble.forest.RandomForestClassifier (fitted to training data),'+\
                            ' as returned by c_prepRandomForest().')
        # consolidation
        rfdict_keys = np.sort(rfdict.keys())
        outdict['featureImportances'] = pd.DataFrame(data=[rfdict[i].feature_importances_ for i in rfdict_keys],index=rfdict_keys,columns=xcols)
        if rfdict[rfdict_keys[0]].oob_score:
            outdict['oobScore'] = pd.DataFrame(data=[rfdict[i].oob_score_ for i in rfdict_keys],index=rfdict_keys,columns=["oobScore"])
        # save to file if user chose to do so
        if sett['data']['consol_outCSV_dir'] is not None:
            outpath = sett['data']['consol_outCSV_dir']+'/'+sett['data']['consol_outCSV_basename']+'importances.csv'
            if os.path.exists(outpath):
                raise Exception('\n\nPath for output feature importances CSV already exists.'+
                                '\nConsider changing \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' data settings.')
            outdict['featureImportances'].to_csv(outpath,index=True,index_label='iter')
        if sett['data']['consol_outCSV_dir'] is not None and outdict['oobScore'] is not None:
            outpath = sett['data']['consol_outCSV_dir']+'/'+sett['data']['consol_outCSV_basename']+'OOBscore.csv'
            if os.path.exists(outpath):
                raise Exception('\n\nPath for output OOB scores CSV already exists.'+
                                '\nConsider changing \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' data settings.')
            outdict['oobScore'].to_csv(outpath,index=True,index_label='iter')
    #
    return outdict
#    
##############################################################

def documentation(outTextFile=None):
    import os
    from collections import OrderedDict
    doc = OrderedDict([])
    ######################################
    ## functions
    doc['prep_sett'] =   '\nprep_sett:    function\n\n'+\
                         '  - returns an ordered dictionary of ordered dictionaries, representing\n'+\
                         '    data settings and random forest model settings that are to be used in \n'+\
                         '    other functions\n\n'+\
                         '  - if called without modifying any arguments, the returned dictionary\n'+\
                         '    just contains default settings\n\n'+\
                         '  - the returned dictionary has 2 keys: \'data\' and \'rf\'\n\n'+\
                         '     - \'data\' is a sub-dictionary (ordered) containing settings such as\n'+\
                         '       paths to input and output datasets, data imputation options, \n'+\
                         '       options for setting-aside of reference data-points for independent\n'+\
                         '       validation, etc.\n\n'+\
                         '     - \'rf\' is a sub-dictionary (ordered) containing settings that are\n'+\
                         '       directly related to fitting of, and prediction using, the random\n'+\
                         '       forest model. Most of these settings are directly from the package\n'+\
                         '       responsible for model (i.e. scikit-learn).\n\n'+\
                         '  - pass the dictionary returned by this function to the print_sett()\n'+\
                         '    function to get a print out of the current state of all settings. e.g.:\n\n'+\
                         '         sett = prep_sett() # default settings as sett\n'+\
                         '         print_sett(sett)   # print out current state of settings\n\n'+\
                         '  - after modifying settings, pass the settings dictionary to the\n'+\
                         '    save_sett(sett) function to save the current state of these settings\n\n'+\
                         '      - data settings will only be saved if a valid specification has been\n'+\
                         '        made for sett[\'data\'][\'save_to_json\']\n'+\
                         '      - random forest settings will only be saved if a valid specification\n'+\
                         '        has been made for sett[\'rf\'][\'save_to_json\']\n'+\
                         '      - e.g.:\n\n'+\
                         '         sett[\'data\'][\'inRefCSV\'] = \'myRefData.csv\' # change data setting from default\n'+\
                         '         sett[\'rf\'][\'n_estimators\'] = 600  # change random forest setting from default\n\n'+\
                         '         sett[\'data\'][\'save_to_json\'] = \'myDataSettings.json\'\n'+\
                         '         sett[\'rf\'][\'save_to_json\'] = \'myRandomForestSettings.json\'\n\n'+\
                         '         save_sett(sett) # save the current state of the settings to file\n\n'+\
                         '  - previously saved json settings files can be loaded using the \n'+\
                         '    \'DATA_load_from_json\' and \'RF_load_from_json\' arguments of the \n'+\
                         '    prep_sett() function. e.g.:\n\n'+\
                         '         sett = prep_sett(DATA_load_from_json=\'myDataSettings.json\',\n'+\
                         '                          RF_load_from_json=\'myRandomForestSettings.json\')\n'+\
                         '         print_sett(sett)   # print out current state of settings\n\n'
    doc['save_sett'] =   '\nsave_sett:    function\n\n'+\
                         '  - can be used to save an updated copy of the data and random forest\n'+\
                         '    settings (sett[\'data\'] & sett[\'rf\']) to json file\n'+\
                         '  - arguments:\n\n'+\
                         '       sett :  a copy of the settings dictionary, with valid \n'+\
                         '               specifications for sett[\'data\'][\'save_to_json\']\n'+\
                         '               and/or sett[\'rf\'][\'save_to_json\']\n\n'+\
                         '  - returns: None\n\n'+\
                         '  - e.g.:\n\n'+\
                         '         sett = prep_sett() # default settings as sett\n\n'+\
                         '         sett[\'data\'][\'inRefCSV\'] = \'myRefData.csv\' \n'+\
                         '         sett[\'rf\'][\'n_estimators\'] = 600  \n\n'+\
                         '         sett[\'data\'][\'save_to_json\'] = \'myDataSettings.json\'\n'+\
                         '         sett[\'rf\'][\'save_to_json\'] = \'myRandomForestSettings.json\'\n\n'+\
                         '         save_sett(sett) # save the current state of the settings\n'
    doc['print_sett'] =  '\nprint_sett:    function\n\n'+\
                         '  - can be used to print the current state of the data and/or random\n'+\
                         '    forest settings to the console\n'+\
                         '  - arguments:\n\n'+\
                         '       sett :  a copy of the settings dictionary\n\n'+\
                         '  - returns: None\n\n'+\
                         '  - e.g.:\n\n'+\
                         '         sett = prep_sett() # default settings as sett\n\n'+\
                         '         sett[\'data\'][\'inRefCSV\'] = \'myRefData.csv\' \n'+\
                         '         sett[\'rf\'][\'n_estimators\'] = 600  \n\n'+\
                         '         print_sett(sett) # print data and rf settings\n\n'+\
                         '         print_sett(sett[\'data\']) # print just data settings\n\n'+\
                         '         print_sett(sett[\'rf\']) # print just rf settings\n'
    doc['pointBiserialCorr'] =  '\npointBiserialCorr:    function\n\n'+\
                         '  - calculates the absolute point biserial correlation for each predictor\n'+\
                         '    variable/channel across all class pairs, and returns a ordered set of\n'+\
                         '    top variables based on this correlation\n'+\
                         '  - arguments:\n\n'+\
                         '       dat :  a copy of the reference data dataframe (refdat)\n'+\
                         '              - can be the imputed reference dataframe (impdat) only if\n\n'+\
                         '                the \'impute_strategy\' data setting is None (omission\n'+\
                         '                instead of imputation)\n\n'+\
                         '       n :  an integer value (>0 and <= the number of predictor variables)\n'+\
                         '            representing the number of \'top\' variables/image channels  \n'+\
                         '            that should be included in the returned variable set\n\n'+\
                         '       saveToCSV : (default False) boolean representing whether information \n'+\
                         '            on point biserial correlation and selected variables should be\n'+\
                         '            saved to CSV files or not. Output files can not be overwritten \n'+\
                         '            (see outCSV_dir and outCSV_basename)\n\n'+\
                         '            if True: the following CSV files will be saved:\n\n'+\
                         '                 ..._top_vars.csv : saved copy of \'top_vars\' (see returns) \n'+\
                         '                 ..._rank_chname.csv : saved copy of \'rank_chname\' (see returns) \n'+\
                         '                 ..._rank_chnum.csv : saved copy of \'rank_chnum\' (see returns) \n'+\
                         '                 ..._absCorrCoeff.csv : saved copy of \'corr\' (see returns) \n\n'+\
                         '            if False: the information will not be saved to CSV file \n\n'+\
                         '       outCSV_dir : (default \'\', files will be saved in current working dir) \n'+\
                         '                    Ignored if saveToCSV is False.\n'+\
                         '                    String representing full path to directory where output\n'+\
                         '                    CSV files should be saved.  Can also be specified relative\n'+\
                         '                    to the current working directory.  If an empty string, the\n'+\
                         '                    files will be saved directly within the current working \n'+\
                         '                    directory \n\n'+\
                         '       outCSV_basename :  (default \'pointBiserialCorr\')\n'+\
                         '                    Ignored if saveToCSV is False.\n'+\
                         '                    String representing the basename for the output CSV files.\n\n'+\
                         '  - returns: a dictionary with the following keys:\n\n'+\
                         '       [\'top_vars\'] data frame with columns representing selected  \n'+\
                         '                   variable names (\'channel_names\' column) corresponding\n'+\
                         '                   image channel numbers (\'channel_numbers\' column)\n\n'+\
                         '       [\'top_chnums\']  an array containing integer values representing the \n'+\
                         '                   channel numbers in the order in which they were selected\n\n'+\
                         '       [\'rank_chnames\']  data frame with columns representing   \n'+\
                         '                   channels/variable names ranked by absolute point biserial \n'+\
                         '                   correlation for a given class pair\n\n'+\
                         '       [\'rank_chnums\']  data frame with columns representing  \n'+\
                         '                   channels/variable numbers ranked by absolute point  \n'+\
                         '                   biserial correlation for a given class pair\n\n'+\
                         '       [\'corr\']  data frame with columns representing the absolute point \n'+\
                         '                   biserial orrelation across all image channels/variables \n'+\
                         '                   for a given class pair \n\n'
    doc['subsetVariables'] =  '\nsubsetVariables:    function\n\n'+\
                         '  - returns a subset (columns removed) copy of input data frame (refdat or  \n'+\
                         '    impdat) based on specified channel/variable names or channel numbers \n'+\
                         '  - arguments:\n\n'+\
                         '       dat :  a copy of the reference dataframe (refdat) or imputed dataframe \n'+\
                         '              (impdat) that is to be subset\n\n'+\
                         '              - columns of this dataframe that represent variables/image \n'+\
                         '                channels are spected to have names starting with \'ch#_\', \n'+\
                         '                where # is any number of digits representing the image \n'+\
                         '                channel number\n\n'+\
                         '       channels :  a list, tuple, pandas series, or numpy array \n'+\
                         '                   representing variables that are to be kept or dropped:\n\n'+\
                         '              - if values are integer, they are assumed to represent image\n'+\
                         '                channel numbers\n\n'+\
                         '              - if values are string, they are assumed to represent image\n'+\
                         '                channel names, as they appear in the dat column names\n\n'+\
                         '       drop : (default False)  a boolean representing whether the specified\n'+\
                         '              channels/variables are to be dropped from dat (True) or the\n'+\
                         '              specified channels/variables are to be the only ones kept \n'+\
                         '              in dat\n\n'+\
                         '  - returns: a copy of dat, subset (columns removed) as specified by user\n'
    doc['resetRefDataframe'] =  '\nprint_sett:    function\n\n'+\
                         '  - can be used to reset the reference data or imputer reference data \n'+\
                         '    (refdat or impdat) data-frame to one of two previous states:\n'+\
                         '      - immediately before data imputation and splitting\n'+\
                         '      - immediately after data imputation and splitting\n\n'+\
                         '  - arguments:\n\n'+\
                         '       dat :  a copy of the data frame that is to be reset\n\n'+\
                         '       imputedAndSplit :  (default False) boolean representing what state\n'+\
                         '                           the data frame should be returned to\n\n'+\
                         '              if True: immediately after data imputation (i.e. as returned\n'+\
                         '                       by the b_imputeDataAndSplitIndValidSet function)\n\n'+\
                         '              if False: immediately before data imputation (i.e. as returned\n'+\
                         '                       by the a_initialDataPrep function)\n\n'+\
                         '  - returns: a copy of the reset data frame\n\n'+\
                         '  - e.g.:\n\n'+\
                         '         refdat,rast,chaninfo = a_initialDataPrep(sett) \n'+\
                         '         refdat,impdat = b_imputeDataAndSplitIndValidSet(refdat,sett)\n'+\
                         '         refdat,impdat,rf = c_prepRandomForest(refdat,impdat,sett)\n\n'+\
                         '         # at this point both refdat and impdat have prediction results\n'+\
                         '         # attached to them, from the c_prepRandomForest function\n\n'+\
                         '         # reset refdat and impdat to their state as returned\n'+\
                         '         # by b_imputeDataAndSplitIndValidSet\n'+\
                         '         refdat = resetRefDataframe(refdat,imputedAndSplit=True)\n'+\
                         '         impdat = resetRefDataframe(impdat,imputedAndSplit=True)\n'
    doc['a_initialDataPrep'] =  '\na_initialDataPrep:    function\n\n'+\
                         '  - reads input files, prepares a data frame representing reference data \n'+\
                         '    (refdat), and extracts raster data at reference point locations\n\n'+\
                         '  - arguments:\n\n'+\
                         '       sett :  a copy of the settings dictionary\n\n'+\
                         '  - returns: a list with three components\n\n'+\
                         '       [0] the data frame representing the reference data, with extracted\n'+\
                         '           raster data attached.\n\n'+\
                         '       [1] a copy of the raster dataset, opened as an osgeo.gdal.Dataset\n'+\
                         '           object\n\n'+\
                         '       [2] a dictionary representing channel information extracted from\n'+\
                         '           raster dataset\n'
    doc['b_imputeDataAndSplitIndValidSet'] =  '\nb_imputeDataAndSplitIndValidSet:    function\n\n'+\
                         '  - takes a copy of the reference data data-frame, as returned by the \n'+\
                         '    a_initialDataPrep function, imputes the data, and splits/sets-aside\n'+\
                         '    the data points into an independent validation set and a set that is\n'+\
                         '    to be used for building training the Random Forest model\n\n'+\
                         '  - see the following data settings: \n'+\
                         '          \'indValidSplit\',\'splitByClass\',\n'+\
                         '          \'impute_strategy\',\'impute_by_class\',\n\n'+\
                         '  - arguments:\n\n'+\
                         '       refdat  :  a copy of the reference data data-frame, as returned by\n'+\
                         '                  the a_initialDataPrep function\n\n'+\
                         '       sett    :  a copy of the settings dictionary\n\n'+\
                         '       verbose :  (default: True) boolean representing whether or not to\n'+\
                         '                   print details of the imputation and splitting to the \n'+\
                         '                   console\n\n'+\
                         '  - returns: a list with two componentns\n\n'+\
                         '       [0] a data frame representing the reference data (refdat), with a\n'+\
                         '           new field/column (\'indValidSet\') representing whether a data\n'+\
                         '           point has been kept for the model (value of 0), set aside \n'+\
                         '           for independent validation (value of 1), or, in some cases,\n'+\
                         '           omitted due to the presence of NoData in combination with\n'+\
                         '           the lack of an imputation strategy (value of -1)\n\n'+\
                         '       [1] a data frame representing the imputed (or subset) reference \n'+\
                         '           data (impdat), after having implemented the imputation \n'+\
                         '           strategy. It is similar to the refdat data frame that is \n'+\
                         '           returned by this function, but NoData values from the raster\n'+\
                         '           dataset have been imputed at reference data locations. If the\n'+\
                         '           imputataion strategy was None, then records/rows containing  \n'+\
                         '           NoData values have been omitted from this data farme instead.\n\n'
    doc['c_prepRandomForest'] =  '\nc_prepRandomForest:    function\n\n'+\
                         '  - prepares and fits/trains a Random Forest model based on the \n'+\
                         '    imputed/omitted reference data (impdat) and the user-specified random \n'+\
                         '    forest settings\n'+\
                         '  - also updates both the reference data and imputed reference data \n'+\
                         '    data-frames with predictions and probabilities of class membership at\n'+\
                         '    reference data locations\n\n'+\
                         '  - arguments:\n\n'+\
                         '       refdat :  a copy of the reference data data-frame, as returned by\n'+\
                         '                 the b_imputeDataAndSplitIndValidSet function\n'+\
                         '       impdat :  a copy of the imputed and independent validation-split \n'+\
                         '                 reference data data-frame, as returned by the\n'+\
                         '                 b_imputeDataAndSplitIndValidSet function\n'+\
                         '       sett   :  a copy of the settings dictionary\n\n'+\
                         '  - returns: a list with three components\n\n'+\
                         '       [0] a copy of the reference data data-frame (refdat) with \n'+\
                         '           Random Forest predictions/probabilities added.\n\n'+\
                         '       [1] a copy of the imputed/omitted reference data data-frame \n'+\
                         '          (impdat) with Random Forest predictions/probabilities added.\n\n'+\
                         '       [2] the trained/fit Random Forest Classifier object\n'
    doc['d_independentValidation'] =  '\nd_independentValidation:    function\n\n'+\
                         '  - generates accuracy statistics from the Random Forest classification \n'+\
                         '    based on the reference data points that were set aside for independent\n'+\
                         '    validation\n'+\
                         '  - accuracy statistics include:  error matrix, user\'s & producer\'s\n'+\
                         '                                   accuracies, overall accuracy, kappa\n\n'+\
                         '  - arguments:\n\n'+\
                         '       refdat :  a copy of the reference data data-frame, as returned by\n'+\
                         '                 the c_prepRandomForest function\n'+\
                         '       sett   :  a copy of the settings dictionary\n\n'+\
                         '  - returns: a dictionary with the following keys\n\n'+\
                         '       [\'errorMat\']   a data frame representing the error matrix \n\n'+\
                         '       [\'classAcc\']   a data frame representing class-specific user\'s \n'+\
                         '                        and producer\'s accuracies \n\n'+\
                         '       [\'overallAcc\'] a float value representing the overall accuracy \n\n'+\
                         '       [\'kappa\']      a float value representing Cohen\'s kappa coefficient\n'
    doc['e_consolidateArossIterations'] =  '\ne_consolidateArossIterations:    function\n\n'+\
                         '  - Consolidates independent validation results and Random Forest model\n'+\
                         '    characteristics across iterations into individual data frames and \n'+\
                         '    (optionally) otuput CSV files\n\n'+\
                         '  - arguments:\n\n'+\
                         '       dat   :  a copy of the reference data (refdat) or imputed reference \n'+\
                         '                data (impdat),\n\n'+\
                         '       sett  :  a copy of the settings dictionary\n\n'+\
                         '       valid :  (optional) a list or dictionary (with iteration numbers as \n'+\
                         '                keys) of independent validation results as returned by the\n'+\
                         '                d_independentValidation function\n'+\
                         '                - this is required for consolidating independent \n'+\
                         '                  validation accuracy statistics and/or error matrices\n\n'+\
                         '       rf    :  (optional) a list or dictionary (with iteration numbers as \n'+\
                         '                keys) of random forest models as returned by the \n'+\
                         '                c_prepRandomForest function\n'+\
                         '                - this is required for consolidating feature importances \n'+\
                         '                  and out-of-bag scores\n\n'+\
                         '  - returns: a dictionary with the following keys:\n\n'+\
                         '       [\'errorMat\']  a data frame of error matrices across iterations \n'+\
                         '                    (requires input for valid). This object is None if \n'+\
                         '                     valid argument is None / not specified.\n\n'+\
                         '       [\'accStats\']  a data frame of independent validation accuracy \n'+\
                         '                     statistics across iterations (requires input for \n'+\
                         '                     valid argument). This object is None if valid is None.\n\n'+\
                         '       [\'featureImportances\'] a data frame of feature importances across\n'+\
                         '                     iterations (requires input for rf argument). This \n'+\
                         '                     object is None if valid if rf argument is None / not \n'+\
                         '                     specified.\n\n'+\
                         '       [\'oobScore\']  a data frame of out-of-bag Random Forest accuracies \n'+\
                         '                     across iterations (requires input for rf argument, \n'+\
                         '                     with all RF models having been set so that oob_score \n'+\
                         '                     is True). This object is None if rf argument is None \n'+\
                         '                     or the Random Forest models were fit without  \n'+\
                         '                     calculating out-of-bag accuracy score. \n'+\
                         '                     - see the \'oob_score\' random forest setting\n'
    ######################################
    ## settings common to the 'data' settings and the random forest ('rf') settings
    doc['working_dir'] =       '\nworking_dir:    data setting and random forest setting\n\n'+\
                               '  - represents the current working directory\n'+\
                               '  - to change this setting you should use os.chdir(), as in: \n'+\
                               '          os.chdir(\'c:\\\\working\\\\myWorkingDir\')\n'
    doc['load_from_json'] =    '\nload_from_json:    data setting and random forest setting\n\n'+\
                               '  - default:  None  (do not load settings from json file)\n'+\
                               '  - a string representing the filename (if in working directory) or \n'+\
                               '    the full path to an input json file storing the settings\n'+\
                               '  - if a valid filename or path is specfied, settings (data or rf \n'+\
                               '    settings) will be loaded from that file\n'+\
                               '  - filename should end in \'.json\'\n'
    doc['save_to_json'] =      '\nsave_to_json:    data setting and random forest setting\n\n'+\
                               '  - default:  None  (do not save settings to json file)\n'+\
                               '  - a string representing the filename (in working directory) or the \n'+\
                               '    full path to an output json file for n'+\
                               '    storing the settings\n'+\
                               '  - if a valid filename or path is specfied, settings (data or rf  \n'+\
                               '    settings) will be saved to that file\n'+\
                               '  - filename should end in \'.json\'\n'
    ######################################
    ## 'data' settings
    doc['randSeedValue'] =     '\nrandSeedValue:    data setting\n\n'+\
                               '  - default:  None  (do not set random seed when calling the  \n'+\
                               '                     a_intialDataPrep function)\n'+\
                               '  - a numeric value that will be used to set the random seed when   \n'+\
                               '    calling the a_intialDataPrep function\n'
    doc['inRefCSV'] =          '\ninRefCSV:    **REQUIRED** data setting\n\n'+\
                               '  - default:  \'\'    (empty string; user specification is required)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an input CSV file representing reference data \n'+\
                               '    points\n'+\
                               '  - at the very least, this file should contain:\n'+\
                               '     - a field/column containing integer values representing classes\n'+\
                               '     - two fields/columns containing values representing x and y \n'+\
                               '       coordinates\n'+\
                               '         - these coordinates must be in the same spatial reference \n'+\
                               '           system as the input raster dataset\n'
    doc['FN_pointID'] =        '\nFN_pointID:    data setting\n\n'+\
                               '  - default:  \'\'    (empty string; script will assign unique  \n'+\
                               '    identifier to each reference data point)\n'+\
                               '  - string representing the name of the field/column within the  \n'+\
                               '    input reference data CSV file that contains a \n'+\
                               '    unique identifier for each reference data point\n'+\
                               '  - not required, but if specified, this information may be included  \n'+\
                               '    in some output files\n'
    doc['FN_classnum'] =       '\nFN_classnum:    **REQUIRED** data setting\n\n'+\
                               '  - default:  \'\'    (empty string; user specification is required)\n'+\
                               '  - string representing the name of the field/column within the  \n'+\
                               '    input reference data CSV file that contains integer values\n'+\
                               '    representing reference classes\n'
    doc['FN_classlab'] =       '\nFN_classlab:    data setting\n\n'+\
                               '  - default:  \'\'    (empty string; no class labels will be \n'+\
                               '                     included)\n'+\
                               '  - string representing the name of the field/column within the \n'+\
                               '    input reference data CSV file that contains class labels (e.g. \n'+\
                               '    text representing the land cover that a given class number\n'+\
                               '    represents)\n'+\
                               '  - not required, but if specified, this information may be included \n'+\
                               '    in some output files\n'
    doc['FN_xy'] =             '\nFN_xy:    **REQUIRED**  data setting\n\n'+\
                               '  - default:  (\'\',\'\')   (tuple of two empty strings; user \n'+\
                               '                         specification is required)\n'+\
                               '  - tuple containing two strings representing names of the \n'+\
                               '    fields/columns of the field within the input refernece data CSV \n'+\
                               '    file that contain the x and y coordinates (respecitively) of the \n'+\
                               '    reference data point locations \n'+\
                               '     - these coordinates must be in the same spatial reference  \n'+\
                               '       system as the input raster dataset\n'
    doc['inRastPath'] =        '\ninRastPath:    **REQUIRED**  data setting\n\n'+\
                               '  - default:  \'\'    (empty string; user specification is required)\n'+\
                               '  - string representing the filename (if in working directory) or  \n'+\
                               '    the full path to an input raster dataset\n'
    doc['indValidSplit'] =     '\nindValidSplit:    data setting\n\n'+\
                               '  - default:  0.3    (30% of the valid reference data points will be  \n'+\
                               '                      set aside for independent validation)\n'+\
                               '  - a floating point value ( >=0.0 and <1.0 ) representing the  \n'+\
                               '    proportion of the reference data to split off / set-aside for\n'+\
                               '    independent validation.\n'+\
                               '  - if 0, no independent validation will be performed\n'+\
                               '  - the splitting-off of the independent validation set will take \n'+\
                               '    place after data imputation\n'+\
                               '  - splitting of independent validation set will also depend on the \n'+\
                               '    specified \'splitByClass\' data setting\n'
    doc['splitByClass'] =      '\nsplitByClass:    data setting\n\n'+\
                               '  - default:  True    (splitting off of the independent validation \n'+\
                               '                       will occur at the class level)\n'+\
                               '  - boolean representing whether (True) or not (False) the \n'+\
                               '    independent validation split proportion should be applied at \n'+\
                               '    the class level\n'+\
                               '  - if False, the split proportion (see \'indValidSplit\' data \n'+\
                               '    setting) will be applied to the reference dataset in general, \n'+\
                               '    meaning the class proportions in the idenpendent validation set\n'+\
                               '    may differ slightly from those in the model set\n'
    doc['minPointsForModel'] = '\nminPointsForModel:    data setting\n\n'+\
                               '  - default:  10    (if less than 10 reference points points are left \n'+\
                               '                     for the model, an error will be raised)\n'+\
                               '  - integer value (>=0) representing the minimum number of reference\n'+\
                               '    data points that should be going into the model / Random Forest\n'+\
                               '    algorithm following the independent validation split \n'+\
                               '  - ignored if indValidSplit is zero (all data are being used for \n'+\
                               '    the model) \n'+\
                               '  - if less than this number of data-points are remaining for the \n'+\
                               '    model, an error will be raised \n'+\
                               '  - if \'splitByClass\' data setting is True, then this number will \n'+\
                               '    be checked against the number of reference data points of each \n'+\
                               '    class that are going into the model \n'+\
                               '  - if \'splitByClass\' data setting is False, then this number will \n'+\
                               '    be checked against the overall number of reference data points,\n'+\
                               '    in general, that are going into the model \n'
    doc['saveDataCSV'] =       '\nsaveDataCSV:    data setting\n\n'+\
                               '  - default:  None    (no reference data CSV file will be saved)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an output CSV file storing the following\n'+\
                               '    information:  \n\n'+\
                               '     - indicator of whether a data point was set aside for \n'+\
                               '       independent validation (indValidSet). \n'+\
                               '        - Value of 1 for set aside. 0 for not. If  \'impute_strategy\'  \n'+\
                               '          is set to None (omission), the omitted rows are still  \n'+\
                               '          included in this file, but will have a value of -1 for \n'+\
                               '          indValidSet. \n\n'+\
                               '     - reference data: ref. class labels (classLab),ref. class \n'+\
                               '       values (classNum), x/y coordinates (x/y) \n\n'+\
                               '     - extracted (non-imputed / no omissions) training samples with\n'+\
                               '       NaN for NoData \n'+\
                               '        - Predictions may not have been made based on these data. \n'+\
                               '          See \'impute_strategy\', below. \n'+\
                               '        - Field/Column names begin with \'ch#_\' where # is any \n'+\
                               '          number of digits representing the image channel from which \n'+\
                               '          the data were extracted \n\n'+\
                               '     - predictions (predict) and probabilities of class memebership \n'+\
                               '       at each reference data point \n'+\
                               '        - Class membership probability fields are named \'prob_c#\',\n'+\
                               '          where # represents a class number/value. \n\n'+\
                               '  - if specified, file will be generated when the c_prepRandomForest()\n'+\
                               '    function is called\n'+\
                               '  - specified path must not already exist. No over-writing.\n' 
    doc['impute_strategy'] =   '\nimpute_strategy:    data setting\n\n'+\
                               '  - default:  \'mean\'    (NoData values will be substituted with the \n'+\
                               '                           mean)\n'+\
                               '  - either None, a string representing the data imputation strategy, \n'+\
                               '    or numeric value for filling missing values\n\n'+\
                               '  - If None, no imputation will be performed. Instead, any data \n'+\
                               '    points that contain NoData / NaN values in any channel, will be\n'+\
                               '    omitted.  This omission will be done prior to the independent \n'+\
                               '    validation split.\n'+\
                               '  - If \'mean\', missing values will be replaced with the mean of the \n'+\
                               '    column.\n'+\
                               '  - If \'median\', missing values will be replaced with the median of \n'+\
                               '    the column.\n'+\
                               '  - If \'most_frequent\', missing values will be replaced with the \n'+\
                               '    most frequent value of the column.\n'+\
                               '  - If a numeric value is specified, missing values will be replaced\n'+\
                               '    with this value.\n'
    doc['impute_by_class'] =   '\nimpute_by_class:    data setting\n\n'+\
                               '  - default:  True    (imputation will be performed at the class \n'+\
                               '                       level) \n'+\
                               '  - boolean representing whether (True) or not (False) the \n'+\
                               '    imputation should be performed at the class level \n'+\
                               '  - this setting is ignored if the impute_strategy is None or a \n'+\
                               '    constant value (int of float)\n'+\
                               '  - e.g.  if impute_strategy is \'mean\' and impute_by_class is True\n'+\
                               '    then, for a given channel, missing values for class 1 will be \n'+\
                               '    replaced by the mean of the reference points that are of\n'+\
                               '    reference class 1.  The same for class 2, and so on.\n'+\
                               '  - e.g.  if impute_strategy is \'mean\' and impute_by_class is False\n'+\
                               '    then, for a given channel, missing values for class 1 will be \n'+\
                               '    replaced by the mean of the channel (at ref. point locations) \n'+\
                               '    regardless of reference class.\n'                          
    doc['saveImputedDataCSV'] ='\nsaveImputedDataCSV:    data setting\n\n'+\
                               '  - default:  None    (no imputed reference data CSV file will be\n'+\
                               '                       saved)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an output CSV file storing a copy of the \n'+\
                               '    reference data with imputation (or omission in the case where\n'+\
                               '   \'impute_strategy\' is None) applied to missing values\n'+\
                               '  - field/column names are the same as for \'saveDataCSV\', above, but \n'+\
                               '    the extracted raster data (\'ch#_...\') have either been \n'+\
                               '    imputed or points have been omitted, based on the \n'+\
                               '   \'impute_strategy\' and \'impute_by_class\' options.\n'+\
                               '  - if specified, file will be generated when the c_prepRandomForest()\n'+\
                               '    function is called\n'+\
                               '  - specified path must not already exist. No over-writing.\n' 
    doc['saveRFclassifier'] =  '\nsaveRFclassifier:    data setting\n\n'+\
                               '  - default:  None    (Random Forest classifier will not be saved to\n'+\
                               '                       file)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an output Python pickle file (recommend using \n'+\
                               '    .pkl extension) storing a copy of the fitted Random Forest model\n'+\
                               '  - saving the model may be necessary if you later want to apply \n'+\
                               '    predictions over the full image set\n'+\
                               '  - if specified, file will be generated when the c_prepRandomForest()\n'+\
                               '    function is called\n'+\
                               '  - specified path must not already exist. No over-writing.\n'
    doc['saveImportancesCSV'] ='\nsaveImportancesCSV:    data setting\n\n'+\
                               '  - default:  None    (no CSV file representing feature importances\n'+\
                               '                       from the current model will be saved)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an output CSV file, storing a copy of the\n'+\
                               '    feature importances from the current Random Forest model\n'+\
                               '  - if specified, file will be generated when the c_prepRandomForest()\n'+\
                               '    function is called\n'+\
                               '  - specified path must not already exist. No over-writing.\n'+\
                               '  - if not specified, file representing feature importances across \n'+\
                               '    several models can still be generated when the \n'+\
                               '    e_consolidateArossIterations() function is called\n'
    doc['saveErrorMatrixCSV'] ='\nsaveErrorMatrixCSV:    data setting\n\n'+\
                               '  - default:  None    (no CSV file representing independent \n'+\
                               '                       validation error matrices will be saved)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an output CSV file, storing a copy of the\n'+\
                               '    error matrices from the independent validation\n'+\
                               '  - if specified, file will be generated when the\n'+\
                               '    d_independentValidation() function is called\n'+\
                               '  - specified path must not already exist. No over-writing.\n'+\
                               '  - if not specified, file representing independent validation error\n'+\
                               '    matrices across several models can still be generated when the\n'+\
                               '    e_consolidateArossIterations() function is called\n'
    doc['saveAccuracyCSV'] =   '\nsaveAccuracyCSV:    data setting\n\n'+\
                               '  - default:  None    (no CSV file representing independent \n'+\
                               '                       validation accuracy statistics will be saved)\n'+\
                               '  - string representing the filename (if in working directory) or \n'+\
                               '    the full path to an output CSV file, storing a copy of the\n'+\
                               '    accuracy statistics from the independent validation\n'+\
                               '  - if specified, file will be generated when the\n'+\
                               '    d_independentValidation() function is called\n'+\
                               '  - specified path must not already exist. No over-writing.\n'+\
                               '  - if not specified, file representing independent validation \n'+\
                               '    accuracy statistics across several models can still be generated\n'+\
                               '    when the e_consolidateArossIterations() function is called\n'
    doc['consol_outCSV_dir'] = '\nconsol_outCSV_dir:    data setting\n\n'+\
                               '  - default:  None    (no CSV files representing info consolidated \n'+\
                               '                       across Random Forest models will be saved)\n'+\
                               '  - string representing path (relative to working directory or full\n'+\
                               '    path) to a directory that CSV files output from the \n'+\
                               '    e_consolidateArossIterations() function will be stored within\n'+\
                               '  - if specified, several CSV files representing statistics that \n'+\
                               '    have been consolidated across model fitting & validation  \n'+\
                               '    iterations may be saved, including:\n\n'+\
                               '     - ...accuracyStats.csv: independent validation accuracy\n'+\
                               '              statistics.  Requires specification for the \'valid\'\n'+\
                               '              argument when calling e_consolidateArossIterations()\n\n'+\
                               '     - ...importances.csv: feature importances.  Requires \n'+\
                               '              specification for the \'rf\' argument when calling \n'+\
                               '              e_consolidateArossIterations()\n\n'+\
                               '     - ...indErrorMatrices.csv: independent validation error\n'+\
                               '              matrices.  Requires specification for the \'valid\'\n'+\
                               '              argument when calling e_consolidateArossIterations()\n\n'+\
                               '     - ...OOBscore.csv: Accuracy score from the model\'s out-of-bag\n'+\
                               '              data.  Requires specification for the \'rf\'\n'+\
                               '              argument when calling e_consolidateArossIterations()\n\n'+\
                               '     \'...\' represents the \'consol_outCSV_basename\' data setting\n\n'+\
                               '  - files will be generated when the e_consolidateArossIterations()\n'+\
                               '    function is called\n'+\
                               '  - over-writing of output CSV files not supported so a different \n'+\
                               '    \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' should be\n'+\
                               '    specified each time the e_consolidateArossIterations() function\n'+\
                               '    is called (unless previously-generated files have been moved or\n'+\
                               '    deleted)\n'
    doc['consol_outCSV_basename'] = '\nconsol_outCSV_basename:    data setting\n\n'+\
                               '  - default:  \'consol_\'    \n'+\
                               '  - string representing basename for CSV files output when calling\n'+\
                               '    the e_consolidateArossIterations() function\n'+\
                               '  - if specified as an empty string (\'\'), CSV files can still be \n'+\
                               '    generated but they won\'t have a basename\n'+\
                               '  - if specified, recommend ending with an underscore or other \n'+\
                               '    character to separate the basename from the rest of the filename\n'+\
                               '  - over-writing of output CSV files not supported so a different \n'+\
                               '    \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' should be\n'+\
                               '    specified each time the e_consolidateArossIterations() function\n'+\
                               '    is called (unless previously-generated files have been moved or\n'+\
                               '    deleted)\n'
    ######################################
    ## random forest ('rf') settings
    doc['n_estimators'] =  '\nn_estimators:    random forest (\'rf\') setting\n\n'+\
                           '  *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default:  100    (100 trees in the forest)\n'+\
                           '  - The number of trees in the forest.\n'
    doc['criterion'] =     '\ncriterion:    random forest (\'rf\') setting\n\n'+\
                           '  *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default:  \'gini\'    (split quality measured by Gini impurity)\n'+\
                           '  - The function to measure the quality of a split.\n'+\
                           '  - Supported criteria are \'gini\' for the Gini impurity and \'entropy\'\n'+\
                           '    for the information gain.\n'+\
                           '  - This parameter is tree-specific.\n'
    doc['max_depth'] =     '\nmax_depth:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: None (nodes expanded until all leaves are pure or until all\n'+\
                           '                   leaves contain less than min_samples_split samples)\n'+\
                           '  - Integer representing the maximum depth of the tree.\n'
    doc['min_samples_split'] = '\nmin_samples_split:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: 2   (at least two samples required to split internal node)\n'+\
                           '  - Integer or float value representing the minimum number of samples\n'+\
                           '    required to split an internal node.\n'+\
                           '  - If int, then consider min_samples_split as the minimum number.\n'+\
                           '  - If float, then min_samples_split is a fraction and\n'+\
                           '    ceil(min_samples_split * n_samples) are the minimum number of\n'+\
                           '    samples for each split.\n'
    doc['min_samples_leaf'] = '\nmin_samples_leaf:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: 1   (at least one sample required to be a leaf node)\n'+\
                           '  - Integer or float value.\n'+\
                           '  - A split point at any depth will only be considered if it leaves at\n'+\
                           '    least min_samples_leaf training samples in each of the left and\n'+\
                           '    right branches.\n'+\
                           '  - This may have the effect of smoothing the model, especially in \n'+\
                           '    regression.\n'
    doc['min_weight_fraction_leaf'] = '\nmin_weight_fraction_leaf:  random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: 0.0 \n'+\
                           '  - The minimum weighted fraction of the sum total of weights (of all \n'+\
                           '    the input samples) required to be at a leaf node.\n'+\
                           '  - Samples have equal weight when sample_weight is not provided.\n'
    doc['max_features'] = '\nmax_features:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: \'auto\' (square root of the number of features)\n'+\
                           '  - Integer, float, string, or None representing the number of features\n'+\
                           '    to consider when looking for the best split.\n'+\
                           '  - If int, then consider max_features features at each split.\n'+\
                           '  - If float, then max_features is a fraction and \n'+\
                           '    int(max_features * n_features) features are considered at each \n'+\
                           '    split.\n'+\
                           '  - If \'auto\', then max_features=sqrt(n_features).\n'+\
                           '  - If \'sqrt\', then max_features=sqrt(n_features) (same as \'auto\').\n'+\
                           '  - If \'log2\', then max_features=log2(n_features).\n'+\
                           '  - If None, then max_features=n_features.\n'
    doc['max_leaf_nodes'] = '\nmax_leaf_nodes:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: None (unlimited number of leaf nodes)\n'+\
                           '  - Integer or None\n'+\
                           '  - Grow trees with max_leaf_nodes in best-first fashion.\n'+\
                           '  - Best nodes are defined as relative reduction in impurity.\n'+\
                           '  - If None then unlimited number of leaf nodes.\n'+\
                           '  - The search for a split does not stop until at least one valid \n'+\
                           '    partition of the node samples is found, even if it requires to \n'+\
                           '    effectively inspect more than max_features features.\n'
    doc['min_impurity_decrease'] = '\nmin_impurity_decrease:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: 0.0 (node will be split if causes any decrease in impurity)\n'+\
                           '  - Float value\n'+\
                           '  - A node will be split if this split induces a decrease of the \n'+\
                           '    impurity greater than or equal to this value.\n'+\
                           '  - More detailed information is provided at the URL listed above.\n'
    doc['bootstrap'] =     '\nbootstrap:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: True  (bootstrap samples used when building trees)\n'+\
                           '  - boolean representing whether (True) or not (False) to use bootstrap\n'+\
                           '    samples when building trees\n'+\
                           '  - If False, the whole datset is used to build each tree.\n'
    doc['oob_score'] =     '\noob_score:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: False  (out-of-bag samples not used to estimate \n'+\
                           '                     generatlization accuracy)\n'+\
                           '  - boolean representing whether (True) or not (False) to use out-of-bag\n'+\
                           '    samples to estimate the generalization accuracy.\n'
    doc['n_jobs'] =        '\nn_jobs:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: None  (1 processor)\n'+\
                           '  - Integer representing the number of jobs to run in parallel for both\n'+\
                           '    fit and predict.\n'+\
                           '  - None means 1 unless in a joblib.parallel_backend context. \n'+\
                           '  - Value of -1 means using all processors. \n'+\
                           '  - More detailed information is provided at the URL listed above.\n'
    doc['random_state'] =  '\nrandom_state:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: None  (random number generator is the RandomState instance\n'+\
                           '                    used by np.random)\n'+\
                           '  - Integer, RandomState instance, or None.\n'+\
                           '  - If int, random_state is the seed used by the random number\n'+\
                           '    generator. \n'+\
                           '  - If RandomState instance, random_state is the random number \n'+\
                           '    generator. \n'+\
                           '  - If None, the random number generator is the RandomState instance\n'+\
                           '    used by np.random. \n'
    doc['verbose'] =       '\nverbose:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: 0\n'+\
                           '  - Integer value controlling the verbosity when fitting and predicting.\n'
    doc['warm_start'] =    '\nwarm_start:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: False  (new forest fit each time)\n'+\
                           '  - Boolean\n'+\
                           '  - When set to True, reuse the solution of the previous call to fit and\n'+\
                           '    add more estimators to the ensemble\n'+\
                           '  - When set to False, a whole new forest is fit\n'
    doc['class_weight'] =  '\nclass_weight:    random forest (\'rf\') setting\n\n'+\
                           ' *** This documentation copies from the official documentation at:\n'+\
                           '      https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n\n'+\
                           '  - default: None  (all classes have weight one)\n'+\
                           '  - dict, list of dicts, \'balanced\', \'balanced_subsample\', or None\n'+\
                           '  - Weights associated with classes in the form {class_label: weight}.\n'+\
                           '  - If not given, all classes are supposed to have weight one. For\n'+\
                           '    multi-output problems, a list of dicts can be provided in the same\n'+\
                           '    order as the columns of y.\n'+\
                           '  - The \'balanced\' mode uses the values of y to automatically adjust\n'+\
                           '    weights inversely proportional to class frequencies in the input\n'+\
                           '    data as  n_samples / (n_classes * np.bincount(y))\n'+\
                           '  - The \'balanced_subsample\' mode is the same as \'balanced\' except \n'+\
                           '    that weights are computed based on the bootstrap sample for every \n'+\
                           '    tree grown.\n'+\
                           '  - More detailed information is provided at the URL listed above.\n'
    if outTextFile is not None:
        with open(outTextFile,"w") as outf:
            for k in doc.keys():
                outf.write(doc[k])
    return doc
                              
