## script prepared by Patrick Kirby for Amir Behnamian for IGARSS 2019
## prepared:  June 2019
##
## Script contains a series of functions for analyzing Random Forest models and
## land cover classification outputs.
##
##
#install.packages('randomForest')
#install.packages('rjson','sp','raster','foreign','xml2')
library(randomForest)
library(rjson)
library(sp)
library(raster)
library(foreign)
library(xml2)

############################################
##############  General function definitions
##
.reqSettMet <- function (name,val){
  "
  #   function for checking whether a required setting is met (returns TRUE) or not (returns FALSE)
  #   - called by the print_sett function
  #   - name: the setting name
  #   - val: a value representing the setting
  "
  if(is.na(val[1])) return(FALSE)
  if (name%in%c('inRefCSV','inRastPath')){
    if(!is.character(val)) return(FALSE)
    if(!file.exists(val)) return(FALSE)
  }else if (name=='FN_classnum'){
    if (!is.character(val)) return(FALSE)
    if (nchar(val)<1) return(FALSE)
  }else if (name=='FN_xy'){
    if (length(val)!=2) return(FALSE)
    if (!is.character(val)) return(FALSE)
    for (v in val) if (nchar(v)<1) return(FALSE)
  }
  return(TRUE)
}

.lget <- function(list,key){
  "return value from list component, given the component name as key"
  eval(parse(text=paste0('list$',key)))
}
##
.getTrainSampleColumnNames <- function(refdat,pattern='^ch[0-9]+\\_',startAfter='y',endBefore=NA){
  "
  # function for getting an array of field/column names in refdat that store the training sample
  # data that were extracted from the RasterStack
  "
  cols <- names(refdat)
  if (!is.na(startAfter)) cols <- cols[(which(cols==startAfter)+1):length(cols)]
  if (!is.na(endBefore))  cols <- cols[1:(which(cols==endBefore)-1)]
  if (!is.na(pattern))    cols <- cols[grepl(pattern,cols)]
  return(cols)
}
##
.impute <- function(dat,impute_strategy){
  "performs imputation of NA values on all columns in dat, based on impute_strategy"
  if (impute_strategy=='mean'){
    imputed <- as.data.frame(apply(dat,MARGIN=2,FUN=function(x){x[is.na(x)] <- base::mean(x,na.rm=TRUE); return(x)}))
  }else if (impute_strategy=='median'){
    imputed <- apply(dat,MARGIN=2,FUN=function(x){x[is.na(x)] <- stats::median(x,na.rm=TRUE); return(x)})
  }else if (is.numeric(impute_strategy)){
    imputed <- apply(dat,MARGIN=2,FUN=function(x){x[is.na(x)] <- impute_strategy; return(x)})
  }else{
    stop('Expects impute_strategy to be either  \'mean\',\'median\', or a numeric constant')
  }
  return (as.data.frame(imputed,row.names=row.names(dat),col.names=col.names(dat)))
}
##
.imputeData <- function(refdat,data_sett){
  "function for omitting or imputing NA values in sample data (training along with independent validation)"
  ## get vector of field/column names in refdat that store the training sample data (X)
  xcols <- .getTrainSampleColumnNames(refdat)
  if (is.na(data_sett$impute_strategy)){
    refdat <- base::subset(refdat,subset=apply(base::subset(refdat@data,select=xcols),MARGIN=1,
                                               FUN=function(x){!any(is.na(x))}))
  }else if (data_sett$impute_by_class){
    for (cls in base::unique(refdat$classNum)){
      refdat@data[refdat$classNum==cls,xcols] <- .impute(refdat@data[refdat$classNum==cls,xcols],
                                                         data_sett$impute_strategy)
    }
  }else{
    refdat@data[,xcols] <- .impute(refdat@data[,xcols],data_sett$impute_strategy)
  }
  return(refdat)
}
#######################################################
############## Settings

save_sett <- function(sett){
  "Can be called to save settings to json file, after updating them."
  if (!all(c('data','rf')%in%names(sett))){
    if (!is.na(sett$save_to_json)){
      if (nchar(sett$save_to_json)>0){
        cat(rjson::toJSON(sett, indent=4),file=sett$save_to_json)
      }
    }
  }else{
    if ('data'%in%names(sett)){
      if (!is.na(sett$data$save_to_json)){
        if (nchar(sett$data$save_to_json)>0){
          cat(rjson::toJSON(sett$data, indent=4),file=sett$data$save_to_json)
        }
      }
    }
    if ('rf'%in%names(sett)){
      if (!is.na(sett$rf$save_to_json)){
        if (nchar(sett$rf$save_to_json)>0){
          cat(rjson::toJSON(sett$rf, indent=4),file=sett$rf$save_to_json)
        }
      }
    }
  }
}

print_sett <- function(sett){
  "Prints current state of the settings to the console."
  if (!all(c('data','rf')%in%names(sett))){
    cat('\n')
    if ('inRefCSV'%in%names(sett)){
      cat('\n** represents a required setting that still needs valid specification\n\n',sep='')
    }
    maxchars <- max(sapply(names(sett),nchar))
    for (i in 1:length(sett)) {
      s <- names(sett)[i]
      s_val <- sett[[i]]
      indent <- '   '
      if (is.character(s_val[1])&!is.na(s_val[1])) quotes <- '\'' else quotes <- ''
      if (s%in%c('inRefCSV','FN_classnum','FN_xy','inRastPath')){
        if (!.reqSettMet(s,s_val)) indent <- '** '
      }
      if (length(s_val)==1){
        cat(indent,rep(' ',maxchars-nchar(s)),s,'  :  ',quotes,s_val,quotes,'\n',sep='')
      }else if (length(s_val)==2){
        cat(indent,rep(' ',maxchars-nchar(s)),s,'  :  c( ',
            quotes,s_val[1],quotes,' , ',quotes,s_val[2],quotes,' )\n',sep='')
      }
    }
  }else{
    cat('\n** represents a required setting that still needs valid specification\n',sep='')
    if ('data'%in%names(sett)){
      cat('\n-------------\nData settings\n-------------\n',sep='')
      maxchars <- max(sapply(names(sett$data),nchar))
      for (i in 1:length(sett$data)) {
        s <- names(sett$data)[i]
        s_val <- sett$data[[i]]
        indent <- '   '
        if (is.character(s_val[1])&!is.na(s_val[1])) quotes <- '\'' else quotes <- ''
        if (s%in%c('inRefCSV','FN_classnum','FN_xy','inRastPath')){
          if (!.reqSettMet(s,s_val)) indent <- '** '
        }
        if (length(s_val)==1){
          cat(indent,rep(' ',maxchars-nchar(s)),names(sett$data)[i],'  :  ',
              quotes,s_val,quotes,'\n',sep='')
        }else if (length(s_val)==2){
          cat(indent,rep(' ',maxchars-nchar(s)),names(sett$data)[i],'  :  c( ',
              quotes,s_val[1],quotes,'  ,  ',quotes,s_val[2],quotes,' )\n',sep='')
        }
      }
    }
    if ('rf'%in%names(sett)){
      cat('\n----------------------------\nRandom Forest model settings\n----------------------------\n',sep='')
      maxchars <- max(sapply(names(sett$rf),nchar))
      for (i in 1:length(sett$rf)) {
        s <- names(sett$rf)[i]
        s_val <- sett$rf[[i]]
        if (is.character(s_val[1])&!is.na(s_val[1])) quotes <- '\'' else quotes <- ''
        cat('   ',rep(' ',maxchars-nchar(s)),names(sett$rf)[i],'  :  ',quotes,s_val,quotes,'\n',sep='')
      }
    }
  }
}

prep_sett <- function(DATA_load_from_json = NA,
                      DATA_save_to_json   = NA,
                      DATA_randSeedValue  = NA,
                      DATA_inRefCSV    = '',
                      DATA_FN_pointID  = '',
                      DATA_FN_classnum = '',
                      DATA_FN_classlab = '',
                      DATA_FN_xy       = c('',''),
                      DATA_inRastPath  = '',
                      DATA_indValidSplit      = 0.3,
                      DATA_splitByClass       = TRUE,
                      DATA_minPointsForModel  = 10,
                      DATA_saveDataCSV        = NA,
                      DATA_impute_strategy    = 'mean',
                      DATA_impute_by_class    = TRUE,
                      DATA_saveImputedDataCSV = NA,
                      DATA_saveRFclassifier   = NA,
                      DATA_saveImportancesCSV = NA,
                      DATA_saveErrorMatrixCSV = NA,
                      DATA_saveAccuracyCSV    = NA,
                      DATA_saveOobErrorCSV    = NA,
                      DATA_saveOobErrorMatrixCSV = NA,
                      DATA_consol_outCSV_dir = NA,
                      DATA_consol_outCSV_basename = 'consol_',
                      RF_load_from_json = NA,
                      RF_save_to_json   = NA,
                      RF_randSeedValue  = NA,
                      RF_ntree    = 500,
                      RF_mtry     = NA,
                      RF_replace  = TRUE,
                      RF_sampsize = NA,
                      RF_nodesize = NA,
                      RF_maxnodes = NA,
                      RF_importance  = FALSE,
                      RF_localImp    = FALSE,
                      RF_proximity   = FALSE,
                      RF_oob.prox    = FALSE,
                      RF_norm.votes  = TRUE,
                      RF_do.trace    = FALSE,
                      RF_keep.forest = NA,
                      RF_keep.inbag  = FALSE
                      ){
  "
  # Returns a list of two sub-lists.  The two sub-lists are:
  #     - $data : data settings.
  #     - $rf   : random forest model settings.
  #  If this function is called without modifying any arguments, then all settigns in the returned list will be defaults.
  #  Settings may also be saved to or loaded from json file for both 'data' and 'rf' settings, by providing the filename (in working dir)
  #     or path for the  ..._load_from_json and ..._save_to_json arguments.
  "
  ## get working directory
  wd <- getwd()
  ##
  if (is.null(DATA_load_from_json)) DATA_load_from_json <- NA
  if (is.null(RF_load_from_json))   RF_load_from_json <- NA
  ####################
  ## prepare data_sett
  if (!is.na(DATA_load_from_json)){
    if (is.character(DATA_load_from_json)){
      if (file.exists(DATA_load_from_json)){
        data_sett <- rjson::fromJSON(file=DATA_load_from_json)
        ## convert 'NA' character strings in the json file to NA values (omit cases where character strings are expected)
        for (i in 1:length(data_sett)){
          if (names(data_sett)[i]%in%c('inRefCSV','FN_pointID','FN_classnum',
                                       'FN_classlab','FN_xy','inRastPath')) next()
          if (is.character(data_sett[[i]])){
            if (data_sett[[i]]=='NA') data_sett[[i]] <- NA
          }else if (is.null(data_sett[[i]])){
            data_sett[[i]] <- NA
          }
        }
      }else if (!grepl('\\|/',DATA_load_from_json)){
        fpath <- paste0(getwd(),'/',DATA_load_from_json)
        if (file.exists(fpath)){
          data_sett <- rjson::fromJSON(file=fpath)
        }else{
          stop('Could not find json file at path specified for DATA_load_from_json')
        }
      }else{
        stop('Could not find json file at path specified for DATA_load_from_json')
      }
    }
    else{
      stop('DATA_load_from_json is expected to be NA or a path to a json file')
    }
  }else{
    data_sett <- list(
      working_dir    = wd,
      load_from_json = DATA_load_from_json,
      save_to_json   = DATA_save_to_json,
      randSeedValue  = DATA_randSeedValue,
      inRefCSV       = DATA_inRefCSV,
      FN_pointID     = DATA_FN_pointID,
      FN_classnum    = DATA_FN_classnum,
      FN_classlab    = DATA_FN_classlab,
      FN_xy          = DATA_FN_xy,
      inRastPath     = DATA_inRastPath,
      indValidSplit  = DATA_indValidSplit,
      splitByClass   = DATA_splitByClass,
      minPointsForModel = DATA_minPointsForModel,
      saveDataCSV     = DATA_saveDataCSV,
      impute_strategy = DATA_impute_strategy,
      impute_by_class = DATA_impute_by_class,
      saveImputedDataCSV = DATA_saveImputedDataCSV,
      saveRFclassifier   = DATA_saveRFclassifier,
      saveImportancesCSV = DATA_saveImportancesCSV,
      saveErrorMatrixCSV = DATA_saveErrorMatrixCSV,
      saveAccuracyCSV    = DATA_saveAccuracyCSV,
      saveOobErrorCSV    = DATA_saveOobErrorCSV,
      saveOobErrorMatrixCSV = DATA_saveOobErrorMatrixCSV,
      consol_outCSV_dir  = DATA_consol_outCSV_dir,
      consol_outCSV_basename  = DATA_consol_outCSV_basename
    )
    ## ensure that any NULL specifications are set to NA
    for (i in 1:length(data_sett)) if (is.null(data_sett[[i]])) data_sett[[i]] <- NA
  }
  ## if user specified just a file name for inRefCSV and/or inRastPath,
  ## attempt to get the full path based on current working directory (wd)
  for (sett in c('inRefCSV','inRastPath')){
    sett_value <- eval(parse(text=paste0('data_sett$',sett)))
    if (nchar(sett_value)>0 & !grepl('\\\\|/',sett_value)){
        fpath <- paste0(wd,'/',sett_value)
        if (file.exists(fpath)) eval(parse(text=paste0('data_sett$',sett,'<-fpath')))
    }
  }
  ## if user specified just a file name for any of the following:
  ##      save_to_json,saveDataCSV,saveImputedDataCSV,saveRFclassifier,saveImportancesCSV,
  ##      saveErrorMatrixCSV,saveAccuracyCSV
  ## then prep the full path based on the current working directory (wd)
  for (sett in c('save_to_json','saveDataCSV','saveImputedDataCSV','saveRFclassifier',
                 'saveImportancesCSV','saveErrorMatrixCSV','saveAccuracyCSV')){
    sett_value <- eval(parse(text=paste0('data_sett$',sett)))
    if (!is.na(sett_value)){
      if (nchar(sett_value)>0 & !grepl('\\\\|/',sett_value)){
        eval(parse(text=paste0('data_sett$',sett,'<-\'',paste0(wd,'/',sett_value),'\'')))
      }
    }
  }
  ## save data_sett to json if user chose to do so
  if (!is.na(data_sett$save_to_json)){
    if (nchar(data_sett$save_to_json)>0){
      cat(rjson::toJSON(data_sett, indent=4),file=data_sett$save_to_json)
    }
  }
  ####################
  ## prepare rf_sett
  if (!is.na(RF_load_from_json)){
    if (is.character(RF_load_from_json)){
      if (file.exists(RF_load_from_json)){
        rf_sett <- rjson::fromJSON(file=RF_load_from_json)
        ## convert 'NA' character strings in the json file to NA values
        for (i in 1:length(rf_sett)){
          if (is.character(rf_sett[[i]])){
            if (rf_sett[[i]]=='NA') rf_sett[[i]] <- NA
          }else if (is.null(data_sett[[i]])){
            data_sett[[i]] <- NA
          }
        }
      }else{
        stop('Could not find json file at path specified for RF_load_from_json')
      }
    }
    else{
      stop('RF_load_from_json is expected to be NA or a path to a json file')
    }
  }else{
    rf_sett = list(
      working_dir    = wd,
      load_from_json = RF_load_from_json,
      save_to_json   = RF_save_to_json,
      randSeedValue  = RF_randSeedValue,
      ntree      = RF_ntree,
      mtry       = RF_mtry,
      replace    = RF_replace,
      sampsize   = RF_sampsize,
      nodesize   = RF_nodesize,
      maxnodes   = RF_maxnodes,
      importance = RF_importance,
      localImp   = RF_localImp,
      proximity  = RF_proximity,
      oob.prox   = RF_oob.prox,
      norm.votes = RF_norm.votes,
      do.trace   = RF_do.trace,
      keep.forest= RF_keep.forest,
      keep.inbag = RF_keep.inbag
    )
    ## ensure that any NULL specifications are set to NA
    for (i in 1:length(rf_sett)) if (is.null(rf_sett[[i]])) rf_sett[[i]] <- NA
  }
  ## if user just specified just a file name for save_to_json,
  ## then prep the full path based on the current working directory (wd)
  if (!is.na(rf_sett$save_to_json)){
    if (nchar(rf_sett$save_to_json)>0){
      if (!grepl('\\|/',rf_sett$save_to_json)){
        rf_sett$save_to_json <- paste0(wd,'/',rf_sett$save_to_json)
      }
      ## save rf_sett to json
      cat(rjson::toJSON(rf_sett, indent=4),file=rf_sett$save_to_json)
    }
  }
  return(list(data=data_sett,rf=rf_sett))
}
#
##########################################################################################
########## Reset reference data frame (refdat) and/or imputed reference data frame (impdat), when performing iterations
resetRefDataframe <- function(dat,imputedAndSplit=FALSE){
  "
  # Takes a copy of the reference dataframe or imputed reference dataframe as dat and resets it to an earlier state.  For use when performing iterations.
  # Earlier state is either:
  #     imputedAndSplit=False : state just before the imputation and splitting off of the independent validation set.
  #         - i.e. to the same state as reference data returned by the a_intialDataPrep function
  #         - this should not be used if dat represents the imputed and split reference data (e.g. impdat)
  #     imputedAndSplit=True  : state just after the imputation and splitting off of the independent validation set
  #         - i.e. to the same state as reference data (refdat) or imputed reference data (impdat) returned returned by the b_imputeDataAndSplitIndValidSet function
  #  *NOTE* This function only subsets columns.  It does not change any values in the retained columns.
  "
  if (imputedAndSplit){
    return(dat[,c('indValidSet','classLab','classNum','x','y',.getTrainSampleColumnNames(dat))])
  }else{
    return(dat[,c('classLab','classNum','x','y',.getTrainSampleColumnNames(dat))])
  }
}
#######################################################
############## Initial preparation of reference dataset
a_initialDataPrep <- function(sett){
  "
  # extract data at refdat locations, across all channels of the input raster Dataset
  # - returns a list with 2 components:
  #   [1] refdat   : a data frame representing reference data with extracted raster data at ref point locations
  #   [2] inrast   : a RasterStack representing the opened raster dataset at data_sett['inRastPath']
  "
  data_sett <- sett$data
  rf_sett <- sett$rf
  ## run a check to ensure that output files specified in data_sett don't already exist
  for (key in c('saveDataCSV','saveImputedDataCSV','saveRFclassifier','saveImportancesCSV',
                'saveErrorMatrixCSV','saveAccuracyCSV')){
    if (!is.na(.lget(data_sett,key))){
      if (file.exists(.lget(data_sett,key))){
        stop('Output filepath specified in data_sett$',key,' already exists.')
      }
    }
  }
  ## set the random seed based on data_sett$randSeedValue. If data_sett$randSeedValue is NA, this command
  ##   will have no effect on the random state.
  if (!is.na(data_sett$randSeedValue)) set.seed(data_sett$randSeedValue)
  ## read inRefCSV as data frame and standardize columns in terms of name and order
  if (data_sett$FN_pointID=='') FN_pointID <- NA else FN_pointID <- data_sett$FN_pointID
  refCSV_ext <- substr(data_sett$inRefCSV,nchar(data_sett$inRefCSV)-3,nchar(data_sett$inRefCSV))
  if (refCSV_ext=='.csv'){
    refdat <- utils::read.csv(data_sett$inRefCSV,row.names=FN_pointID,stringsAsFactors = FALSE)
  }else if (refCSV_ext=='.dbf'){
    refdat <- foreign::read.dbf(data_sett$inRefCSV,as.is=FALSE)
  }

  if (!data_sett$FN_classlab%in%names(refdat)){
    refdat$classLab <- NA
    refdat <- base::subset(refdat,select=c('classLab',data_sett$FN_classnum,data_sett$FN_xy[1],
                                           data_sett$FN_xy[2]))
  }else{
    refdat <- base::subset(refdat,select=c(data_sett$FN_classlab,data_sett$FN_classnum,data_sett$FN_xy[1],
                                           data_sett$FN_xy[2]))
  }
  names(refdat) <- c('classLab','classNum','x','y')
  ## read inRastPath as a RasterStack inrast
  inrast = raster::stack(data_sett$inRastPath)
  nlyr <- raster::nlayers(inrast) # number of layers/channels/bands in the stack
  ## if there is an .aux.xml file associated with the inRastPath, assume it's channel-specific metadata.
  ## Read the xml file and populate the RasterStack (inrast) with the metadata (e.g. NoDataValue)
  if(file.exists(paste0(data_sett$inRastPath,'.aux.xml'))){
    md <- xml2::as_list(xml2::read_xml(paste0(data_sett$inRastPath,'.aux.xml')))[[1]]
    if (length(md)==nlyr){
      for (i in 1:nlyr){
        raster::metadata(inrast[[i]]) <- md[[i]]
      }
    }else if ((length(md)==nlyr+1)){
      for (i in 1:nlyr){
        raster::metadata(inrast[[i]]) <- md[[i+1]]
      }
    }
  }
  ## convert refdat to SpatialPointsDataFrame with same spaital reference system as inrast
  refdat <- sp::SpatialPointsDataFrame(coords=base::subset(refdat,select=c('x','y')),data=refdat,
                                       proj4string=inrast@crs)
  ##
  ## extract the data at refdat locations
  rastdat <- as.data.frame(raster::extract(inrast,refdat),stringsAsFactors=FALSE,row.names=row.names(refdat))
  ## cycle through channels: prep field/column names and join the extracted data to refdat as new fields/columns
  nlyr <- raster::nlayers(inrast) ## number of layers/channels/bands in the stack
  for (ch in 1:nlyr){
    ## field/column name
    rastdat_fn <- paste0('ch',sprintf(paste0('%0',nchar(nlyr),'d'),ch),'_') ## base name
    md <- raster::metadata(inrast[[ch]]) ## channel metadata
    if ('Description'%in%names(md)){
      ## if channel name is in the metadata, as Description, then append that to rastdat_fn
      rastdat_fn <- paste0(rastdat_fn,substr(md$Description,1,51))
    }else{
      ## otherwise, just append the default channel name to rastdat_fn
      rastdat_fn <- paste0(rastdat_fn,substr(inrast[[ch]]@data@names[1],1,51))
    }
    names(rastdat)[ch]<-rastdat_fn
  }
  ## join data from current channel to refdat (ensuring row name match)
  refdat@data <- transform(base::merge(refdat@data,rastdat,by='row.names',all=T),row.names=Row.names,Row.names=NULL)
  ## ensure that the spatial points and corresponding dataframe are in the same row order
  refdat@data <- refdat@data[match(row.names(refdat),row.names(refdat@data)),]
  ##
  return(list(refdat=refdat,inrast=inrast))
}
###############################################################
############## Impute data and split independent validation set
b_imputeDataAndSplitIndValidSet <- function(refdat,sett,verbose=TRUE){
  "
  # Perform data imputation (or omission) and split dataset into an independent valiation set and model fitting set
  # - returns a list with two components:
  #   [1] impdat: a copy of refdat with 'indValidSplit' as the first column, indicating whether a data point has been set 
  #               aside for independent validation (1) or set aside for model fitting (0)
  #               - if the sett$data$impute_strategy was None, meaning omission of NoData/NaN data points, then 'indValidSplit' may contain
  #                 values of -1 indicating that the row will be omitted from both model fitting and independent validation due to NoData/NA values
  #   [2] refdat: a copy of with the same format as above (including 'indValidSplit') but with imputed values in the sample data (X) columns
  #               where the NoData/NA values were in the original copy of the reference data
  " 
  data_sett <- sett$data
  ## add field/column for representing whether (1) or not (0) a reference data point has been set aside for
  ## independent validation
  refdat$indValidSet <- 0
  refdat@data <- refdat@data[,c('indValidSet',names(refdat)[-which(names(refdat)=='indValidSet')])] ## re-order columns
  ## impute data based on data_sett$impute_strategy or omit points with NoData/NA if data_sett$impute_strategy is NA
  ## - from this point on, refdat represents the omitted/imputed data, and refdat_orig represents the
  ##   non-omitted/non-imputed data
  refdat_orig <- refdat
  refdat_orig$indValidSet <- -1
  refdat <- .imputeData(refdat,data_sett)
  ## perform independent validation split
  if (data_sett$splitByClass){
    ## perform class-level independent validation split
    for (cls in sort(base::unique(refdat$classNum))){
      dat_sample <- base::subset(refdat,subset=refdat$classNum==cls) # subset to the class
      dat_sample <- base::sample(row.names(dat_sample),size=data_sett$indValidSplit*nrow(dat_sample),replace=FALSE)
      if (length(dat_sample)>0) refdat@data[dat_sample,]$indValidSet <- 1
      ## perform the class-level check of minimum points for the RF model, if user chose to do so
      if (data_sett$minPointsForModel > 0 && data_sett$indValidSplit > 0){
        if (length(dat_sample) < data_sett$minPointsForModel){
          stop('Class number ',cls,' only has ',length(dat_sample),' data points for model.',
               ' Minimum number is ',data_sett$minPointsForModel,'.')
        }
      }
    }
  }else{
    ## perform general independent validation split
    dat_sample <- base::sample(row.names(refdat),size=data_sett$indValidSplit*nrow(refdat),replace=FALSE)
    if (length(dat_sample)>0) refdat@data[dat_sample,]$indValidSet <- 1
    ## perform general check of minimum points for the RF model, if user chose to do so
    if (data_sett$minPointsForModel > 0 & data_sett$indValidSplit > 0){
      if (length(dat_sample) < data_sett$minPointsForModel){
        stop('Only ',length(dat_sample),' data points for model.',
             ' Minimum number is ',data_sett$minPointsForModel,'.')
      }
    }
  }
  # update refdat_orig with independent validation split indicators
  refdat_orig@data[row.names(refdat@data),'indValidSet'] <- refdat@data$indValidSet
  # notify user of split (if verbose)
  if (verbose){
    if (is.na(data_sett$impute_strategy)){
      cat('\n',nrow(refdat_orig)-nrow(refdat),
          ' data points omitted due to NoData values and lack of imputation.\n',sep='')
    }
    if (data_sett$splitByClass){
      cat('\n',data_sett$indValidSplit*100,'% of reference data points (',nrow(refdat),
            ' total) of each class set aside for independent validation:\n',sep='')
      for (cls in sort(base::unique(refdat$classNum))){
        cat('    Class ',cls,' : ',sum((refdat$classNum==cls) & (refdat$indValidSet==1)),' of ',
            sum(refdat$classNum==cls),' reference points set aside\n',sep='')

      }
      cat('\n')
    }else{
      cat('\n',data_sett$indValidSplit*100,'% of reference data points (',
            sum(refdat$indValidSet==1),' of ',nrow(refdat),') set aside for independent validation.\n',sep='')
    }
  }
  ##
  return(list(impdat=refdat,refdat=refdat_orig))
}
##########################################
############## Prepare Random Forest model
c_prepRandomForest <- function(refdat,impdat,sett){
  "
  # Define Random Forest Classifier based on sett$rf settings, fit the model on imputed/subset data (NoData/NA values handled),
  # make predictions (with probabilities) at reference data locations, and save reference datasets (incl. predicitons/probabilities).
  # - returns a list of three components:
  #   [1] refdat: a copy of the reference data with predictions/probabilities added
  #   [2] impdat: a copy of the imputed/subset reference data with predictions/probabilities added
  #   [3] rf: the trained/fit Random Forest classifier
  # - see: ?randomForest
  "
  data_sett <- sett$data
  rf_sett <- sett$rf
  ## if the supplied impdat dataframe does not have an 'indValidSet' column, then use all of the data points provided
  if (!'indValidSet'%in%names(impdat)){
    impdat$indValidSet <- 0
    impdat@data <- impdat@data[,c('indValidSet',names(impdat)[-which(names(impdat)=='indValidSet')])] # re-order columns
  }
  ## get vector of field/column names in impdat that store the training sample data, x
  xcols <- .getTrainSampleColumnNames(impdat)
  ## omit any non-expected columns in impdat or refdat (e.g. prediction results from previous runs of this function)
  expcols <- c(c('indValidSet','classLab','classNum','x','y'),xcols) # expected column names
  impdat <- base::subset(impdat,select=expcols)
  refdat <- base::subset(refdat,select=expcols)
  ## subset impdat to only include those rows and columns that are to be included in the model
  mdat <- base::subset(impdat@data,subset=(impdat$indValidSet==0),select=c('classNum',xcols))
  ## convert classNum to factor to ensure classification rather than regression
  mdat$classNum <- as.factor(mdat$classNum)
  ## prep sampsize RF parameter based on rf_sett$sampsize specification
  if (is.na(rf_sett$sampsize)){
    sampsize <- if (rf_sett$replace) nrow(mdat) else ceiling(.632*nrow(mdat))
  }else{
    sampsize <- rf_sett$sampsize
  }
  ## the the random seed if user specified a randSeedValue in the rf settings
  if (!is.na(rf_sett$randSeedValue)) set.seed(rf_sett$randSeedValue)
  ## define and fit the Random Forest classifier based on user specifications in rf_sett
  rf <- randomForest::randomForest(classNum ~ ., data=mdat,
                                   na.action=stats::na.fail,
                                   ntree=rf_sett$ntree,
                                   mtry = if (is.na(rf_sett$mtry)) floor(sqrt(length(xcols))) else rf_sett$mtry,
                                   replace=rf_sett$replace,
                                   sampsize = sampsize,
                                   nodesize = if (is.na(rf_sett$nodesize)) 1 else rf_sett$nodesize,
                                   maxnodes= if (is.na(rf_sett$maxnodes)) NULL else rf_sett$maxnodes,
                                   importance=rf_sett$importance,localImp=rf_sett$localImp,
                                   proximity=rf_sett$proximity,oob.prox=rf_sett$oob.prox,
                                   norm.votes=rf_sett$norm.votes,do.trace=rf_sett$do.trace,
                                   keep.forest = if(is.na(rf_sett$keep.forest)) TRUE else rf_sett$keep.forest,
                                   keep.inbag=rf_sett$keep.inbag)
  #### predictions at all locations in impdat (includes independent validation set locations)
  classes <- sort(base::unique(refdat$classNum)) ## unique class values in ascending order
  n_class <- length(classes)
  ## hard classes
  refdat$predict <- as.integer(NA) ## new column in refdat for storing hard class predictions
  refdat@data[row.names(impdat@data),'predict'] <- as.integer(as.character(
    stats::predict(rf,newdata=impdat@data[,xcols],type='response'))) ## add hard class predictions to refdat
  impdat$predict <- refdat@data[row.names(impdat@data),'predict'] ## add hard class predictions to impdat
  ## class probabilities
  classNumFmt <- paste0('%0',nchar(max(classes)),'d') ## number format for class-specific probability columns
  classprob_FNs <- paste0('prob_c',sprintf(classNumFmt,classes)) ## field names for class probability columns
  ## data frame for class probabilites
  classprobs <- as.data.frame(matrix(as.numeric(NA),nrow=nrow(refdat),ncol=n_class,
                                     dimnames = list(row.names(refdat@data),classprob_FNs)))
  classprobs[row.names(impdat@data),] <- stats::predict(rf,newdata=impdat@data[,xcols],type='prob')
  refdat@data <- transform(base::merge(refdat@data,classprobs,by='row.names',all=TRUE),
                           row.names=Row.names,Row.names=NULL) ## add probabilites as new columns to refdat
  impdat@data <- transform(base::merge(impdat@data,classprobs[row.names(impdat@data),],by='row.names',all=TRUE),
                           row.names=Row.names,Row.names=NULL) ## add probabilites as new columns to impdat
  ## re-order rows columns in refdat and impdat
  colorder <- c(head(expcols,which(expcols=='classNum')),'predict',classprob_FNs,
                tail(expcols,-which(expcols=='classNum')))
  refdat@data <- refdat@data[match(row.names(refdat),row.names(refdat@data)),colorder]
  impdat@data <- impdat@data[match(row.names(impdat),row.names(impdat@data)),colorder]
  ## save refdat and/or impdat, if user specified to do so
  if (!is.na(data_sett$saveDataCSV)){
    outdf <- cbind(data.frame(z=as.integer(row.names(refdat@data))),refdat@data)
    if (nchar(data_sett$FN_pointID)>0)names(outdf)[1] <- data_sett$FN_pointID else names(outdf)[1] <- 'ID'
    write.csv(outdf,file=data_sett$saveDataCSV,row.names=FALSE)
  }
  if (!is.na(data_sett$saveImputedDataCSV)){
    outdf <- cbind(data.frame(z=as.integer(row.names(impdat@data))),impdat@data)
    if (nchar(data_sett$FN_pointID)>0)names(outdf)[1] <- data_sett$FN_pointID else names(outdf)[1] <- 'ID'
    write.csv(outdf,file=data_sett$saveImputedDataCSV,row.names=FALSE)
  }
  ## save Random Forest model to .RData file if user specified to do so
  if (!is.na(data_sett$saveRFclassifier)){
    save(rf,file=data_sett$saveRFclassifier)
  }
  ##
  return(list(refdat=refdat,impdat=impdat,rf=rf))
}
##########################################################################################
############## Perform validation on reference points set aside for independent validation
d_independentValidation <- function(refdat,sett){
  "
  # Takes reference data and generates accuracy assessment statistics (error matrix, user's & producer's accuracy, overall accuracy, kappa)
  # - returns a list with the following components:
  #   [1] errorMat   : the error/confustion matrix as a pandas data frame
  #   [2] classAcc   : class-specific user's and producer's accuracies as a pandas data frame
  #   [3] overallAcc : a float representing the overall accuracy
  #   [4] kappa      : a float representing Cohen's kappa coefficient
  "
  data_sett <- sett$data
  if (sum(refdat$indValidSet==1)==0){
    warning('No reference data set aside for independent validation. Independent validation was not performed.')
    return(NULL)
  }
  ## prepare subset dat, including only those reference data points that were set aside for independent validation
  dat <- base::subset(refdat@data,subset=refdat@data$indValidSet==1)
  if (nrow(dat)==0) return(NA)
  ## prepare error matrix and accuracies
  classes <- sort(base::unique(refdat$classNum)) ## unique class values in ascending order
  n_class <- length(classes)
  classNumFmt <- paste0('%0',nchar(max(classes)),'d') ## number format for class number labels (zero-padding)
  errmat <- matrix(as.integer(NA),nrow=n_class,ncol=n_class,
                   dimnames=list(paste0('Predicted_c',sprintf(classNumFmt,classes)),
                                 paste0('Actual_c',sprintf(classNumFmt,classes)))) ## empty error matrix
  for (rind in 1:n_class){
    for (cind in 1:n_class) errmat[rind,cind] <- sum(dat$classNum==classes[cind] & dat$predict==classes[rind])
  }
  classAcc <- matrix(-1,nrow=2,ncol=n_class,
                     dimnames=list(c('users','producers'),paste0('c',sprintf(classNumFmt,classes))))
  for (c in 1:n_class){
    classAcc['users',c]     <- errmat[c,c]/sum(errmat[c, ])
    classAcc['producers',c] <- errmat[c,c]/sum(errmat[ ,c])
  }
  overallAcc = sum(diag(errmat))/nrow(dat)
  ## sum of products of elementwise multiplication of column sums and row sums (for calculating kappa)
  rowColSumProdSum <- sum(apply(errmat,2,sum)*apply(errmat,1,sum))
  ## calculate kappa
  kappa = ( nrow(dat)*sum(diag(errmat))-rowColSumProdSum ) / ( nrow(dat)^2-rowColSumProdSum )
  ## save independent error matrix and/or accuracy stats to CSV, if user chose to do so
  if (!is.na(data_sett$saveErrorMatrixCSV)){
    outdf <- cbind(data.frame(Predicted=row.names(errmat),stringsAsFactors=FALSE),errmat)
    write.csv(outdf,data_sett$saveErrorMatrixCSV,row.names=FALSE)
  }
  if (!is.na(data_sett$saveAccuracyCSV)){
    outdf <- cbind(data.frame(overallAcc=overallAcc,kappa=kappa),
                   t(classAcc['users',]),t(classAcc['producers',]))
    names(outdf)[3:ncol(outdf)] <- paste0(c(rep('users_',n_class),rep('producers_',n_class)),
                                          names(outdf)[3:ncol(outdf)])
    write.csv(outdf,data_sett$saveAccuracyCSV,row.names=FALSE)
  }
  ##
  return(list(errorMat=errmat,classAcc=classAcc,overallAcc=overallAcc,kappa=kappa))
}
##########################################################################################
############## Consolidate RF and/or independent validation results across iterations
e_consolidateArossIterations <- function(dat,sett,valid=NA,rf=NA){
  "
  # Takes:
  #     dat :   a copy of the reference data (e.g. refdat) or imputed reference data (e.g. impdat),
  #     sett :  a copy of the settings list
  #     valid : a list of independent validation results as returned by d_independentValidation, and/or
  #     rf :    a list of fitted Random Forest models as returned by c_prepRandomForest
  #   and consolidates the results in valid and/or rf across iterations.
  #
  # Returns a list with the following components (listed by names):
  #    [1] errorMat :  a data frame of error matrices across iterations (requires input for valid). This object is NA if valid is NA.
  #    [2] accStats :  a data frame of independent validation accuracy statistics across iterations (requires input for valid). This object is NA if valid is NA.
  #    [3] featureImportances : a data frame of feature importances across iterations (requires input for rf).  This object is NA if rf is NA.
  #    [4] oobErrorRate :  a data frame of out-of-bag error rates across iterations(requires input for rf). This object is NA if rf is NA.
  #    [5] oobErrorMat :  a data frame of out-of-bag error matrices across iterations(requires input for rf). This object is NA if rf is NA.
  "
  if (is.na(sett$data$consol_outCSV_basename)) sett$data$consol_outCSV_basename <- ''
  ## general check of any speficied output files (ensure that they don't already exist)
  if (!is.na(sett$data$consol_outCSV_dir)){
    if (!dir.exists(sett$data$consol_outCSV_dir)){
      stop('Could not find specified outCSV_dir (data setting)')
    }else{
      if (is.na(sett$data$consol_outCSV_basename)) sett$data$consol_outCSV_basename <- ''
      for (csv in c('importances.csv','indErrorMatrices.csv','accuracyStats.csv',
                    'OOBerrorStats.csv','OOBerrorMatrices.csv')){
        path <- paste0(sett$data$consol_outCSV_dir,'/',sett$data$consol_outCSV_basename,csv)
        if (file.exists(path)) {
          stop('Path for output CSV file already exists:\n         ',
               path,'\n   Consider changing outCSV_dir or outCSV_basename data settings')
        }
      }
    }
  }
  ##
  outlist <- list(errorMat=NA,accStats=NA,featureImportances=NA,oobErrorRate=NA,oobErrorMat=NA)
  classes <- sort(unique(dat$classNum))
  n_class <- length(classes)
  classNumFmt <- paste0('%0',nchar(max(classes)),'d') # character string for formatting numbers for class-specific labels
  xcols <- .getTrainSampleColumnNames(dat)
  ## consolidate independent validation accuracy stats if user specified a list for 'valid'
  if (!is.na(valid[1])){
    ## checks
    if (!is.list(valid)) stop('valid is expected to be either NA or a list of sub-lists with each sub-list ',
                              'representing independent validation results.')
    ## consolidation
    outlist$errorMat <- do.call(rbind,lapply(1:length(valid),function(i){
      cbind(data.frame(iter=i,Predicted=row.names(valid[[i]]$errorMat),stringsAsFactors=FALSE),
            valid[[i]]$errorMat,row.names=NULL)
    }))
    overallAcc <- sapply(valid,'[[','overallAcc')
    kappa <- sapply(valid,'[[','kappa')
    users <- data.frame(do.call(rbind,lapply(1:length(valid),function(i)valid[[i]]$classAcc['users',])))
    names(users) <- paste0('users_',names(users))
    producers <- data.frame(do.call(rbind,lapply(1:length(valid),function(i)valid[[i]]$classAcc['producers',])))
    names(producers) <- paste0('producers_',names(producers))
    outlist$accStats <- cbind(overallAcc,kappa,users,producers)
    ## save to file if user chose to do so
    if (!is.na(sett$data$consol_outCSV_dir)){
      outdir <- sett$data$consol_outCSV_dir
      basename <- sett$data$consol_outCSV_basename
      write.csv(outlist$errorMat,file=paste0(outdir,'/',basename,'indErrorMatrices.csv'),row.names=FALSE)
      outdf <- cbind(as.integer(row.names(outlist$accStats)),outlist$accStats)
      names(outdf)[1] <- 'iter'
      write.csv(outdf,file=paste0(outdir,'/',basename,'accuracyStats.csv'),row.names=FALSE)
    }
  }
  ## consolidate random forest model characteristics if user specified a list for 'rf'
  if (!is.na(rf[1])){
    ## checks
    if (!is.list(rf)) stop('rf is expected to be either NA or a list of sub-lists, with each sub-list ',
                           'representing a fitted/trained Random Forest model.')
    ## consolidation of feature importances
    if ('MeanDecreaseAccuracy'%in%colnames(rf[[1]]$importance)){
      ## if mean decrease in accuracy was calculated (in addition to mean decrease in gini)
      outlist$featureImportances <- do.call(rbind,lapply(1:length(rf),function(i){
        imp <- t(rf[[i]]$importance)
        row.names(imp)[1:(nrow(imp)-2)] <- paste0('MeanDecreaseAccuracy_c',
                                                  sprintf(classNumFmt,as.integer(row.names(imp)[1:(nrow(imp)-2)])))
        impSD <- t(rf[[i]]$importanceSD)
        row.names(impSD)[1:(nrow(impSD)-1)] <- paste0('SD_MeanDecreaseAccuracy_c',
                                                      sprintf(classNumFmt,as.integer(row.names(impSD)[1:(nrow(impSD)-1)])))
        row.names(impSD)[nrow(impSD)] <- 'SD_MeanDecreaseAccuracy'
        impdf <- as.data.frame(rbind(imp[match(c('MeanDecreaseGini','MeanDecreaseAccuracy'),row.names(imp)),],
                                     imp[1:(nrow(imp)-2),],impSD[c(nrow(impSD),1:(nrow(impSD)-1)),]))
        impdf <- cbind(data.frame(iter=rep(i,nrow(impdf)),stat=row.names(impdf)),impdf)
        return(impdf)
      }))
      row.names(outlist$featureImportances) <- NULL
    }else{
      ## otherwise assume only mean decrease in gini was calculated
      imp <- t(do.call(cbind,lapply(rf,'[[','importance')))
      row.names(imp) <- NULL
      outlist$featureImportances <- cbind(data.frame(iter=1:nrow(imp),stat='MeanDecreaseGini'),imp)
    }
    ## consolidation of OOB estimate of error rate
    oobErr <- as.data.frame(do.call(rbind,lapply(rf,function(x) tail(x$err.rate,1))))
    names(oobErr) <- c('OOBerror',paste0('OOBerror_c',sprintf(classNumFmt,as.integer(names(oobErr)[2:ncol(oobErr)]))))
    row.names(oobErr) <- 1:nrow(oobErr)
    outlist$oobErrorRate <- oobErr
    ## consolidation of error matrices based on out-of-bag data
    errmat <- do.call(rbind,lapply(1:length(rf),function(i){
      em <- t(rf[[i]]$confusion[,1:n_class])
      return(cbind(data.frame(iter=rep(i,n_class),
                       Predicted=paste0('Predicted_c',sprintf(classNumFmt,as.integer(row.names(em)))),
                       stringsAsFactors=FALSE),em))
    }))
    names(errmat)[3:ncol(errmat)] <- paste0('Actual_c',sprintf(classNumFmt,as.integer(names(errmat)[3:ncol(errmat)])))
    row.names(errmat) <- NULL
    outlist$oobErrorMat <- errmat
    ## save to file if user chose to do so
    if (!is.na(sett$data$consol_outCSV_dir)){
      outdir <- sett$data$consol_outCSV_dir
      basename <- sett$data$consol_outCSV_basename
      write.csv(outlist$featureImportances,file=paste0(outdir,'/',basename,'importances.csv'),row.names=FALSE)
      outdf <- cbind(data.frame(iter=as.integer(row.names(outlist$oobErrorRate))),outlist$oobErrorRate)
      write.csv(outdf,file=paste0(outdir,'/',basename,'OOBerrorStats.csv'),row.names=FALSE)
      write.csv(outlist$oobErrorMat,file=paste0(outdir,'/',basename,'OOBerrorMatrices.csv'),row.names=FALSE)
    }
  }
  return(outlist)
}
##########################################################################################

documentation <-function(outTextFile=NA){
  doc <- list()
  ######################################
  ## functions
  doc$prep_sett = paste0(
  '\nprep_sett:    function\n\n',
  '  - returns a list of lists, representing ata settings and random forest\n',
  '    model settings that are to be used in other functions\n',
  '  - if called without modifying any arguments, the returned list just\n',
  '    contains default settings\n\n',
  '  - the returned list has 2 components named: \'data\' and \'rf\'\n\n',
  '     - \'data\' is a sub-list containing settings such as paths to input \n',
  '       and output datasets, data imputation options, options for \n',
  '       setting-aside of reference data-points for independent validation, \n',
  '       etc.\n\n\n',
  '     - \'rf\' is a sub-list containing settings that are directly related\n',
  '       to fitting of, and prediction using, the random forest model.  Most\n',
  '       of these settings are directly from the package responsible the \n',
  '       for the model (i.e. randomForest).\n\n',
  '  - pass the list returned by this function to the print_sett() function\n',
  '    to get a print-out of the current state of all settings. e.g.:\n\n',
  '         sett = prep_sett() # default settings as sett\n',
  '         print_sett(sett)   # print out current state of settings\n\n',
  '  - after modifying settings, pass the settings list to the\n',
  '    save_sett(sett) function to save the current state of these settings\n\n',
  '      - data settings will only be saved if a valid specification has been\n',
  '        made for sett$data$save_to_json\n',
  '      - random forest settings will only be saved if a valid specification\n',
  '        has been made for sett$rf$save_to_json\n',
  '      - e.g.:\n\n',
  '         sett$data$inRefCSV = \'myRefData.csv\' # change data setting from default\n',
  '         sett$data$ntree = 600  # change random forest setting from default\n\n',
  '         sett$data$save_to_json = \'myDataSettings.json\'\n',
  '         sett$rf$save_to_json = \'myRandomForestSettings.json\'\n\n',
  '         save_sett(sett) # save the current state of the settings to file\n\n',
  '  - previously saved json settings files can be loaded using the \n',
  '    \'DATA_load_from_json\' and \'RF_load_from_json\' arguments of the \n',
  '    prep_sett() function. e.g.:\n\n',
  '         sett = prep_sett(DATA_load_from_json=\'myDataSettings.json\',\n',
  '                          RF_load_from_json=\'myRandomForestSettings.json\')\n',
  '         print_sett(sett)   # print out current state of settings\n\n')
  doc$save_sett = paste0(
  '\nsave_sett:    function\n\n',
  '  - can be used to save an updated copy of the data and random forest\n',
  '    settings (sett$data & sett$rf) to json file\n',
  '  - arguments:\n\n',
  '       sett :  a copy of the settings list, with valid specifications\n',
  '               for sett$data$save_to_json and/or\n',
  '               sett$rf$save_to_json\n\n',
  '  - e.g.:\n\n',
  '         sett = prep_sett() # default settings as sett\n\n',
  '         sett$data$inRefCSV = \'myRefData.csv\' \n',
  '         sett$rf$ntree = 600  \n\n',
  '         sett$data$save_to_json = \'myDataSettings.json\'\n',
  '         sett$rf$save_to_json   = \'myRandomForestSettings.json\'\n\n',
  '         save_sett(sett) # save the current state of the settings\n')
  doc$print_sett =  paste0(
  '\nprint_sett:    function\n\n',
  '  - can be used to print the current state of the data and/or random\n',
  '    forest settings to the console\n',
  '  - arguments:\n\n',
  '       sett :  a copy of the settings list\n\n',
  '  - e.g.:\n\n',
  '         sett = prep_sett() # default settings as sett\n\n',
  '         sett$data$inRefCSV\'] = \'myRefData.csv\' \n',
  '         sett$rf$inRefCSV\'] = 600  \n\n',
  '         print_sett(sett) # print data and rf settings\n\n',
  '         print_sett(sett$data) # print just data settings\n\n',
  '         print_sett(sett$rf) # print just rf settings\n')
  doc$resetRefDataframe =  paste0(
  '\nprint_sett:    function\n\n',
  '  - can be used to reset the reference data or imputer reference data \n',
  '    (refdat or impdat) data-frame to one of two previous states:\n',
  '      - immediately before data imputation and splitting\n',
  '      - immediately after data imputation and splitting\n\n',
  '  - arguments:\n\n',
  '       dat :  a copy of the data frame that is to be reset\n\n',
  '       imputedAndSplit :  (default FALSE) logical representing what state\n',
  '                           the data frame should be returned to\n\n',
  '              if TRUE: immediately after data imputation (i.e. as returned\n',
  '                       by the b_imputeDataAndSplitIndValidSet function)\n\n',
  '              if FALSE: immediately before data imputation (i.e. as returned\n',
  '                        by the a_initialDataPrep function)\n\n',
  '  - returns: a copy of the reset data frame\n\n',
  '  - e.g.:\n\n',
  '         a = a_initialDataPrep(sett) \n',
  '         b = b_imputeDataAndSplitIndValidSet(a$refdat,sett)\n',
  '         c = c_prepRandomForest(refdat,impdat,sett)\n\n',
  '         refdat=c$refdat; impdat=c$impdat; rf=c$rf',
  '         # at this point both refdat and impdat have prediction results\n',
  '         # attached to them, from the c_prepRandomForest function\n\n',
  '         # reset refdat and impdat to their state as returned\n',
  '         # by b_imputeDataAndSplitIndValidSet\n',
  '         refdat = resetRefDataframe(refdat,imputedAndSplit=TRUE)\n',
  '         impdat = resetRefDataframe(impdat,imputedAndSplit=TRUE)\n')
  doc$a_initialDataPrep =  paste0(
  '\na_initialDataPrep:    function\n\n',
  '  - reads input files, prepares a data frame representing reference data \n',
  '    (refdat), and extracts raster data at reference point locations\n\n',
  '  - arguments:\n\n',
  '       sett :  a copy of the settings list\n\n',
  '  - returns: a list with two components:\n\n',
  '       \'refdat\' the data frame representing the reference data, with \n',
  '                extracted raster data attached.\n\n',
  '       \'inrast\' a copy of the input raster dataset, opened as an \n',
  '                raster::RasterStack object\n\n')
  doc$b_imputeDataAndSplitIndValidSet =  paste0(
  '\nb_imputeDataAndSplitIndValidSet:    function\n\n',
  '  - takes a copy of the reference data data-frame, as returned by the \n',
  '    a_initialDataPrep function, imputes the data, and splits/sets-aside\n',
  '    the data points into an independent validation set and a set that is\n',
  '    to be used for building training the Random Forest model\n\n',
  '  - see the following data settings: \n',
  '          \'indValidSplit\',\'splitByClass\',\n',
  '          \'impute_strategy\',\'impute_by_class\',\n\n',
  '  - arguments:\n\n',
  '       refdat  :  a copy of the reference data data-frame, as returned by\n',
  '                  the a_initialDataPrep function\n',
  '       sett    :  a copy of the settings dictionary\n\n',
  '       verbose :  (default: TRUE) boolean representing whether or not to\n',
  '                   print details of the imputation and splitting to the \n',
  '                   console\n\n',
  '  - returns: a list with two componentns:\n\n',
  '       \'refdat\' a data frame representing the reference data (refdat), \n',
  '                with a new field/column (\'indValidSet\') representing \n',
  '                whether a data point has been kept for the model (value of \n',
  '                0), set aside for independent validation (value of 1), or,\n',
  '                in some cases, omitted due to the presence of NoData in\n',
  '                combination with the lack of an imputation strategy (value\n',
  '                of -1)\n\n',
  '       \'impdat\' a data frame representing the imputed (or subset)  \n',
  '                reference data (impdat), after having implemented the \n',
  '                imputation strategy. It is similar to the refdat data \n',
  '                frame that is returned by this function, but NoData \n',
  '                values from the raster dataset have been imputed at \n',
  '                reference data locations. If the imputataion strategy was \n',
  '                NA, then records/rows containing  NoData values have been\n',
  '                omitted from this data farme instead.\n\n')
  doc$c_prepRandomForest =  paste0(
  '\nc_prepRandomForest:    function\n\n',
  '  - prepares and fits/trains a Random Forest model based on the \n',
  '    imputed/omitted reference data (impdat) and the user-specified random \n',
  '    forest settings\n',
  '  - also updates both the reference data and imputed reference data \n',
  '    data-frames with predictions and probabilities of class membership at\n',
  '    reference data locations\n\n',
  '  - arguments:\n\n',
  '       refdat :  a copy of the reference data data-frame, as returned by\n',
  '                 the b_imputeDataAndSplitIndValidSet function\n',
  '       impdat :  a copy of the imputed and independent validation-split \n',
  '                 reference data data-frame, as returned by the\n',
  '                 b_imputeDataAndSplitIndValidSet function\n',
  '       sett   :  a copy of the settings list\n\n',
  '  - returns: a list with three components:\n\n',
  '       \'refdat\' a copy of the reference data data-frame (refdat) with \n',
  '                Random Forest predictions/probabilities added.\n\n',
  '       \'impdat\' a copy of the imputed/omitted reference data data-frame \n',
  '                (impdat) with Random Forest predictions/probabilities \n',
  '                added.\n\n',
  '       \'rf\'  the trained/fit Random Forest classifier object\n')
  doc$d_independentValidation =  paste0(
  '\nd_independentValidation:    function\n\n',
  '  - generates accuracy statistics from the Random Forest classification \n',
  '    based on the reference data points that were set aside for independent\n',
  '    validation\n',
  '  - accuracy statistics include:  error matrix, user\'s & producer\'s\n',
  '                                   accuracies, overall accuracy, kappa\n\n',
  '  - arguments:\n\n',
  '       refdat :  a copy of the reference data data-frame, as returned by\n',
  '                 the c_prepRandomForest function\n',
  '       sett   :  a copy of the settings list\n\n',
  '  - returns: a list with the following components:\n\n',
  '       \'errorMat\'   a data frame representing the error matrix \n\n',
  '       \'classAcc\'   a data frame representing class-specific user\'s \n',
  '                      and producer\'s accuracies \n\n',
  '       \'overallAcc\' a float value representing the overall accuracy \n\n',
  '       \'kappa\'      a float value representing Cohen\'s kappa coefficient\n')
  doc$e_consolidateArossIterations =  paste0(
  '\ne_consolidateArossIterations:    function\n\n',
  '  - Consolidates independent validation results and Random Forest model\n',
  '    characteristics across iterations into individual data frames and \n',
  '    (optionally) otuput CSV files\n\n',
  '  - arguments:\n\n',
  '       dat   :  a copy of the reference data (refdat) or imputed reference \n',
  '                data (impdat),\n\n',
  '       sett  :  a copy of the settings list\n\n',
  '       valid :  (optional) a list of independent validation results as \n',
  '                returned by the d_independentValidation function\n',
  '                - this is required for consolidating independent \n',
  '                  validation accuracy statistics and/or error matrices\n\n',
  '       rf    :  (optional) a list of random forest models as returned by \n',
  '                the c_prepRandomForest function\n',
  '                - this is required for consolidating feature importances, \n',
  '                  out-of-bag error rates, and out-of-bag error matrices\n\n',
  '  - returns: a list with the following components:\n\n',
  '       \'errorMat\'  a data frame of error matrices across iterations \n',
  '                  (requires input for valid). This object is NA if \n',
  '                  valid argument is NA / not specified.\n\n',
  '       \'accStats\'  a data frame of independent validation accuracy \n',
  '                  statistics across iterations (requires input for \n',
  '                  valid argument). This object is NA if valid is NA\n',
  '                  / not specified.\n\n',
  '       \'featureImportances\' a data frame of feature importances across\n',
  '                     iterations (requires input for rf argument). This \n',
  '                     object is NA if valid if rf argument is NA / not \n',
  '                     specified.\n\n',
  '       \'oobErrorRate\'  a data frame of out-of-bag Random Forest error \n',
  '                       rates across iterations (requires input for rf \n',
  '                       argument). This object is NA if rf argument is NA. \n\n',
  '       \'oobErrorMat\'  a data frame of out-of-bag Random Forest error \n',
  '                      matrices across iterations (requires input for rf \n',
  '                      argument). This object is NA if rf argument is NA. \n')
  ######################################
  ## settings common to the 'data' settings and the random forest ('rf') settings
  doc$working_dir = paste0(
  '\nworking_dir:    data setting and random forest setting\n\n',
  '  - represents the current working directory\n',
  '  - to change this setting you should use setwd(), as in: \n',
  '         setwd(\'c:\\\\working\\\\myWorkingDir\')\n')
  doc$load_from_json = paste0(
  '\nload_from_json:    data setting and random forest setting\n\n',
  '  - default:  NA  (do not load settings from json file)\n',
  '  - a string representing the filename (if in working directory) or \n',
  '    the full path to an input json file storing the settings\n',
  '  - if a valid filename or path is specfied, settings (data or rf \n',
  '    settings) will be loaded from that file\n',
  '  - filename should end in \'.json\'\n')
  doc$save_to_json = paste0(
  '\nsave_to_json:    data setting and random forest setting\n\n',
  '  - default:  NA  (do not save settings to json file)\n',
  '  - a string representing the filename (in working directory) or the \n',
  '    full path to an output json file for n',
  '    storing the settings\n',
  '  - if a valid filename or path is specfied, settings (data or rf  \n',
  '    settings) will be saved to that file\n',
  '  - filename should end in \'.json\'\n')
  doc$randSeedValue =  paste0(
    '\nrandSeedValue:    data setting and random forest setting\n\n',
    '  - default:  NA    (no setting of random seed)\n',
    '  - a numeric value that will be used to set the random seed\n',
    '  - the randSeedValue setting in the data settings list (sett$data) is\n',
    '    used to set the random seed when the a_initialDataPrep function \n',
    '    is called (i.e. seed for data prep such independent validation \n',
    '    splitting)\n',
    '  - the randSeedValue setting in the random forest settings list \n',
    '    (sett$rf) is used to set the random seed when the c_prepRandomForest\n',
    '    function is called (i.e. seed for the random forest algorithm)\n',
    '     - *CAUTION* if performing random forest fitting iterations, this \n',
    '       setting (sett$rf$randSeedValue) should be changed for each \n',
    '       iteration or the output results from each iteration will be the \n',
    '       same (assuming no change in the data).\n')
  ######################################
  ## 'data' settings
  doc$inRefCSV =   paste0(
  '\ninRefCSV:    **REQUIRED** data setting\n\n',
  '  - default:  \'\'    (empty string; user specification is required)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an input CSV file representing reference data \n',
  '    points\n',
  '  - at the very least, this file should contain:\n',
  '     - a field/column containing integer values representing classes\n',
  '     - two fields/columns containing values representing x and y \n',
  '       coordinates\n',
  '         - these coordinates must be in the same spatial reference \n',
  '           system as the input raster dataset\n')
  doc$FN_pointID =  paste0(
  '\nFN_pointID:    data setting\n\n',
  '  - default:  \'\'    (empty string; script will assign unique  \n',
  '    identifier to each reference data point)\n',
  '  - string representing the name of the field/column within the  \n',
  '    input refernece data CSV file that contains a \n',
  '    unique identifier for each reference data point\n',
  '  - not required, but if specified, this information may be included  \n',
  '    in some output files\n')
  doc$FN_classnum =  paste0(
  '\nFN_classnum:    **REQUIRED** data setting\n\n',
  '  - default:  \'\'    (empty string; user specification is required)\n',
  '  - string representing the name of the field/column within the  \n',
  '    input reference data CSV file that contains integer values\n',
  '    representing reference classes\n')
  doc$FN_classlab =  paste0(
  '\nFN_classlab:    data setting\n\n',
  '  - default:  \'\'    (empty string; no class labels will be \n',
  '                     included)\n',
  '  - string representing the name of the field/column within the \n',
  '    input refernece data CSV file that contains class labels (e.g. \n',
  '    text representing the land cover that a given class number\n',
  '    represents)\n',
  '  - not required, but if specified, this information may be included \n',
  '    in some output files\n')
  doc$FN_xy =  paste0(
  '\nFN_xy:    **REQUIRED**  data setting\n\n',
  '  - default:  c(\'\',\'\')   (vector of two empty strings; user \n',
  '    specification is required)\n',
  '  - vecotr containing two strings representing names of the \n',
  '    fields/columns of the field within the input refernece data CSV \n',
  '    file that contain the x and y coordinates (respecitively) of the \n',
  '    reference data point locations \n',
  '     - these coordinates must be in the same spatial reference  \n',
  '       system as the input raster dataset\n')
  doc$inRastPath =  paste0(
  '\ninRastPath:    **REQUIRED**  data setting\n\n',
  '  - default:  \'\'    (empty string; user specification is required)\n',
  '  - string representing the filename (if in working directory) or  \n',
  '    the full path to an input raster dataset\n')
  doc$indValidSplit =  paste0(
  '\nindValidSplit:    data setting\n\n',
  '  - default:  0.3    (30% of the valid reference data points will be set \n',
  '                      aside for independent validation)\n',
  '  - a numeric value ( >=0.0 and <1.0 ) representing the proportion of the \n',
  '    reference data to split off / set-aside for independent validation.\n',
  '  - if 0, no independent validation will be performed\n',
  '  - the splitting-off of the independent validation set will take \n',
  '    place after data imputation\n',
  '  - splitting of independent validation set will also depend on the \n',
  '    specified \'splitByClass\' data setting\n')
  doc$splitByClass =  paste0(
  '\nsplitByClass:    data setting\n\n',
  '  - default:  TRUE    (splitting off of the independent validation \n',
  '                       will occur at the class level)\n',
  '  - boolean representing whether (TRUE) or not (FALSE) the \n',
  '    independent validation split proportion should be applied at \n',
  '    the class level\n',
  '  - if FALSE, the split proportion (see \'indValidSplit\' data \n',
  '    setting) will be applied to the reference dataset in general, \n',
  '    meaning the class proportions in the idenpendent validation set\n',
  '    may differ slightly from those in the model set\n')
  doc$minPointsForModel = paste0(
  '\nminPointsForModel:    data setting\n\n',
  '  - default:  10    (if less than 10 reference points points are left \n',
  '                     for the model, an error will be raised)\n',
  '  - integer value (>=0) representing the minimum number of reference\n',
  '    data points that should be going into the model / Random Forest\n',
  '    algorithm following the independent validation split \n',
  '  - ignored if indValidSplit is zero (all data are being used for \n',
  '    the model) \n',
  '  - if less than this number of data-points are remaining for the \n',
  '    model, an error will be raised \n',
  '  - if \'splitByClass\' data setting is TRUE, then this number will \n',
  '    be checked against the number of reference data points of each \n',
  '    class that are going into the model \n',
  '  - if \'splitByClass\' data setting is FALSE, then this number will \n',
  '    be checked against the overall number of reference data points,\n',
  '    in general, that are going into the model \n')
  doc$saveDataCSV =  paste0(
  '\nsaveDataCSV:    data setting\n\n',
  '  - default:  NA    (no reference data CSV file will be saved)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an output CSV file storing the following\n',
  '    information:  \n\n',
  '     - indicator of whether a data point was set aside for \n',
  '       independent validation (indValidSet). \n',
  '        - Value of 1 for set aside. 0 for not. If  \'impute_strategy\'  \n',
  '          is set to None (omission), the omitted rows are still  \n',
  '          included in this file, but will have a value of -1 for \n',
  '          indValidSet. \n\n',
  '     - reference data: ref. class labels (classLab),ref. class \n',
  '       values (classNum), x/y coordinates (x/y) \n\n',
  '     - extracted (non-imputed / no omissions) training samples with\n',
  '       NA for NoData \n',
  '        - Predictions may not have been made based on these data. \n',
  '          See \'impute_strategy\', below. \n',
  '        - Field/Column names begin with \'ch#_\' where # is any \n',
  '          number of digits representing the image channel from which \n',
  '          the data were extracted \n\n',
  '     - predictions (predict) and probabilities of class memebership \n',
  '       at each reference data point \n',
  '        - Class membership probability fields are named \'prob_c#\',\n',
  '          where # represents a class number/value. \n\n',
  '  - if specified, file will be generated when the c_prepRandomForest()\n',
  '    function is called\n',
  '  - specified path must not already exist. No over-writing.\n')
  doc$impute_strategy =  paste0(
  '\nimpute_strategy:    data setting\n\n',
  '  - default:  \'mean\'    (NoData values will be substituted with the \n',
  '                         mean)\n',
  '  - either NA, a string representing the data imputation strategy, \n',
  '    or numeric value for filling missing values\n\n',
  '  - If NA, no imputation will be performed. Instead, any data \n',
  '    points that contain NoData / NA values in any channel, will be\n',
  '    omitted.  This omission will be done prior to the independent \n',
  '    validation split.\n',
  '  - If \'mean\', missing values will be replaced with the mean of the \n',
  '    column.\n',
  '  - If \'median\', missing values will be replaced with the median of \n',
  '    the column.\n',
  '  - If a numeric value is specified, missing values will be replaced\n',
  '    with this value.\n')
  doc$impute_by_class =  paste0(
  '\nimpute_by_class:    data setting\n\n',
  '  - default:  TRUE    (imputation will be performed at the class \n',
  '                       level) \n',
  '  - logical representing whether (TRUE) or not (FALSE) the \n',
  '    imputation should be performed at the class level \n',
  '  - this setting is ignored if the impute_strategy is NA or a \n',
  '    constant value (numeric)\n',
  '  - e.g.  if impute_strategy is \'mean\' and impute_by_class is TRUE\n',
  '    then, for a given channel, missing values for class 1 will be \n',
  '    replaced by the mean of the reference points that are of\n',
  '    reference class 1.  The same for class 2, and so on.\n',
  '  - e.g.  if impute_strategy is \'mean\' and impute_by_class is FALSE\n',
  '    then, for a given channel, missing values for class 1 will be \n',
  '    replaced by the mean of the channel (at ref. point locations) \n',
  '    regardless of reference class.\n')
  doc$saveImputedDataCSV = paste0(
  '\nsaveImputedDataCSV:    data setting\n\n',
  '  - default:  NA    (no imputed reference data CSV file will be saved)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an output CSV file storing a copy of the reference\n',
  '    data with imputation (or omission in the case where \'impute_strategy\'\n',
  '    is NA) applied to missing values\n',
  '  - field/column names are the same as for \'saveDataCSV\', above, but \n',
  '    the extracted raster data (\'ch#_...\') have either been imputed\n',
  '    or points have been omitted, based on the \'impute_strategy\' and \n',
  '   \'impute_by_class\' options.\n',
  '  - if specified, file will be generated when the c_prepRandomForest()\n',
  '    function is called\n',
  '  - specified path must not already exist. No over-writing.\n')
  doc$saveRFclassifier = paste0(
  '\nsaveRFclassifier:    data setting\n\n',
  '  - default:  NA   (Random Forest classifier will not be saved to file)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an output RData file (should end with .RData\n',
  '    extension) storing a copy of the fitted Random Forest model\n',
  '  - saving the model may be necessary if you later want to apply \n',
  '    predictions over the full image set\n',
  '  - if specified, file will be generated when the c_prepRandomForest()\n',
  '    function is called\n',
  '  - specified path must not already exist. No over-writing.\n')
  doc$saveImportancesCSV = paste0(
  '\nsaveImportancesCSV:    data setting\n\n',
  '  - default:  NA    (no CSV file representing feature importances\n',
  '                     from the current model will be saved)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an output CSV file, storing a copy of the\n',
  '    feature importances from the current Random Forest model\n',
  '  - if specified, file will be generated when the c_prepRandomForest()\n',
  '    function is called\n',
  '  - specified path must not already exist. No over-writing.\n',
  '  - if not specified, file representing feature importances across \n',
  '    several models can still be generated when the \n',
  '    e_consolidateArossIterations() function is called\n')
  doc$saveErrorMatrixCSV = paste0(
  '\nsaveErrorMatrixCSV:    data setting\n\n',
  '  - default:  NA    (no CSV file representing independent \n',
  '                     validation error matrices will be saved)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an output CSV file, storing a copy of the\n',
  '    error matrices from the independent validation\n',
  '  - if specified, file will be generated when the\n',
  '    d_independentValidation() function is called\n',
  '  - specified path must not already exist. No over-writing.\n',
  '  - if not specified, file representing independent validation error\n',
  '    matrices across several models can still be generated when the\n',
  '    e_consolidateArossIterations() function is called\n')
  doc$saveAccuracyCSV =  paste0(
  '\nsaveAccuracyCSV:    data setting\n\n',
  '  - default:  NA    (no CSV file representing independent \n',
  '                     validation accuracy statistics will be saved)\n',
  '  - string representing the filename (if in working directory) or \n',
  '    the full path to an output CSV file, storing a copy of the\n',
  '    accuracy statistics from the independent validation\n',
  '  - if specified, file will be generated when the\n',
  '    d_independentValidation() function is called\n',
  '  - specified path must not already exist. No over-writing.\n',
  '  - if not specified, file representing independent validation \n',
  '    accuracy statistics across several models can still be generated\n',
  '    when the e_consolidateArossIterations() function is called\n')
  doc$consol_outCSV_dir = paste0(
  '\nconsol_outCSV_dir:    data setting\n\n',
  '  - default:  None    (no CSV files representing info consolidated \n',
  '                       across Random Forest models will be saved)\n',
  '  - string representing path (relative to working directory or full\n',
  '    path) to a directory that CSV files output from the \n',
  '    e_consolidateArossIterations() function will be stored within\n',
  '  - if specified, several CSV files representing statistics that \n',
  '    have been consolidated across model fitting & validation  \n',
  '    iterations may be saved, including:\n\n',
  '     - ...accuracyStats.csv: independent validation accuracy\n',
  '              statistics.  Requires specification for the \'valid\'\n',
  '              argument when calling e_consolidateArossIterations()\n\n',
  '     - ...importances.csv: feature importances.  Requires \n',
  '              specification for the \'rf\' argument when calling \n',
  '              e_consolidateArossIterations()\n\n',
  '     - ...indErrorMatrices.csv: independent validation error\n',
  '              matrices.  Requires specification for the \'valid\'\n',
  '              argument when calling e_consolidateArossIterations()\n\n',
  '     - ...OOBerrorStats.csv: Error rates from the model\'s out-of-bag\n',
  '              data.  Includes overall error rate and class specific \n',
  '              error rates.  Requires specification for the \'rf\'\n',
  '              argument when calling e_consolidateArossIterations()\n\n',
  '     - ...OOBerrorMatrices.csv: Error matrices from the model\'s \n',
  '              out-of-bag data. Requires specification for the \'rf\'\n',
  '              argument when calling e_consolidateArossIterations()\n\n',
  '     \'...\' represents the \'consol_outCSV_basename\' data setting\n\n',
  '  - files will be generated when the e_consolidateArossIterations()\n',
  '    function is called\n',
  '  - over-writing of output CSV files not supported so a different \n',
  '    \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' should be\n',
  '    specified each time the e_consolidateArossIterations() function\n',
  '    is called (unless previously-generated files have been moved or\n',
  '    deleted)\n')
  doc$consol_outCSV_basename = paste0(
  '\nconsol_outCSV_basename:    data setting\n\n',
  '  - default:  \'consol_\'    \n',
  '  - string representing basename for CSV files output when calling\n',
  '    the e_consolidateArossIterations() function\n',
  '  - if specified as an empty string (\'\'), CSV files can still be \n',
  '    generated but they won\'t have a basename\n',
  '  - if specified, recommend ending with an underscore or other \n',
  '    character to separate the basename from the rest of the filename\n',
  '  - over-writing of output CSV files not supported so a different \n',
  '    \'consol_outCSV_dir\' and/or \'consol_outCSV_basename\' should be\n',
  '    specified each time the e_consolidateArossIterations() function\n',
  '    is called (unless previously-generated files have been moved or\n',
  '    deleted)\n')

  ######################################
  ## random forest ('rf') settings
  for (setting in c('ntree','mtry','replace','sampsize','nodesize','maxnodes','importance',
                    'localImp','proximity','oob.prox','norm.votes','do.trace','keep.forest',
                    'keep.inbag')){
    doc[[length(doc)+1]] <-   paste0('\nntree:    random forest (\'rf\') setting\n\n',
                                     '  *** see the official documentation for randomForest: ?randomForest \n')
    names(doc)[length(doc)] <- setting
  }
  if (!is.na(outTextFile)){
    cat('\n',file=outTextFile,append=FALSE)
    for (d in doc) cat(d,file=outTextFile,append=TRUE)
    }
  return (doc)
}
