---
title: REddyProc typical workflow    
output: 
  rmarkdown::html_vignette:
    keep_md: true
vignette: >
  %\VignetteEngine{knitr::rmarkdown_notangle}
  %\VignetteIndexEntry{REddyProc typical workflow}
  %\usepackage[UTF-8]{inputenc}
---

```{r, include = FALSE}
# do not execute on CRAN: 
# https://stackoverflow.com/questions/28961431/computationally-heavy-r-vignettes
is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_",
             "_R_CHECK_LICENSE_") %in% names(Sys.getenv()))
knitr::opts_chunk$set(eval = !is_check)
```

```{r setup, include = FALSE}
#rmarkdown::render("vignettes/useCase.Rmd")
knitr::opts_knit$set(root.dir = '..')
knitr::opts_chunk$set(
    #, fig.align = "center"
    #, fig.width = 3.27, fig.height = 2.5, dev.args = list(pointsize = 10)
    #,cache = TRUE
    #, fig.width = 4.3, fig.height = 3.2, dev.args = list(pointsize = 10)
    #, fig.width = 6.3, fig.height = 6.2, dev.args = list(pointsize = 10)
    # works with html but causes problems with latex
    #,out.extra = 'style = "display:block; margin: auto"' 
    )
knitr::knit_hooks$set(spar = function(before, options, envir) {
    if (before) {
        par(las = 1 )                   #also y axis labels horizontal
        par(mar = c(2.0,3.3,0,0) + 0.3 )  #margins
        par(tck = 0.02 )                          #axe-tick length inside plots             
        par(mgp = c(1.1,0.2,0) )  #positioning of axis title, axis labels, axis
     }
})
```


```{r, include = FALSE, warning = FALSE}
#themeTw <- theme_bw(base_size = 10) + theme(axis.title = element_text(size = 9))
#bgiDir <- "~/bgi"
```

# REddyProc typical workflow

## Importing the half-hourly data
The workflow starts with importing the half-hourly data. The example, reads a 
text file with data of the year 1998 from the Tharandt site and converts the 
separate decimal columns year, day, and hour to a POSIX timestamp column. 
Next, it initializes the `sEddyProc` class.

```{r inputData, spar = TRUE, message = FALSE}
#+++ load libraries used in this vignette
library(REddyProc)
library(dplyr)
#+++ Load data with 1 header and 1 unit row from (tab-delimited) text file
fileName <- getExamplePath('Example_DETha98.txt', isTryDownload = TRUE)
EddyData <- if (length(fileName)) fLoadTXTIntoDataframe(fileName) else
  # or use example dataset in RData format provided with REddyProc
  Example_DETha98
#+++ Replace long runs of equal NEE values by NA
EddyData <- filterLongRuns(EddyData, "NEE")
#+++ Add time stamp in POSIX time format
EddyDataWithPosix <- fConvertTimeToPosix(
  EddyData, 'YDH',Year = 'Year',Day = 'DoY', Hour = 'Hour') %>% 
  filterLongRuns("NEE")
#+++ Initalize R5 reference class sEddyProc for post-processing of eddy data
#+++ with the variables needed for post-processing later
EProc <- sEddyProc$new(
  'DE-Tha', EddyDataWithPosix, c('NEE','Rg','Tair','VPD', 'Ustar'))
```

```{r, include = FALSE}
.tmp.f <- function(){
  # suspected that example dataset was already gapfilled and tried to load 
  # from LaThuille - not successful
  library(ncdf4)
  ipath <- file.path(bgiDir,"data/DataStructureMDI/DATA/site/Fluxnet"
                     , "halfhourly/level5_new_v2_newRpot_UncNew/Data")
  fname <- "DE-Tha.1996.2006.hourly.nc"
  nc <- nc_open(file.path(ipath,fname))
  get.var.ncdf <- function(...){ as.vector(ncvar_get(...))}
  ds <- subset(data.frame(
    Year  = get.var.ncdf(nc, "year")
    ,DoY  = get.var.ncdf(nc, "julday")
    ,Hour  = get.var.ncdf(nc, "hour")
    ,NEE  = get.var.ncdf(nc, "NEE")
    ,Rg  = get.var.ncdf(nc, "Rg")
    ,Tair  = get.var.ncdf(nc, "Tair")
    ,VPD  = get.var.ncdf(nc, "VPD")
    ,Ustar  = get.var.ncdf(nc, "u")
    ,rH  = get.var.ncdf(nc, "rH")
  ), (Year == 1998 & !(DoY == 366 & Hour == 0)) | 
    (Year == 1999 & (DoY == 366 & Hour == 0)))
  #), Year == 1998)
  ds$DoY[ds$DoY == 366] <- 1
  dss <- EddyDataWithPosix <- fConvertTimeToPosix(
    ds, 'YDH', Year = 'Year', Day = 'DoY', Hour = 'Hour')
  #EddyDataWithPosix$VPD <- fCalcVPDfromRHandTair(
  #EddyDataWithPosix$rH, EddyDataWithPosix$Tair)
  EProc <- sEddyProc$new('DE-Tha', EddyDataWithPosix, 
  	c('NEE','Rg','Tair','VPD', 'Ustar'))
}
```

A fingerprint-plot of the source half-hourly shows already several gaps.
A fingerprint-plot is a color-coded image of the half-hourly fluxes by daytime 
on the x and and day of the year on the y axis. 
```{r fpNEEOrig}
EProc$sPlotFingerprintY('NEE', Year = 1998)
```

```{r eval=FALSE, include=FALSE}
# plot with custom color palette
EProc$sPlotFingerprintY('NEE', Year = 1998, colors = rainbow(50), valueLimits = c(-300,300))
```

For writing plots of data of several years to pdf see also

* [`sEddyProc_sPlotFingerprint`](../html/sEddyProc_sPlotFingerprint.html),
* [`sEddyProc_sPlotHHFluxes`](../html/sEddyProc_sPlotHHFluxes.html), and
* [`sEddyProc_sPlotDiurnalCycle`](../html/sEddyProc_sPlotDiurnalCycle.html).

## Estimating the uStar threshold distribution
The second step, is the estimation of the  distribution of uStar thresholds, to
identify periods of low friction velocity (uStar), where NEE is biased low. 
Discarding periods with low uStar is one of the largest sources of uncertainty 
in aggregated fluxes. Hence, several quantiles of the distribution of 
the uncertain uStar threshold are estimated by a bootstrap.

The friction velocity, uStar, needs to be in column named "Ustar" of the input 
dataset. 

```{r, message = FALSE}
EProc$sEstimateUstarScenarios(
    nSample = 100L, probs = c(0.05, 0.5, 0.95))
EProc$sGetEstimatedUstarThresholdDistribution()
```

```{r fpNEEUStar, include = FALSE, eval = FALSE}
# EProc$sPlotNEEVersusUStarForSeason()
uStarTh <- EProc$sGetEstimatedUstarThresholdDistribution()

str(uStarTh)
signif(unlist(uStarTh[1,5:7]),2)
EProc$sDATA$NEE_low <- EProc$sDATA$NEE_median <- EProc$sDATA$NEE_orig <- 
  EProc$sDATA$NEE_upper  <- EProc$sDATA$NEE
EProc$sDATA$NEE_orig[ EProc$sDATA$Ustar < unlist(uStarTh[1,4])] <- NA
EProc$sDATA$NEE_low[ EProc$sDATA$Ustar < unlist(uStarTh[1,5])] <- NA
EProc$sDATA$NEE_median[ EProc$sDATA$Ustar < unlist(uStarTh[1,6])] <- NA
EProc$sDATA$NEE_upper[ EProc$sDATA$Ustar < unlist(uStarTh[1,7])] <- NA
# need to produce fingerprints by hand in console - if exeucted from chunk it does not safe a pdf
if (!dir.exists("tmp")) dir.create("tmp")
EProc$sPlotFingerprint('NEE_orig', Dir = "tmp/plots_fingerprint", Format = "png")
EProc$sPlotFingerprint('NEE_low', Dir = "tmp/plots_fingerprint", Format = "png")
EProc$sPlotFingerprint('NEE_median', Dir = "tmp/plots_fingerprint", Format = "png")
EProc$sPlotFingerprint('NEE_upper', Dir = "tmp/plots_fingerprint", Format = "png")
EProc$sDATA$NEE_median <- EProc$sDATA$NEE_orig <- 
  EProc$sDATA$NEE_low <- EProc$sDATA$NEE_upper <- NULL
```

The output reports annually aggregated uStar estimates of 
`r if (!is_check) signif(unlist(EProc$sGetUstarScenarios()[1,2]),2)` for 
the original data and 
`r if (!is_check) signif(unlist(EProc$sGetUstarScenarios()[1,3:5]),2)` 
for lower, median, 
and upper quantile of the estimated distribution. The threshold can vary between
periods of different surface roughness, e.g. before and after harvest.
Therefore, there are estimates for different time periods, called seasons.
These season-estimates are by default aggregated to entire years.

The subsequent post processing steps will be repeated using the four $u_*$ threshold 
scenarios (non-resampled and tree quantiles of the bootstrapped distribution). 
They require to specify a $u_*$-threshold for each 
season and a suffix to distinguish the outputs related to different thresholds.
By default the annually aggregated estimates are used for each season
within the year.

```{r, message = FALSE}
EProc$sGetUstarScenarios()
```

## Gap-filling
The second post-processing step is filling the gaps in NEE using information of the 
valid data. Here, we decide to use the same annual $u_*$ threshold estimate 
in each season, as obtained above, and decide to compute uncertainty also 
for valid records (FillAll). 

```{r gapfill, message = FALSE}
EProc$sMDSGapFillUStarScens('NEE')
```

The screen output (not shown here) already shows that the $u_*$-filtering and
gap-filling was repeated for each given estimate of the $u_*$ threshold
, i.e. column in 
`uStarThAnnual`, with marking 22% to 38% of the data as gap. 
For gap-filling without prior $u_*$-filtering using `sEddyProc_sMDSGapFill`
or for applying single or user-specified $u_*$ thresholds using 
`sEddyProc_sMDSGapFillAfterUstar`
see `vignette("uStarCases")`.

For each of the different $u_*$ threshold estimates
a separate set of output columns of filled NEE and its 
uncertainty is generated, distinguished by the suffixes given with 
`uStarSuffixes`. <Suffix>"_f" denotes the filled value and "_fsd" the 
estimated standard deviation of its uncertainty.

```{r, results = 'hold' }
grep("NEE_.*_f$",names(EProc$sExportResults()), value = TRUE)
grep("NEE_.*_fsd$",names(EProc$sExportResults()), value = TRUE)
```

A fingerprint-plot of one of the new variables shows that gaps have been filled.
```{r fpNEEFilled}
EProc$sPlotFingerprintY('NEE_U50_f', Year = 1998)
```

```{r fpNEEFilled2, include = FALSE, eval = FALSE}
#```{r fpNEEFilled, results = 'hide', echo = FALSE, warn = FALSE, message = FALSE, fig.show = 'hide'}
# plot fingerprint of filled NEE for the median
EProc$sPlotFingerprint('NEE_U50_f', Dir.s = "plots_fingerprint", Format.s = "png")
#EProc$sPlotFingerprintY('NEE_U50_f', Year.i = 1998)
#
# Check that although using FillAll.b = TRUE still the original NEE is in NEE_f for valid records
dss <- cbind(EddyData, EProc$sExportResults())
head(dss$Ustar_U50_fqc)
plot( NEE_U50_f ~ NEE, subset(dss, Ustar_U50_fqc == 0) )
```

## Partitioning net flux into GPP and Reco

The third post-processing step is partitioning the net flux (NEE) into its gross 
components GPP and Reco.
The partitioning needs to distinguish carefully between night-time and day-time. 
Therefore it needs a specification of geographical coordinates and time zone 
to allow computing sunrise and sunset. Further, the missing values in the 
used meteorological data need to be filled. 

For VPD, which is important for daytime flux partitioning, and additional
gap-filling of longer gaps based on minimum daily temperature (assumed dewpoint)
is available.

```{r, message = FALSE}
EProc$sSetLocationInfo(LatDeg = 51.0, LongDeg = 13.6, TimeZoneHour = 1)  
EProc$sMDSGapFill('Tair', FillAll = FALSE,  minNWarnRunLength = NA)     
EProc$sMDSGapFill('VPD', FillAll = FALSE,  minNWarnRunLength = NA)     
EProc$sFillVPDFromDew() # fill longer gaps still present in VPD_f
```

Now we are ready to invoke the partitioning, here by the night-time approach, 
for each of the several filled NEE columns.
```{r partNight, message = FALSE}
EProc$sMRFluxPartitionUStarScens()
```

```{r fpGPPReco, include = FALSE, eval = FALSE}
# plot fingerprint of filled NEE for the median
EProc$sPlotFingerprint('GPP_U50_f', Dir = "plots_fingerprint", Format = "png")
EProc$sPlotFingerprint('Reco_U50', Dir = "plots_fingerprint", Format = "png")
#EProc$sPlotFingerprintY('GPP_U50_f', Year.i = 1998)
dss <- EProc$sExportResults()
grep("GPP",names(dss), value = TRUE)
summary(dss$GPP_U05_fqc)
plot( dss$GPP_U05_fqc ~ dss$NEE_U50_fqc )
```

The results are stored in columns `Reco` and `GPP_f` modified by the respective 
$u_*$ threshold suffix. 
```{r}
grep("GPP.*_f$|Reco",names(EProc$sExportResults()), value = TRUE)
```

Visualizations of the results by a fingerprint plot gives a compact overview. 
```{r fingerPrintGPP}
EProc$sPlotFingerprintY('GPP_U50_f', Year = 1998)
```

For using daytime-based flux partitioning see [`sEddyProc_sGLFluxPartition`](../html/sEddyProc_sGLFluxPartition.html) 
computing columns `GPP_DT` and `Recco_DT`.

## Estimating the uncertainty of aggregated results

The results of the different $u_*$ threshold scenarios can be used for estimating
the uncertainty due to not knowing the threshold.

First, the mean of the GPP across all the year is computed for each
$u_*$-scenario and converted from ${\mu mol\, CO_2\, 
m^{-2} s^{-1}}$ to ${gC\,m^{-2} yr^{-1}}$.

```{r aggregateGPP}
FilledEddyData <- EProc$sExportResults()
uStarSuffixes <- colnames(EProc$sGetUstarScenarios())[-1]
#suffix <- uStarSuffixes[2]
GPPAggCO2 <- sapply( uStarSuffixes, function(suffix) {
	GPPHalfHour <- FilledEddyData[[paste0("GPP_",suffix,"_f")]]
	mean(GPPHalfHour, na.rm = TRUE)
})
molarMass <- 12.011
GPPAgg <- GPPAggCO2 * 1e-6 * molarMass * 3600*24*365.25
print(GPPAgg)
```

The difference between those aggregated values is a first estimate of 
uncertainty range in GPP due to uncertainty of the $u_*$ threshold.
```{r, results = 'hide'}
(max(GPPAgg) - min(GPPAgg)) / median(GPPAgg) 
```
In this run of the example a relative error of about 
`r  if (!is_check) signif( (max(GPPAgg) - min(GPPAgg))/ median(GPPAgg)*100,2)`% 
is inferred.

For a better but more time consuming uncertainty estimate, specify a larger
sample of $u_*$ threshold values, for each repeat the post-processing, and 
compute statistics from the larger sample of resulting GPP columns. This can be
achieved by specifying a larger sequence of quantiles when calling 
`sEstimateUstarScenarios` in place of the command shown above.
```{r, eval = FALSE}
EProc$sEstimateUstarScenarios( 
  nSample = 200, probs = seq(0.025,0.975,length.out = 39) )
```

## Storing the results in a csv-file

The results still reside inside the `sEddyProc` class. 
We first export them to an R Data.frame, append the columns to the original 
input data, and write this data.frame to text file in a temporary directory.
```{r, results = 'hide', message = FALSE}
FilledEddyData <- EProc$sExportResults()
CombinedData <- cbind(EddyData, FilledEddyData)
fWriteDataframeToFile(CombinedData, 'DE-Tha-Results.txt', Dir = tempdir())
# or without relying on data.frame EddyData
# with replacing column DateTime by Year, DoY, and Hour:
fWriteDataframeToFile(
  cbind(EProc$sExportData(), EProc$sExportResults()), 'DE-Tha-Results_ydh.txt', 
  isSplitDatetime=TRUE, Dir = tempdir())
# tmp <- fLoadTXTIntoDataframe(file.path(tempdir(),'DE-Tha-Results_ydh.txt'))
```