Loop through year, month, day for netCDF file name

Question

I am using a for loop in R to read netCDF file from a folder and extract values for given list of longitude, latitude. It looks like working, except ONE PROBLEM. When the loop returns values against date, it creates January 29 to 31 after February 28. I want, as usual, March 01 after February 28 or 29 (for leap year). Here is my R code:

# given latitude, longitude list
sb1 <- data.frame(longitude=1:10,latitude =1:10)

# Extracting zonal or sub-basin average rainfall from netCDF file

sb1_r <- c()
date <- c()
rain_month <- c()
rain_year <- c()


for (year in 1998:1998){
  for (month in 1:3){
     for (day in seq_along(1:31)){
        FileName <- paste('3B42_daily',year,sprintf("%02d",month),sprintf("%02d", day),'7.SUB.nc', sep='.')
     if (!file.exists(FileName)){
     next
     } else {

      File <- nc_open(FileName)
      rain <- ncvar_get(File, 'r')

      sb1_r[day] <- mean(apply(sb1,1,function(x)rain[x[1],x[2]]),na.rm = TRUE)
      date[day] <-  paste(year,sprintf("%02d", month),sprintf("%02d", day),sep='-')
      rain_month <- data.frame(date,sb1_r)
      nc_close(File)
      }
    }

    rain_year <- rbind(rain_year,rain_month) 
  }

}

You can find daily netCDF data for three months to this link: https://drive.google.com/open?id=0B8rqKaYt0VEaMWVGc1gzdXI1U28

You have `for (day in seq_along(1:31))` for the months of January, February, and March. But, February has only 28 days. Could this be the problem? If so, you need to customize the loop. — dataanalyst, Aug 01 '16 at 05:50
@Gandalf But I don't have NetCDF files with name 3B42_daily.1998.02.29.7.SUB and so on. To avoid this, I put "if (!file.exists(FileName)){" in my code. — Wahid Palash, Aug 01 '16 at 06:09
Just to point out that using the mean function will not give you the correct answer when using a e.g. regular lat/lon grid, since the grid cells vary in size. Thus the value in each cell needs to be weighted by the cell area. Far better to simply use CDO which accounts for this automatically - see below. — ClimateUnboxed, Jan 28 '17 at 15:32

score 2 · Answer 1 · answered Jan 18 '17 at 11:08

Note that the above codes in R are NOT correct unless the rainfall netcdf file is using a equi-area grid, which is rarely the case! (And this is not the case for the TRMM files used in this example). This is a common error when handling gridded data.

For example, if you have a regular lat/lon grid, you need to account for the cosine reduction of the longitude dimension as you move towards the poles. The error is small if your sub-basin is small, but can get significant if the area is large. For some kinds of grids, e.g. reduced Gaussian grids, the error can be very significant if your sub-domain just so happens to coincide with the discontinuous change in longitude point numbers, especially for "non-smooth" fields such as precipitation.

I would always perform zonal and sub-basin processing using CDO to ensure that the calculation is performed correctly. If you use CDO, the area mean and zonal averaging functions account for the native grid.

Thus my code would look something like this:

#!/bin/bash

# define the lat-lon bounds of your sub area
lat1=20
lat2=30
lon1=0
lon2=20

# merge all the daily files into one file
# do this one month at a time as some system limit number of open files

year=1998 # can make this a loop if you want multiple years
for month in `seq -f "%02g" 1 3`  ; do 
  files=`ls 3B42_daily1998${month}*.nc`
  cdo mergetime $files TRMM_${month}.nc
done
cdo mergetime $TRMM_*.nc TRMM_timeseries.nc

# now extract the subdomain
cdo sellonlatbox,$lon1,$lon2,$lat1,$lat2 TRMM_timeseries.nc TRMM_box.nc

# CORRECT area average 
cdo fldmean TRMM_box.nc TRMM_box_av.nc

# zonal average
cdo zonmean TRMM_box.nc TRMM_box_zon.nc

score 1 · Accepted Answer · answered Aug 01 '16 at 09:12

Instead of trying to create the filenames do the opposite. Extract the file names and for each file get the date from the filename and sb1_r from the file. You can do that faster using rbindlist from data.table package (but is not needed).

# given latitude, longitude list sb1 <- data.frame(longitude=1:10,latitude =1:10)

# Extracting zonal or sub-basin average rainfall from netCDF file
filenames = list.files(path = ".", pattern = ".nc")
rain_year = data.frame()

require(ncdf4)
for(FileName in filenames){
  File <- nc_open(FileName)
  # Create Date
  year <- strsplit(FileName, split = '[.]')[[1]][2]
  month <- strsplit(FileName, split = '[.]')[[1]][3]
  day <- strsplit(FileName, split = '[.]')[[1]][4]
  date = paste(year, month, day, sep = "-")
  # get value
  rain <- ncvar_get(File, 'r')
  sb1_r <- mean(apply(sb1,1,function(x)rain[x[1],x[2]]),na.rm = TRUE)
  # update data.frame
  rain_year = rbind(rain_year, data.frame(date = date, sb1_r = sb1_r))
  nc_close(File)
}

# Faster using data.table package
require(data.table)
temp = rbindlist(
  lapply(X = filenames, function(FileName){
    year <- as.integer( strsplit(FileName, split = '[.]')[[1]][2] )
    month <- as.integer( strsplit(FileName, split = '[.]')[[1]][3] )
    day <- as.integer( strsplit(FileName, split = '[.]')[[1]][4] )
    date = paste(year, month, day, sep = "-")
    File <- nc_open(FileName)
    rain <- ncvar_get(File, 'r')
    sb1_r <- mean(apply(sb1,1,function(x)rain[x[1],x[2]]),na.rm = TRUE)
    return(data.frame(date = date, sb1_r = sb1_r))
  })
)

score -3 · Answer 3 · answered Jan 04 '18 at 14:01

#!/usr/bin/env Rscript
argv<-commandArgs(trailingOnly=TRUE)
if(length(argv)==2 & argv[1] <= argv[2]) {
  if (is.na(strptime(sprintf("%s",argv[1]),"%Y%m%d"))) {
    cat("arguments valid check error: ", argv[1], "\n")
    stop()
  }
  if (argv[2]!=format(strptime(sprintf("%s",argv[2]),"%Y%m%d"),"%Y%m%d")) {
    cat("arguments valid check error: ", argv[2], "\n")
    stop()
  }
} else if (length(argv)==2 & argv[1] > argv[2]) {
   print(sprintf("error: %s is greater than %s",argv[1],argv[2]))
   stop()
} else if (length(argv)!=2) {
   script.name<-basename(strsplit(commandArgs(trailingOnly=FALSE)[4],"=")[[1]][2])
   print(sprintf("Usage: %s startDate endDate",script.name))
   stop()
}

filelist<-c()
for (Ymd in format(seq(
   as.POSIXct(sprintf("%s",argv[1]),format="%Y%m%d"),
   as.POSIXct(sprintf("%s",argv[2]),format="%Y%m%d"),
   by="24 hour"),"%Y%m%d")) {
   filelist<-append(filelist, sprintf("%s.%s.%s.%s.%s","3B42_daily",substr(Ymd,1,4),substr(Ymd,5,6),substr(Ymd,7,8),"7.SUB.nc"))
}

sb1_r <- c()
date <- c()
rain_month <- c()
rain_year <- c()

for (i in 1:length(filelist)) {
if (!file.exists(filelist[i])){
  next
 } else {
  year <- as.numeric(substr(filelist[i],12,15))
  month <- as.numeric(substr(filelist[i],17,18))
  day <- as.numeric(substr(filelist[i],20,21))
  File <- nc_open(filelist[i])
  rain <- ncvar_get(File, 'r')

  sb1_r[day] <- mean(apply(sb1,1,function(x)rain[x[1],x[2]]),na.rm = TRUE)
  date[day] <-  paste(year,sprintf("%02d", month),sprintf("%02d", day),sep='-')
  rain_month <- data.frame(date,sb1_r)
  nc_close(File)
  }
}

Could you add some explanation about what you changed compared to the other questions? Right now it is hard to understand the advantages of your solution. — Malte Hartwig, Jan 04 '18 at 14:10

Loop through year, month, day for netCDF file name

3 Answers3