5

I have two timestamps as input. I want to calculate the time difference in hours between those timestamps excluding Sundays.

I can get the number of days using datediff function in hive.

I can get the day of a particular date using from_unixtime(unix_timestamp(startdate), 'EEEE').

But I dont know how to relate those functions to achieve my requirement or is there any other easy way to achieve this.

Thanks in Advance.

Vanaja Jayaraman
  • 753
  • 3
  • 18

2 Answers2

6

You can write one custom UDF which takes two columns containing the dates as inputs and counts the difference between the dates excluding sundays.

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Date;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

public class IsoYearWeek extends UDF {

  public LongWritable evaluate(Text dateString,Text dateString1) throws ParseException { //takes the two columns as inputs
    SimpleDateFormat date = new SimpleDateFormat("dd/MM/yyyy");
/*  String date1 = "20/07/2016";
    String date2 = "28/07/2016";
*/  int count=0;

    List<Date> dates = new ArrayList<Date>();

    Date  startDate = (Date)date.parse(dateString.toString()); 
    Date  endDate = (Date)date.parse(dateString1.toString());
    long interval = 24*1000 * 60 * 60; // 1 hour in millis
    long endTime =endDate.getTime() ; // create your endtime here, possibly using Calendar or Date
    long curTime = startDate.getTime();
    while (curTime <= endTime) {
        dates.add(new Date(curTime));
        curTime += interval;
    }
    for(int i=0;i<dates.size();i++){
        Date lDate =(Date)dates.get(i);
        if(lDate.getDay()==0){
            count+=1;  //counts the number of sundays in between
        }
    }

        long days_diff = (endDate.getTime()-startDate.getTime())/(24 * 60 * 60 * 1000)-count; //displays the days difference excluding sundays
        return new LongWritable(days_diff);

  }

}
  • 1
    I vote for this solution. Maybe the code should be improved but I don't know how your task can be accomplished without UDF efficiently. Also possible to use shell script or python for the same. Also possible to use `date_dim` table to join with your data to exclude sundays, then sum(days) but I'd prefer UDF in this case. – leftjoin Jul 22 '16 at 19:08
  • how to use this in hive query ? I have a same type of requirement. – Ranjeet Paswan Jul 19 '18 at 12:54
0

Use spark so that It will be more easy to implement and maintain


import org.joda.time.format.DateTimeFormat


def dayDiffWithExcludeWeekendAndHoliday(startDate:String,endDate:String,holidayExclusion:Seq[String]) ={ 
@transient val datePattern="yyyy-MM-dd"
@transient val dateformatter=DateTimeFormat.forPattern(datePattern)
var numWeekDaysValid=0
var numWeekends=0
var numWeekDaysInValid=0
val holidayExclusionJoda=holidayExclusion.map(dateformatter.parseDateTime(_))
val startDateJoda=dateformatter.parseDateTime(startDate)
var startDateJodaLatest=dateformatter.parseDateTime(startDate)
val endDateJoda=dateformatter.parseDateTime(endDate)
while (startDateJodaLatest.compareTo(endDateJoda) !=0)
{
startDateJodaLatest.getDayOfWeek match {
case value if value >5 => numWeekends=numWeekends+1
case value if value <= 5  => holidayExclusionJoda.contains(startDateJodaLatest) match {case value if value == true => numWeekDaysInValid=numWeekDaysInValid+1 case value if value == false => numWeekDaysValid=numWeekDaysValid+1 }
}
startDateJodaLatest = startDateJodaLatest.plusDays(1)
}
Array(numWeekDaysValid,numWeekends,numWeekDaysInValid)
}

spark.udf.register("dayDiffWithExcludeWeekendAndHoliday",dayDiffWithExcludeWeekendAndHoliday(_:String,_:String,_:Seq[String]))

case class tmpDateInfo(startDate:String,endDate:String,holidayExclusion:Array[String])
case class tmpDateInfoFull(startDate:String,endDate:String,holidayExclusion:Array[String],numWeekDaysValid:Int,numWeekends:Int,numWeekDaysInValid:Int)


def dayDiffWithExcludeWeekendAndHolidayCase(tmpInfo:tmpDateInfo) ={ 
@transient val datePattern="yyyy-MM-dd"
@transient val dateformatter=DateTimeFormat.forPattern(datePattern)
var numWeekDaysValid=0
var numWeekends=0
var numWeekDaysInValid=0
val holidayExclusionJoda=tmpInfo.holidayExclusion.map(dateformatter.parseDateTime(_))
val startDateJoda=dateformatter.parseDateTime(tmpInfo.startDate)
var startDateJodaLatest=dateformatter.parseDateTime(tmpInfo.startDate)
val endDateJoda=dateformatter.parseDateTime(tmpInfo.endDate)
while (startDateJodaLatest.compareTo(endDateJoda) !=0)
{
startDateJodaLatest.getDayOfWeek match {
case value if value >5 => numWeekends=numWeekends+1
case value if value <= 5  => holidayExclusionJoda.contains(startDateJodaLatest) match {case value if value == true => numWeekDaysInValid=numWeekDaysInValid+1 case value if value == false => numWeekDaysValid=numWeekDaysValid+1 }
}
startDateJodaLatest = startDateJodaLatest.plusDays(1)
}
tmpDateInfoFull(tmpInfo.startDate,tmpInfo.endDate,tmpInfo.holidayExclusion,numWeekDaysValid,numWeekends,numWeekDaysInValid)
}

//df way 1
val tmpDF=Seq(("2020-05-03","2020-06-08",List("2020-05-08","2020-06-05"))).toDF("startDate","endDate","holidayExclusion").select(col("startDate").cast(StringType),col("endDate").cast(StringType),col("holidayExclusion"))

tmpDF.as[tmpDateInfo].map(dayDiffWithExcludeWeekendAndHolidayCase).show(false)

//df way 2

tmpDF.selectExpr("*","dayDiffWithExcludeWeekendAndHoliday(cast(startDate as string),cast(endDate as string),cast(holidayExclusion as array<string>)) as resultDays").selectExpr("startDate","endDate","holidayExclusion","resultDays[0] as numWeekDaysValid","resultDays[1] as numWeekends","resultDays[2] as numWeekDaysInValid").show(false)


tmpDF.selectExpr("*","dayDiffWithExcludeWeekendAndHoliday(cast(startDate as string),cast(endDate as string),cast(holidayExclusion as array<string>)) as resultDays").selectExpr("startDate","endDate","holidayExclusion","resultDays[0] as numWeekDaysValid","resultDays[1] as numWeekends","resultDays[2] as numWeekDaysInValid").show(false)

// spark sql way, works with hive table when configured in hive metastore

tmpDF.createOrReplaceTempView("tmpTable")
spark.sql("select startDate,endDate,holidayExclusion,dayDiffWithExcludeWeekendAndHoliday(startDate,endDate,holidayExclusion) from tmpTable").show(false)


Raptor0009
  • 258
  • 4
  • 14