0

I wrote a loop to find out which customer is a new customer. The code is this one:

#Format dates

emision <- as.POSIXct(modelerData$Fecha_Emision_Poliza, format= "%Y-%m-%d", tz="UTC")

anulacion <- as.POSIXct(modelerData$Fecha_Anulacion, format= "%Y-%m-%d", tz="UTC")


#Todo a TRUE

modelerData$Nuevos_Clientes[1] <- "T"


#La primera póliza a TRUE las demás a False

for (t in 2:nrow(modelerData)) {

  modelerData$Nuevos_Clientes[t] <- "T"

  if (modelerData$Cod_Titular_Poliza[t]==modelerData$Cod_Titular_Poliza[t-1]){

    modelerData$Nuevos_Clientes[t] <- "F"
  }
}

#Define vector for indexes
vec <- vector(length = nrow(modelerData)) 
vec <- which(modelerData$Nuevos_Clientes == "T")

vec[length(vec)+1] <- nrow(modelerData)+1

#Create function for max annulations `anteriores`

max_anula <- function( q, w) {

  voc <- vector(length = q-w)

  for (m in w:(q-1)) {
    voc[m-w+1]<- anulacion[m]

  }

  maxi <- max(voc, na.rm = TRUE)

  return(maxi)
}

#Recorre las pólizas
s <- 1
for (h in vec){
  s <- s+1
if(h==max(vec) | s == length(vec)+1){
  break
}  
  else if (vec[s]-vec[s-1]==1){

  next}

  else {

for (i in (h+1):(vec[s]-1)){

  print(cat("Esta es i: ",i,"\n"))
  print(cat("Esta es s: ",s,"\n"))
  print(cat("Esta es h: ",h,"\n"))

  if(emision[i]==emision[i-1]){

    modelerData$Nuevos_Clientes[i]<-modelerData$Nuevos_Clientes[i-1]
  } 
  else if (is.na(anulacion[i-1]==TRUE)){

    for (n in i:(vec[s]-1)) {
      modelerData$Nuevos_Clientes[i]<- "F" #Como hay una arriba "Activa" todos los de abajo "F"
       #rompe el bucle ya no tengo que mirar más pólizas
    }
    break
  } else if (emision[i]<anulacion[i-1]){
    modelerData$Nuevos_Clientes[i]<- "F"
  }
    else if (as.numeric(emision[i])< max_anula(i-1, h)){ #(emision[i]< max(anulacion_anteriores, na.rm = TRUE))){
      modelerData$Nuevos_Clientes[i]<- "F" 
    }
    else 

      modelerData$Nuevos_Clientes[i]<- "T" 

                           }  
     }
}

#Creamos la nueva columna

modelerData$Nuevos_Clientes <- Nuevos_Clientes

newVar <- c(fieldName="Nuevos_Clientes", fieldLabel="", fieldStorage="string", fieldMeasure="", fieldFormat="", fieldRole="")

modelerDataModel <- cbind(modelerDataModel, newVar)

A sample of the dataset (Nuevos_Clientes is the field I am calculating):

Cod_Titular_Poliza  Cod_Poliza  Fecha_Emision_Poliza    Fecha_Anulacion Nuevos_Clientes
99999   41000011063800000   01/03/2003 0:00 31/01/2006 0:00 T
99998   41000011063800000   01/03/2003 0:00 31/10/2006 0:00 T
99997   41000011063800000   01/03/2003 0:00 31/01/2006 0:00 T
99996   41000011063800000   01/03/2003 0:00 31/01/2006 0:00 T
99996   41000011063300000   01/01/2006 0:00 31/12/2006 0:00 F
99996   41000011088800000   01/01/2014 0:00     T

If I just export a 100.000 rows sample in a .csv to R Studio and execute the code it just takes a couple of minutes. But if I run the same code in an R transformation node in Modeler it takes hours. Is there something I'm missing? If a take a sample of 10.000 rows in Modeler it will take 53 seconds. Why does it take hours to run 100.000 rows? Shouldn't it take around 530 seconds?

Thanks for your help.

eli-k
  • 10,898
  • 11
  • 40
  • 44
Domians
  • 1
  • 2
  • Don't post pics; rather copy/paste the output of `dput(head(datos))` to give us a reproducible example. In any case, it seems that your code fail to correctly vectorize some operations which could give a boost in performance. – nicola Oct 22 '19 at 08:17
  • thanks @nicola. Is it ok like that? – Domians Oct 22 '19 at 09:23

0 Answers0