0

I want to create the standard deviation of the interquartile range when the rows exceeds a certain threshold. For example, I got 7 columns named AI_1, ..., AI_7. In total this dataset has 60480 observations split out over 42 IDs (1440 each). I can use the following code that obtains the standard deviation of the remaining values that exceeds this treshold grouped by ID:

AI_SD_1 <- 
  Threshold %>% 
  group_by(ID) %>% summarise(across(starts_with("AI"),~sd(.x[.x>0])))

But what I really want is that it only takes the values that exceeds the threshold, , of lets say 0.0 and then takes the IQR and make the standard deviation of the remaining values. I tried this code but it did not work...

AI_SD_1 <- 
  Threshold %>% 
  group_by(ID) %>% summarise(across(starts_with("AI"),~IQR(sd(.x[.x>0]))))

I tried this on the following dataset:

structure(list(X = 1:40, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00", 
"00:20:00", "00:21:00", "00:22:00", "00:23:00", "00:24:00", "00:25:00", 
"00:26:00", "00:27:00", "00:28:00", "00:29:00", "00:30:00", "00:31:00", 
"00:32:00", "00:33:00", "00:34:00", "00:35:00", "00:36:00", "00:37:00", 
"00:38:00", "00:39:00"), AI_1 = c(0.17532896077581, 0.174249939439765, 
0.174170544792533, 0.172877357886967, 0.173679017353614, 0.174216799443538, 
0.174514454250882, 0.174656389074666, 0.173377175454716, 0.173044040397703, 
0.172476572884875, 0.174738790856458, 0.173833445732856, 0.174229265722835, 
0.174392878820111, 0.174715890976243, 0.174241614289181, 0.173229751013599, 
0.173579164085914, 0.173829069216696, 0.173499039975341, 0.174387946222767, 
0.173802854581089, 0.174107580137568, 0.174113709936873, 0.173172609295233, 
0.174509255493075, 0.173383120975257, 0.173398927511582, 0.173466516952908, 
0.172948133735396, 0.173862963181179, 0.173958069991397, 0.173548890469988, 
0.172576514315674, 0.173669934234216, 0.175270434955699, 0.173970520524663, 
0.173486447161383, 0.173524261869031), AI_2 = c(0.173549588758752, 
0, 0.85729795236214, 0.513925586220723, 0.140789239632585, 0.0989981552300843, 
0.321625480480368, 0.62540390366724, 0.00714855410741877, 0, 
0, 0, 0.212943798631015, 0, 0, 0.023650258664654, 0.00159158576982517, 
0.0172670511608436, 0, 0, 0, 0.25653572767355, 0.41158598021939, 
0.433889173147664, 0.442200975044019, 0.471931171507954, 0.415009919603445, 
0.43364443321512, 0.449930874231746, 0.48397633182816, 0.561283610701842, 
0.5967776859022, 0.573177245964571, 0.472930329569871, 0.439095752009315, 
0.462621903831972, 0.559256399155301, 0.53339728012121, 0.441235429453436, 
0.465421052015491), AI_3 = c(0.026069149474549, 0.0417747330978121, 
0.276687600798659, 0.258591321128928, 0.208790296683244, 0.0300099278967508, 
0.15234594700642, 0.26519848659315, 0.34220566727692, 0.352310255219813, 
0.297621781376737, 0.292800000618149, 0.481566536382664, 0.337770306519177, 
0.743182296874282, 0.256202127993172, 0.201340506649845, 0.200155318345632, 
0.237126429055375, 0.234974163009848, 0.235808994849961, 0.302168675921402, 
0.377936665388589, 0.416123299239618, 0.389279883023212, 0.357972848973051, 
0.305268847437493, 0.290040891577408, 0.197384083463156, 0.258282654013295, 
0.270482346892427, 0.390076826023885, 0.383994401786836, 0.334410119444762, 
0.311890059374708, 0.318967430715954, 0.334253221209486, 0.335548932537048, 
0.338351312724649, 0.185946982796148), AI_4 = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0.00841646877382803, 0, 0, 0, 0, 0, 0.0427220323055223, 
0.0536388836974687, 0, 0, 0.105351450064905, 0.0683236999744629
), AI_5 = c(0, 0, 0.0015062890214412, 0.00154798776365785, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0.0935957286321642, 0.124805132264803, 0.15822579347775, 
0.136886010411225, 0.152936125790329, 0.0805137146699505, 0.0921793066286638, 
0.12585372625817), AI_6 = c(0.190018331633492, 0.241159552783285, 
0.231916111803065, 0.193196835220518, 0.240381778378367, 0.266125762332231, 
0.339227319507121, 0.354841547583334, 0.277011867279295, 0.474462632995715, 
0.516356521276347, 0.559477604383845, 0.374857636694405, 0.376675155204282, 
0.516347133869462, 0.627633542885353, 0.565732682034457, 0.544148310829377, 
0.545022418887296, 0.602327138107482, 0.529578366594453, 0.571672817412653, 
0.51963881197827, 0.493590581088222, 0.487545798153711, 0.525272191616523, 
0.586906227102549, 0.555446579214151, 0.578788883825157, 0.617822898150646, 
0.631649356464785, 0.727438443667015, 0.587476645379318, 0.631823122324518, 
0.491478597415953, 0.504716900833816, 0.505227023005322, 0.62100679441503, 
0.485167462464327, 0.533883465680236), AI_7 = c(0.139608768263461, 
0.165583663096789, 0.326959508587122, 0.221739297198209, 0.160657663051105, 
0.107439748199699, 0.117594125364214, 0.133528520361788, 0.117950354159875, 
0.131428192187155, 0.125355403562937, 0.119185646272255, 0.196285453922129, 
0.167061057207379, 0.169855099745761, 0.141077126343563, 0.078433720675593, 
0.0999303057993443, 0.0798045801131668, 0.0331137028671696, 0.0920945831761988, 
0.0233052285173748, 0, 0, 0, 0.00876293044107867, 0, 0.109134564970416, 
0.110323312017635, 0.117772975747077, 0.10653351319029, 0.154092473556077, 
0.180835044334998, 0.149089904995742, 0.132776250971465, 0.198981440238251, 
0.319389350543801, 0.143210043963242, 0.121118396897186, 0.126442822159217
), ID = c("ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1")), row.names = c(NA, 40L), class = "data.frame")
  • Please clarify what you mean by "takes the IQR and make the standard deviation of the remaining values" Perhaps you want to estimate the standard deviation of the values that fall between the 75th and 25th quantiles? – langtang Oct 18 '22 at 01:00

1 Answers1

1

It is possible that you want to limit each of the AI_ columns to those that are above the threshold, and then return the standard deviation of those that fall within the 25th and 75th quantile.

You can use this helper function, which first limits a vector x to those that are above the threshold, then gets the 25th/75th, and returns the standard deviation of the values that fall within this range.

f <- function(x, thresh=0) {
  x <- x[x>thresh]
  lims = quantile(x, probs=c(0.25,0.75))
  sd(x[between(x,lims[1],lims[2])])
}

Now, just apply the function to each of the AI_ columns, grouped by ID

df %>% group_by(ID) %>% summarize(across(starts_with("AI"), f))
langtang
  • 22,248
  • 1
  • 12
  • 27