Quantiles and data interval for comparison

Question

I have a question regarding the use of quantiles for determining the enveloppe of a curve. This is what I am doing: I have a continuous variable ("var") that I turned discrete using cut ("var_cut"), and a related variable obtained from the the continuous variable ("modvar"). What I'm doing is plotting modvar~var_cut, and I want to have a sense of the variability of modvar. Here is the method I chose:

#df containing var and modvar        
structure(list(var = c(0.1968, 0.2263667, 0.1769, 0.2318, 0.2001333, 
    0.2382667, 0.2005, 0.2022667, 0.1699333, 0.2115667, 0.212, 0.2218667, 
    0.2327333, 0.2224333, 0.1690333, 0.1961333, 0.1756667, 0.2268333, 
    0.1938667, 0.1983, 0.1914333, 0.1745333, 0.2382, 0.2068333, 0.2509333, 
    0.221, 0.2075667, 0.2475333, 0.2463333, 0.2354, 0.2335, 0.2382, 
    0.2636667, 0.1829667, 0.2180333, 0.1703333, 0.2177333, 0.1932667, 
    0.2281, 0.1960667, 0.1975333, 0.1640333, 0.2021667, 0.2044333, 
    0.2124, 0.2267, 0.2202333, 0.1648667, 0.1898, 0.168, 0.2225, 
    0.1899667, 0.1966667, 0.183, 0.1678667, 0.2288333, 0.2006, 0.2389333, 
    0.2105, 0.2018667, 0.2457667, 0.2393333, 0.2286, 0.2280333, 0.2319, 
    0.2565333, 0.1838, 0.2189667, 0.1710667, 0.2184, 0.194, 0.2289333, 
    0.1968, 0.1984, 0.1646667, 0.2029667, 0.2053667, 0.2132333, 0.2274667, 
    0.2211, 0.1655333, 0.1907333, 0.1688333, 0.2234, 0.1908, 0.1975333, 
    0.1838333, 0.1686, 0.2297333, 0.2013667, 0.2397667, 0.2113333, 
    0.2027333, 0.2467333, 0.2402, 0.2295333, 0.2289333, 0.2328333, 
    0.2574333, 0.1795667), modvar = c(1.01575728698598, 0.978902741156023, 
    1.04056240429755, 0.972130196236979, 1.01160236751187, 0.964069530301364, 
    1.01114528024965, 1.00894310935747, 1.04924631438672, 0.997350768101313, 
    0.99681066471784, 0.984511938538037, 0.97096684869995, 0.983805678263226, 
    1.05036815386312, 1.01658832074033, 1.04209969832671, 0.978321129711924, 
    1.01941361113723, 1.0138875545253, 1.02244681578377, 1.04351246817399, 
    0.964152671071449, 1.00325089585421, 0.948280761510472, 0.985592269953813, 
    1.00233672132977, 0.95251882175466, 0.954014607723197, 0.967642838331369, 
    0.970011166114885, 0.964152671071449, 0.932408727300664, 1.03300033368478, 
    0.989290226814528, 1.04874771906387, 0.989664173306662, 1.0201615041215, 
    0.976742202973302, 1.01667133686158, 1.01484323711037, 1.05660059539869, 
    1.00906775818819, 1.00624246779128, 0.996312069394995, 0.978487286603262, 
    0.986547952538877, 1.05556177204354, 1.02448270513577, 1.05165615023086, 
    0.983722537493141, 1.02427491553498, 1.01592344387731, 1.03295882562415, 
    1.0518223071222, 0.975828153097695, 1.01102063141894, 0.963238621195842, 
    0.998680397178511, 1.00944170468032, 0.954720867998008, 0.962740025872996, 
    0.976118958819745, 0.976825343743386, 0.972005547406268, 0.941300426990633, 
    1.03196163497846, 0.988126754628668, 1.04783354453944, 0.988833139552309, 
    1.0192474542459, 0.975703504266984, 1.01575728698598, 1.01376290569459, 
    1.05581106970497, 1.00807056754249, 1.00507899560542, 0.995273370688676, 
    0.977531604018197, 0.985467621123101, 1.05473086293802, 1.02331935759875, 
    1.05061745152455, 0.982600698016739, 1.02323621682866, 1.01484323711037, 
    1.03192012691783, 1.0509082572466, 0.974706313621292, 1.01006494883388, 
    0.962199797840693, 0.997641698472193, 1.00836149791338, 0.953516012400351, 
    0.96165969445722, 0.974955611282715, 0.975703504266984, 0.970842199869238, 
    0.94017858751423, 1.03723839392897)), .Names = c("var", "modvar"
    ), row.names = c(NA, 100L), class = "data.frame")



#Calculation of discrete variable, as well as lower and upper boundaries of modvar
df$var_cut<-cut(df$var, quantile(df$var, (0:10)/10), include.lowest=TRUE)
df$var_cut<-cut(df$var, quantile(df$var, (0:10)/10), include.lowest=TRUE, labels=c(1:length(levels(df$var_cut))))

df$lowervar<-ifelse(df$var_cut=="1",df$lowervar<-quantile(df[df$var_cut=="1","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="2",df$lowervar<-quantile(df[df$var_cut=="2","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="3",df$lowervar<-quantile(df[df$var_cut=="3","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="4",df$lowervar<-quantile(df[df$var_cut=="4","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="5",df$lowervar<-quantile(df[df$var_cut=="5","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="6",df$lowervar<-quantile(df[df$var_cut=="6","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="7",df$lowervar<-quantile(df[df$var_cut=="7","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="8",df$lowervar<-quantile(df[df$var_cut=="8","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="9",df$lowervar<-quantile(df[df$var_cut=="9","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="10",df$lowervar<-quantile(df[df$var_cut=="10","modvar"],c(0.05), na.rm=T),NA))))))))))

df$uppervar<-ifelse(df$var_cut=="1",df$uppervar<-quantile(df[df$var_cut=="1","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="2",df$uppervar<-quantile(df[df$var_cut=="2","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="3",df$uppervar<-quantile(df[df$var_cut=="3","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="4",df$uppervar<-quantile(df[df$var_cut=="4","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="5",df$uppervar<-quantile(df[df$var_cut=="5","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="6",df$uppervar<-quantile(df[df$var_cut=="6","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="7",df$uppervar<-quantile(df[df$var_cut=="7","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="8",df$uppervar<-quantile(df[df$var_cut=="8","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="9",df$uppervar<-quantile(df[df$var_cut=="9","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="10",df$uppervar<-quantile(df[df$var_cut=="10","modvar"],c(0.95), na.rm=T),NA))))))))))

I have been working under the assumption that by using the "probs" argument of the quantile function, I can obtain an enveloppe for my modvar curve that is similar to a confidence interval, specifying the lower and upper bounds to correspond respectively to 0.05 and 0.95 probabilities in the quantile function.

Would you say this is acceptable a method? What about comparing two different curves this way? I would like to check for overlaps between different variables for example. What I would be doing is plotting the same modvar~var_cut but for different databases, and checking if the lower and upper bounds of the respective curves overlap.

I hope you can help me, thank you in advance!

I've used a procedure such as you describe to track temporal trends in the distribution of variables, and to analyse interactions in the distributions of two variables. Seems perfectly reasonable to me. That said, I don't really see a coding question here, since it should as though you are not getting any errors. It's not reallly "comparing two different curves" but is rather comparing the conditional distribution of one variable within ordered categorical subsets of a second variable. Voting to migrate to CV.com. — IRTFM, Apr 01 '15 at 20:41
I typically look at the results with `matplot. Try: `with( df, matplot(var_cut, df[4:5]))`` — IRTFM, Apr 01 '15 at 20:46
@BondedDust thanks for the answer, I posted it in CV.com as well! And I'll try matplot for good measure! — Chris. Z, Apr 03 '15 at 17:19

Quantiles and data interval for comparison

0 Answers0