0

I am analysing microbial communities by constrained ordination (RDA, CCA and CAP) using the tables with environmental variables (soil properties).

First block was 29 samples and 43 environmental variables. I used that code:

#Prokaryotes
#area

setwd("~/Cadmium/Cd_2022/Cd_R_2022")
Area.prok.spe <- read.delim ('Cadmium_Prok_otutab_area.txt', row.names = 1)
Area.prok.spe <- t(Area.prok.spe)
Area.prok.env <- read.delim ('Area_chem_prok.txt', row.names = 1)

# DCA
DCA <- decorana (log1p (Area.prok.spe))
DCA
# DCA1<3 => linear

#RDA
rda.area.prok <- rda (Area.prok.spe ~ ., data = Area.prok.env)
rda.area.prok
anova (rda.area.prok)
plot(rda.area.prok, type="text", xlim = c(- 5, 5), ylim = c(-10,10))
#No residual component(
ordistep(rda(Area.prok.spe ~ 1, data = Area.prok.env), scope=formula(rda.area.prok), direction="forward", pstep=1000)
ordistep.prok.A <- ordistep(rda(Area.prok.spe ~ 1, data = Area.prok.env), scope=formula(rda.area.prok), direction = "both", Pin = 0.05, Pout = 0.1, permutations = how(nperm = 999), steps = 50, trace = TRUE)
# look at the significant variables
ordistep.prok.A$anova
plot(ordistep.prok.A, type="text")
#Now we can calculate variations explained by individual fractions (using varpart function): 
varp <- varpart (Area.prok.spe,  ~ Feox, ~ Cat, ~ Cd, ~ Cdt, data = Area.prok.env)
varp
plot (varp, digits = 2, Xnames = c('Feox', 'Cat', 'Cd(CaCl2)', 'Cdt'), bg = c('navy', 'tomato', 'yellow', 'green'), cutoff = -1)

#CCA
Cd_cca_area <- cca(Area.prok.spe ~ ., Area.prok.env)
Cd_cca_area
anova.cca(Cd_cca_area, step=1000)
plot(Cd_cca_area, type="text")
ordistep.prok.A2 <- ordistep(cca(Area.prok.spe ~ 1, data=Area.prok.env), scope=formula(Area.prok.env), direction="forward", pstep=1000)
plot(ordistep.prok.A2, type="text")
ordistep.prok.A2$anova
varp.cca <- varpart (Area.prok.spe,  ~ Cdt, ~ K, ~ Be, ~ Cox, data = Area.prok.env)
varp.cca
plot (varp.cca, digits = 2, Xnames = c('Cdt', 'K (CaCl2)', 'Be(CaCl2)', 'Cox'), bg = c('navy', 'tomato', 'yellow', 'green'), cutoff = -1)


#CAP
cap_area <- capscale(Area.prok.spe ~ ., Area.prok.env, dist="bray")
Cd_cap_area
anova(Cd_cap_area)
plot(cap_area, type="text")
ordistep.prok.A3 <- ordistep(capscale(Area.prok.spe ~ 1, data=Area.prok.env), scope=formula(Cd_cap_area), direction="forward", pstep=1000)
ordistep.prok.A3$anova
plot(ordistep.prok.A3, type="text")
#anova(Cd_cap_area, by="axis", step=1000)
#anova(Cd_cap_area, by="terms", step=1000)
plot(capscale(Area.prok.spe ~ ., Area.prok.env, dist="bray"), type="text")

It worked fine, except that I always had 28 constrained axes and 0 unconstrained, so the ANOVA was not possible due to no residuals.

Then I used the very same code to analyse the second block of the data with 31 samples and 55 environmental variables.

#Prokaryotes
#Profile A

setwd("~/Cadmium/Cd_2022/Cd_R_2022")
A.prok.spe <- read.delim ('Cadmium_Prok_otutab_A.txt', row.names = 1)
A.prok.spe <- t(A.prok.spe)
A.prok.env <- read.delim ('Cd_chem_A3.txt', row.names = 1)

# DCA
DCA <- decorana (log1p (A.prok.spe))
DCA
# DCA1<3 => linear

#RDA
rda.all <- rda (A.prok.spe ~ ., data = A.prok.env)
rda.all
anova (rda.all, step=1000)


ordistep.prok.A <- ordistep(rda(A.prok.spe ~ 1, data = A.prok.env), scope=formula(rda.all), direction="forward", pstep=1000)
# look at the significant variables
ordistep.prok.A$anova
plot(ordistep.prok.A, type="text")
#Now we can calculate variations explained by individual fractions (using varpart function): 
varp <- varpart (A.prok.spe,  ~ Feox, ~ Cat, ~ Cd, ~ Cdt, data = A.prok.env)
varp
plot (varp, digits = 2, Xnames = c('Feox', 'Cat', 'Cd(CaCl2)', 'Cdt'), bg = c('navy', 'tomato', 'yellow', 'green'), cutoff = -1)
plot(rda.all, type="text")

#CCA
Cd_cca_prokA <- cca(A.prok.spe ~ ., A.prok.env)
Cd_cca_prokA
anova.cca(Cd_cca_prokA, step=1000)
ordistep.prok.A2 <- ordistep(cca(A.prok.spe ~ 1, data=A.prok.env), scope=formula(A.prok.env), direction="forward", pstep=1000)
plot(ordistep.prok.A2, type="text")
plot(Cd_cca_prokA, type="text")
ordistep.prok.A2$anova
varp.cca <- varpart (A.prok.spe,  ~ Nit, ~ Crt, ~ VWC, ~ Cu + Cut, data = A.prok.env)
varp.cca
plot (varp.cca, digits = 2, Xnames = c('Nit', 'Crt', 'VWC', 'Cut + Cu(CaCl2)'), bg = c('navy', 'tomato', 'yellow', 'green'), cutoff = -1)

#CAP
Cd_cap_A <- capscale(A.prok.spe ~ ., A.prok.env, dist="bray")
Cd_cap_A
anova(Cd_cap_A)
ordistep.prok.A3 <- ordistep(capscale(A.prok.spe ~ 1, data=A.prok.env, dist="bray"), scope=formula(Cd_cap_A), direction="forward", pstep=1000)
ordistep.prok.A3$anova
plot(ordistep.prok.A3, type="text")
plot(Cd_cap_A, type="text")
anova(Cd_cap_A, by="axis", step=1000)
anova_cap_A_terms <- anova(Cd_cap_A, by="terms", step=1000)
  1. I noticed that when plotting results, e.g. plot(Cd_cca_prokA), it showed vectors for only first 10 variables. In the first block, when I run plot(Cd_cca_area) it showed all the variables as the vectors on the biplot.
  2. When I run anova(Cd_cap_A, by="terms", step=1000) or anova(Cd_cca_prokA, by="terms", step=1000) it returned results only for the first 10 variables.
> anova(Cd_cca_prokA, by="terms", step=1000)
Permutation test for cca under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

Model: cca(formula = A.prok.spe ~ Alt + Ast + Bat + Cat + Cdt + Cot + Crt + Cut + Fet + Kt + Mgt + Mnt + Nat + Nit + Pbt + St + Sit + Tit + Znt + pH..H2O. + m.Cox. + Consumption.Cox + Cox + pH..BaCl2. + CEC + BS + Al.ox + Fe.ox + Mn.ox + Cd.ox + Al + Ba + Be + Cd + Cu + Fe + K + Mg + Mn + Na + Ni + Pb + Zn + DOC + PD + BD + TP + CP + SP + NP + VWC + GWC + RWC + VWHC + RWHC, data = A.prok.env)
         Df ChiSquare      F Pr(>F)  
Alt       1   0.11697 0.9187  0.659  
Ast       1   0.11982 0.9411  0.557  
Bat       1   0.14516 1.1401  0.170  
Cat       1   0.22292 1.7508  0.023 *
Cdt       1   0.12798 1.0052  0.398  
Cot       1   0.18957 1.4889  0.019 *
Crt       1   0.10791 0.8476  0.848  
Cut       1   0.10150 0.7972  0.889  
Fet       1   0.13003 1.0213  0.390  
Kt        1   0.10646 0.8361  0.887  
Residual 20   2.54640                
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
  1. Finally, the results of all 3 analyses were: 10 constrained and 20 unconstrained axes.

I started to suspect that only the first 10 environmental variables were used and the other 45 were just discarded!

What happened? I did all the same that with the first block. How to force R to use all the environmental variables?

Please help me.

Thank you.

1 Answers1

0

You do not give a reproducible case, but if you really have 31 sampling units (observations) and 55 predictors, you are overfitting your data. You cannot have more predictors than observations – or you can, but 31 random variables will completely explain 31 observations. Probably the problem is the same with your "only 10 first variables": these were enough to predict exactly your observations and the later ones were dumped (we call that "aliasing"). As a summary: the number of predictor variables cannot be higher than the number of observations. You need either more data or you need to reduce the number of your predictors.

Jari Oksanen
  • 3,287
  • 1
  • 11
  • 15
  • That is the answer on why I'm getting 0 unconstrained axes in the first block. But it is not the answer, why R is discarding all variables except first 10 in the second block, which is my question. – Екатерина Самойлова Sep 03 '22 at 21:12
  • It is the answer to both: if ten is all you need, ten is all you get. All except the needed ones are just ignored. I guess you got an informative message of "aliased variables" in the short output. Function `alias(..., names.only=TRUE)` will give you the names of the aliased variables. Aliasing means that these variables could only explain things that were already explained and so explain nothing. You need more data or fewer predictors. – Jari Oksanen Sep 03 '22 at 21:19
  • No. There is some mistake. In the first block were 45 variables, and all 45 were used. > I guess you got an informative message of "aliased variables" in the short output. No. I've got such a message in the first block. But not in the second. In the second block R just took first two variables "alphabeticaly". – Екатерина Самойлова Sep 03 '22 at 21:22
  • Maybe I should've mention, that ten variables taken were first 10 "alphabeticaly" first. Not the 10 most important. – Екатерина Самойлова Sep 03 '22 at 21:25
  • Yes, this is expected. If you have a small data set that can be explained by *any* 10 random variables, R will pick those any ten in alphabetical order. – Jari Oksanen Sep 04 '22 at 22:12