The solution became obvious upon further reflection.
Instead of using all observations (200k) in the long format, I made longitude and depth of sampling into one variable, used like sampling units along a transect. Thus, ending up with 3800 columns of longitude - depth combinations, and 61 rows for the taxa, with the value variable being the abundance of the taxa (If you want to cluster sampling units then you have to transpose the df). This is then feasible for hclust or SIMPROF since now the quadratic complexity only applies to 61 rows (as opposed to ~200k as I tried at the beginning).
Cheers
Here is some code:
library(reshape2)
library(dplyr)
d4<-d4 %>% na.omit() %>% arrange(desc(LONGITUDE_DEC))
# make 1 variable of longitude and depth that can be used for all taxa measured, like
#community ecology sampling units
d4$sampling_units<-paste(d4$LONGITUDE_DEC,d4$BIN_MIDDEPTH_M)
d5<-d4 %>% select(PREDICTED_GROUP,CONCENTRATION_IND_M3,sampling_units)
d5<-d5%>%na.omit()
# dcast data frame so that you get the taxa as rows, sampling units as columns w
# concentration/abundance as values.
d6<-dcast(d5,PREDICTED_GROUP ~ sampling_units, value.var = "CONCENTRATION_IND_M3")
d7<-d6 %>% na.omit()
d7$PREDICTED_GROUP<-as.factor(d7$PREDICTED_GROUP)
# give the rownames the taxa names
rownames(d7)<-paste(d7$PREDICTED_GROUP)
#delete that variable that is no longer needed
d7$PREDICTED_GROUP<-NULL
library(vegan)
# calculate the dissimilarity matrix with vegdist so you can use the sorenson/bray
#method
distBray <- vegdist(d7, method = "bray")
# calculate the clusters with ward.D2
clust1 <- hclust(distBray, method = "ward.D2")
clust1
#plot the cluster dendrogram with dendextend
library(dendextend)
library(ggdendro)
library(ggplot2)
dend <- clust1 %>% as.dendrogram %>%
set("branches_k_color", k = 5) %>% set("branches_lwd", 0.5) %>% set("clear_leaves") %>% set("labels_colors", k = 5) %>% set("leaves_cex", 0.5) %>%
set("labels_cex", 0.5)
ggd1 <- as.ggdend(dend)
ggplot(ggd1, horiz = TRUE)