It is a bit hard to answer your question without a reproducible example or some information about your computer. Below is my suggestion.
Some dummy data resembling your dataset:
# Create a dummy example
set.seed(1)
dummy_mat <- matrix(rnorm(n=9*9), ncol = 9, nrow = 9)
diag(dummy_mat) <- 0
dummy_mat[lower.tri(dummy_mat)] <- dummy_mat[upper.tri(dummy_mat)]
# Create col and row names
x <- paste0("Genus", LETTERS[1:3])
y <- paste0(" species", 1:3)
set.seed(1)
ids <- paste0(sample(LETTERS, 9, replace = T), sample(1000:9999, 9))
nms <- paste0(ids, "|", do.call(paste0, expand.grid(x, y)))
colnames(dummy_mat) <- nms
rownames(dummy_mat) <- nms
This is the output:
> dummy_mat
Y9788|GenusA species1 D2300|GenusB species1 G9521|GenusC species1 A2798|GenusA species2 B9228|GenusB species2 W2128|GenusC species2 K9920|GenusA species3 N1877|GenusB species3 R9676|GenusC species3
Y9788|GenusA species1 0.0000000 -0.3053884 0.8212212 -1.47075238 -0.3942900 -0.7074952 1.433023702 0.02800216 0.610726353
D2300|GenusB species1 -0.3053884 0.0000000 0.5939013 -0.47815006 -0.0593134 0.3645820 1.980399899 -0.74327321 -0.934097632
G9521|GenusC species1 0.8212212 1.1000254 0.0000000 0.41794156 1.1000254 0.7685329 -0.367221476 0.18879230 -1.253633400
A2798|GenusA species2 0.5939013 0.7631757 1.4330237 0.00000000 0.7631757 -0.1123462 -1.044134626 -1.80495863 0.291446236
B9228|GenusB species2 -1.4707524 -0.7074952 1.9803999 0.02800216 0.0000000 0.8811077 0.569719627 1.46555486 -0.443291873
W2128|GenusC species2 -0.4781501 0.3645820 -0.3672215 -0.74327321 0.1532533 0.0000000 -0.135054604 0.15325334 0.001105352
K9920|GenusA species3 0.4179416 0.7685329 -1.0441346 0.18879230 2.1726117 -1.2536334 0.000000000 2.17261167 0.074341324
N1877|GenusB species3 -0.3942900 -0.1123462 0.5697196 -1.80495863 0.6107264 0.2914462 0.001105352 0.00000000 -0.589520946
R9676|GenusC species3 -0.0593134 0.8811077 -0.1350546 1.46555486 -0.9340976 -0.4432919 0.074341324 -0.58952095 0.000000000
Now we can try to answer your question.
1] First I would suggest to list all unique genera you have in your dataset. I will use the library stringr
(but it can also be done in base R) and the fact that all the genera are delimited by "|"
and " "
.
# Identify all genera you have
nms <- colnames(dummy_mat)
library(stringr)
nms <- stringr::str_extract(nms, "(?<=\\|).*(?= )") # delimiters of the Genus "|" and " " (whitespace)
nms <- unique(nms)
2] Now we can create a data.frame that will store all your average distances (preallocation is always a good thing when dealing with large datasets)
# Create a data frame to pre allocate memory
df <- data.frame(genus = nms, meanDist = rep(as.numeric(NA), length(nms)))
> df
genus meanDist
1 GenusA NA
2 GenusB NA
3 GenusC NA
3] Compute the average distance. Within each iteration we only need the upper triangular matrix (diagonal is always 0, lower triangular is equal to upper triangular matrix). Instead of recreating new data.frames at each iteration, we will try to subset the existing matrix.
for(n in seq_along(df$genus)){
pattern <- df$genus[n]
ind <- grep(pattern, colnames(dummy_mat))
m <- dummy_mat[ind, ind]
df$meanDist[n] <- mean(m[upper.tri(m)])
}
4] Our results
> df
genus meanDist
1 GenusA -0.3606211
2 GenusB 0.2209894
3 GenusC -0.1613317
Let me know if this solve your problem.
UPDATE
Using the profiling tool provided by the library profvis
we can see that te loop itself doesn't use up much memory even with a distance matrix 10.000x10.000
library(profvis)
profvis({
# Create a dummy example
set.seed(1)
dummy_mat <- matrix(rnorm(n=10000*10000), ncol = 10000, nrow = 10000)
diag(dummy_mat) <- 0
dummy_mat[lower.tri(dummy_mat)] <- dummy_mat[upper.tri(dummy_mat)]
# Create col and row names
x <- paste0("Genus", LETTERS[1:10])
y <- paste0(" species", 1:1000)
set.seed(1)
ids <- paste0(sample(LETTERS, 9, replace = T), sample(1000:9999, 9))
nms <- paste0(ids, "|", do.call(paste0, expand.grid(x, y)))
colnames(dummy_mat) <- nms
rownames(dummy_mat) <- nms
# Identify all genera you have
nms <- colnames(dummy_mat)
library(stringr)
nms <- stringr::str_extract(nms, "(?<=\\|).*(?= )") # delimiters of the Genus "|" and " " (whitespace)
nms <- unique(nms)
# Create a data frame to pre allocate memory
df <- data.frame(genus = nms, meanDist = rep(as.numeric(NA), length(nms)))
# Compute mean distances
for(n in seq_along(df$genus)){
pattern <- df$genus[n]
ind <- grep(pattern, colnames(dummy_mat))
m <- dummy_mat[ind, ind]
df$meanDist[n] <- mean(m[upper.tri(m)])
}
}) #profvis end
