1

I am using the following code to obtain the mean of all possible combination (m=2) of the variables whose name starts with "form".

k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], 2, FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), 2, paste0, collapse="")
data <- cbind(data, combined)

The dataset "data" is the following:

structure(list(id = c(5309039, 5284969, 5300279, 5270289, 5259957, 
5267086, 5173196, 5057536, 5246135, 5255558, 5241070, 5280194, 
5112387, 444459, 5054590, 5048412, 5296390, 5093742, 5293520), 
    form13 = c(1300.81321145176, 1130.23869905075, 1292.03253463863, 
    1358.23586808642, 1250.66417156907, 1388.37813595599, 1277.89625553694, 
    1242.17552321015, 1275.95068420011, 1449.97932094858, 1494.93158409261, 
    1183.72005024492, 1319.72081010904, 1153.43556746197, 1451.47500658524, 
    1502.05308533551, 1641.66472289938, 1407.07852441646, 1444.3815517771
    ), form12 = c(1329.6, 1104.4, 1272, 1322.8, 1195.5, 1487.4, 
    1195.6, 1258, 1256.4, 1455, 1524, 1170, 1291.4, 1224.6, 1414, 
    1606, 1765.2, 1441, 1406.8), form11 = c(1325.578, 1201.752, 
    1346.42, 1424.884, 1328.03, 1367.262, 1294.928, 1278.99, 
    1330.482, 1493.54, 1524.19, 1242.21, 1379.522, 1178.458, 
    1438.37, 1475.15, 1611.236, 1426.11, 1431.014), form10 = c(1056.7264, 
    940.4956, 1076.29, 1149.9412, 1059.028, 1095.8536, 1027.9564, 
    1012.996, 1061.3296, 1214.386, 1243.156, 978.472, 1107.3616, 
    918.6304, 1162.6, 1197.124, 1324.8628, 1151.092, 1155.6952
    ), form9 = c(1265.95883621535, 1104.13796282321, 1292.61038190038, 
    1391.60226122629, 1269.10247448997, 1319.10781736395, 1226.47462059388, 
    1205.80097696249, 1272.24391797013, 1476.61400008329, 1514.11964245256, 
    1157.70450530205, 1334.62450699242, 1072.96302932, 1408.41424685422, 
    1453.98138963552, 1619.24856353662, 1393.1329826012, 1399.25113387699
    ), form8 = c(1482.14960970768, 1302.96011430734, 1455.11530997823, 
    1507.60187999797, 1403.62372119021, 1590.3115445541, 1392.70107590683, 
    1422.72772811208, 1440.68241714823, 1606.14610155669, 1656.53381495283, 
    1357.47229571355, 1476.63693689195, 1356.28387443873, 1567.80354390345, 
    1697.01564123702, 1829.93948069795, 1581.30521692185, 1561.45650301116
    ), form7 = c(1444.56088362196, 1256.09569669502, 1416.12716131828, 
    1471.33068319787, 1361.97012558123, 1558.32178921338, 1350.4820727773, 
    1382.06304580259, 1400.94715403591, 1574.97601740197, 1627.97203596215, 
    1313.42968513872, 1438.7628489193, 1312.17974558614, 1534.64866852904, 
    1670.54939207752, 1810.35399499291, 1548.84925168016, 1527.97307493173
    ), form6 = c(1199.39256844313, 1030.51525282711, 1173.91406615889, 
    1223.38008553142, 1125.38576782367, 1301.32988998026, 1115.09171006788, 
    1143.39035787661, 1160.31177216137, 1316.25318375141, 1363.74113364133, 
    1081.8903116367, 1194.19714454337, 1080.77028284113, 1280.11720270038, 
    1401.89327051093, 1527.16747332837, 1292.84186767351, 1274.13542778885
    ), form5 = c(1297.78687926793, 1159.12885718351, 1290.6491699916, 
    1344.46508388198, 1257.02131246849, 1368.96738018114, 1239.89545043121, 
    1250.12098970015, 1277.57642224122, 1419.04226152712, 1455.58342941928, 
    1202.60322079507, 1313.15664462902, 1177.98531965952, 1380.99558290387, 
    1461.37241431927, 1574.8610783177, 1384.16870680163, 1375.22939662201
    ), form4 = c(1335.97776730397, 1108.36308048125, 1324.2608292059, 
    1412.60257966574, 1269.05887158687, 1452.82443206729, 1240.94583733479, 
    1257.73161635649, 1302.80120256198, 1535.02507407783, 1595.00938916382, 
    1179.7286135352, 1361.20807332313, 1139.31698950533, 1472.56938122075, 
    1604.51232282192, 1790.81013902909, 1477.77823673001, 1463.10387273464
    ), form3 = c(1354.228, 1167.277, 1385.695, 1504.159, 1357.93, 
    1417.162, 1307.953, 1283.89, 1361.632, 1607.815, 1654.09, 
    1228.36, 1435.672, 1132.108, 1524.52, 1580.05, 1785.511, 
    1506.01, 1513.414), form2 = c(2275.7324829005, 1960.23260237236, 
    2259.163108513, 2384.94888103794, 2181.57337654262, 2442.86896126772, 
    2142.36120747078, 2165.7494001933, 2228.9072421228, 2562.48497832825, 
    2650.8148703194, 2057.68931533889, 2311.5302827576, 2002.33637794664, 
    2471.44922673607, 2664.88828208925, 2945.12448823488, 2479.00498842122, 
    2457.73611045874), form1 = c(1180.88828860349, 1056.82591443514, 
    1162.17101167316, 1198.5102427986, 1126.52065872992, 1255.77452231775, 
    1118.95833314255, 1139.74737411054, 1152.17835587263, 1266.73762443072, 
    1301.62370599969, 1094.56758356167, 1177.07157336578, 1093.7447765967, 
    1240.19104186727, 1329.65141749175, 1421.68162869499, 1249.53896489237, 
    1235.79664943772)), row.names = c(NA, -19L), class = c("tbl_df", 
"tbl", "data.frame"))
> 

The code works well and I am trying to implement it in order to take all possible combination with m from 2 to 8. I've tried the following code, but it doesn't work.

x<-2:8
k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
data <- cbind(data, combined)

as I get the following error:

> x<-2:8
> k=which(grepl("^form",colnames(data)))
> combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
**Error in combn(data[, k], seq_along(x), FUN = rowMeans) : 
  length(m) == 1L is not TRUE**
> colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
**Error in combn(names(data[, k]), seq_along(x), paste0, collapse = "") : 
  length(m) == 1L is not TRUE**
> data <- cbind(data, combined)

Where am I wrong?

Also, I would like to add the following prephix "comb_" to the name of all generated variables. How should I modify the above code?

Thank you!

  • 1
    When you get errors with your code, it is really helpful to include the verbatim text of the error. If you instead get results that are not consistent with the data or your expectations, it is helpful to include the output, and either/both of how it is wrong and what the expected output should be. Since we don't have your `data` (bad practice to use that as a variable name, btw), it's hard to reproduce anything from this locally. – r2evans Dec 21 '19 at 20:12
  • 1
    Thank you r2evans! I have edited the post following your suggestions! – Mariano C Giglio Dec 21 '19 at 20:45

3 Answers3

2

The reason is simply that combn only takes one m at a time. Just use sapply to iterate over the ms. In order to get the column names in one step we can use 'colnames<-()'. 'colnames<-'(x, names) is actually the same as colnames(x) <- names but with the advantage that everything is on the RHS. "form" suffixes can be deleted with gsub.

k <- 2:14
combined.2.lst <- sapply(2:8, function(m) 
  `colnames<-`(combn(data[,k], m, rowMeans),
                 combn(names(data[,k]), m, function(x) 
                         paste0("comb.", paste0(gsub("form", "", x), collapse=".")))))

This gives you a list which then can be cbinded.

combined.2 <- do.call(cbind, combined.2.lst)
dim(combined.2)
# [1]   19 7085

Result

combined.2[1:5, c(1, 50, 100, 500, 1000, 5000)]  # example columns
#      comb.13.12 comb.9.1 comb.13.10.9 comb.13.10.2.1 comb.9.5.4.3 comb.13.7.6.5.4.3.2
# [1,]   1315.207 1223.424     1207.833       1453.540     1313.488            1458.356
# [2,]   1117.319 1080.482     1058.291       1271.948     1134.727            1258.836
# [3,]   1282.016 1227.391     1220.311       1447.414     1323.304            1448.835
# [4,]   1340.518 1295.056     1299.926       1522.909     1413.207            1528.446
# [5,]   1223.082 1197.812     1192.932       1404.447     1288.278            1400.515

Finally just use cbind(data, combined.2).

jay.sf
  • 60,139
  • 8
  • 53
  • 110
1

The function combn, can only take 1 element for the number of combinations, so you, need to use lapply and finally combine them with do.call(cbind..):

First we define the function for combination x:

func = function(x,DATA){
mat = combn(DATA,x,FUN=rowMeans)
colnames(mat) = combn(names(DATA),x, paste0, collapse="")
mat
}

Then we iterate:

k=which(grepl("^form",colnames(data)))
combined = lapply(2:8,func,DATA=data[,k])
combined <- do.call(cbind, combined)

If you are familiar with purrr, you can also do:

library(purrr)
library(dplyr)

combined = 2:8 %>% map(~as.tibble(func(.x,DATA=data[,k]))) %>% bind_cols()
StupidWolf
  • 45,075
  • 17
  • 40
  • 72
1

You need do iteration over m<-2:8, using lapply() or sapply(). I tried to keep your main structure of your code and make minimal changes to let it work:

m <- 2:8
k=which(grepl("^form",colnames(data)))
combined <- Reduce(cbind,lapply(m, function(m) combn(data[,k], m, FUN = rowMeans)))
colnames(combined) <-unlist(sapply(m, function(m) combn(names(data[,k]), m, paste0, collapse=""))) 
data <- cbind(data, combined)
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81