1

I want to convert a pairwise distance table (observations in 2 columns) into a table with individuals listed instead (observations in 1 column). Essentially the info on pairwise relationships will be lost (which is irrelevant to my analysis anyways) and the distance value will need to be doubled for their respective rows.

I can separate the strings with this code:

pairwise_readout <- str_split_fixed(pairwise[,1], " ", 4) #splits strings apart
pairwise_readout <- data.frame(pairwise_readout,pairwise$dist) #places distance again

But have no idea how to continue with re-arranging the table into fewer columns. All search results bring up only pairwise table related solutions.

Here is an example dataset:

Important to note is that I'm also interested in the 'gr#' contained within the string for each observation.

pairwise <- data.frame(ind_comp = c("OP2645ii_d gr3 OP5048___g gr2","OP5046___e gr5 OP5048___g gr2","OP2413iiia gr1 OP5048___g gr2","OP5043___b gr1 OP5048___g gr2", "OP3088i___a gr1 OP5048___g gr2","OP5046___a gr5 OP5048___g gr2", "OP5048___b gr5 OP5048___g gr2", "OP5043___a gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5044___c gr2", "OP2413iiib gr4 OP5048___g gr2", "OP5046___c gr1 OP5048___g gr2"), dist = c(7.590363,6.449676,6.419955,6.349918,6.182623,6.162655,6.154232,6.140147,6.058633,5.962923,5.943956,5.863753))

Essentially I want a table that follows this form:

pairwise_table_less_columns <- data.frame(ind_comp = c("OP2645ii_d","OP5048___g","OP5046___e", "OP5048___g", "OP2413iiia", "OP5048___g", "OP5043___b", "OP5048___g", "OP3088i___a", "OP5048___g", "OP5046___a", "OP5048___g", "OP5048___b", "OP5048___g", "OP5043___a", "OP5048___g", "OP2645ii_d", "OP5048___g", "OP2645ii_d", "OP5044___c", "OP2413iiib", "OP5048___g", "OP5046___c", "OP5048___g"), gr = c("gr3","gr2","gr5", "gr2", "gr1", "gr2", "gr1", "gr2", "gr1", "gr2", "gr5", "gr2", "gr5", "gr2", "gr3", "gr2", "gr3", "gr2", "gr3", "gr2", "gr4", "gr2", "gr1", "gr2"), dist = c(7.590363,7.590363,6.449676,6.449676,6.419955,6.419955,6.349918,6.349918,6.182623,6.182623,6.162655,6.162655,6.154232,6.154232,6.140147,6.140147,6.058633,6.058633,5.962923,5.962923,5.943956,5.943956,5.863753,5.863753))
Sotos
  • 51,121
  • 6
  • 32
  • 66

4 Answers4

2

We can use dplyr and tidyr. First separate ind_comp into 4 different columns based on whitespace, gather it into long format, remove digits from key column so that they have same name, creating a common identifier using row_number() and then spread to wide format.

library(dplyr)
library(tidyr)

pairwise %>%
  separate(ind_comp, c("ind_comp1", "gr1", "ind_comp2", "gr2"), sep = "\\s+") %>%
  gather(key, value, -dist) %>%
  mutate(key = sub("\\d+", "", key)) %>%
  group_by(key) %>%
  mutate(row = row_number()) %>%
  spread(key, value) %>%
  dplyr::select(-row)


# A tibble: 24 x 3
#    dist gr    ind_comp  
#   <dbl> <chr> <chr>     
# 1  5.86 gr1   OP5046___c
# 2  5.86 gr2   OP5048___g
# 3  5.94 gr4   OP2413iiib
# 4  5.94 gr2   OP5048___g
# 5  5.96 gr3   OP2645ii_d
# 6  5.96 gr2   OP5044___c
# 7  6.06 gr3   OP2645ii_d
# 8  6.06 gr2   OP5048___g
# 9  6.14 gr3   OP5043___a
#10  6.14 gr2   OP5048___g
# … with 14 more rows
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
1

Here is a base R solution.
Split the dataframe pairwise_readout into two by columns then rbind them. There are intermediate steps to make sure the column names are equal and to order the result.

tmp1 <- pairwise_readout[c(1, 2, 5)]
tmp2 <- pairwise_readout[c(3, 4, 5)]
names(tmp1) <- names(tmp2) <- c("ind_comp", "gr", "dist")
tmp1$id <- tmp2$id <- seq_len(nrow(tmp1))
tmp <- rbind(tmp1,tmp2)
result <- tmp[order(tmp$id), -4]

Final clean up.

rm(tmp, tmp1, tmp2)
Rui Barradas
  • 70,273
  • 8
  • 34
  • 66
1

Another idea is to replace the second space with another delimeter, and split on that, i.e.

library(dplyr)
library(tidyr)

pairwise %>% 
 mutate(ind_comp = gsub('([^ ]+ [^ ]+) ', '\\1|', ind_comp)) %>% 
 separate_rows(ind_comp, sep = '[|]')

which gives,

          ind_comp     dist
1   OP2645ii_d gr3 7.590363
2   OP5048___g gr2 7.590363
3   OP5046___e gr5 6.449676
4   OP5048___g gr2 6.449676
5   OP2413iiia gr1 6.419955
6   OP5048___g gr2 6.419955
7   OP5043___b gr1 6.349918
8   OP5048___g gr2 6.349918
9  OP3088i___a gr1 6.182623
10  OP5048___g gr2 6.182623
11  OP5046___a gr5 6.162655
12  OP5048___g gr2 6.162655
13  OP5048___b gr5 6.154232
14  OP5048___g gr2 6.154232
15  OP5043___a gr3 6.140147
16  OP5048___g gr2 6.140147
17  OP2645ii_d gr3 6.058633
18  OP5048___g gr2 6.058633
19  OP2645ii_d gr3 5.962923
20  OP5044___c gr2 5.962923
21  OP2413iiib gr4 5.943956
22  OP5048___g gr2 5.943956
23  OP5046___c gr1 5.863753
24  OP5048___g gr2 5.863753
Sotos
  • 51,121
  • 6
  • 32
  • 66
0

I'm late, but this would be my solution:

library("stringr") #For str_split

pairwise <- data.frame(ind_comp = c("OP2645ii_d gr3 OP5048___g gr2","OP5046___e gr5 OP5048___g gr2","OP2413iiia gr1 OP5048___g gr2","OP5043___b gr1 OP5048___g gr2", "OP3088i___a gr1 OP5048___g gr2","OP5046___a gr5 OP5048___g gr2", "OP5048___b gr5 OP5048___g gr2", "OP5043___a gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5048___g gr2", "OP2645ii_d gr3 OP5044___c gr2", "OP2413iiib gr4 OP5048___g gr2", "OP5046___c gr1 OP5048___g gr2"), dist = c(7.590363,6.449676,6.419955,6.349918,6.182623,6.162655,6.154232,6.140147,6.058633,5.962923,5.943956,5.863753))
pairwise$ind_comp <- as.character(pairwise$ind_comp)

pairwise$ind_comp2 <- sapply(str_split(pairwise$ind_comp, "(?<=\\s[a-z]{2}[0-9]{1})\\s"), "[", 2) #Splitting to create second column
pairwise$ind_comp <- sapply(str_split(pairwise$ind_comp, "(?<=\\s[a-z]{2}[0-9]{1})\\s"), "[", 1) #And first column

tmp_pairwise <- data.frame(ind_comp = pairwise$ind_comp2, dist = as.numeric(pairwise$dist)) #Copying second columna and corresponding distances to temporary object

pairwise <- pairwise[, -3] #Removing second column from original data frame

pairwise <- rbind(pairwise, tmp_pairwise) #Binding original data frame and the temporary data frame by rows

rm(tmp_pairwise) #Removing temporary data frame

pairwise$gr <- sapply(str_split(pairwise$ind_comp, "(?<=\\s)"), "[", 2) #Creating group column
pairwise$ind_comp <- sapply(str_split(pairwise$ind_comp, "(?<=\\s)"), "[", 1) #Fixing first column to remove group information
head(pairwise)
      ind_comp     dist  gr
1  OP2645ii_d  7.590363 gr3
2  OP5046___e  6.449676 gr5
3  OP2413iiia  6.419955 gr1
4  OP5043___b  6.349918 gr1
5 OP3088i___a  6.182623 gr1
6  OP5046___a  6.162655 gr5
Dunois
  • 1,813
  • 9
  • 22