0

For plotting a sankey diagram nodes and links are required. To get the nodes and links from a data frame one could use for example a count function from the package plyr and use it for each node to count the links between the neighbours but is there another elegant way?

example target, aim is to get nodes and links:

param1 | param2 | param3 |
a      | b      | d      |
w      | c      | d      |
a      | b      | d      |
z      | c      | e      |

#nodes:
nodes = data.frame("name" = 
c(
a, #node 0
w, #node 1
z, #node 2
b, #node 3
c, #node 4
d, #node 5
e  #node 6
))

#links
links = as.data.frame(matrix(c(
0, 3, 2, # from node 0,  to node 3, freq
1, 4, 1,
2, 4, 1,
3, 5, 2,
4, 5, 1,
4, 6, 1,
),
byrow = TRUE, ncol = 3))
zx8754
  • 52,746
  • 12
  • 114
  • 209
hkn
  • 123
  • 11

1 Answers1

2

Using igraph package:

library(dplyr)
library(igraph)

# example data
df1 <- read.table(text="
                  param1 param2 param3
                  a b d
                  w c d
                  a b d
                  z c e", header = TRUE, stringsAsFactors = FALSE)

# make graph
g <- graph_from_data_frame(
  rbind(
    setNames(df1[, 1:2], c("from", "to")),
    setNames(df1[, 2:3], c("from", "to"))))


nodes <- data.frame(id = as.numeric(V(g)),
                    name = V(g)$name)
nodes
#   id name
# 1  1    a
# 2  2    w
# 3  3    z
# 4  4    b
# 5  5    c
# 6  6    d
# 7  7    e

links <- as.data.frame(get.edges(g, E(g))) %>%
  group_by(V1, V2) %>%
  summarise(freq = n()) %>% 
  data.frame()

links
#   V1 V2 freq
# 1  1  4    2
# 2  2  5    1
# 3  3  5    1
# 4  4  6    2
# 5  5  6    1
# 6  5  7    1
zx8754
  • 52,746
  • 12
  • 114
  • 209
  • i have a further question, how can i avoid linking empty cells? would be great if you could help please – hkn Jun 14 '18 at 08:29
  • @hkn please avoid asking new questions in the comments, [ask a new question](https://stackoverflow.com/questions/ask). – zx8754 Jun 14 '18 at 08:38