I have some data that was given to me in a format that I'm trying to get into a flat file for a customer. It is hierarchical data but it doesn't have all the data filled in and you can't just do an easy fill because of so many different sub levels involved. The numbers are always 4 digits and signify something specific.
This is a report that can go out to dozens of subgroups with thousands of rows of data.
Here is a sample in R:
L1 <- c("Main1", rep(NA, 21), "Main2", "Main3")
L2 <- c(NA, "Sub2_1", rep(NA, 22))
L3 <- c(NA, NA, "Sub3_1", rep(NA, 17), "Sub3_2", rep(NA, 3))
L4 <- c(rep(NA, 3), "Sub4_1", rep(NA, 9), "Sub4_2", rep(NA, 7), "0015", rep(NA, 2))
L5 <- c(rep(NA, 4), "Sub5_1", NA, NA, "Sub5_2", NA, "Sub5_3", rep(NA, 4), "Sub5_5", rep(NA, 9))
L6 <- c(rep(NA, 5), "1111", "2885", NA, "0001", NA, "Sub6_1", rep(NA, 4), "Sub6_2", rep(NA, 8))
L7 <- c(rep(NA, 11), "Sub7_1", rep(NA, 4), "Sub7_2", rep(NA, 7))
L8 <- c(rep(NA, 12), "0011", rep(NA, 4), "9494", "Sub8_1", rep(NA, 5))
L9 <- c(rep(NA, 19), "8479", rep(NA, 4))
df <- data.frame(L1, L2, L3, L4, L5, L6, L7, L8, L9)
And I want an output like this since the four digit "code" is what we really need to look up:
code_f <- c("1111", "2885", "0001", "0011", "9494", "8479", "0015", NA, NA)
L1_f <- c(rep("Main1", 7), "Main2", "Main3")
L2_f <- c(rep("Sub2_1", 7), NA, NA)
L3_f <- c(rep("Sub3_1", 6), "Sub3_2", NA, NA)
L4_f <- c(rep("Sub4_1", 4), rep("Sub4_2", 2), rep(NA, 3))
L5_f <- c(rep("Sub5_1", 2), "Sub5_2", "Sub5_3", rep("Sub5_5", 2), rep(NA, 3))
L6_f <- c(rep(NA, 3), "Sub6_1", rep("Sub6_3", 2), rep(NA, 3))
L7_f <- c(rep(NA, 3), "Sub7_1", rep("Sub7_2", 2), rep(NA, 3))
L8_f <- c(rep(NA, 5), "Sub8_1", rep(NA, 3))
df_f <- data.frame(code_f, L1_f, L2_f, L3_f, L4_f, L5_f, L6_f, L7_f, L8_f)