1

I imported a json file using rjson and converted it to a data.frame but all of the data is spread widthwise and the column names contain the key information.

stations <- fromJSON(file = "station_information.json")
test <- as.data.frame(stations[3])

What this looks like is:

> dim(test)
[1]    2 5985

> test[1:27]
  data.stations.station_id data.stations.name data.stations.short_name
1                       72   W 52 St & 11 Ave                  6926.01
2                       72   W 52 St & 11 Ave                  6926.01
  data.stations.lat data.stations.lon data.stations.region_id
1          40.76727         -73.99393                      71
2          40.76727         -73.99393                      71
  data.stations.rental_methods data.stations.capacity
1                          KEY                     39
2                   CREDITCARD                     39
  data.stations.eightd_has_key_dispenser data.stations.station_id.1
1                                  FALSE                         79
2                                  FALSE                         79
      data.stations.name.1 data.stations.short_name.1 data.stations.lat.1
1 Franklin St & W Broadway                    5430.08            40.71912
2 Franklin St & W Broadway                    5430.08            40.71912
  data.stations.lon.1 data.stations.region_id.1 data.stations.rental_methods.1
1           -74.00667                        71                            KEY
2           -74.00667                        71                     CREDITCARD
  data.stations.capacity.1 data.stations.eightd_has_key_dispenser.1
1                       33                                    FALSE
2                       33                                    FALSE
  data.stations.station_id.2   data.stations.name.2 data.stations.short_name.2
1                         82 St James Pl & Pearl St                    5167.06
2                         82 St James Pl & Pearl St                    5167.06
  data.stations.lat.2 data.stations.lon.2 data.stations.region_id.2
1            40.71117           -74.00017                        71
2            40.71117           -74.00017                        71
  data.stations.rental_methods.2 data.stations.capacity.2
1                            KEY                       27
2                     CREDITCARD                       27
  data.stations.eightd_has_key_dispenser.2
1                                    FALSE
2                                    FALSE

As you can see, this cannot be fixed with a simple transpose t() or melt() solution. I am wondering what I am doing wrong on import or conversion to data.frame which leaves me with a dataframe stretched with the index of what should be rows appended to the column names.

I have tried both of these methods but I am left with the same stretched data:

plyr::ldply(stations, data.frame)

do.call(rbind, lapply(stations, data.frame, stringsAsFactors=FALSE))

In the end I would like my output to look like every 9 columns were "cut" and stacked on to the first 9 - so that I am left with 655 rows and 9 columns Any suggestions would be appreciated.

NOTE: I am taking the JSON directly from this link (it is not a large file)

Here is a reproducible example of the first 27 columns, which should be reshaped to a 9 x 3 dataframe:

> dput(df)
structure(list(data.stations.station_id = structure(c(1L, 1L), class = "factor", .Label = "72"), 
    data.stations.name = structure(c(1L, 1L), class = "factor", .Label = "W 52 St & 11 Ave"), 
    data.stations.short_name = structure(c(1L, 1L), class = "factor", .Label = "6926.01"), 
    data.stations.lat = c(40.76727216, 40.76727216), data.stations.lon = c(-73.99392888, 
    -73.99392888), data.stations.region_id = c(71, 71), data.stations.rental_methods = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity = c(39, 39), data.stations.eightd_has_key_dispenser = c(FALSE, 
    FALSE), data.stations.station_id.1 = structure(c(1L, 1L), class = "factor", .Label = "79"), 
    data.stations.name.1 = structure(c(1L, 1L), class = "factor", .Label = "Franklin St & W Broadway"), 
    data.stations.short_name.1 = structure(c(1L, 1L), class = "factor", .Label = "5430.08"), 
    data.stations.lat.1 = c(40.71911552, 40.71911552), data.stations.lon.1 = c(-74.00666661, 
    -74.00666661), data.stations.region_id.1 = c(71, 71), data.stations.rental_methods.1 = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity.1 = c(33, 33), data.stations.eightd_has_key_dispenser.1 = c(FALSE, 
    FALSE), data.stations.station_id.2 = structure(c(1L, 1L), class = "factor", .Label = "82"), 
    data.stations.name.2 = structure(c(1L, 1L), class = "factor", .Label = "St James Pl & Pearl St"), 
    data.stations.short_name.2 = structure(c(1L, 1L), class = "factor", .Label = "5167.06"), 
    data.stations.lat.2 = c(40.71117416, 40.71117416), data.stations.lon.2 = c(-74.00016545, 
    -74.00016545), data.stations.region_id.2 = c(71, 71), data.stations.rental_methods.2 = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity.2 = c(27, 27), data.stations.eightd_has_key_dispenser.2 = c(FALSE, 
    FALSE)), .Names = c("data.stations.station_id", "data.stations.name", 
"data.stations.short_name", "data.stations.lat", "data.stations.lon", 
"data.stations.region_id", "data.stations.rental_methods", "data.stations.capacity", 
"data.stations.eightd_has_key_dispenser", "data.stations.station_id.1", 
"data.stations.name.1", "data.stations.short_name.1", "data.stations.lat.1", 
"data.stations.lon.1", "data.stations.region_id.1", "data.stations.rental_methods.1", 
"data.stations.capacity.1", "data.stations.eightd_has_key_dispenser.1", 
"data.stations.station_id.2", "data.stations.name.2", "data.stations.short_name.2", 
"data.stations.lat.2", "data.stations.lon.2", "data.stations.region_id.2", 
"data.stations.rental_methods.2", "data.stations.capacity.2", 
"data.stations.eightd_has_key_dispenser.2"), row.names = c(NA, 
-2L), class = "data.frame")

So the output structure should look like this (obviously with the values not NA). Each row represents the appended index number of the original dataframe's column names

> output
  data.stations.station_id data.stations.name data.stations.short_name
1                       NA                 NA                       NA
2                       NA                 NA                       NA
3                       NA                 NA                       NA
  data.stations.lat data.stations.lon data.stations.region_id
1                NA                NA                      NA
2                NA                NA                      NA
3                NA                NA                      NA
  data.stations.rental_methods data.stations.capacity
1                           NA                     NA
2                           NA                     NA
3                           NA                     NA
  data.stations.eightd_has_key_dispenser
1                                     NA
2                                     NA
3                                     NA
iskandarblue
  • 7,208
  • 15
  • 60
  • 130

2 Answers2

1

I would try:

library(data.table)
rbindlist(lapply(split(seq_along(df), c(0, (seq_along(df)%/%9)[-length(df)])), 
    function(x) df[, x]), use.names = FALSE)
##    data.stations.station_id       data.stations.name data.stations.short_name data.stations.lat
## 1:                       72         W 52 St & 11 Ave                  6926.01          40.76727
## 2:                       72         W 52 St & 11 Ave                  6926.01          40.76727
## 3:                       79 Franklin St & W Broadway                  5430.08          40.71912
## 4:                       79 Franklin St & W Broadway                  5430.08          40.71912
## 5:                       82   St James Pl & Pearl St                  5167.06          40.71117
## 6:                       82   St James Pl & Pearl St                  5167.06          40.71117
##    data.stations.lon data.stations.region_id data.stations.rental_methods
## 1:         -73.99393                      71                          KEY
## 2:         -73.99393                      71                   CREDITCARD
## 3:         -74.00667                      71                          KEY
## 4:         -74.00667                      71                   CREDITCARD
## 5:         -74.00017                      71                          KEY
## 6:         -74.00017                      71                   CREDITCARD
##    data.stations.capacity data.stations.eightd_has_key_dispenser
## 1:                     39                                  FALSE
## 2:                     39                                  FALSE
## 3:                     33                                  FALSE
## 4:                     33                                  FALSE
## 5:                     27                                  FALSE
## 6:                     27                                  FALSE

That is, create a list of data.frames with 9 columns each, and rbind them. This way, you won't have problems of data coercion when converting to a matrix.

This results in a 6 row x 9 column data.table. Not sure what rule you want to use to drop rows to end up with just 3 rows....


But I think you're trying to solve a problem that does not exist. Try reading your data like this:

library(jsonlite)
x <- fromJSON("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
head(x[[3]]$stations)
##   station_id                          name short_name      lat       lon region_id
## 1         72              W 52 St & 11 Ave    6926.01 40.76727 -73.99393        71
## 2         79      Franklin St & W Broadway    5430.08 40.71912 -74.00667        71
## 3         82        St James Pl & Pearl St    5167.06 40.71117 -74.00017        71
## 4         83 Atlantic Ave & Fort Greene Pl    4354.07 40.68383 -73.97632        71
## 5        116               W 17 St & 8 Ave    6148.02 40.74178 -74.00150        71
## 6        119      Park Ave & St Edwards St    4700.06 40.69609 -73.97803        71
##    rental_methods capacity eightd_has_key_dispenser
## 1 KEY, CREDITCARD       39                    FALSE
## 2 KEY, CREDITCARD       33                    FALSE
## 3 KEY, CREDITCARD       27                    FALSE
## 4 KEY, CREDITCARD       62                    FALSE
## 5 KEY, CREDITCARD       39                    FALSE
## 6 KEY, CREDITCARD       19                    FALSE


dim(x[[3]]$stations)
# [1] 665   9
A5C1D2H2I1M1N2O1R2T1
  • 190,393
  • 28
  • 405
  • 485
1

You can use matrix but make sure all your factor columns are characters, i.e.

ind <- sapply(df, is.factor)
df[ind] <- lapply(df[ind], as.character)

final_df <- as.data.frame(matrix(unlist(df), ncol = 9, byrow = TRUE))

final_df[c(TRUE, FALSE),]
#  V1 V2                       V3                       V4      V5      V6          V7          V8           V9
#1 72 72         W 52 St & 11 Ave         W 52 St & 11 Ave 6926.01 6926.01 40.76727216 40.76727216 -73.99392888
#3 79 79 Franklin St & W Broadway Franklin St & W Broadway 5430.08 5430.08 40.71911552 40.71911552 -74.00666661
#5 82 82   St James Pl & Pearl St   St James Pl & Pearl St 5167.06 5167.06 40.71117416 40.71117416 -74.00016545

On the other hand, as @A5C1D2H2I1M1N2O1R2T1 notes, you might be looking for this instead:

as.data.frame(matrix(c(t(df)), ncol = 9, byrow = TRUE))
#  V1                       V2      V3       V4        V5 V6         V7 V8    V9
#1 72         W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71        KEY 39 FALSE
#2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71        KEY 33 FALSE
#3 82   St James Pl & Pearl St 5167.06 40.71117 -74.00017 71        KEY 27 FALSE
#4 72         W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 CREDITCARD 39 FALSE
#5 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 CREDITCARD 33 FALSE
#6 82   St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 CREDITCARD 27 FALSE
Sotos
  • 51,121
  • 6
  • 32
  • 66