0

I am having some difficulties unpacking what looks like a list within a list in Google's Page Speed API response.

Ideally, I want only audit results exported as a CSV file. So I can compare the website load times and performance of my client's website.

'''

library(httr)
library(tidyverse)
library(tidyr)

#URL to submit GET request to
url <- "https://www.googleapis.com/pagespeedonline/v5/runPagespeedurl=https://www.google.com/"


# GET request returned as list
raw_list <- url %>% 
httr::GET() %>% 
httr::content()


#turning the list into a dataframe 
df_pagespeed <- as.data.frame(do.call(rbind, raw_list))

#attempted unpack list in audit results with no luck
df_pagespeed <- tidyr::unnest(df_pagespeed, cols = audits)

# select only the audit results. 
df_pagespeed_final <- df_pagespeed[c(audits)]

#export to csv file
write.csv(df_pagespeed_final,"test-pagespeed.csv", row.names = FALSE)

'''

Ideally I want the second dataframe (df_pagespeed_final) to contain information related to pagespeed audit results. Meaningful insights like my first-contentful-paint

Hopefully that is clear enough for someone to understand. If not, please let me know and I will revise the question.

Thanks for your help.

  • There are two urls in your code, which one do you send request to? – Chamkrai Jun 29 '22 at 11:34
  • Hello @TomHoel, thanks for the reply, I will revise my question now and see if I can make it clearer. I believe StackOverflow is overflowing it to the next line, when it should just be one query string. `url <- "https://www.googleapis.com/pagespeedonline/v5/runPagespeedurl=https://www.google.com/` –  Jun 29 '22 at 12:34
  • There is a lot of content. Are there any specific results you want? – QHarr Jun 29 '22 at 22:17
  • I also see http 429 too many requests. Not surprising given nature of the info being sought. This likely needs to be modified to use polite package. Or more likely, use browser automation and hand-off results to a json parser. – QHarr Jun 29 '22 at 22:27
  • Hello @QHarr thanks for taking the time to look into my question. You can add a free API key from Google on the end of the `url` variable. Which will prevent you getting server 429 errors. I find it is okay for 1 or 2 responses, before you get the error. Link to API docs. https://developers.google.com/speed/docs/insights/v5/get-started When I have some more free time I will revise the question and look into polite package. Thanks for pointing me in the right direction. As I am relatively new to programming and R, I really appreciate it. –  Jun 30 '22 at 14:30
  • The API key info is good to know. Do you know what info you want? It looks like it would be fairly straightforward to have a dataframe of loadingExperience and originLoadingExperience, particularly if just extracting the metrics results and adding a column for which area of performance was being measured. – QHarr Jun 30 '22 at 15:13

1 Answers1

1

I managed to figure it out. Probably not the best solution or cleanest, but it works.

Hopefully this helps someone else out, who is working with the Google Pagespeed API and R.


library(httr)
library(tidyverse)
library(tidyr)
library(purrr)
library(magrittr)
library(ggplot2)
library(reshape)

#URL to submit GET request to
url <- "https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=https://www.google.com/"


# GET request returned as list
raw_list <- url %>% 
  httr::GET() %>% 
  httr::content()



#turning the list into a dataframe 
df_pagespeed <- as.data.frame(do.call(rbind, raw_list))

df_all_audit <- df_pagespeed["lighthouseResult", "audits"]

df_all_audit <- as.data.frame(do.call(rbind, df_all_audit))


#df for meaningful paint 
df_first_meaningful_paint <- df_all_audit["lighthouseResult", "first-meaningful-paint"]

df_first_meaningful_paint <- as.data.frame(do.call(rbind, df_first_meaningful_paint))

df_first_meaningful_paint <- df_first_meaningful_paint$numericValue[1]



#df for largest content paint
df_largest_content_paint <- df_all_audit["lighthouseResult", "largest-contentful-paint"]

df_largest_content_paint <- as.data.frame(do.call(rbind, df_largest_content_paint))

df_largest_content_paint <- df_largest_content_paint$numericValue[1]



#df for total-blocking time
df_total_blocking_time <- df_all_audit["lighthouseResult", "total-blocking-time"]

df_total_blocking_time <- as.data.frame(do.call(rbind, df_total_blocking_time))

df_total_blocking_time <- df_total_blocking_time$numericValue[1]


#df for total-blocking time
df_speed_index <- df_all_audit["lighthouseResult", "speed-index"]

df_speed_index <- as.data.frame(do.call(rbind, df_speed_index ))

df_speed_index  <- df_speed_index$numericValue[1]



#df for content paint 
df_first_content_paint <- df_all_audit["lighthouseResult", "first-contentful-paint"]

df_first_content_paint <- as.data.frame(do.call(rbind, df_first_content_paint))

df_first_content_paint <- df_first_content_paint$numericValue[1]


#df for cumulative layout shift 
df_cumulative_shift <- df_all_audit["lighthouseResult", "cumulative-layout-shift"]

df_cumulative_shift <- as.data.frame(do.call(rbind, df_cumulative_shift))

df_cumulative_shift <- df_cumulative_shift$numericValue[1]



#df for server response time
df_server_response_time <- df_all_audit["lighthouseResult", "server-response-time"]

df_server_response_time<- as.data.frame(do.call(rbind, df_server_response_time))

df_server_response_time <- df_server_response_time$numericValue[1]



now <- Sys.time()
time <- data.frame(now)


#put all data frames into list
df_list <- bind_cols(time, df_first_content_paint, df_first_meaningful_paint, df_largest_content_paint, df_total_blocking_time, df_speed_index, df_cumulative_shift, df_server_response_time)

# renaming columns in dataframe
names(df_list)[1] <- "time"
names(df_list)[2] <- "first_content_paint"
names(df_list)[3] <- "first_meaningful_paint"
names(df_list)[4] <- "largest_content_paint"
names(df_list)[5] <- "total_blocking_time"
names(df_list)[6] <- "speed_index"
names(df_list)[7] <- "cumulative_shift"
names(df_list)[8] <- "server_response_time"

#assigning data to df_pagespeed_new
df_pagespeed_new <- df_list

# loading in old pagespeed file
df_pagespeed_old <- read_csv("pagespeed.csv")

#adding additional row to df page speed 
total <- rbind(df_pagespeed_old, df_pagespeed_new)

#writing the new new dataframe (larger more rows to dataframe)
write.csv(total,"pagespeed.csv", row.names = FALSE)


#plotting the graph
p <- ggplot()+
  geom_line(data=total,aes(y=first_content_paint,x= time,colour="first_content_paint"),size=1 )+
  geom_line(data=total,aes(y=largest_content_paint,x= time,colour="largest_content_paint"),size=1 )+
  geom_line(data=total,aes(y=total_blocking_time,x= time,colour="total_blocking_time"),size=1 )+
  geom_line(data=total,aes(y=speed_index,x= time,colour="speed_index"),size=1 )+
  geom_line(data=total,aes(y=cumulative_shift,x= time,colour="cumulative_shift"),size=1 )+
  geom_line(data=total,aes(y=server_response_time,x= time,colour="server_response_time"),size=1) +
  scale_color_manual(name = "Speed Metrics", values = c("first_content_paint" = "#008080", "largest_content_paint" = "#58508d", "total_blocking_time" = "#bc5090", "speed_index" = "#ff6361", "cumulative_shift" = "#ffa600", "server_response_time" = "#003f5c")) +
  xlab("Time & Date") +
  scale_y_continuous("Loadtime (milliseconds)") + 
  labs(title="www.google.com Page Speed Metrics")+ 
  theme(plot.title=element_text(hjust=0.5))

p + theme_classic() # Classic theme