1

I have been trying to extract a table from .jpg format to excel format. I'm aware how to do it if it's a .pdf or html file. Please find the script below. I would be grateful if someone could help me figure this out. Thanks,

library(httr)
library(magick)
library(tidyverse)
url_template <- "https://www.environment.co.za/wp-content/uploads/2016/05/worst-air-pollution-in-south-africa-table-graph-statistics-1024x864.jpg"
pb <- progress_estimated(n=length(url_template))

sprintf(url_template) %>% 
  map(~{
    pb$tick()$print()
    GET(url = .x, 
        add_headers(
          accept = "image/webp,image/apng,image/*,*/*;q=0.8", 
          referer = "https://www.environment.co.za/pollution/worst-air-pollution-south-africa.html/attachment/worst-air-pollution-in-south-africa-table-graph-statistics", 
      authority = "environment.co.za"))    
  }) -> store_list_pages

map(store_list_pages, content) %>% 
  map(image_read) %>% 
  reduce(image_join) %>% 
  image_write("SApollution.pdf", format = "pdf")   

library(tabulizer)
library(tabulizerjars)
library(XML)
wbk<-loadWorkbook("~/crap_exercise/img2pdf/randomdata.xlsx", create=TRUE) 
# Extract the table from the document
out <- extract_tables("SApollution.pdf") #check if which="the table number" is there

#Combine these into a single data matrix containing all of the data
final <- do.call(rbind, out[-length(out)])

# table headers get extracted as rows with bad formatting. Dump them.
final <- as.data.frame(final[1:nrow(final), ])

# Column names
headers <- c('#', 'Uraban area', 'Province', 'PM2.5 (mg/m3)')

# Apply custom column names
names(final) <- headers
createSheet(wbk, "pollution")
writeWorksheet(wbk,poptable,sheet='pollution', header=T)
saveWorkbook(wbk)
Justin
  • 11
  • 2

0 Answers0