I have been trying to extract a table from .jpg format to excel format. I'm aware how to do it if it's a .pdf or html file. Please find the script below. I would be grateful if someone could help me figure this out. Thanks,
library(httr)
library(magick)
library(tidyverse)
url_template <- "https://www.environment.co.za/wp-content/uploads/2016/05/worst-air-pollution-in-south-africa-table-graph-statistics-1024x864.jpg"
pb <- progress_estimated(n=length(url_template))
sprintf(url_template) %>%
map(~{
pb$tick()$print()
GET(url = .x,
add_headers(
accept = "image/webp,image/apng,image/*,*/*;q=0.8",
referer = "https://www.environment.co.za/pollution/worst-air-pollution-south-africa.html/attachment/worst-air-pollution-in-south-africa-table-graph-statistics",
authority = "environment.co.za"))
}) -> store_list_pages
map(store_list_pages, content) %>%
map(image_read) %>%
reduce(image_join) %>%
image_write("SApollution.pdf", format = "pdf")
library(tabulizer)
library(tabulizerjars)
library(XML)
wbk<-loadWorkbook("~/crap_exercise/img2pdf/randomdata.xlsx", create=TRUE)
# Extract the table from the document
out <- extract_tables("SApollution.pdf") #check if which="the table number" is there
#Combine these into a single data matrix containing all of the data
final <- do.call(rbind, out[-length(out)])
# table headers get extracted as rows with bad formatting. Dump them.
final <- as.data.frame(final[1:nrow(final), ])
# Column names
headers <- c('#', 'Uraban area', 'Province', 'PM2.5 (mg/m3)')
# Apply custom column names
names(final) <- headers
createSheet(wbk, "pollution")
writeWorksheet(wbk,poptable,sheet='pollution', header=T)
saveWorkbook(wbk)