Table extract from multiple pages in pdf

Question

I am trying to get the table extract from multiple pages in pdf but i am getting only 2 pages and page header currently. (Source PDF(test.pdf),output.csv file, codetext.txt are added as attachment. I have stored output in csv files

Expectation: it should read whole the data from PDF, currently it is reading partial data.I have also added Link for pdf source,output file,code

https://drive.google.com/drive/folders/1pscNnTkuWN0JLYVYruZf9iknuDQ2oP6U?usp=sharing

Here is my code

import tabula
import requests
import csv
import pandas as pd

import re
import parse
import pdfplumber
from collections import namedtuple
import datetime
from datetime import date
import os
import glob
import shutil
from os import path

# using pdminer i am extracting all the post name , grade name and month repporting to add to this cleaned data frame.


# ------------------------------------File name
file = "C:\\Users\\xxx\\Downloads\\test.pdf"

lines = []

pnames = []
gnames = []
mreports = []
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
try:
text = page.extract_text()
except:
text = ''
if text is not None:
liness = text.split('\n')
lines += liness

for li in lines:
if "Port:" in li:
li = li.replace("Port:", "").strip()
li_new = li.split("Month Reporting:")[-0].strip()
m_repor = li.split("Month Reporting:")[-1].strip()

if "Grade Name:" in li_new:
g_name = li_new.split("Grade Name:")[-1].strip()
p_name = li_new.split("Grade Name:")[0].strip()
print(li_new)
else:
g_name = li_new.split()[1:]
g_name = ' '.join(g_name).strip()
p_name = li_new.split()[0].strip()
pnames.append(p_name)
gnames.append(g_name)
mreports.append(m_repor)
print("PortName: ", len(pnames))
print("GradeName: ", len(gnames))
print("MonthReporting: ", len(mreports))

# i am using tabula to extract all the tables from pdf and this table is cleaned for final joining.
df = tabula.read_pdf(file, pages='all')
final_list = [
["PORT NAME", "GRADE NAME", "MONTH REPORTING", "BL DATE", "VESSEL", "DESTINATION", "CHARTERERS", "API"]]
# final_list=[]
print(final_list)
last_df = len(df)
print("Length of tables: ", last_df)

for i in range(0, len(pnames)):
op_df = df[i]
op_df = op_df.dropna(how='all')
op_df_list = op_df.values.tolist()

for li in op_df_list:
if str(li[0]) == "nan":
li = li[1:]
else:
print("check this case")
print(li)
li.insert(0, pnames[i])
li.insert(1, gnames[i])
li.insert(2, mreports[i])
print(li)
if "BL Date" in li:
pass
else:
final_list.append(li)
df_2 = pd.DataFrame(final_list)
df_2.columns = df_2.iloc[0]
df_2 = df_2[1:]
max_row=len(df_2)
curr_date = datetime.datetime.now()
created_date = curr_date.strftime('%d-%b-%y')
for row in range(max_row):
df_2['created_by'] = 'created by'
df_2['created_date'] = created_date

print(df_2)
df_2.rename(
columns={'PORT NAME': 'port_name', 'GRADE NAME': 'crude', 'MONTH REPORTING': 'reporting_month', 'BL DATE': 'bl_date',
'VESSEL': 'vessel', 'DESTINATION': 'destination',
'CHARTERERS': 'charterer', 'API': 'api'}, inplace=True)

df_2 = df_2.reindex(
columns=["port_name", "crude", "reporting_month", "bl_date", "vessel", "destination", "Charterer",
"api"])

# return df_2


df_2.to_csv('Outputfile.csv', index=False)
print("Sucessfully generated output CSV")

also tried the query below but could not map with above query

import pdfplumber
import pandas as pd
 

    pdf_file = "test.pdf"
    with pdfplumber.open(pdf_file) as pdf:
        lst = [p.extract_table() for p in pdf.pages]
     
    flat_list = [item for sublist in lst[:3] for item in sublist]
    df = pd.DataFrame(flat_list)
    df.columns = df.iloc[0]
    df = df[1:]

score 0 · Answer 1 · edited Oct 21 '22 at 23:00

0

You can try to iterate through the pdf pages and append the tables to your df, something like:

from PyPDF2 import PdfFileReader
import tabula
import pandas as pd

pdf = "foo.pdf"

inputpdf = PdfFileReader(open(pdf, "rb"))
main_df = pd.DataFrame()

for i in range(inputpdf.numPages):
    df = pd.DataFrame(tabula.read_pdf(pdf,lattice=True,pages=i + 1,multiple_tables=True,pandas_options={"header": None},)[0],)
    df = df.replace({"\r": " "}, regex=True)
    main_df = main_df.append(df)

print(main_df)
main_df.to_excel(excel_writer="tables_extracted.xlsx", sheet_name="foo")

edited Oct 21 '22 at 23:00

Ran A

746
3
7
19

answered Oct 17 '22 at 17:16

Alejandro Molina

1

Hello, please see https://meta.stackoverflow.com/editing-help Thanks! – Eric Aya Oct 17 '22 at 17:58

Table extract from multiple pages in pdf

1 Answers1