I have a folder with 100 PDF files. Page 1 of all PDF files contains a table that I am extracting. Then I am concatenating all the tables into a dataframe and writing as a CSV file. However, I am getting error while concatenating.
import os
import camelot
import pandas as pd
import PyPDF2
import tabula
# Set the directory path where the PDF files are located
dir_path = "my/path/"
# Create an empty list to store the tables
tables = []
# Loop through each file in the directory
for filename in os.listdir(dir_path):
# Check if the file is a PDF file
if filename.endswith(".pdf"):
# Open the PDF file
with open(os.path.join(dir_path, filename), "rb") as pdf_file:
# Create a PDF reader object
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
# Get the first page of the PDF file
page = pdf_reader.getPage(0)
# Extract the table from the first page using tabula-py
table = tabula.read_pdf(pdf_file, pages=1, pandas_options={"header": True})
print(table)
# Append the table to the tables list
tables.append(table)
# Concatenate all tables into a single DataFrame
df = pd.concat(tables)
# Write the DataFrame to a CSV file
df.to_csv("Output.csv", index=False)
TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid