I am storing pdf text (extracted with pypdf) in a CSV file. the problem is few pdf file is very long and the text spreads into multiple rows for those long pdf file instead of keeping a single row. How to keep them in a single row? here my output looks like
column1 column2
long pdf hello my
name is jhone
short pdf hello my name is jhone. I haven't any problem for short pdf file
my code:
pdf_url ='https://www.snb.ch/en/mmr/speeches/id/ref_20230330_amrtmo/source/ref_20230330_amrtmo.en.pdf'
print("pdf_url: ",pdf_url)
# Download the PDF file from the URL
response = requests.get(pdf_url)
# Create an in-memory buffer from the PDF content
pdf_buffer = io.BytesIO(response.content)
# Read the PDF file from the in-memory buffer
pdf = PdfReader(pdf_buffer)
pdf_content = []
# Access the contents of the PDF file
for page_num in range(len(pdf.pages)):
page = pdf.pages[page_num]
page = str(page.extract_text())
pdf_content.append(page)
with open(filename, "a", newline="", encoding='utf8') as f:
writer = csv.writer(f)
writer.writerow([first_author, new_date_str, speech_title,pdf_url,pdf_content])
pdf_content.clear()