0

this is the error I am getting from the logs:

[ERROR] KeyError: 'Text'
Traceback (most recent call last):
  File "/var/task/lambda_function.py", line 51, in lambda_handler
    pdfText += item["Text"] + '\n'

I am trying to run a form analysis via textract to extract data from the form and save it into S3 as a .csv file. My code is below:

import boto3
import os

def getJobResults(jobId):

    pages = []

    textract = boto3.client('textract')
    response = textract.get_document_analysis(JobId=jobId)
    
    pages.append(response)

    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):

        response = textract.get_document_analysis(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

def lambda_handler(event, context):
    notificationMessage = json.loads(json.dumps(event))['Records'][0]['Sns']['Message']
    
    pdfTextExtractionStatus = json.loads(notificationMessage)['Status']
    pdfTextExtractionJobTag = json.loads(notificationMessage)['JobTag']
    pdfTextExtractionJobId = json.loads(notificationMessage)['JobId']
    pdfTextExtractionDocumentLocation = json.loads(notificationMessage)['DocumentLocation']
    
    pdfTextExtractionS3ObjectName = json.loads(json.dumps(pdfTextExtractionDocumentLocation))['S3ObjectName']
    pdfTextExtractionS3Bucket = json.loads(json.dumps(pdfTextExtractionDocumentLocation))['S3Bucket']
    
    print(pdfTextExtractionJobTag + ' : ' + pdfTextExtractionStatus)
    
    pdfText = ''
    
    if(pdfTextExtractionStatus == 'SUCCEEDED'):
        response = getJobResults(pdfTextExtractionJobId)
        
        for resultPage in response:
            for item in resultPage["Blocks"]:
                if item["BlockType"] == "KEY_VALUE_SET" :
                    EntityTypes : ['KEY'|'VALUE']
                    pdfText += item["Text"] + '\n'
                    
                    
        s3 = boto3.client('s3')
        
        outputTextFileName = os.path.splitext(pdfTextExtractionS3ObjectName)[0] + '.csv'
        s3.put_object(Body=pdfText, Bucket=pdfTextExtractionS3Bucket, Key=outputTextFileName)

The documentation I am following is : https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.get_document_analysis

Any advice would be greatly appreciated! Thank you

John Rotenstein
  • 241,921
  • 22
  • 380
  • 470

0 Answers0