UPDATE 2: The product has been updated to support up to 1000 documents per batch request. (Note: Individual processors may have different page limits per request)
https://cloud.google.com/document-ai/quotas#content_limits
UPDATE: To make this an easier process, I added a feature to the Document AI Toolbox Python SDK to create batches of documents for Batch Processing.
Refer to this guide for the code sample: https://cloud.google.com/document-ai/docs/send-request#batch-documents
Batch processing currently allows 50 documents per request, with a maximum file size of 1GB and page limits depending on which processor is being used.
https://cloud.google.com/document-ai/quotas#content_limits
You can move your files in Cloud Storage into separate directories of 50 documents each to process the whole directory at once.
You can also divide the requests up by providing specific documents for each request. Use the gcsDocuments
parameter instead of gcsPrefix
.
https://cloud.google.com/document-ai/docs/send-request#batch-process
You could try something similar to this
def create_batches(
input_bucket: str,
input_prefix: str,
batch_size: int = BATCH_MAX_FILES,
) -> List[List[documentai.GcsDocument]]:
"""
Create batches of documents to process
"""
if batch_size > BATCH_MAX_FILES:
raise ValueError(
f"Batch size must be less than {BATCH_MAX_FILES}. "
f"You provided {batch_size}"
)
blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix)
batches: List[List[documentai.GcsDocument]] = []
batch: List[documentai.GcsDocument] = []
for blob in blob_list:
if blob.content_type not in ACCEPTED_MIME_TYPES:
logging.error(
"Invalid Mime Type %s - Skipping file %s", blob.content_type, blob.name
)
continue
if len(batch) == batch_size:
batches.append(batch)
batch = []
batch.append(
documentai.GcsDocument(
gcs_uri=f"gs://{input_bucket}/{blob.name}",
mime_type=blob.content_type,
)
)
batches.append(batch)
return batches
def batch_process_documents(
processor: Dict,
document_batch: List[documentai.GcsDocument],
gcs_output_uri: str,
skip_human_review: bool = True,
) -> documentai.BatchProcessMetadata:
"""
Constructs requests to process documents using the Document AI
Batch Method.
Returns Batch Process Metadata
"""
docai_client = documentai.DocumentProcessorServiceClient(
client_options=ClientOptions(
api_endpoint=f"{processor['location']}-documentai.googleapis.com"
)
)
resource_name = docai_client.processor_path(
processor["project_id"], processor["location"], processor["processor_id"]
)
output_config = documentai.DocumentOutputConfig(
gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=gcs_output_uri
)
)
# Load GCS Input URI into a List of document files
input_config = documentai.BatchDocumentsInputConfig(
gcs_documents=documentai.GcsDocuments(documents=document_batch)
)
request = documentai.BatchProcessRequest(
name=resource_name,
input_documents=input_config,
document_output_config=output_config,
skip_human_review=skip_human_review,
)
operation = docai_client.batch_process_documents(request)
# The API supports limited concurrent requests.
logging.info("Waiting for operation %s to complete...", operation.operation.name)
# No Timeout Set
operation.result()
return documentai.BatchProcessMetadata(operation.metadata)
def main():
batches = create_batches(gcs_input_bucket, gcs_input_prefix)
batch_process_results = []
for i, batch in enumerate(batches):
if len(batch) <= 0:
continue
logging.info("Processing batch %s: %s documents", i, len(batch))
batch_process_metadata = batch_process_documents(
processor=processor,
document_batch=batch,
gcs_output_uri=gcs_output_uri,
)
logging.info(batch_process_metadata.state_message)
batch_process_results.append(batch_process_metadata)
print(batch_process_results)