I am trying to create a BatchRequest
that filters using data_connector_query
that I previously defined in the datasource as group_names
for the default_regexp
pertaining to an InferredAssetS3DataConnector
.
Here's the datasource config:
config = {
"name": self.config_id,
"class_name": "Datasource",
"execution_engine": {"class_name": "PandasExecutionEngine"},
"data_connectors": {
"sirene_update_s3_connector": {
"class_name": "InferredAssetS3DataConnector",
"bucket": self.bucket,
"prefix": self.prefix,
"default_regex": {
"pattern": r"(.*)\/(.*)\/technical_date=(\d{4})-(\d{2})-(\d{2})\/(.*)_(\d{8})\.json",
"group_names": [
"prefix",
"data_asset_name",
"year",
"month",
"day",
"file_name",
"date_suffix",
],
},
},
},
}
Here are two batch_requests that i constructed differently...
# Construct all the batches
batch_request_parameters = {
"datasource_name": datasource.name,
"data_connector_name": list(datasource.data_connectors.keys())[0],
"data_asset_name": "asset_alpha",
}
batch_request = BatchRequest(**batch_request_parameters)
datasource.get_batch_definition_list_from_batch_request(batch_request)
returns:
[{'datasource_name': 'my_bucket.data_pipelines.raw_data.asset_alpha', 'data_connector_name': 'asset_alpha_s3_connector', 'data_asset_name': 'asset_alpha', 'batch_identifiers': {'prefix': 'data_pipelines/raw_data', 'year': '2022', 'month': '11', 'day': '23', 'file_name': 'companies', 'date_suffix': '20221123'}},
{'datasource_name': 'my_bucket.data_pipelines.raw_data.asset_alpha', 'data_connector_name': 'asset_alpha_s3_connector', 'data_asset_name': 'asset_alpha', 'batch_identifiers': {'prefix': 'data_pipelines/raw_data', 'year': '2022', 'month': '12', 'day': '06', 'file_name': 'companies', 'date_suffix': '20221206'}},
{'datasource_name': 'my_bucket.data_pipelines.raw_data.asset_alpha', 'data_connector_name': 'asset_alpha_s3_connector', 'data_asset_name': 'asset_alpha', 'batch_identifiers': {'prefix': 'data_pipelines/raw_data', 'year': '2022', 'month': '12', 'day': '12', 'file_name': 'companies', 'date_suffix': '20221212'}},
#...
]
# Construct filtered batch
batch_request_parameters_2 = {
"datasource_name": datasource.name,
"data_connector_name": list(datasource.data_connectors.keys())[0],
"data_asset_name": "asset_alpha",
"data_connector_query": {
"prefix": "data_pipelines/raw_data",
"year": "2022",
"month": "11",
"day": "23",
"file_name": "companies",
"date_suffix": "20221123",
},
}
batch_request_2 = BatchRequest(**batch_request_parameters_2)
datasource.get_batch_definition_list_from_batch_request(batch_request_2)
This raises an error...
BatchFilterError: Unrecognized data_connector_query key(s):
"{'file_name', 'prefix', 'year', 'month', 'day', 'date_suffix'}" detected.
I tested with BatchDefinition
, and it seems to work well.
from great_expectations.core.id_dict import IDDict
batch_request_parameters_3 = {
"datasource_name": datasource.name,
"data_connector_name": list(datasource.data_connectors.keys())[0],
"data_asset_name": "asset_alpha",
"batch_identifiers": IDDict({
"prefix": "data_pipelines/raw_data",
"year": "2022",
"month": "11",
"day": "23",
"file_name": "companies",
"date_suffix": "20221123",
}),
}
batch_def = BatchDefinition(**batch_request_parameters_3)
datasource.get_batch_from_batch_definition(batch_def)
What's up with BatchRequest
, and why can't it detect my group_names
?