I am trying to set up a code based data context with a hosted static site having all stores be in azure. Here is the config and test code:
connection_str = <con string>
data_context_config = DataContextConfig(
config_version=2,
plugins_directory=None,
config_variables_file_path=None,
datasources={
"spark_datasource": DatasourceConfig(
class_name="Datasource",
execution_engine={
"class_name": "SparkDFExecutionEngine"
},
data_connectors={
"spark_connector": {
"module_name": "great_expectations.datasource.data_connector",
"class_name": "RuntimeDataConnector",
"batch_identifiers": [
"some_key_maybe_pipeline_stage",
"some_other_key_maybe_run_id",
],
}
},
)
},
stores={
"expectations_AZ_store": {
"class_name": "ExpectationsStore",
"store_backend": {
"class_name": "TupleAzureBlobStoreBackend",
"container": "\\$web",
"prefix": "",
"connection_string": connection_str
},
},
"validations_AZ_store": {
"class_name": "ValidationsStore",
"store_backend": {
"class_name": "TupleAzureBlobStoreBackend",
"container": "\\$web",
"prefix": "",
"connection_string": connection_str
},
},
"evaluation_parameter_AZ_store": {
"class_name": "EvaluationParameterStore",
"store_backend": {
"class_name": "TupleAzureBlobStoreBackend",
"container": "\\$web",
"prefix": "",
"connection_string": connection_str
},
},
"checkpoint_AZ_store": {
"class_name": "CheckpointStore",
"store_backend": {
"class_name": "TupleAzureBlobStoreBackend",
"container": "\\$web",
"prefix": "",
"connection_string": connection_str
},
},
},
expectations_store_name="expectations_AZ_store",
validations_store_name="validations_AZ_store",
evaluation_parameter_store_name="evaluation_parameter_AZ_store",
checkpoint_store_name="checkpoint_AZ_store",
data_docs_sites={
"az_site": {
"class_name": "SiteBuilder",
"store_backend": {
"class_name": "TupleAzureBlobStoreBackend",
"container": "\\$web",
"prefix": "data_docs",
},
"site_index_builder": {
"class_name": "DefaultSiteIndexBuilder",
"show_cta_footer": True,
},
}
},
validation_operators={
"action_list_operator": {
"class_name": "ActionListValidationOperator",
"action_list": [
{
"name": "store_validation_result",
"action": {"class_name": "StoreValidationResultAction"},
},
{
"name": "store_evaluation_params",
"action": {"class_name": "StoreEvaluationParametersAction"},
},
{
"name": "update_data_docs",
"action": {"class_name": "UpdateDataDocsAction"},
},
],
}
},
)
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2019-01.csv.gz")
context = BaseDataContext(project_config=data_context_config)
expectation_suite_name = "test_expectation_suite"
context.create_expectation_suite(
expectation_suite_name=expectation_suite_name, overwrite_existing=True
)
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=expectation_suite_name,
)
# print(validator.head())
validator.expect_column_values_to_not_be_null(column="passenger_count")
validator.expect_column_values_to_be_between(
column="congestion_surcharge", min_value=0, max_value=1000
)
validator.save_expectation_suite(discard_failed_expectations=False)
my_checkpoint_name = "test_checkpoint"
checkpoint_config = {
"name": my_checkpoint_name,
"config_version": 1.0,
"class_name": "SimpleCheckpoint",
"run_name_template": "%Y%m%d-%H%M%S-test_run",
"validations": [
{
"batch_request": {
"datasource_name": "spark_datasource",
"data_connector_name": "spark_connector",
"data_asset_name": "test_data_asset",
},
"expectation_suite_name": expectation_suite_name,
"action_list": [
{
"name": "store_validation_result",
"action": {"class_name": "StoreValidationResultAction"},
},
{
"name": "store_evaluation_params",
"action": {"class_name": "StoreEvaluationParametersAction"},
},
{
"name": "update_data_docs",
"action": {"class_name": "UpdateDataDocsAction"},
},
],
}
]
}
my_checkpoint = context.test_yaml_config(yaml.dump(checkpoint_config))
context.add_checkpoint(**checkpoint_config)
checkpoint_result = context.run_checkpoint(
checkpoint_name=my_checkpoint_name,
batch_request= {
"runtime_parameters": {"batch_data": df},
"batch_identifiers": {
"some_key_maybe_pipeline_stage": "prod",
"some_other_key_maybe_run_id": f"my_run_name_test",
},
}
)
However when I try to run it, it fails under context.add_checkpoint(**checkpoint_config) with the error message: ValidationMetricIdentifier tuple must have at least six components.
Would appreciate any help as I am struggling to get this working.