This dataframe:
df = pd.DataFrame(
[
{"name": "Ross", "dob": pd.Timestamp("1967-10-18")},
{"name": "Rachel", "dob": pd.Timestamp("1968-05-05")},
{"name": "Phoebe", "dob": None},
]
)
Would cause the UserConfigurableProfiler to fail when using great_expectations <= 0.15.18. For example, for the above dataframe, the error is the following:
File ~/.local/lib/python3.9/site-packages/great_expectations/execution_engine/execution_engine.py:397, in ExecutionEngine.resolve_metrics(self, metrics_to_resolve, metrics, runtime_configuration)
393 resolved_metrics[metric_to_resolve.id] = metric_fn(
394 **metric_provider_kwargs
395 )
396 except Exception as e:
--> 397 raise ge_exceptions.MetricResolutionError(
398 message=str(e), failed_metrics=(metric_to_resolve,)
399 )
401 if len(metric_fn_bundle) > 0:
402 try:
403 # an engine-specific way of computing metrics together
404 # NOTE: DH 20220328: This is where we can introduce the Batch Metrics Store (BMS)
MetricResolutionError: Column values, min_value, and max_value must either be None or of the same type.
But when using great_expectations >= 0.15.19, the profiler does not throw an error anymore. I don't know if it is silently swallowing the exception somehow. I went through the source code but I couldn't find anything different between 0.15.18 and 0.15.19 for example.
Steps to reproduce:
pip install great_expectations==0.15.18
import great_expectations
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.checkpoint.checkpoint import SimpleCheckpoint
from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import (
DataContextConfig,
FilesystemStoreBackendDefaults,
)
from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler, ExpectationConfiguration
import os
import pandas as pd
data_context_config = DataContextConfig( store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=f'{os.getcwd()}/ge_gen_dir_2/') )
context = BaseDataContext(project_config = data_context_config)
expectation_suite_name = 'expectation_suite_yellow_tripdata_02'
suite = context.create_expectation_suite(
expectation_suite_name = expectation_suite_name,
overwrite_existing=True
)
df = pd.DataFrame(
[
{"name": "Ross", "dob": pd.Timestamp("1967-10-18")},
{"name": "Rachel", "dob": pd.Timestamp("1968-05-05")},
{"name": "Phoebe", "dob": None},
]
)
batch_request = RuntimeBatchRequest(
datasource_name= 'dummy_datasource',
data_connector_name= "runtime_connector",
data_asset_name='yellow_tripdata_01',
batch_identifiers={
"run_id": 'yellow_tripdata_01',
},
runtime_parameters={"batch_data": df}
)
datasource_config = {
'name': f'dummy_datasource',
'class_name': 'Datasource',
'module_name': 'great_expectations.datasource',
'execution_engine': {
'module_name': 'great_expectations.execution_engine',
'class_name': 'PandasExecutionEngine',
},
'data_connectors': {
'runtime_connector': {
'class_name': 'RuntimeDataConnector',
'batch_identifiers': ['run_id'],
},
},
}
context.add_datasource(**datasource_config)
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=expectation_suite_name)
profiler = UserConfigurableProfiler(
profile_dataset=validator,
not_null_only=False,
# when set to True, no expectations at column level will be generated
table_expectations_only=False,
ignored_columns=[],
# Here you can provide a list of expectations to exclude from the profiler
excluded_expectations=[],
)
profiler.build_suite()
If you want to test the same for the lib version that does not throw an error, just install the later version, for example:
pip install great_expectations==0.15.21
Please help!