Using custom checks, I couldn't find how to define a custom name to check. I want something other than null or None to be displayed in the error log
@register_check_method(statistics=["str_length"], check_type="element_wise")
def test_pyspark_check(pyspark_obj, *, str_length: int) -> bool:
check_df = pyspark_obj.dataframe.withColumn("length", F.length(F.col(pyspark_obj.column_name)))
return check_df.select("length").limit(100).collect()[0][0] == str_length
class PartitionSchema(DataFrameModel):
full_partition_name: T.StringType() = pa.Field(nullable=False)
probe: T.StringType() = pa.Field(nullable=True, test_pyspark_check={
"str_length": 1
})
event_date: T.StringType() = pa.Field(nullable=True, test_pyspark_check={
"str_length": 9
})
processing_time: T.StringType() = pa.Field(nullable=True)
Here you can see that I am not getting any useful information about the name of the check or error
{
"DATA": {
"DATAFRAME_CHECK": [
{
"schema": "PartitionSchema",
"column": "probe",
"check": null,
"error": "column 'probe' with type StringType failed validation None"
},
{
"schema": "PartitionSchema",
"column": "event_date",
"check": null,
"error": "column 'event_date' with type StringType failed validation None"
}
]
}
}
But if you are using the standard API for checks in the Field class, an example from the documentation
class PanderaSchema(DataFrameModel):
id: T.IntegerType() = pa.Field(gt=5)
product_name: T.StringType() = pa.Field(str_startswith="B")
price: T.DecimalType(20, 5) = pa.Field()
description: T.ArrayType(T.StringType()) = pa.Field()
meta: T.MapType(T.StringType(), T.StringType()) = pa.Field()
Then you get a completed check key and a full-fledged informative error
{
"SCHEMA": {
"COLUMN_NOT_IN_DATAFRAME": [
{
"schema": "PanderaSchema",
"column": "PanderaSchema",
"check": "column_in_dataframe",
"error": "column 'product_name' not in dataframe\nRow(id=5, product='Bread', price=Decimal('44.40000'), description=['description of product'], meta={'product_category': 'dairy'})"
}
],
"WRONG_DATATYPE": [
{
"schema": "PanderaSchema",
"column": "description",
"check": "dtype('ArrayType(StringType(), True)')",
"error": "expected column 'description' to have type ArrayType(StringType(), True), got ArrayType(StringType(), False)"
},
{
"schema": "PanderaSchema",
"column": "meta",
"check": "dtype('MapType(StringType(), StringType(), True)')",
"error": "expected column 'meta' to have type MapType(StringType(), StringType(), True), got MapType(StringType(), StringType(), False)"
}
]
},
"DATA": {
"DATAFRAME_CHECK": [
{
"schema": "PanderaSchema",
"column": "id",
"check": "greater_than(5)",
"error": "column 'id' with type IntegerType() failed validation greater_than(5)"
}
]
}
}
Please note that in my version, the check key is null, and in the error key we have None instead of any useful information
I tried to use all available keys of the register_check_method() method, but I did not get any positive changes