0

Using custom checks, I couldn't find how to define a custom name to check. I want something other than null or None to be displayed in the error log


@register_check_method(statistics=["str_length"], check_type="element_wise")
def test_pyspark_check(pyspark_obj, *, str_length: int) -> bool:

    check_df = pyspark_obj.dataframe.withColumn("length", F.length(F.col(pyspark_obj.column_name)))
    return check_df.select("length").limit(100).collect()[0][0] == str_length

class PartitionSchema(DataFrameModel):
    full_partition_name: T.StringType() = pa.Field(nullable=False)
    probe: T.StringType() = pa.Field(nullable=True, test_pyspark_check={
                "str_length": 1
            })
    event_date: T.StringType() = pa.Field(nullable=True, test_pyspark_check={
                "str_length": 9
            })
    processing_time: T.StringType() = pa.Field(nullable=True)

Here you can see that I am not getting any useful information about the name of the check or error

{
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "PartitionSchema",
                "column": "probe",
                "check": null,
                "error": "column 'probe' with type StringType failed validation None"
            },
            {
                "schema": "PartitionSchema",
                "column": "event_date",
                "check": null,
                "error": "column 'event_date' with type StringType failed validation None"
            }
        ]
    }
}

But if you are using the standard API for checks in the Field class, an example from the documentation

class PanderaSchema(DataFrameModel):
    id: T.IntegerType() = pa.Field(gt=5)
    product_name: T.StringType() = pa.Field(str_startswith="B")
    price: T.DecimalType(20, 5) = pa.Field()
    description: T.ArrayType(T.StringType()) = pa.Field()
    meta: T.MapType(T.StringType(), T.StringType()) = pa.Field()

Then you get a completed check key and a full-fledged informative error

{
    "SCHEMA": {
        "COLUMN_NOT_IN_DATAFRAME": [
            {
                "schema": "PanderaSchema",
                "column": "PanderaSchema",
                "check": "column_in_dataframe",
                "error": "column 'product_name' not in dataframe\nRow(id=5, product='Bread', price=Decimal('44.40000'), description=['description of product'], meta={'product_category': 'dairy'})"
            }
        ],
        "WRONG_DATATYPE": [
            {
                "schema": "PanderaSchema",
                "column": "description",
                "check": "dtype('ArrayType(StringType(), True)')",
                "error": "expected column 'description' to have type ArrayType(StringType(), True), got ArrayType(StringType(), False)"
            },
            {
                "schema": "PanderaSchema",
                "column": "meta",
                "check": "dtype('MapType(StringType(), StringType(), True)')",
                "error": "expected column 'meta' to have type MapType(StringType(), StringType(), True), got MapType(StringType(), StringType(), False)"
            }
        ]
    },
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "PanderaSchema",
                "column": "id",
                "check": "greater_than(5)",
                "error": "column 'id' with type IntegerType() failed validation greater_than(5)"
            }
        ]
    }
}

Please note that in my version, the check key is null, and in the error key we have None instead of any useful information

I tried to use all available keys of the register_check_method() method, but I did not get any positive changes

0 Answers0