How to validate a list of custom dictionaries - schemas with Cerberus in Python

Question

I have a basic data_schema and I produce a list of many data points. Each data point follow my data_schema. How can I validate them all at once as part of the list. A reason I wish to do that is for speed. Iterating through the list and individually validating each data point takes too much time.

I am looking for a way to validate the whole list with Cerberus and hopefully to do it faster than validating each individual data point.

My code at the moment is this:

from cerberus import Validator
from time import time
from faker import Faker
import numpy as np

Faker.seed(0)

fake = Faker()


def create_data(x: int):
    """Create fake data"""

    # dictionary
    data = []
    for i in range(0, x):
        data_i = {}
        data_i["name"] = fake.name()
        data_i["city"] = fake.city()
        data_i["closeness (1-5)"] = np.random.randint(1, 5)
        data_i["extrovert"] = fake.pybool()
        data_i["favorite_temperature"] = fake.pyfloat(left_digits=2, right_digits=2)
        data.append(data_i)

    return data


data = create_data(10000)

data_schema = {
    "type": "dict",
    "schema": {
        "name": {"type": "string"},
        "city": {"type": "string"},
        "closeness (1-5)": {"type": "number"},
        "extrovert": {"type": "boolean"},
        "favorite_temperature": {"type": "number"},
    },
}

list_schema = {
    "type": "list",
    "schema": data_schema,
}

validator = Validator(list_schema)

t0 = time()
# for item in data:
#    validator.validate(item)
validator.validate(data)
t1 = time()
duration = t1 - t0
print(f"Cerberus validation lasted: {duration} sec")

which does not do what I was hoping it to do. I get this error:

cerberus.schema.SchemaError: {'type': ['must be of dict type']}

score 0 · Answer 1 · answered Jun 18 '22 at 07:58

Ok I think I found the problem. We need to pass the data in a dictionary format to cerberus so the final list of data should be a value inside a dictionary.

To understand better look at the working code below where I added this data_dict = {"data": data}. Instead of passing the list data to the validator directly I wrap it inside a very simple dictionary with one key-value pair where the value is the list. The key can be anything:

from cerberus import Validator
from time import time
from faker import Faker
import numpy as np

Faker.seed(0)

fake = Faker()


def create_data(x: int):
    """Create fake data"""

    # dictionary
    data = []
    for i in range(0, x):
        data_i = {}
        data_i["name"] = fake.name()
        data_i["city"] = fake.city()
        data_i["closeness (1-5)"] = np.random.randint(1, 5)
        data_i["extrovert"] = fake.pybool()
        data_i["favorite_temperature"] = fake.pyfloat(left_digits=2, right_digits=2)
        data.append(data_i)

    return data


data = create_data(10000)

data_schema = {
    "type": "dict",
    "schema": {
        "name": {"type": "string"},
        "city": {"type": "string"},
        "closeness (1-5)": {"type": "number"},
        "extrovert": {"type": "boolean"},
        "favorite_temperature": {"type": "number"},
    },
}

list_schema = {
    "data": {
        "type": "list",
        "schema": data_schema,
    }
}

data_dict = {"data": data}
validator = Validator(list_schema)

t0 = time()
validator.validate(data_dict)
t1 = time()
duration = t1 - t0
print(f"Cerberus validation lasted: {duration} sec")

How to validate a list of custom dictionaries - schemas with Cerberus in Python

1 Answers1