To add parameters to a data pipeline, start by defining your parameters in a dataset. You can use Fusion for this. A straightforward schema involves creating one column per parameter, and giving your table a single row.
You can learn about how to create Tables in Fusion spreadsheets here (public documentation), and how to synchronize those to a dataset here (public documentation).
Once you have your dataset, create config.py
in your code repository like so:
from typing import Any as _Any
from pyspark.sql import DataFrame as _df
class Config(object):
def __init__(
self,
string_param: str,
string_param_with_default: str,
list_param: list(),
int_param: int,
**kwargs: _Any,
) -> None:
self.string_param= string_param
self.string_param_with_default= string_param_with_default or "foo"
self.list_param = list_param or []
self.int_param = int_param or 1
def config_from_dataset(config_df: _df) -> Config:
return Config(**config_df.collect()[0].asDict())
After that you can call your configuration like so:
from pyspark.sql import functions as F, types as T
from transforms.api import transform_df, Input, Output, Check
from transforms import expectations as E
from myproject.config import config_from_dataset
@transform_df(
Output("path/to/output"),
source_df=Input("path/to/input"),
config=Input(
"path/to/config",
checks=[
Check(E.count().equals(1), "Config should have exactly 1 row"),
Check(
E.schema().equals({
'string_param': T.StringType(),
'string_param_with_default': T.StringType(),
'list_param': T.ArrayType(T.StringType()),
'int_param': T.IntegerType(),
}),
"Config should have fixed schema"
)
]
)
)
def compute(source_df, config):
config = config_from_dataset(config)
# grab a parameter
string_param = config.string_param
# use the param
return source_df.filter(F.col("name").eqNullSafe(string_param))
In the code snippet above I’ve included a Check on the input that validates the configuration table.