I'm trying to setup a simple KubeFlow pipeline, and I'm having trouble packaging up dependencies in a way that works for KubeFlow.
The code simply downloads a config file and parses it, then passes back the parsed configuration.
However, in order to parse the config file, it needs to have access to another internal python package.
I have a .tar.gz
archive of the package hosted on a bucket in the same project, and added the URL of the package as a dependency, but I get an error message saying tarfile.ReadError: not a gzip file
.
I know the file is good, so it's some intermediate issue with hosting on a bucket or the way kubeflow installs dependencies.
Here is a minimal example:
from kfp import compiler
from kfp import dsl
from kfp.components import func_to_container_op
from google.protobuf import text_format
from google.cloud import storage
import training_reader
def get_training_config(working_bucket: str,
working_directoy: str,
config_file: str) -> training_reader.TrainEvalPipelineConfig:
download_file(working_bucket, os.path.join(working_directoy, config_file), "ssd.config")
pipeline_config = training_reader.TrainEvalPipelineConfig()
with open("ssd.config", 'r') as f:
text_format.Merge(f.read(), pipeline_config)
return pipeline_config
config_op_packages = ["https://storage.cloud.google.com/my_bucket/packages/training-reader-0.1.tar.gz",
"google-cloud-storage",
"protobuf"
]
training_config_op = func_to_container_op(get_training_config,
base_image="tensorflow/tensorflow:1.15.2-py3",
packages_to_install=config_op_packages)
def output_config(config: training_reader.TrainEvalPipelineConfig) -> None:
print(config)
output_config_op = func_to_container_op(output_config)
@dsl.pipeline(
name='Post Training Processing',
description='Building the post-processing pipeline'
)
def ssd_postprocessing_pipeline(
working_bucket: str,
working_directory: str,
config_file:str):
config = training_config_op(working_bucket, working_directory, config_file)
output_config_op(config.output)
pipeline_name = ssd_postprocessing_pipeline.__name__ + '.zip'
compiler.Compiler().compile(ssd_postprocessing_pipeline, pipeline_name)