I'm trying to run an Azure ML pipeline using the Azure ML Python SDKv2. The input to the pipeline is a Data asset whose Data source is the default blob store. Its path is azureml:raw_data_v2:1
and it is of type URI_FOLDER
. I'm getting the following error when running the pipeline
[2022-11-03 16:10:29Z] Job failed, job RunId is fca7d858-2b46-43bb-89e8-0481631eafbe. Error: {"Error":{"Code":"UserError","Severity":null,"Message":"{"NonCompliant":"ArgumentError(InvalidArgument { argument: \"arguments.path\", expected: \"Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/[a082bb6b7b039486a52e2427040accec] or '/[514faf5a71b4f0f67374c388f37aa0d7]/[4a2030f3ff4c5c8696e675e920aac45a] to match the entire content of the volume. \", actual: \"REDACTED\" })"}\n{\n "code": "data-capability.UriMountSession.PyFuseError", \n "target": "",\n "category": "UserError",\n "error_details": [\n {\n "key": "NonCompliantReason", \n "value": "ArgumentError(InvalidArgument { argument: \"arguments.path\", expected: \"Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/' or '//*' to match the entire content of the volume.\", actual: \"REDACTED\" })"\n }, \n {\n "key": "StackTrace",\n "value": " File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/data_capability/capability_session.py\", line 70, in start\n (data_path, sub_data_path) = session.start()\n\n File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/data_capability/data_sessions.py\", line 364, in start\n options=mnt_options\n\n File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/azureml/dataprep/fuse/dprepfuse.py\", line 696, in rslex_uri_volume_mount\n raise e\n\n File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/azureml/dataprep/fuse/dprepfuse.py\", line 690, in rslex_uri_volume_mount\n mount_context = RslexDirectURIMountContext(mount_point, uri, options)\n"\n }\n ]\n}", "MessageFormat":null,"MessageParameters":{},"ReferenceCode":null,"DetailsUri":null,"Target":null,"Details":[], "InnerError":null,"DebugInfo":null,"AdditionalInfo":null},"Correlation":null,"Environment":null,"Location":null, "Time":"0001-01-01T00:00:00+00:00","ComponentName":null}
The most important part is
ArgumentError(InvalidArgument { argument: "arguments.path", expected: "Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/[a082bb6b7b039486a52e2427040accec] or '/[514faf5a71b4f0f67374c388f37aa0d7]/[4a2030f3ff4c5c8696e675e920aac45a] to match the entire content of the volume.
I'm assuming this is happening when its trying to mount my input data to the docker container that will run my pipeline, but not sure. Here is my complete code for the pipeline
from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.constants import AssetTypes
from mldesigner import command_component
import os
import pandas as pd
import numpy as np
import glob
from PIL import Image
import json
import pickle
os.environ['AZURE_TENANT_ID'] = 'xxx-xxx-xxx-xxx'
credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
ml_client = MLClient.from_config(credential=credential)
def get_val_test_filenames(input_ml_path, dataset):
df = pd.read_csv(f'{input_ml_path}/evaluation/{dataset}_reference_slides.csv', encoding='utf-8')
slides = df['Slide_id'].to_list()
return slides
def create_id(path):
parts = path.split('/')
file_name = parts[-1][:-4]
hash_str = parts[-2]
doc = parts[-3]
id = f'{doc}__{hash_str}__{file_name}'
return id
def create_y_val(input_ml_path, val_files):
y_val = []
with open(f'{input_ml_path}/evaluation/golden_dev.json') as y_val_file:
val_dict = json.load(y_val_file)
for vf in val_files:
sim_list = val_dict[vf]
y_val.append(sim_list)
return y_val # this should be list of lists
# x_train, x_val, x_test, y_val
def create_no_hier_datasets(input_ml_path, output_ml_path):
print(f'************* inside create no hier datasets *********************')
train_dir = f'{input_ml_path}/raw/images/final_slides/'
val_slides = get_val_test_filenames(input_ml_path, 'val')
test_slides = get_val_test_filenames(input_ml_path, 'test')
x_train, x_val, x_test, y_val = [], [], [], []
cnt = 0
for filename in glob.iglob(train_dir + '**/thumb*.jpg', recursive=True):
if 'small' in filename:
continue
img_np = np.asarray(Image.open(filename))
if img_np.shape != (768, 1024, 3):
print(f'{img_np.shape} does not equal (768, 1024, 3)')
continue
id = create_id(filename)
if id in val_slides:
x_val.append(img_np)
y_val.append(filename)
elif id in test_slides:
x_test.append(img_np)
else:
x_train.append(img_np)
x_train_np = np.asarray(x_train)
x_val_np = np.asarray(x_val)
x_test_np = np.asarray(x_test)
y_val_list = create_y_val(input_ml_path, y_val)
np.save(f"{output_ml_path}/x_train.npy", x_train_np)
np.save(f"{output_ml_path}/x_val.npy", x_val_np)
np.save(f"{output_ml_path}/x_test.npy", x_test_np)
with open(f"{output_ml_path}/y_val.npy", 'wb') as fp:
pickle.dump(y_val_list, fp)
output_version = '1'
input_version = '1'
@command_component(
name="create_train_test_data",
version="1",
display_name="Create train and test data",
description="creates train and test data",
environment=dict(
conda_file="conda.yml",
image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
)
)
def create_train_test_data_component(
input_data: Input(type=AssetTypes.URI_FOLDER),
output_data: Output(type=AssetTypes.URI_FOLDER),
):
create_no_hier_datasets(input_data, output_data)
@pipeline(compute='cpu-cluster', description="pipeline to create train and test data")
def data_prep_pipeline(pipeline_input_data):
create_data_node = create_train_test_data_component(input_data=pipeline_input_data)
raw_data_ds = Input(type=AssetTypes.URI_FOLDER, path="azureml:raw_data_v2:1")
output_data_ds = Output(type=AssetTypes.URI_FOLDER, path="azureml:train_test_data:1")
pipeline_job = data_prep_pipeline(pipeline_input_data=raw_data_ds)
pipeline_job = ml_client.jobs.create_or_update(pipeline_job, experiment_name="no_hierarchy")
As the error seems to be happening before my code begins to run, I really don't understand whats going wrong. Does anyone have experience with this?