1

I'm trying to run an Azure ML pipeline using the Azure ML Python SDKv2. The input to the pipeline is a Data asset whose Data source is the default blob store. Its path is azureml:raw_data_v2:1 and it is of type URI_FOLDER. I'm getting the following error when running the pipeline

[2022-11-03 16:10:29Z] Job failed, job RunId is fca7d858-2b46-43bb-89e8-0481631eafbe. Error: {"Error":{"Code":"UserError","Severity":null,"Message":"{"NonCompliant":"ArgumentError(InvalidArgument { argument: \"arguments.path\", expected: \"Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/[a082bb6b7b039486a52e2427040accec] or '/[514faf5a71b4f0f67374c388f37aa0d7]/[4a2030f3ff4c5c8696e675e920aac45a] to match the entire content of the volume. \", actual: \"REDACTED\" })"}\n{\n "code": "data-capability.UriMountSession.PyFuseError", \n "target": "",\n "category": "UserError",\n "error_details": [\n {\n "key": "NonCompliantReason", \n "value": "ArgumentError(InvalidArgument { argument: \"arguments.path\", expected: \"Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/' or '//*' to match the entire content of the volume.\", actual: \"REDACTED\" })"\n }, \n {\n "key": "StackTrace",\n "value": " File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/data_capability/capability_session.py\", line 70, in start\n (data_path, sub_data_path) = session.start()\n\n File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/data_capability/data_sessions.py\", line 364, in start\n options=mnt_options\n\n File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/azureml/dataprep/fuse/dprepfuse.py\", line 696, in rslex_uri_volume_mount\n raise e\n\n File \"/opt/miniconda/envs/data-capability/lib/python3.7/site-packages/azureml/dataprep/fuse/dprepfuse.py\", line 690, in rslex_uri_volume_mount\n mount_context = RslexDirectURIMountContext(mount_point, uri, options)\n"\n }\n ]\n}", "MessageFormat":null,"MessageParameters":{},"ReferenceCode":null,"DetailsUri":null,"Target":null,"Details":[], "InnerError":null,"DebugInfo":null,"AdditionalInfo":null},"Correlation":null,"Environment":null,"Location":null, "Time":"0001-01-01T00:00:00+00:00","ComponentName":null}

The most important part is

ArgumentError(InvalidArgument { argument: "arguments.path", expected: "Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/[a082bb6b7b039486a52e2427040accec] or '/[514faf5a71b4f0f67374c388f37aa0d7]/[4a2030f3ff4c5c8696e675e920aac45a] to match the entire content of the volume.

I'm assuming this is happening when its trying to mount my input data to the docker container that will run my pipeline, but not sure. Here is my complete code for the pipeline

from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.constants import AssetTypes
from mldesigner import command_component
import os
import pandas as pd
import numpy as np
import glob
from PIL import Image
import json
import pickle

os.environ['AZURE_TENANT_ID'] = 'xxx-xxx-xxx-xxx'
credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
ml_client = MLClient.from_config(credential=credential)

def get_val_test_filenames(input_ml_path, dataset):
    df = pd.read_csv(f'{input_ml_path}/evaluation/{dataset}_reference_slides.csv', encoding='utf-8')
    slides = df['Slide_id'].to_list()
    return slides

def create_id(path):
    parts = path.split('/')
    file_name = parts[-1][:-4]
    hash_str = parts[-2]
    doc = parts[-3]
    id = f'{doc}__{hash_str}__{file_name}'
    return id

def create_y_val(input_ml_path, val_files):
    y_val = []
    with open(f'{input_ml_path}/evaluation/golden_dev.json') as y_val_file:
        val_dict = json.load(y_val_file)
        for vf in val_files:
            sim_list = val_dict[vf]
            y_val.append(sim_list)
    return y_val # this should be list of lists

# x_train, x_val, x_test, y_val
def create_no_hier_datasets(input_ml_path, output_ml_path):
    print(f'************* inside create no hier datasets *********************')
    train_dir = f'{input_ml_path}/raw/images/final_slides/'

    val_slides = get_val_test_filenames(input_ml_path, 'val')
    test_slides = get_val_test_filenames(input_ml_path, 'test')
    x_train, x_val, x_test, y_val = [], [], [], []
    cnt = 0
    for filename in glob.iglob(train_dir + '**/thumb*.jpg', recursive=True):
        if 'small' in filename:
            continue
        
        img_np = np.asarray(Image.open(filename))
        if img_np.shape != (768, 1024, 3):
            print(f'{img_np.shape} does not equal (768, 1024, 3)')
            continue

        id = create_id(filename)
        if id in val_slides:
            x_val.append(img_np)
            y_val.append(filename)
        elif id in test_slides:
            x_test.append(img_np)
        else:
            x_train.append(img_np)
    
    x_train_np = np.asarray(x_train)
    x_val_np = np.asarray(x_val)
    x_test_np = np.asarray(x_test)
    y_val_list = create_y_val(input_ml_path, y_val)
    
    np.save(f"{output_ml_path}/x_train.npy", x_train_np)
    np.save(f"{output_ml_path}/x_val.npy", x_val_np)
    np.save(f"{output_ml_path}/x_test.npy", x_test_np)
    with open(f"{output_ml_path}/y_val.npy", 'wb') as fp:
        pickle.dump(y_val_list, fp)


output_version = '1'
input_version = '1'
@command_component(
    name="create_train_test_data",
    version="1",
    display_name="Create train and test data",
    description="creates train and test data",
    environment=dict(
        conda_file="conda.yml",
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
    )
)
def create_train_test_data_component(
    input_data: Input(type=AssetTypes.URI_FOLDER),
    output_data: Output(type=AssetTypes.URI_FOLDER),
):
    create_no_hier_datasets(input_data, output_data)


@pipeline(compute='cpu-cluster', description="pipeline to create train and test data")
def data_prep_pipeline(pipeline_input_data):
    create_data_node = create_train_test_data_component(input_data=pipeline_input_data)


raw_data_ds = Input(type=AssetTypes.URI_FOLDER, path="azureml:raw_data_v2:1")
output_data_ds = Output(type=AssetTypes.URI_FOLDER, path="azureml:train_test_data:1")
pipeline_job = data_prep_pipeline(pipeline_input_data=raw_data_ds)
pipeline_job = ml_client.jobs.create_or_update(pipeline_job, experiment_name="no_hierarchy")

As the error seems to be happening before my code begins to run, I really don't understand whats going wrong. Does anyone have experience with this?

gary69
  • 3,620
  • 6
  • 36
  • 50

1 Answers1

0

its by design that uri_folder doesnt support globbing in the path.

Whats your scenario where you need globbing in uri path?

  • i'm not using a glob for the path, i'm using this `azureml:raw_data_v2:1`. I'm using a glob to iterate over the directory after it should be mounted in this line `for filename in glob.iglob(train_dir + '**/thumb*.jpg', recursive=True)` but i don't think its reaching that part of the code – gary69 Nov 08 '22 at 22:34