0

I've just spent the whole day trying to go through the documentation example for AWS SageMaker setting up a processing job. I created a 'part_rel_processing.ipynb' file which triggers the job and takes care of the input/output file paths and then I created a 'processing_participant_relationships.py' file which has the code to do the data preprocessing. It reads the data csv file from s3 bucket and then writes to the output path specified in the .ipynb file. But when I run the job in SageMaker processing jobs tab, it keeps failing and the error occurs at the call to ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test") in .ipynb file. I am not sure WHY it is not happy with my output_name or path?

I've created a part_rel_processing.ipynb file with the following:

Block 1: import boto3 import sagemaker from sagemaker import get_execution_role from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
)

Block 2: from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code="processing_participant_relationships.py",
    inputs=[
        ProcessingInput(source="s3://sagemaker-us-east-1-898900188658/gun_violence_data", destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),<<----ERROR
    ],
)

Block 3: preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "test_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]

And a processing script processing_participant_relationships.py:

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action="ignore", category=DataConversionWarning)


if __name__ == "__main__":
   
    input_data_path = os.path.join("'/opt/ml/processing/input", "gun_violence.csv")
    input_data_path = 's3://sagemaker-us-east-1-8989001XXXXX/gun_violence_data/gun-violence.csv'<-- wiped out bucket id for security issues

    print("Reading input data from {}".format(input_data_path))
    df = pd.read_csv(input_data_path)
   
    df['suspect_rel'] = ''

    # parse each column into readable form
    for i, row in df.iterrows():

        temp = row['participant_type']
        #print("participant_type row %s" % temp)

        if isinstance(temp, float):
            continue
        #match = re.findall('\d*::\d*Subject-Suspect', temp)

        # get index number of suspect
        match = re.findall('\d*::Subject-Suspect', temp)

        if len(match) == 0:
            continue
        elif 'Subject-Suspect' not in match[0]:
            continue


        for keyval in match:
            if '::' in str(keyval):
                #print("keyval: %s" % keyval)
                part_value = str(keyval).split('::')
                part_index = part_value[0]

                temp_age_group = row['participant_relationship']
                if isinstance(row['participant_relationship'], float):
                    pass
                else:
                    regex = part_index + '::(.*)'
                    #print("regex: %s" % regex)
                    #print("temp_age_group: %s" % temp_age_group)
                    if not isinstance(temp_age_group, float):
                        match_age = re.findall(regex, temp_age_group)
                        #print("match_age: %s" % match_age)
                        if len(match_age) != 0:
                            if '||' in match_age[0]:
                                element = match_age[0].split('||')
                                if element[0] == '':
                                    pass
                                    #print("empty element: --%s--" % element[0])
                                else:
                                    df.at[i, 'suspect_rel'] = element
                            else:
                                if match_age[0] == '':
                                    #print("do nothing")
                                    pass
                                else:
                                    df.at[i, 'suspect_rel'] = match_age
                                #print("i = %d" % i)
                        else:
                            continue

        print("Preprocessing Suspect Relationship CSV: {}".format(df.shape))

        parsed_suspect_relationship_output_path = os.path.join("/opt/ml/processing/participant_relationship", "parsed_suspect_relationship.csv")

Error Message in the Notebook Jobs window: enter image description here

0 Answers0