I've just spent the whole day trying to go through the documentation example for AWS SageMaker setting up a processing job. I created a 'part_rel_processing.ipynb' file which triggers the job and takes care of the input/output file paths and then I created a 'processing_participant_relationships.py' file which has the code to do the data preprocessing. It reads the data csv file from s3 bucket and then writes to the output path specified in the .ipynb file. But when I run the job in SageMaker processing jobs tab, it keeps failing and the error occurs at the call to ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test") in .ipynb file. I am not sure WHY it is not happy with my output_name or path?
I've created a part_rel_processing.ipynb file with the following:
Block 1: import boto3 import sagemaker from sagemaker import get_execution_role from sagemaker.sklearn.processing import SKLearnProcessor
region = boto3.session.Session().region_name
role = get_execution_role()
sklearn_processor = SKLearnProcessor(
framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
)
Block 2: from sagemaker.processing import ProcessingInput, ProcessingOutput
sklearn_processor.run(
code="processing_participant_relationships.py",
inputs=[
ProcessingInput(source="s3://sagemaker-us-east-1-898900188658/gun_violence_data", destination="/opt/ml/processing/input"),
],
outputs=[
ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),<<----ERROR
],
)
Block 3: preprocessing_job_description = sklearn_processor.jobs[-1].describe()
output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
if output["OutputName"] == "test_data":
preprocessed_training_data = output["S3Output"]["S3Uri"]
And a processing script processing_participant_relationships.py:
import argparse
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
if __name__ == "__main__":
input_data_path = os.path.join("'/opt/ml/processing/input", "gun_violence.csv")
input_data_path = 's3://sagemaker-us-east-1-8989001XXXXX/gun_violence_data/gun-violence.csv'<-- wiped out bucket id for security issues
print("Reading input data from {}".format(input_data_path))
df = pd.read_csv(input_data_path)
df['suspect_rel'] = ''
# parse each column into readable form
for i, row in df.iterrows():
temp = row['participant_type']
#print("participant_type row %s" % temp)
if isinstance(temp, float):
continue
#match = re.findall('\d*::\d*Subject-Suspect', temp)
# get index number of suspect
match = re.findall('\d*::Subject-Suspect', temp)
if len(match) == 0:
continue
elif 'Subject-Suspect' not in match[0]:
continue
for keyval in match:
if '::' in str(keyval):
#print("keyval: %s" % keyval)
part_value = str(keyval).split('::')
part_index = part_value[0]
temp_age_group = row['participant_relationship']
if isinstance(row['participant_relationship'], float):
pass
else:
regex = part_index + '::(.*)'
#print("regex: %s" % regex)
#print("temp_age_group: %s" % temp_age_group)
if not isinstance(temp_age_group, float):
match_age = re.findall(regex, temp_age_group)
#print("match_age: %s" % match_age)
if len(match_age) != 0:
if '||' in match_age[0]:
element = match_age[0].split('||')
if element[0] == '':
pass
#print("empty element: --%s--" % element[0])
else:
df.at[i, 'suspect_rel'] = element
else:
if match_age[0] == '':
#print("do nothing")
pass
else:
df.at[i, 'suspect_rel'] = match_age
#print("i = %d" % i)
else:
continue
print("Preprocessing Suspect Relationship CSV: {}".format(df.shape))
parsed_suspect_relationship_output_path = os.path.join("/opt/ml/processing/participant_relationship", "parsed_suspect_relationship.csv")
Error Message in the Notebook Jobs window: enter image description here