So, I deployed 2 sagemaker endpoints (of Large Language Models) following their AWS official tutorial notebook here: https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb however, I am unable to invoke the endpoints for inference due to a weird error that keeps asking for an offload folder. My notebook instance uses ml.g5.xlarge
. Below are the codes, where I set the model config, deploy the endpoints and and try to invoke the endpoint for inference.
The functions, I even edited the query_endpoint_with_json_payload()
function by including the offload_folder parameter:
def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type="application/json", offload_folder=None):
client = boto3.client("runtime.sagemaker")
if offload_folder is not None:
response = client.invoke_endpoint(
EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json, CustomAttributes=f'{{"offload_folder":"{offload_folder}"}}'
)
else:
response = client.invoke_endpoint(
EndpointName=endpoint_name,
ContentType=content_type,
Body=encoded_json
)
return response
def parse_response_model_flan_t5(query_response):
model_predictions = json.loads(query_response["Body"].read())
generated_text = model_predictions["generated_texts"]
return generated_text
def parse_response_multiple_texts_bloomz(query_response):
generated_text = []
model_predictions = json.loads(query_response["Body"].read())
for x in model_predictions[0]:
generated_text.append(x["generated_text"])
return generated_text
Model Configuration:
_MODEL_CONFIG_ = {
"huggingface-text2text-flan-t5-xxl": {
"instance type": "ml.g5.xlarge",
"env": {"SAGEMAKER_MODEL_SERVER_WORKERS": "1", "TS_DEFAULT_WORKERS_PER_MODEL": "1"},
"parse_function": parse_response_model_flan_t5,
"prompt": """Answer based on context:\n\n{context}\n\n{question}""",
},
"huggingface-textembedding-gpt-j-6b": {
"instance type": "ml.g5.xlarge",
"env": {"SAGEMAKER_MODEL_SERVER_WORKERS": "1", "TS_DEFAULT_WORKERS_PER_MODEL": "1"},
},
}
Deployment:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"
for model_id in _MODEL_CONFIG_:
endpoint_name = name_from_base(f"jumpstart-example-raglc-{model_id}")
inference_instance_type = _MODEL_CONFIG_[model_id]["instance type"]
#offload_folder = _MODEL_CONFIG_[model_id].get("offload_folder") # Get the offload folder for the model
# Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
deploy_image_uri = image_uris.retrieve(
region=None,
framework=None, # automatically inferred from model_id
image_scope="inference",
model_id=model_id,
model_version=model_version,
instance_type=inference_instance_type,
)
# Retrieve the model uri.
model_uri = model_uris.retrieve(
model_id=model_id, model_version=model_version, model_scope="inference"
)
model_inference = Model(
image_uri=deploy_image_uri,
model_data=model_uri,
role=aws_role,
predictor_cls=Predictor,
name=endpoint_name,
env=_MODEL_CONFIG_[model_id]["env"],
)
model_predictor_inference = model_inference.deploy(
initial_instance_count=1,
instance_type=inference_instance_type,
predictor_cls=Predictor,
endpoint_name=endpoint_name,
#offload_folder=offload_folder,
)
print(f"{bold}Model {model_id} has been deployed successfully.{unbold}{newline}")
_MODEL_CONFIG_[model_id]["endpoint_name"] = endpoint_name
The models were deployed successfully, however, invoking the endpoints to run inference fails. Here is the code to invoke the endpoints together with the payload:
question = "Which instances can I use with Managed Spot Training in SageMaker?"
payload = {
"text_inputs": question,
"max_length": 100,
"num_return_sequences": 1,
"top_k": 50,
"top_p": 0.95,
"do_sample": True,
}
list_of_LLMs = list(_MODEL_CONFIG_.keys())
list_of_LLMs.remove("huggingface-textembedding-gpt-j-6b") # remove the embedding model
offload_folder = "s3://ragbucketfo/offload/"
for model_id in list_of_LLMs:
endpoint_name = _MODEL_CONFIG_[model_id]["endpoint_name"]
query_response = query_endpoint_with_json_payload(
json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name, offload_folder=offload_folder
)
generated_texts = _MODEL_CONFIG_[model_id]["parse_function"](query_response)
print(f"For model: {model_id}, the generated output is: {generated_texts[0]}\n")
The error I get back is:
---------------------------------------------------------------------------
ModelError Traceback (most recent call last)
Cell In[55], line 16
14 for model_id in list_of_LLMs:
15 endpoint_name = _MODEL_CONFIG_[model_id]["endpoint_name"]
---> 16 query_response = query_endpoint_with_json_payload(
17 json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name, offload_folder=offload_folder
18 )
19 generated_texts = _MODEL_CONFIG_[model_id]["parse_function"](query_response)
20 print(f"For model: {model_id}, the generated output is: {generated_texts[0]}\n")
Cell In[51], line 4, in query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type, offload_folder)
2 client = boto3.client("runtime.sagemaker")
3 if offload_folder is not None:
----> 4 response = client.invoke_endpoint(
5 EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json, CustomAttributes=f'{{"offload_folder":"{offload_folder}"}}'
6 )
7 else:
8 response = client.invoke_endpoint(
9 EndpointName=endpoint_name,
10 ContentType=content_type,
11 Body=encoded_json
12 )
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/client.py:530, in ClientCreator._create_api_method.<locals>._api_call(self, *args, **kwargs)
526 raise TypeError(
527 f"{py_operation_name}() only accepts keyword arguments."
528 )
529 # The "self" in this scope is referring to the BaseClient.
--> 530 return self._make_api_call(operation_name, kwargs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/client.py:964, in BaseClient._make_api_call(self, operation_name, api_params)
962 error_code = parsed_response.get("Error", {}).get("Code")
963 error_class = self.exceptions.from_code(error_code)
--> 964 raise error_class(parsed_response, operation_name)
965 else:
966 return parsed_response
ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
"code": 400,
"type": "InternalServerException",
"message": "At least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`."
}
". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/jumpstart-example-raglc-huggingface-tex-2023-07-23-04-36-50-245 in account 367332732143 for more information.
I have tried different instances including ml.g5.4xlarge
but this instance only allows for one deployment (so I couldn't deploy both models). Even when I deploy only the huggingface-text2text-flan-t5-xxl
model, on ml.g5.4xlarge
I run into the CUDA issue below
---------------------------------------------------------------------------
ModelError Traceback (most recent call last)
/tmp/ipykernel_12905/510372874.py in <cell line: 14>()
14 for model_id in list_of_LLMs:
15 endpoint_name = _MODEL_CONFIG_[model_id]["endpoint_name"]
---> 16 query_response = query_endpoint_with_json_payload(
17 json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name
18 )
/tmp/ipykernel_12905/111908580.py in query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type)
1 def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type="application/json"):
2 client = boto3.client("runtime.sagemaker")
----> 3 response = client.invoke_endpoint(
4 EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
5 )
~/anaconda3/envs/mxnet_p38/lib/python3.8/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
528 )
529 # The "self" in this scope is referring to the BaseClient.
--> 530 return self._make_api_call(operation_name, kwargs)
531
532 _api_call.__name__ = str(py_operation_name)
~/anaconda3/envs/mxnet_p38/lib/python3.8/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
962 error_code = parsed_response.get("Error", {}).get("Code")
963 error_class = self.exceptions.from_code(error_code)
--> 964 raise error_class(parsed_response, operation_name)
965 else:
966 return parsed_response
ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
"code": 400,
"type": "InternalServerException",
"message": "Attempting to deserialize object on CUDA device 1 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device."
}
". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/jumpstart-example-raglc-huggingface-tex-2023-07-23-02-44-12-269 in account 367332732143 for more information.```
I would really appreciate your guidance. Once again is the link to the tutorial notebook by AWS:https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb