diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md index 096ff28ac..692b9125b 100644 --- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md +++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md @@ -45,7 +45,7 @@ The following example shows how to create a chat completion for an NVIDIA NIM. ```python response = client.chat.completions.create( - model="meta-llama/Llama-3.1-8B-Instruct", + model="nvidia/meta/llama-3.1-8b-instruct", messages=[ { "role": "system", @@ -67,37 +67,40 @@ print(f"Response: {response.choices[0].message.content}") The following example shows how to do tool calling for an NVIDIA NIM. ```python -from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition - -tool_definition = ToolDefinition( - tool_name="get_weather", - description="Get current weather information for a location", - parameters={ - "location": ToolParamDefinition( - param_type="string", - description="The city and state, e.g. San Francisco, CA", - required=True, - ), - "unit": ToolParamDefinition( - param_type="string", - description="Temperature unit (celsius or fahrenheit)", - required=False, - default="celsius", - ), +tool_definition = { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather information for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "description": "Temperature unit (celsius or fahrenheit)", + "default": "celsius", + }, + }, + "required": ["location"], + }, }, -) +} tool_response = client.chat.completions.create( - model="meta-llama/Llama-3.1-8B-Instruct", + model="nvidia/meta/llama-3.1-8b-instruct", messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=[tool_definition], ) -print(f"Tool Response: {tool_response.choices[0].message.content}") +print(f"Response content: {tool_response.choices[0].message.content}") if tool_response.choices[0].message.tool_calls: for tool_call in tool_response.choices[0].message.tool_calls: - print(f"Tool Called: {tool_call.tool_name}") - print(f"Arguments: {tool_call.arguments}") + print(f"Tool Called: {tool_call.function.name}") + print(f"Arguments: {tool_call.function.arguments}") ``` ### Structured Output Example @@ -105,33 +108,26 @@ if tool_response.choices[0].message.tool_calls: The following example shows how to do structured output for an NVIDIA NIM. ```python -from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType - person_schema = { "type": "object", "properties": { "name": {"type": "string"}, - "age": {"type": "integer"}, + "age": {"type": "number"}, "occupation": {"type": "string"}, }, "required": ["name", "age", "occupation"], } -response_format = JsonSchemaResponseFormat( - type=ResponseFormatType.json_schema, json_schema=person_schema -) - structured_response = client.chat.completions.create( - model="meta-llama/Llama-3.1-8B-Instruct", + model="nvidia/meta/llama-3.1-8b-instruct", messages=[ { "role": "user", "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ", } ], - response_format=response_format, + extra_body={"nvext": {"guided_json": person_schema}}, ) - print(f"Structured Response: {structured_response.choices[0].message.content}") ``` @@ -141,7 +137,7 @@ The following example shows how to create embeddings for an NVIDIA NIM. ```python response = client.embeddings.create( - model="nvidia/llama-3.2-nv-embedqa-1b-v2", + model="nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2", input=["What is the capital of France?"], extra_body={"input_type": "query"}, ) @@ -163,15 +159,15 @@ image_path = {path_to_the_image} demo_image_b64 = load_image_as_base64(image_path) vlm_response = client.chat.completions.create( - model="nvidia/vila", + model="nvidia/meta/llama-3.2-11b-vision-instruct", messages=[ { "role": "user", "content": [ { - "type": "image", - "image": { - "data": demo_image_b64, + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{demo_image_b64}", }, }, { diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index eab665d63..5aba6bddc 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -19,15 +19,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin): """ NVIDIA Inference Adapter for Llama Stack. - - Note: The inheritance order is important here. OpenAIMixin must come before - ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability() - is used instead of ModelRegistryHelper.check_model_availability(). It also - must come before Inference to ensure that OpenAIMixin methods are available - in the Inference interface. - - - OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists - - ModelRegistryHelper.check_model_availability() just returns False and shows a warning """ # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html