mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-06 10:37:22 +00:00
docs: Documentation update for NVIDIA Inference Provider (#3840)
# What does this PR do? <!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. --> <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> - Fix examples in the NVIDIA inference documentation to align with current API requirements. ## Test Plan <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* --> N/A
This commit is contained in:
parent
f675fdda0f
commit
165b8b07f4
2 changed files with 34 additions and 47 deletions
|
|
@ -45,7 +45,7 @@ The following example shows how to create a chat completion for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="nvidia/meta/llama-3.1-8b-instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
|
|
@ -67,37 +67,40 @@ print(f"Response: {response.choices[0].message.content}")
|
||||||
The following example shows how to do tool calling for an NVIDIA NIM.
|
The following example shows how to do tool calling for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
|
tool_definition = {
|
||||||
|
"type": "function",
|
||||||
tool_definition = ToolDefinition(
|
"function": {
|
||||||
tool_name="get_weather",
|
"name": "get_weather",
|
||||||
description="Get current weather information for a location",
|
"description": "Get current weather information for a location",
|
||||||
parameters={
|
"parameters": {
|
||||||
"location": ToolParamDefinition(
|
"type": "object",
|
||||||
param_type="string",
|
"properties": {
|
||||||
description="The city and state, e.g. San Francisco, CA",
|
"location": {
|
||||||
required=True,
|
"type": "string",
|
||||||
),
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
"unit": ToolParamDefinition(
|
},
|
||||||
param_type="string",
|
"unit": {
|
||||||
description="Temperature unit (celsius or fahrenheit)",
|
"type": "string",
|
||||||
required=False,
|
"description": "Temperature unit (celsius or fahrenheit)",
|
||||||
default="celsius",
|
"default": "celsius",
|
||||||
),
|
},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
},
|
},
|
||||||
)
|
}
|
||||||
|
|
||||||
tool_response = client.chat.completions.create(
|
tool_response = client.chat.completions.create(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="nvidia/meta/llama-3.1-8b-instruct",
|
||||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||||
tools=[tool_definition],
|
tools=[tool_definition],
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Tool Response: {tool_response.choices[0].message.content}")
|
print(f"Response content: {tool_response.choices[0].message.content}")
|
||||||
if tool_response.choices[0].message.tool_calls:
|
if tool_response.choices[0].message.tool_calls:
|
||||||
for tool_call in tool_response.choices[0].message.tool_calls:
|
for tool_call in tool_response.choices[0].message.tool_calls:
|
||||||
print(f"Tool Called: {tool_call.tool_name}")
|
print(f"Tool Called: {tool_call.function.name}")
|
||||||
print(f"Arguments: {tool_call.arguments}")
|
print(f"Arguments: {tool_call.function.arguments}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Structured Output Example
|
### Structured Output Example
|
||||||
|
|
@ -105,33 +108,26 @@ if tool_response.choices[0].message.tool_calls:
|
||||||
The following example shows how to do structured output for an NVIDIA NIM.
|
The following example shows how to do structured output for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
|
|
||||||
|
|
||||||
person_schema = {
|
person_schema = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"name": {"type": "string"},
|
"name": {"type": "string"},
|
||||||
"age": {"type": "integer"},
|
"age": {"type": "number"},
|
||||||
"occupation": {"type": "string"},
|
"occupation": {"type": "string"},
|
||||||
},
|
},
|
||||||
"required": ["name", "age", "occupation"],
|
"required": ["name", "age", "occupation"],
|
||||||
}
|
}
|
||||||
|
|
||||||
response_format = JsonSchemaResponseFormat(
|
|
||||||
type=ResponseFormatType.json_schema, json_schema=person_schema
|
|
||||||
)
|
|
||||||
|
|
||||||
structured_response = client.chat.completions.create(
|
structured_response = client.chat.completions.create(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="nvidia/meta/llama-3.1-8b-instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
|
"content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
response_format=response_format,
|
extra_body={"nvext": {"guided_json": person_schema}},
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Structured Response: {structured_response.choices[0].message.content}")
|
print(f"Structured Response: {structured_response.choices[0].message.content}")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -141,7 +137,7 @@ The following example shows how to create embeddings for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.embeddings.create(
|
response = client.embeddings.create(
|
||||||
model="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
model="nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2",
|
||||||
input=["What is the capital of France?"],
|
input=["What is the capital of France?"],
|
||||||
extra_body={"input_type": "query"},
|
extra_body={"input_type": "query"},
|
||||||
)
|
)
|
||||||
|
|
@ -163,15 +159,15 @@ image_path = {path_to_the_image}
|
||||||
demo_image_b64 = load_image_as_base64(image_path)
|
demo_image_b64 = load_image_as_base64(image_path)
|
||||||
|
|
||||||
vlm_response = client.chat.completions.create(
|
vlm_response = client.chat.completions.create(
|
||||||
model="nvidia/vila",
|
model="nvidia/meta/llama-3.2-11b-vision-instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "image",
|
"type": "image_url",
|
||||||
"image": {
|
"image_url": {
|
||||||
"data": demo_image_b64,
|
"url": f"data:image/png;base64,{demo_image_b64}",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -19,15 +19,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
NVIDIA Inference Adapter for Llama Stack.
|
NVIDIA Inference Adapter for Llama Stack.
|
||||||
|
|
||||||
Note: The inheritance order is important here. OpenAIMixin must come before
|
|
||||||
ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability()
|
|
||||||
is used instead of ModelRegistryHelper.check_model_availability(). It also
|
|
||||||
must come before Inference to ensure that OpenAIMixin methods are available
|
|
||||||
in the Inference interface.
|
|
||||||
|
|
||||||
- OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists
|
|
||||||
- ModelRegistryHelper.check_model_availability() just returns False and shows a warning
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
# source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue