mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
docs: add VLM NIM example (#3277)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 1s
Vector IO Integration Tests / test-matrix (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 2s
Pre-commit / pre-commit (push) Failing after 0s
Test Llama Stack Build / build-single-provider (push) Failing after 1s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 0s
Test Llama Stack Build / generate-matrix (push) Failing after 1s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 1s
Test Llama Stack Build / build (push) Has been skipped
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 5s
Test External API and Providers / test-external (venv) (push) Failing after 1s
UI Tests / ui-tests (22) (push) Failing after 0s
Unit Tests / unit-tests (3.12) (push) Failing after 1s
Unit Tests / unit-tests (3.13) (push) Failing after 0s
Update ReadTheDocs / update-readthedocs (push) Failing after 1s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 1s
Vector IO Integration Tests / test-matrix (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 2s
Pre-commit / pre-commit (push) Failing after 0s
Test Llama Stack Build / build-single-provider (push) Failing after 1s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 0s
Test Llama Stack Build / generate-matrix (push) Failing after 1s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 1s
Test Llama Stack Build / build (push) Has been skipped
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 5s
Test External API and Providers / test-external (venv) (push) Failing after 1s
UI Tests / ui-tests (22) (push) Failing after 0s
Unit Tests / unit-tests (3.12) (push) Failing after 1s
Unit Tests / unit-tests (3.13) (push) Failing after 0s
Update ReadTheDocs / update-readthedocs (push) Failing after 1s
This commit is contained in:
parent
3370d8e557
commit
b12cd528ef
4 changed files with 64 additions and 6 deletions
|
@ -50,6 +50,7 @@ The following models are available by default:
|
||||||
- `meta/llama-3.2-11b-vision-instruct `
|
- `meta/llama-3.2-11b-vision-instruct `
|
||||||
- `meta/llama-3.2-90b-vision-instruct `
|
- `meta/llama-3.2-90b-vision-instruct `
|
||||||
- `meta/llama-3.3-70b-instruct `
|
- `meta/llama-3.3-70b-instruct `
|
||||||
|
- `nvidia/vila `
|
||||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||||
- `nvidia/nv-embedqa-e5-v5 `
|
- `nvidia/nv-embedqa-e5-v5 `
|
||||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||||
|
|
|
@ -134,6 +134,11 @@ models:
|
||||||
provider_id: nvidia
|
provider_id: nvidia
|
||||||
provider_model_id: meta/llama-3.3-70b-instruct
|
provider_model_id: meta/llama-3.3-70b-instruct
|
||||||
model_type: llm
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: nvidia/vila
|
||||||
|
provider_id: nvidia
|
||||||
|
provider_model_id: nvidia/vila
|
||||||
|
model_type: llm
|
||||||
- metadata:
|
- metadata:
|
||||||
embedding_dimension: 2048
|
embedding_dimension: 2048
|
||||||
context_length: 8192
|
context_length: 8192
|
||||||
|
|
|
@ -41,10 +41,10 @@ client.initialize()
|
||||||
|
|
||||||
### Create Completion
|
### Create Completion
|
||||||
|
|
||||||
> Note on Completion API
|
The following example shows how to create a completion for an NVIDIA NIM.
|
||||||
>
|
|
||||||
> The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does.
|
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.inference.completion(
|
response = client.inference.completion(
|
||||||
|
@ -60,6 +60,8 @@ print(f"Response: {response.content}")
|
||||||
|
|
||||||
### Create Chat Completion
|
### Create Chat Completion
|
||||||
|
|
||||||
|
The following example shows how to create a chat completion for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.inference.chat_completion(
|
response = client.inference.chat_completion(
|
||||||
model_id="meta-llama/Llama-3.1-8B-Instruct",
|
model_id="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
@ -82,6 +84,9 @@ print(f"Response: {response.completion_message.content}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Tool Calling Example ###
|
### Tool Calling Example ###
|
||||||
|
|
||||||
|
The following example shows how to do tool calling for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
|
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
|
||||||
|
|
||||||
|
@ -117,6 +122,9 @@ if tool_response.completion_message.tool_calls:
|
||||||
```
|
```
|
||||||
|
|
||||||
### Structured Output Example
|
### Structured Output Example
|
||||||
|
|
||||||
|
The following example shows how to do structured output for an NVIDIA NIM.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
|
from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
|
||||||
|
|
||||||
|
@ -149,8 +157,10 @@ print(f"Structured Response: {structured_response.completion_message.content}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Create Embeddings
|
### Create Embeddings
|
||||||
> Note on OpenAI embeddings compatibility
|
|
||||||
>
|
The following example shows how to create embeddings for an NVIDIA NIM.
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
|
> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -161,3 +171,41 @@ response = client.inference.embeddings(
|
||||||
)
|
)
|
||||||
print(f"Embeddings: {response.embeddings}")
|
print(f"Embeddings: {response.embeddings}")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Vision Language Models Example
|
||||||
|
|
||||||
|
The following example shows how to run vision inference by using an NVIDIA NIM.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def load_image_as_base64(image_path):
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
img_bytes = image_file.read()
|
||||||
|
return base64.b64encode(img_bytes).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
image_path = {path_to_the_image}
|
||||||
|
demo_image_b64 = load_image_as_base64(image_path)
|
||||||
|
|
||||||
|
vlm_response = client.inference.chat_completion(
|
||||||
|
model_id="nvidia/vila",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"image": {
|
||||||
|
"data": demo_image_b64,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Please describe what you see in this image in detail.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"VLM Response: {vlm_response.completion_message.content}")
|
||||||
|
```
|
||||||
|
|
|
@ -55,6 +55,10 @@ MODEL_ENTRIES = [
|
||||||
"meta/llama-3.3-70b-instruct",
|
"meta/llama-3.3-70b-instruct",
|
||||||
CoreModelId.llama3_3_70b_instruct.value,
|
CoreModelId.llama3_3_70b_instruct.value,
|
||||||
),
|
),
|
||||||
|
ProviderModelEntry(
|
||||||
|
provider_model_id="nvidia/vila",
|
||||||
|
model_type=ModelType.llm,
|
||||||
|
),
|
||||||
# NeMo Retriever Text Embedding models -
|
# NeMo Retriever Text Embedding models -
|
||||||
#
|
#
|
||||||
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue