Merge origin/main into add-missing-provider-data-impls

Resolved conflicts in: - benchmarking/k8s-benchmark/stack_run_config.yaml (accepted new storage schema) - llama_stack/providers/remote/inference/cerebras/cerebras.py (kept provider data support) - llama_stack/providers/remote/inference/cerebras/config.py (kept provider data support) - llama_stack/providers/remote/inference/nvidia/config.py (kept provider data support) - llama_stack/providers/remote/inference/runpod/config.py (merged imports) - pyproject.toml (kept databricks-sdk dependency)
2025-12-16 10:42:38 +00:00 · 2025-10-27 11:39:00 -07:00 · 2025-10-27 11:39:00 -07:00 · 9eb9a37ee4
commit 9eb9a37ee4
parent fa4a9ece5b 98a5047f9d
1880 changed files with 804868 additions and 70533 deletions
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -18,7 +18,7 @@ This provider enables running inference using NVIDIA NIM.
 Build the NVIDIA environment:

 ```bash
-llama stack build --distro nvidia --image-type venv
+uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```

 ### Basic Usage using the LlamaStack Python Client
@ -45,7 +45,7 @@ The following example shows how to create a chat completion for an NVIDIA NIM.

 ```python
 response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[
        {
            "role": "system",
@ -67,37 +67,40 @@ print(f"Response: {response.choices[0].message.content}")
 The following example shows how to do tool calling for an NVIDIA NIM.

 ```python
-from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-
-tool_definition = ToolDefinition(
-    tool_name="get_weather",
-    description="Get current weather information for a location",
-    parameters={
-        "location": ToolParamDefinition(
-            param_type="string",
-            description="The city and state, e.g. San Francisco, CA",
-            required=True,
-        ),
-        "unit": ToolParamDefinition(
-            param_type="string",
-            description="Temperature unit (celsius or fahrenheit)",
-            required=False,
-            default="celsius",
-        ),
+tool_definition = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current weather information for a location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "Temperature unit (celsius or fahrenheit)",
+                    "default": "celsius",
+                },
+            },
+            "required": ["location"],
+        },
    },
-)
+}

 tool_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
    tools=[tool_definition],
 )

-print(f"Tool Response: {tool_response.choices[0].message.content}")
+print(f"Response content: {tool_response.choices[0].message.content}")
 if tool_response.choices[0].message.tool_calls:
    for tool_call in tool_response.choices[0].message.tool_calls:
-        print(f"Tool Called: {tool_call.tool_name}")
-        print(f"Arguments: {tool_call.arguments}")
+        print(f"Tool Called: {tool_call.function.name}")
+        print(f"Arguments: {tool_call.function.arguments}")
 ```

 ### Structured Output Example
@ -105,33 +108,26 @@ if tool_response.choices[0].message.tool_calls:
 The following example shows how to do structured output for an NVIDIA NIM.

 ```python
-from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
-
 person_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
-        "age": {"type": "integer"},
+        "age": {"type": "number"},
        "occupation": {"type": "string"},
    },
    "required": ["name", "age", "occupation"],
 }

-response_format = JsonSchemaResponseFormat(
-    type=ResponseFormatType.json_schema, json_schema=person_schema
-)
-
 structured_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
        }
    ],
-    response_format=response_format,
+    extra_body={"nvext": {"guided_json": person_schema}},
 )
-
 print(f"Structured Response: {structured_response.choices[0].message.content}")
 ```

@ -139,16 +135,13 @@ print(f"Structured Response: {structured_response.choices[0].message.content}")

 The following example shows how to create embeddings for an NVIDIA NIM.

-> [!NOTE]
-> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
-
 ```python
-response = client.inference.embeddings(
-    model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
-    contents=["What is the capital of France?"],
-    task_type="query",
+response = client.embeddings.create(
+    model="nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2",
+    input=["What is the capital of France?"],
+    extra_body={"input_type": "query"},
 )
-print(f"Embeddings: {response.embeddings}")
+print(f"Embeddings: {response.data}")
 ```

 ### Vision Language Models Example
@ -166,15 +159,15 @@ image_path = {path_to_the_image}
 demo_image_b64 = load_image_as_base64(image_path)

 vlm_response = client.chat.completions.create(
-    model="nvidia/vila",
+    model="nvidia/meta/llama-3.2-11b-vision-instruct",
    messages=[
        {
            "role": "user",
            "content": [
                {
-                    "type": "image",
-                    "image": {
-                        "data": demo_image_b64,
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{demo_image_b64}",
                    },
                },
                {
--- a/llama_stack/providers/remote/inference/nvidia/init.py
+++ b/llama_stack/providers/remote/inference/nvidia/init.py
@ -10,7 +10,7 @@ from .config import NVIDIAConfig


 async def get_adapter_impl(config: NVIDIAConfig, _deps) -> Inference:
-    # import dynamically so `llama stack build` does not fail due to missing dependencies
+    # import dynamically so `llama stack list-deps` does not fail due to missing dependencies
    from .nvidia import NVIDIAInferenceAdapter

    if not isinstance(config, NVIDIAConfig):
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -5,13 +5,6 @@
 # the root directory of this source tree.


-from openai import NOT_GIVEN
-
-from llama_stack.apis.inference import (
-    OpenAIEmbeddingData,
-    OpenAIEmbeddingsResponse,
-    OpenAIEmbeddingUsage,
-)
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

@ -28,15 +21,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin):

    """
    NVIDIA Inference Adapter for Llama Stack.
-
-    Note: The inheritance order is important here. OpenAIMixin must come before
-    ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability()
-    is used instead of ModelRegistryHelper.check_model_availability(). It also
-    must come before Inference to ensure that OpenAIMixin methods are available
-    in the Inference interface.
-
-    - OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists
-    - ModelRegistryHelper.check_model_availability() just returns False and shows a warning
    """

    # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
@ -51,7 +35,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...")

        if _is_nvidia_hosted(self.config):
-            if not self.config.api_key:
+            if not self.config.auth_credential:
                raise RuntimeError(
                    "API key is required for hosted NVIDIA NIM. Either provide an API key or use a self-hosted NIM."
                )
@ -62,7 +46,13 @@ class NVIDIAInferenceAdapter(OpenAIMixin):

        :return: The NVIDIA API key
        """
-        return self.config.api_key.get_secret_value() if self.config.api_key else "NO KEY"
+        if self.config.auth_credential:
+            return self.config.auth_credential.get_secret_value()
+
+        if not _is_nvidia_hosted(self.config):
+            return "NO KEY REQUIRED"
+
+        return None

    def get_base_url(self) -> str:
        """
@ -71,54 +61,3 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
        :return: The NVIDIA API base URL
        """
        return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url
-
-    async def openai_embeddings(
-        self,
-        model: str,
-        input: str | list[str],
-        encoding_format: str | None = "float",
-        dimensions: int | None = None,
-        user: str | None = None,
-    ) -> OpenAIEmbeddingsResponse:
-        """
-        OpenAI-compatible embeddings for NVIDIA NIM.
-
-        Note: NVIDIA NIM asymmetric embedding models require an "input_type" field not present in the standard OpenAI embeddings API.
-        We default this to "query" to ensure requests succeed when using the
-        OpenAI-compatible endpoint. For passage embeddings, use the embeddings API with
-        `task_type='document'`.
-        """
-        extra_body: dict[str, object] = {"input_type": "query"}
-        logger.warning(
-            "NVIDIA OpenAI-compatible embeddings: defaulting to input_type='query'. "
-            "For passage embeddings, use the embeddings API with task_type='document'."
-        )
-
-        response = await self.client.embeddings.create(
-            model=await self._get_provider_model_id(model),
-            input=input,
-            encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN,
-            dimensions=dimensions if dimensions is not None else NOT_GIVEN,
-            user=user if user is not None else NOT_GIVEN,
-            extra_body=extra_body,
-        )
-
-        data = []
-        for i, embedding_data in enumerate(response.data):
-            data.append(
-                OpenAIEmbeddingData(
-                    embedding=embedding_data.embedding,
-                    index=i,
-                )
-            )
-
-        usage = OpenAIEmbeddingUsage(
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-        )
-
-        return OpenAIEmbeddingsResponse(
-            data=data,
-            model=response.model,
-            usage=usage,
-        )