feat: add dynamic model registration support to TGI inference (#3417)

# What does this PR do? adds dynamic model support to TGI add new overwrite_completion_id feature to OpenAIMixin to deal with TGI always returning id="" ## Test Plan tgi: `docker run --gpus all --shm-size 1g -p 8080:80 -v /data:/data ghcr.io/huggingface/text-generation-inference --model-id Qwen/Qwen3-0.6B` stack: `TGI_URL=http://localhost:8080 uv run llama stack build --image-type venv --distro ci-tests --run` test: `./scripts/integration-tests.sh --stack-config http://localhost:8321 --setup tgi --subdirs inference --pattern openai`
2025-12-03 09:53:45 +00:00 · 2025-09-15 15:52:40 -04:00 · 2025-09-15 15:52:40 -04:00 · f4ab154ade
commit f4ab154ade
parent ab321739f2
14 changed files with 12218 additions and 20 deletions
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import uuid
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
 from typing import Any
@ -43,6 +44,12 @@ class OpenAIMixin(ABC):
      The model_store is set in routing_tables/common.py during provider initialization.
    """

+    # Allow subclasses to control whether to overwrite the 'id' field in OpenAI responses
+    # is overwritten with a client-side generated id.
+    #
+    # This is useful for providers that do not return a unique id in the response.
+    overwrite_completion_id: bool = False
+
    @abstractmethod
    def get_api_key(self) -> str:
        """
@ -110,6 +117,23 @@ class OpenAIMixin(ABC):
            raise ValueError(f"Model {model} has no provider_resource_id")
        return model_obj.provider_resource_id

+    async def _maybe_overwrite_id(self, resp: Any, stream: bool | None) -> Any:
+        if not self.overwrite_completion_id:
+            return resp
+
+        new_id = f"cltsd-{uuid.uuid4()}"
+        if stream:
+
+            async def _gen():
+                async for chunk in resp:
+                    chunk.id = new_id
+                    yield chunk
+
+            return _gen()
+        else:
+            resp.id = new_id
+            return resp
+
    async def openai_completion(
        self,
        model: str,
@ -147,7 +171,7 @@ class OpenAIMixin(ABC):
            extra_body["guided_choice"] = guided_choice

        # TODO: fix openai_completion to return type compatible with OpenAI's API response
-        return await self.client.completions.create(  # type: ignore[no-any-return]
+        resp = await self.client.completions.create(
            **await prepare_openai_completion_params(
                model=await self._get_provider_model_id(model),
                prompt=prompt,
@ -171,6 +195,8 @@ class OpenAIMixin(ABC):
            extra_body=extra_body,
        )

+        return await self._maybe_overwrite_id(resp, stream)  # type: ignore[no-any-return]
+
    async def openai_chat_completion(
        self,
        model: str,
@ -200,8 +226,7 @@ class OpenAIMixin(ABC):
        """
        Direct OpenAI chat completion API call.
        """
-        # Type ignore because return types are compatible
-        return await self.client.chat.completions.create(  # type: ignore[no-any-return]
+        resp = await self.client.chat.completions.create(
            **await prepare_openai_completion_params(
                model=await self._get_provider_model_id(model),
                messages=messages,
@ -229,6 +254,8 @@ class OpenAIMixin(ABC):
            )
        )

+        return await self._maybe_overwrite_id(resp, stream)  # type: ignore[no-any-return]
+
    async def openai_embeddings(
        self,
        model: str,