fixes and linting

2025-12-31 06:33:54 +00:00 · 2025-03-28 18:33:36 -07:00 · 2025-03-28 18:33:36 -07:00 · 5251d2422d
commit 5251d2422d
parent 021dd0d35d
8 changed files with 149 additions and 345 deletions
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -5,10 +5,9 @@
 # the root directory of this source tree.


-import logging
 from typing import AsyncGenerator, List, Optional

-from huggingface_hub import AsyncInferenceClient, HfApi, InferenceClient
+from huggingface_hub import AsyncInferenceClient, HfApi

 from llama_stack.apis.common.content_types import (
    InterleavedContent,
@ -16,12 +15,10 @@ from llama_stack.apis.common.content_types import (
 )
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
-    ChatCompletionResponse,
    CompletionRequest,
    EmbeddingsResponse,
    EmbeddingTaskType,
    Inference,
-    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
    ResponseFormat,
@ -38,26 +35,20 @@ from llama_stack.log import get_logger
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import (
-    build_hf_repo_model_entry,
    ModelRegistryHelper,
+    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
-    convert_chat_completion_request_to_openai_params,
-    convert_completion_request_to_openai_params,
-    convert_message_to_openai_dict_new,
-    convert_openai_chat_completion_choice,
-    convert_openai_chat_completion_stream,
-    convert_tooldef_to_openai_tool,
-    get_sampling_options,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
-    process_chat_completion_response,
-    process_chat_completion_stream_response,
+    convert_chat_completion_request_to_openai_params,
+    convert_openai_chat_completion_choice,
+    convert_openai_chat_completion_stream,
+    get_sampling_options,
    process_completion_response,
    process_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_model_input_info,
    completion_request_to_prompt_model_input_info,
 )

@ -85,9 +76,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
    def __init__(self) -> None:
        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
        self.huggingface_repo_to_llama_model_id = {
-            model.huggingface_repo: model.descriptor()
-            for model in all_registered_models()
-            if model.huggingface_repo
+            model.huggingface_repo: model.descriptor() for model in all_registered_models() if model.huggingface_repo
        }

    async def shutdown(self) -> None:
@ -114,7 +103,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncGenerator:
        if response_format:
-            raise ValueError(f"TGI does not support Response Format for completions.")
+            raise ValueError("TGI does not support Response Format for completions.")

        if sampling_params is None:
            sampling_params = SamplingParams()
@ -166,17 +155,13 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        return options

    async def _get_params_for_completion(self, request: CompletionRequest) -> dict:
-        prompt, input_tokens = await completion_request_to_prompt_model_input_info(
-            request
-        )
+        prompt, input_tokens = await completion_request_to_prompt_model_input_info(request)

        return dict(
            prompt=prompt,
            stream=request.stream,
            details=True,
-            max_new_tokens=self._get_max_new_tokens(
-                request.sampling_params, input_tokens
-            ),
+            max_new_tokens=self._get_max_new_tokens(request.sampling_params, input_tokens),
            stop_sequences=["<|eom_id|>", "<|eot_id|>"],
            **self._build_options(request.sampling_params, request.response_format),
        )
@ -185,16 +170,14 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        params = await self._get_params_for_completion(request)

        async def _generate_and_convert_to_openai_compat():
-            s = self.client.text_generation(**params)
-            for chunk in s:
+            s = await self.client.text_generation(**params)
+            async for chunk in s:
                token_result = chunk.token
                finish_reason = None
                if chunk.details:
                    finish_reason = chunk.details.finish_reason

-                choice = OpenAICompatCompletionChoice(
-                    text=token_result.text, finish_reason=finish_reason
-                )
+                choice = OpenAICompatCompletionChoice(text=token_result.text, finish_reason=finish_reason)
                yield OpenAICompatCompletionResponse(
                    choices=[choice],
                )
@ -205,7 +188,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):

    async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator:
        params = await self._get_params_for_completion(request)
-        r = self.client.text_generation(**params)
+        r = await self.client.text_generation(**params)

        choice = OpenAICompatCompletionChoice(
            finish_reason=r.details.finish_reason,
@ -234,9 +217,6 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        if sampling_params is None:
            sampling_params = SamplingParams()
        model = await self.model_store.get_model(model_id)
-        from rich.pretty import pprint
-
-        pprint(messages)
        request = ChatCompletionRequest(
            model=model.provider_resource_id,
            messages=messages,
@ -250,18 +230,9 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):

        params = await convert_chat_completion_request_to_openai_params(request)

-        import json
-
-        # print(json.dumps(params, indent=2))
-
-        pprint(params)
-
-        response = self.client.chat.completions.create(**params)
-
+        response = await self.client.chat.completions.create(**params)
        if stream:
-            return convert_openai_chat_completion_stream(
-                response, enable_incremental_tool_calls=True
-            )
+            return convert_openai_chat_completion_stream(response, enable_incremental_tool_calls=True)
        else:
            return convert_openai_chat_completion_choice(response.choices[0])

@ -281,18 +252,16 @@ class TGIAdapter(_HfAdapter):
        logger.info(f"Initializing TGI client with url={config.url}")
        # unfortunately, the TGI async client does not work well with proxies
        # so using sync client for now instead
-        self.client = InferenceClient(model=f"{config.url}")
+        self.client = AsyncInferenceClient(model=f"{config.url}")

-        endpoint_info = self.client.get_endpoint_info()
+        endpoint_info = await self.client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
        self.model_id = endpoint_info["model_id"]


 class InferenceAPIAdapter(_HfAdapter):
    async def initialize(self, config: InferenceAPIImplConfig) -> None:
-        self.client = AsyncInferenceClient(
-            model=config.huggingface_repo, token=config.api_token.get_secret_value()
-        )
+        self.client = AsyncInferenceClient(model=config.huggingface_repo, token=config.api_token.get_secret_value())
        endpoint_info = await self.client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
        self.model_id = endpoint_info["model_id"]
@ -310,6 +279,4 @@ class InferenceEndpointAdapter(_HfAdapter):
        # Initialize the adapter
        self.client = endpoint.async_client
        self.model_id = endpoint.repository
-        self.max_tokens = int(
-            endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"]
-        )
+        self.max_tokens = int(endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"])