chore: remove llama_models.llama3.api imports from providers (#1107)

There should be a choke-point for llama3.api imports -- this is the prompt adapter. Creating a ChatFormat() object on demand is inexpensive. The underlying Tokenizer is a singleton anyway.
2025-02-19 19:01:29 -08:00 · 2025-02-19 19:01:29 -08:00 · cdcbeb005b
commit cdcbeb005b
parent e9b8259cf9
13 changed files with 77 additions and 113 deletions
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -7,7 +7,6 @@ import json
 import logging
 from typing import AsyncGenerator, Dict, List, Optional, Union

-from llama_models.llama3.api.chat_format import ChatFormat
 from openai.types.chat import ChatCompletionMessageToolCall
 from pydantic import BaseModel

@ -40,6 +39,7 @@ from llama_stack.models.llama.datatypes import (
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    convert_image_content_to_url,
+    decode_assistant_message,
 )

 logger = logging.getLogger(__name__)
@ -149,7 +149,7 @@ def convert_openai_completion_logprobs_stream(text: str, logprobs: Optional[Unio
    return None


-def process_completion_response(response: OpenAICompatCompletionResponse, formatter: ChatFormat) -> CompletionResponse:
+def process_completion_response(response: OpenAICompatCompletionResponse) -> CompletionResponse:
    choice = response.choices[0]
    # drop suffix <eot_id> if present and return stop reason as end of turn
    if choice.text.endswith("<|eot_id|>"):
@ -174,16 +174,13 @@ def process_completion_response(response: OpenAICompatCompletionResponse, format

 def process_chat_completion_response(
    response: OpenAICompatCompletionResponse,
-    formatter: ChatFormat,
    request: ChatCompletionRequest,
 ) -> ChatCompletionResponse:
    choice = response.choices[0]

    # TODO: This does not work well with tool calls for vLLM remote provider
    #   Ref: https://github.com/meta-llama/llama-stack/issues/1058
-    raw_message = formatter.decode_assistant_message_from_content(
-        text_from_choice(choice), get_stop_reason(choice.finish_reason)
-    )
+    raw_message = decode_assistant_message(text_from_choice(choice), get_stop_reason(choice.finish_reason))

    # NOTE: If we do not set tools in chat-completion request, we should not
    # expect the ToolCall in the response. Instead, we should return the raw
@ -217,7 +214,7 @@ def process_chat_completion_response(


 async def process_completion_stream_response(
-    stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat
+    stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
 ) -> AsyncGenerator:
    stop_reason = None

@ -254,7 +251,6 @@ async def process_completion_stream_response(

 async def process_chat_completion_stream_response(
    stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
-    formatter: ChatFormat,
    request: ChatCompletionRequest,
 ) -> AsyncGenerator:
    yield ChatCompletionResponseStreamChunk(
@ -333,7 +329,7 @@ async def process_chat_completion_stream_response(
            )

    # parse tool calls and report errors
-    message = formatter.decode_assistant_message_from_content(buffer, stop_reason)
+    message = decode_assistant_message(buffer, stop_reason)

    parsed_tool_calls = len(message.tool_calls) > 0
    if ipython and not parsed_tool_calls: