Convert TGI to work with openai_compat

2025-12-09 11:20:58 +00:00 · 2024-10-08 12:57:34 -07:00 · 2024-10-08 12:57:34 -07:00 · ed899a5dec
commit ed899a5dec
parent 05e73d12b3
6 changed files with 133 additions and 338 deletions
--- a/llama_stack/providers/utils/inference/augment_messages.py
+++ b/llama_stack/providers/utils/inference/augment_messages.py
@ -3,8 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Tuple
+
 from llama_models.llama3.api.chat_format import ChatFormat
 from termcolor import cprint
+
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_models.datatypes import ModelFamily
@ -28,6 +31,17 @@ def chat_completion_request_to_prompt(
    return formatter.tokenizer.decode(model_input.tokens)


+def chat_completion_request_to_model_input_info(
+    request: ChatCompletionRequest, formatter: ChatFormat
+) -> Tuple[str, int]:
+    messages = augment_messages_for_tools(request)
+    model_input = formatter.encode_dialog_prompt(messages)
+    return (
+        formatter.tokenizer.decode(model_input.tokens),
+        len(model_input.tokens),
+    )
+
+
 def augment_messages_for_tools(request: ChatCompletionRequest) -> List[Message]:
    """Reads chat completion request and augments the messages to handle tools.
    For eg. for llama_3_1, add system message with the appropriate tools or
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -60,6 +60,8 @@ def process_chat_completion_response(
    if reason := choice.finish_reason:
        if reason in ["stop", "eos"]:
            stop_reason = StopReason.end_of_turn
+        elif reason == "eom":
+            stop_reason = StopReason.end_of_message
        elif reason == "length":
            stop_reason = StopReason.out_of_tokens

@ -96,7 +98,7 @@ async def process_chat_completion_stream_response(
        finish_reason = choice.finish_reason

        if finish_reason:
-            if stop_reason is None and finish_reason in ["stop", "eos"]:
+            if stop_reason is None and finish_reason in ["stop", "eos", "eos_token"]:
                stop_reason = StopReason.end_of_turn
            elif stop_reason is None and finish_reason == "length":
                stop_reason = StopReason.out_of_tokens
@ -118,16 +120,16 @@ async def process_chat_completion_stream_response(
            buffer += text
            continue

-        if ipython:
-            if text == "<|eot_id|>":
-                stop_reason = StopReason.end_of_turn
-                text = ""
-                continue
-            elif text == "<|eom_id|>":
-                stop_reason = StopReason.end_of_message
-                text = ""
-                continue
+        if text == "<|eot_id|>":
+            stop_reason = StopReason.end_of_turn
+            text = ""
+            continue
+        elif text == "<|eom_id|>":
+            stop_reason = StopReason.end_of_message
+            text = ""
+            continue

+        if ipython:
            buffer += text
            delta = ToolCallDelta(
                content=text,