completion() for tgi

2025-12-15 18:33:09 +00:00 · 2024-10-23 12:06:25 -07:00 · 2024-10-23 12:06:25 -07:00 · 5570a63248
commit 5570a63248
parent 21f2e9adf5
4 changed files with 100 additions and 8 deletions
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -95,13 +95,6 @@ async def process_completion_stream_response(
        choice = chunk.choices[0]
        finish_reason = choice.finish_reason

-        if finish_reason:
-            if finish_reason in ["stop", "eos", "eos_token"]:
-                stop_reason = StopReason.end_of_turn
-            elif finish_reason == "length":
-                stop_reason = StopReason.out_of_tokens
-            break
-
        text = text_from_choice(choice)
        if text == "<|eot_id|>":
            stop_reason = StopReason.end_of_turn
@ -115,6 +108,12 @@ async def process_completion_stream_response(
            delta=text,
            stop_reason=stop_reason,
        )
+        if finish_reason:
+            if finish_reason in ["stop", "eos", "eos_token"]:
+                stop_reason = StopReason.end_of_turn
+            elif finish_reason == "length":
+                stop_reason = StopReason.out_of_tokens
+            break

    yield CompletionResponseStreamChunk(
        delta="",
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -31,6 +31,13 @@ def completion_request_to_prompt(
    return formatter.tokenizer.decode(model_input.tokens)


+def completion_request_to_prompt_model_input_info(
+    request: CompletionRequest, formatter: ChatFormat
+) -> Tuple[str, int]:
+    model_input = formatter.encode_content(request.content)
+    return (formatter.tokenizer.decode(model_input.tokens), len(model_input.tokens))
+
+
 def chat_completion_request_to_prompt(
    request: ChatCompletionRequest, formatter: ChatFormat
 ) -> str: