fix(fixes-for-text-completion-streaming): fixes for text completion streaming

2025-04-25 18:54:30 +00:00 · 2024-01-08 13:39:54 +05:30 · 2024-01-08 13:39:54 +05:30 · ff12e023ae
commit ff12e023ae
parent 39fb3f2a74
2 changed files with 38 additions and 18 deletions
--- a/litellm/main.py
+++ b/litellm/main.py
@ -469,6 +469,7 @@ def completion(
        "caching_groups",
        "ttl",
        "cache",
+        "parent_call"
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -2619,7 +2620,7 @@ def text_completion(
            # only use engine when model not passed
            model = kwargs["engine"]
        kwargs.pop("engine")
-
+    kwargs["parent_call"] = kwargs.get("parent_call", "text_completion")
    text_completion_response = TextCompletionResponse()

    optional_params: Dict[str, Any] = {}
@ -2726,6 +2727,7 @@ def text_completion(
    if kwargs.get("acompletion", False) == True:
        return response
    if stream == True or kwargs.get("stream", False) == True:
+        print(f"original model response: {response}")
        response = TextCompletionStreamWrapper(completion_stream=response, model=model)
        return response
    transformed_logprobs = None
@ -3162,22 +3164,23 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]=
    else:
        completion_output = ""
    # # Update usage information if needed
-    try:
-        response["usage"]["prompt_tokens"] = token_counter(
-            model=model, messages=messages
+    print(f"INSIDE TEXT COMPLETION STREAM CHUNK BUILDER")
+    _usage = litellm.Usage
+    print(f"messages: {messages}")
+    _usage.prompt_tokens = token_counter(
+        model=model, messages=messages, count_response_tokens=True
    )
-    except:  # don't allow this failing to block a complete streaming response from being returned
-        print_verbose(f"token_counter failed, assuming prompt tokens is 0")
-        response["usage"]["prompt_tokens"] = 0
-    response["usage"]["completion_tokens"] = token_counter(
+    print(f"received prompt tokens: {_usage.prompt_tokens}")
+    _usage.completion_tokens = token_counter(
        model=model,
        text=combined_content,
        count_response_tokens=True,  # count_response_tokens is a Flag to tell token counter this is a response, No need to add extra tokens we do for input messages
    )
-    response["usage"]["total_tokens"] = (
-        response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
+    _usage.total_tokens = (
+        _usage.prompt_tokens + _usage.completion_tokens
    )
-    return response
+    response["usage"] = _usage
+    return litellm.TextCompletionResponse(**response)

 def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
    id = chunks[0]["id"]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2038,12 +2038,19 @@ def client(original_function):
                if (
                    "complete_response" in kwargs
                    and kwargs["complete_response"] == True
+                    and kwargs.get("parent_call", None) is None
                ):
                    chunks = []
                    for idx, chunk in enumerate(result):
                        chunks.append(chunk)
+                    call_type = original_function.__name__
+                    if call_type == CallTypes.completion.value:
                        return litellm.stream_chunk_builder(
-                        chunks, messages=kwargs.get("messages", None)
+                            chunks, messages=kwargs.get("messages")
+                        )
+                    elif call_type == CallTypes.text_completion.value:
+                        return litellm.stream_chunk_builder(
+                            chunks, messages=[{"role": "user", "content": kwargs.get("prompt")}]
                        )
                else:
                    return result
@ -2695,9 +2702,10 @@ def token_counter(
            raise ValueError("text and messages cannot both be None")
    elif isinstance(text, List):
        text = "".join(t for t in text if isinstance(t, str))
-
+    print_verbose(f"text: {text}")
    if model is not None:
        tokenizer_json = _select_tokenizer(model=model)
+        print(f"tokenizer_json['type']: {tokenizer_json['type']}")
        if tokenizer_json["type"] == "huggingface_tokenizer":
            print_verbose(
                f"Token Counter - using hugging face token counter, for model={model}"
@ -2731,6 +2739,7 @@ def token_counter(
                num_tokens = len(enc)
    else:
        num_tokens = len(encoding.encode(text))  # type: ignore
+    print_verbose(f"final num tokens returned: {num_tokens}")
    return num_tokens


@ -7760,6 +7769,8 @@ class TextCompletionStreamWrapper:

    def convert_to_text_completion_object(self, chunk: ModelResponse):
        try:
+            if not isinstance(chunk, ModelResponse):
+                return
            response = TextCompletionResponse()
            response["id"] = chunk.get("id", None)
            response["object"] = "text_completion"
@ -7784,12 +7795,18 @@ class TextCompletionStreamWrapper:
        # model_response = ModelResponse(stream=True, model=self.model)
        response = TextCompletionResponse()
        try:
-            for chunk in self.completion_stream:
+            while True:
+                if isinstance(self.completion_stream, str) or isinstance(
+                    self.completion_stream, bytes
+                ) or isinstance(self.completion_stream, ModelResponse):
+                    chunk = self.completion_stream
+                else:
+                    chunk = next(self.completion_stream)
+
                if chunk == "None" or chunk is None:
                    raise Exception
                processed_chunk = self.convert_to_text_completion_object(chunk=chunk)
                return processed_chunk
-            raise StopIteration
        except StopIteration:
            raise StopIteration
        except Exception as e: