From edb10198efd70b9161fbf2981aad957d185af43e Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 8 May 2024 21:25:40 -0700
Subject: [PATCH 1/6] feat - add stream_options support litellm

---
 litellm/main.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/litellm/main.py b/litellm/main.py
index bff9886ac..d6d276653 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -187,6 +187,7 @@ async def acompletion(
     top_p: Optional[float] = None,
     n: Optional[int] = None,
     stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
     stop=None,
     max_tokens: Optional[int] = None,
     presence_penalty: Optional[float] = None,
@@ -223,6 +224,7 @@ async def acompletion(
         top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
         n (int, optional): The number of completions to generate (default is 1).
         stream (bool, optional): If True, return a streaming response (default is False).
+        stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
         stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
         max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
         presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@@ -260,6 +262,7 @@ async def acompletion(
         "top_p": top_p,
         "n": n,
         "stream": stream,
+        "stream_options": stream_options,
         "stop": stop,
         "max_tokens": max_tokens,
         "presence_penalty": presence_penalty,
@@ -457,6 +460,7 @@ def completion(
     top_p: Optional[float] = None,
     n: Optional[int] = None,
     stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
     stop=None,
     max_tokens: Optional[int] = None,
     presence_penalty: Optional[float] = None,
@@ -496,6 +500,7 @@ def completion(
         top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
         n (int, optional): The number of completions to generate (default is 1).
         stream (bool, optional): If True, return a streaming response (default is False).
+        stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
         stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
         max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
         presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@@ -573,6 +578,7 @@ def completion(
         "top_p",
         "n",
         "stream",
+        "stream_options",
         "stop",
         "max_tokens",
         "presence_penalty",
@@ -783,6 +789,7 @@ def completion(
             top_p=top_p,
             n=n,
             stream=stream,
+            stream_options=stream_options,
             stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,

From 10420516020f5da2e5369035da7898f218d2449e Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 8 May 2024 21:52:25 -0700
Subject: [PATCH 2/6] support stream_options for chat completion models

---
 litellm/llms/openai.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index d516334ac..d542cbe07 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -530,6 +530,7 @@ class OpenAIChatCompletion(BaseLLM):
             model=model,
             custom_llm_provider="openai",
             logging_obj=logging_obj,
+            stream_options=data.get("stream_options", None),
         )
         return streamwrapper
 
@@ -579,6 +580,7 @@ class OpenAIChatCompletion(BaseLLM):
                 model=model,
                 custom_llm_provider="openai",
                 logging_obj=logging_obj,
+                stream_options=data.get("stream_options", None),
             )
             return streamwrapper
         except (

From f2965660dd2222ee74f7cd447f9b4ace70ba8364 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 8 May 2024 21:52:39 -0700
Subject: [PATCH 3/6] test openai stream_options

---
 litellm/tests/test_streaming.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 271a53dd4..7d639d7a3 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -1501,6 +1501,37 @@ def test_openai_chat_completion_complete_response_call():
 
 
 # test_openai_chat_completion_complete_response_call()
+def test_openai_stream_options_call():
+    litellm.set_verbose = False
+    response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "system", "content": "say GM - we're going to make it "}],
+        stream=True,
+        stream_options={"include_usage": True},
+        max_tokens=10,
+    )
+    usage = None
+    chunks = []
+    for chunk in response:
+        print("chunk: ", chunk)
+        chunks.append(chunk)
+
+    last_chunk = chunks[-1]
+    print("last chunk: ", last_chunk)
+
+    """
+    Assert that:
+    - Last Chunk includes Usage
+    - All chunks prior to last chunk have usage=None
+    """
+
+    assert last_chunk.usage is not None
+    assert last_chunk.usage.total_tokens > 0
+    assert last_chunk.usage.prompt_tokens > 0
+    assert last_chunk.usage.completion_tokens > 0
+
+    # assert all non last chunks have usage=None
+    assert all(chunk.usage is None for chunk in chunks[:-1])
 
 
 def test_openai_text_completion_call():

From 80ca011a642ae6206c293eef15c951c1f74a8e3c Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 8 May 2024 21:53:33 -0700
Subject: [PATCH 4/6] support stream_options

---
 litellm/utils.py | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index df58db29c..64a644f15 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -612,6 +612,7 @@ class ModelResponse(OpenAIObject):
         system_fingerprint=None,
         usage=None,
         stream=None,
+        stream_options=None,
         response_ms=None,
         hidden_params=None,
         **params,
@@ -658,6 +659,12 @@ class ModelResponse(OpenAIObject):
             usage = usage
         elif stream is None or stream == False:
             usage = Usage()
+        elif (
+            stream == True
+            and stream_options is not None
+            and stream_options.get("include_usage") == True
+        ):
+            usage = Usage()
         if hidden_params:
             self._hidden_params = hidden_params
 
@@ -4839,6 +4846,7 @@ def get_optional_params(
     top_p=None,
     n=None,
     stream=False,
+    stream_options=None,
     stop=None,
     max_tokens=None,
     presence_penalty=None,
@@ -4908,6 +4916,7 @@ def get_optional_params(
         "top_p": None,
         "n": None,
         "stream": None,
+        "stream_options": None,
         "stop": None,
         "max_tokens": None,
         "presence_penalty": None,
@@ -5779,6 +5788,8 @@ def get_optional_params(
             optional_params["n"] = n
         if stream is not None:
             optional_params["stream"] = stream
+        if stream_options is not None:
+            optional_params["stream_options"] = stream_options
         if stop is not None:
             optional_params["stop"] = stop
         if max_tokens is not None:
@@ -6049,6 +6060,7 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
             "top_p",
             "n",
             "stream",
+            "stream_options",
             "stop",
             "max_tokens",
             "presence_penalty",
@@ -9466,7 +9478,12 @@ def get_secret(
 # replicate/anthropic/cohere
 class CustomStreamWrapper:
     def __init__(
-        self, completion_stream, model, custom_llm_provider=None, logging_obj=None
+        self,
+        completion_stream,
+        model,
+        custom_llm_provider=None,
+        logging_obj=None,
+        stream_options=None,
     ):
         self.model = model
         self.custom_llm_provider = custom_llm_provider
@@ -9492,6 +9509,7 @@ class CustomStreamWrapper:
         self.response_id = None
         self.logging_loop = None
         self.rules = Rules()
+        self.stream_options = stream_options
 
     def __iter__(self):
         return self
@@ -9932,6 +9950,7 @@ class CustomStreamWrapper:
             is_finished = False
             finish_reason = None
             logprobs = None
+            usage = None
             original_chunk = None  # this is used for function/tool calling
             if len(str_line.choices) > 0:
                 if (
@@ -9966,12 +9985,15 @@ class CustomStreamWrapper:
                 else:
                     logprobs = None
 
+            usage = getattr(str_line, "usage", None)
+
             return {
                 "text": text,
                 "is_finished": is_finished,
                 "finish_reason": finish_reason,
                 "logprobs": logprobs,
                 "original_chunk": str_line,
+                "usage": usage,
             }
         except Exception as e:
             traceback.print_exc()
@@ -10274,7 +10296,9 @@ class CustomStreamWrapper:
             raise e
 
     def model_response_creator(self):
-        model_response = ModelResponse(stream=True, model=self.model)
+        model_response = ModelResponse(
+            stream=True, model=self.model, stream_options=self.stream_options
+        )
         if self.response_id is not None:
             model_response.id = self.response_id
         else:
@@ -10594,6 +10618,12 @@ class CustomStreamWrapper:
                 if response_obj["logprobs"] is not None:
                     model_response.choices[0].logprobs = response_obj["logprobs"]
 
+                if (
+                    self.stream_options is not None
+                    and self.stream_options["include_usage"] == True
+                ):
+                    model_response.usage = response_obj["usage"]
+
             model_response.model = self.model
             print_verbose(
                 f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
@@ -10681,6 +10711,11 @@ class CustomStreamWrapper:
                         except Exception as e:
                             model_response.choices[0].delta = Delta()
                 else:
+                    if (
+                        self.stream_options is not None
+                        and self.stream_options["include_usage"] == True
+                    ):
+                        return model_response
                     return
             print_verbose(
                 f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"

From e7e54772ae518258bfb52b5d9b7f612b60aa4750 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 8 May 2024 21:57:25 -0700
Subject: [PATCH 5/6] docs include `stream_options` param

---
 docs/my-website/docs/completion/input.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md
index 11ca13121..451deaac4 100644
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@@ -83,6 +83,7 @@ def completion(
     top_p: Optional[float] = None,
     n: Optional[int] = None,
     stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
     stop=None,
     max_tokens: Optional[int] = None,
     presence_penalty: Optional[float] = None,
@@ -139,6 +140,10 @@ def completion(
 
 - `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
 
+- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
+
+    - `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. 
+
 - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
 
 - `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.

From dfd6361310bfc30b370a6e7a13699ae481e04403 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 9 May 2024 07:59:37 -0700
Subject: [PATCH 6/6] fix completion vs acompletion params

---
 litellm/main.py                   |  1 +
 litellm/tests/test_acompletion.py | 23 ++++++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index d6d276653..186b87060 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -207,6 +207,7 @@ async def acompletion(
     api_version: Optional[str] = None,
     api_key: Optional[str] = None,
     model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
+    extra_headers: Optional[dict] = None,
     # Optional liteLLM function params
     **kwargs,
 ):
diff --git a/litellm/tests/test_acompletion.py b/litellm/tests/test_acompletion.py
index e5c09b9b7..b83e34653 100644
--- a/litellm/tests/test_acompletion.py
+++ b/litellm/tests/test_acompletion.py
@@ -1,5 +1,6 @@
 import pytest
 from litellm import acompletion
+from litellm import completion
 
 
 def test_acompletion_params():
@@ -7,17 +8,29 @@ def test_acompletion_params():
     from litellm.types.completion import CompletionRequest
 
     acompletion_params_odict = inspect.signature(acompletion).parameters
-    acompletion_params = {name: param.annotation for name, param in acompletion_params_odict.items()}
-    completion_params = {field_name: field_type for field_name, field_type in CompletionRequest.__annotations__.items()}
+    completion_params_dict = inspect.signature(completion).parameters
 
-    # remove kwargs
-    acompletion_params.pop("kwargs", None)
+    acompletion_params = {
+        name: param.annotation for name, param in acompletion_params_odict.items()
+    }
+    completion_params = {
+        name: param.annotation for name, param in completion_params_dict.items()
+    }
 
     keys_acompletion = set(acompletion_params.keys())
     keys_completion = set(completion_params.keys())
 
+    print(keys_acompletion)
+    print("\n\n\n")
+    print(keys_completion)
+
+    print("diff=", keys_completion - keys_acompletion)
+
     # Assert that the parameters are the same
     if keys_acompletion != keys_completion:
-        pytest.fail("The parameters of the acompletion function and the CompletionRequest class are not the same.")
+        pytest.fail(
+            "The parameters of the litellm.acompletion function and litellm.completion are not the same."
+        )
+
 
 # test_acompletion_params()