Merge branch 'main' into litellm_remove_litellm_telemetry

2024-03-26 11:35:02 -07:00 · 2024-03-26 11:35:02 -07:00 · da503eab18
commit da503eab18
parent b6af44c4dc f3f56204c5
21 changed files with 740 additions and 2272 deletions
--- a/README.md
+++ b/README.md
@ -30,11 +30,11 @@ LiteLLM manages:
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
 - Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)

-**Stable Release**: v`1.30.2` 👈 Recommended stable version of proxy. 
-
 [**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)

+🚨 **Stable Release:** v1.34.1 
+
 Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).

 # Usage ([**Docs**](https://docs.litellm.ai/docs/))
--- a/deploy/kubernetes/kub.yaml
+++ b/deploy/kubernetes/kub.yaml
@ -15,15 +15,16 @@ spec:
      containers:
        - name: litellm-container
          image: ghcr.io/berriai/litellm:main-latest
+          imagePullPolicy: Always
          env:
            - name: AZURE_API_KEY
              value: "d6f****"
            - name: AZURE_API_BASE
-              value: "https://openai
+              value: "https://openai"
            - name: LITELLM_MASTER_KEY
              value: "sk-1234"
            - name: DATABASE_URL
-              value: "postgresql://ishaan:*********
+              value: "postgresql://ishaan:*********"
          args:
            - "--config"
            - "/app/proxy_config.yaml"  # Update the path to mount the config file
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -88,6 +88,7 @@ spec:
      containers:
        - name: litellm-container
          image: ghcr.io/berriai/litellm:main-latest
+          imagePullPolicy: Always
          env:
            - name: AZURE_API_KEY
              value: "d6******"
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -318,6 +318,9 @@ class RedisCache(BaseCache):
    def flush_cache(self):
        self.redis_client.flushall()

+    def flushall(self):
+        self.redis_client.flushall()
+
    async def disconnect(self):
        await self.async_redis_conn_pool.disconnect(inuse_connections=True)

--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -7742,6 +7742,44 @@ async def cache_ping():
        )


+@router.post(
+    "/cache/flushall",
+    tags=["caching"],
+    dependencies=[Depends(user_api_key_auth)],
+)
+async def cache_flushall():
+    """
+    A function to flush all items from the cache. (All items will be deleted from the cache with this)
+    Raises HTTPException if the cache is not initialized or if the cache type does not support flushing.
+    Returns a dictionary with the status of the operation.
+
+    Usage:
+    ```
+    curl -X POST http://0.0.0.0:4000/cache/flushall -H "Authorization: Bearer sk-1234"
+    ```
+    """
+    try:
+        if litellm.cache is None:
+            raise HTTPException(
+                status_code=503, detail="Cache not initialized. litellm.cache is None"
+            )
+        if litellm.cache.type == "redis":
+            litellm.cache.cache.flushall()
+            return {
+                "status": "success",
+            }
+        else:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Cache type {litellm.cache.type} does not support flushing",
+            )
+    except Exception as e:
+        raise HTTPException(
+            status_code=503,
+            detail=f"Service Unhealthy ({str(e)})",
+        )
+
+
@router.get("/", dependencies=[Depends(user_api_key_auth)])
 async def home(request: Request):
    return "LiteLLM: RUNNING"
--- a/litellm/proxy/tests/large_text.py
+++ b/litellm/proxy/tests/large_text.py
--- a/litellm/router.py
+++ b/litellm/router.py
@ -32,41 +32,6 @@ import logging


 class Router:
-    """
-    Example usage:
-    ```python
-    from litellm import Router
-    model_list = [
-    {
-        "model_name": "azure-gpt-3.5-turbo", # model alias
-        "litellm_params": { # params for litellm completion/embedding call
-            "model": "azure/<your-deployment-name-1>",
-            "api_key": <your-api-key>,
-            "api_version": <your-api-version>,
-            "api_base": <your-api-base>
-        },
-    },
-    {
-        "model_name": "azure-gpt-3.5-turbo", # model alias
-        "litellm_params": { # params for litellm completion/embedding call
-            "model": "azure/<your-deployment-name-2>",
-            "api_key": <your-api-key>,
-            "api_version": <your-api-version>,
-            "api_base": <your-api-base>
-        },
-    },
-    {
-        "model_name": "openai-gpt-3.5-turbo", # model alias
-        "litellm_params": { # params for litellm completion/embedding call
-            "model": "gpt-3.5-turbo",
-            "api_key": <your-api-key>,
-        },
-    ]
-
-    router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
-    ```
-    """
-
    model_names: List = []
    cache_responses: Optional[bool] = False
    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
@ -142,6 +107,39 @@ class Router:

        Returns:
            Router: An instance of the litellm.Router class.
+
+        Example Usage:
+        ```python
+        from litellm import Router
+        model_list = [
+        {
+            "model_name": "azure-gpt-3.5-turbo", # model alias
+            "litellm_params": { # params for litellm completion/embedding call
+                "model": "azure/<your-deployment-name-1>",
+                "api_key": <your-api-key>,
+                "api_version": <your-api-version>,
+                "api_base": <your-api-base>
+            },
+        },
+        {
+            "model_name": "azure-gpt-3.5-turbo", # model alias
+            "litellm_params": { # params for litellm completion/embedding call
+                "model": "azure/<your-deployment-name-2>",
+                "api_key": <your-api-key>,
+                "api_version": <your-api-version>,
+                "api_base": <your-api-base>
+            },
+        },
+        {
+            "model_name": "openai-gpt-3.5-turbo", # model alias
+            "litellm_params": { # params for litellm completion/embedding call
+                "model": "gpt-3.5-turbo",
+                "api_key": <your-api-key>,
+            },
+        ]
+
+        router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
+        ```
        """
        self.set_verbose = set_verbose
        self.debug_level = debug_level
@ -286,8 +284,8 @@ class Router:
            litellm.failure_callback.append(self.deployment_callback_on_failure)
        else:
            litellm.failure_callback = [self.deployment_callback_on_failure]
-        verbose_router_logger.debug(
-            f"Intialized router with Routing strategy: {self.routing_strategy}\n"
+        verbose_router_logger.info(
+            f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}"
        )

    def print_deployment(self, deployment: dict):
@ -1148,11 +1146,13 @@ class Router:
            original_exception = e
            fallback_model_group = None
            try:
+                verbose_router_logger.debug(f"Trying to fallback b/w models")
                if (
-                    hasattr(e, "status_code") and e.status_code == 400
+                    hasattr(e, "status_code")
+                    and e.status_code == 400
+                    and not isinstance(e, litellm.ContextWindowExceededError)
                ):  # don't retry a malformed request
                    raise e
-                verbose_router_logger.debug(f"Trying to fallback b/w models")
                if (
                    isinstance(e, litellm.ContextWindowExceededError)
                    and context_window_fallbacks is not None
@ -1346,6 +1346,13 @@ class Router:
            original_exception = e
            verbose_router_logger.debug(f"An exception occurs {original_exception}")
            try:
+                if (
+                    hasattr(e, "status_code")
+                    and e.status_code == 400
+                    and not isinstance(e, litellm.ContextWindowExceededError)
+                ):  # don't retry a malformed request
+                    raise e
+
                verbose_router_logger.debug(
                    f"Trying to fallback b/w models. Initial model group: {model_group}"
                )
--- a/litellm/tests/log.txt
+++ b/litellm/tests/log.txt
@ -1,119 +1,6 @@
-============================= test session starts ==============================
-platform darwin -- Python 3.11.6, pytest-7.3.1, pluggy-1.3.0
-rootdir: /Users/krrishdholakia/Documents/litellm/litellm/tests
-plugins: timeout-2.2.0, asyncio-0.23.2, anyio-3.7.1, xdist-3.3.1
-asyncio: mode=Mode.STRICT
-collected 1 item
-
-test_completion.py .                                                     [100%]
-
-=============================== warnings summary ===============================
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-  /opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)
-
-../proxy/_types.py:102
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:102: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    extra = Extra.allow  # Allow extra fields
-
-../proxy/_types.py:105
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:105: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:134
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:134: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:180
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:180: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:241
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:241: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:253
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:253: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:292
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:292: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:319
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:319: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:570
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:570: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:591
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:591: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../utils.py:35
-  /Users/krrishdholakia/Documents/litellm/litellm/utils.py:35: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
-    import pkg_resources
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: 10 warnings
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.cloud')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(parent)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.logging')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.iam')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('mpl_toolkits')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('sphinxcontrib')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../llms/prompt_templates/factory.py:6
-  /Users/krrishdholakia/Documents/litellm/litellm/llms/prompt_templates/factory.py:6: DeprecationWarning: 'imghdr' is deprecated and slated for removal in Python 3.13
-    import imghdr, base64
-
-test_completion.py::test_completion_claude_3_stream
-../utils.py:3249
-../utils.py:3249
-  /Users/krrishdholakia/Documents/litellm/litellm/utils.py:3249: DeprecationWarning: open_text is deprecated. Use files() instead. Refer to https://importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice.
-    with resources.open_text(
-
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
-======================== 1 passed, 46 warnings in 3.14s ========================
+<litellm.utils.CustomStreamWrapper object at 0x118bd82d0>
+chunk: ModelResponse(id='chatcmpl-95b7d389-ff5a-4e09-a084-02584ba2cf1e', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1711406570, model='ai21.j2-mid-v1', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
+extracted chunk: In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the
+chunk: ModelResponse(id='chatcmpl-95b7d389-ff5a-4e09-a084-02584ba2cf1e', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1711406570, model='ai21.j2-mid-v1', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
+extracted chunk: 
+completion_response: In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -281,7 +281,8 @@ async def test_async_vertexai_streaming_response():
            complete_response = ""
            async for chunk in response:
                print(f"chunk: {chunk}")
-                complete_response += chunk.choices[0].delta.content
+                if chunk.choices[0].delta.content is not None:
+                    complete_response += chunk.choices[0].delta.content
            print(f"complete_response: {complete_response}")
            assert len(complete_response) > 0
        except litellm.RateLimitError as e:
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -547,7 +547,7 @@ def test_redis_cache_completion_stream():
        response_2_id = ""
        for chunk in response2:
            print(chunk)
-            response_2_id += chunk.id
+            response_2_id = chunk.id
        assert (
            response_1_id == response_2_id
        ), f"Response 1 != Response 2. Same params, Response 1{response_1_id} != Response 2{response_2_id}"
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -651,6 +651,7 @@ async def test_async_chat_vertex_ai_stream():
    try:
        load_vertex_ai_credentials()
        customHandler = CompletionCustomHandler()
+        litellm.set_verbose = True
        litellm.callbacks = [customHandler]
        # test streaming
        response = await litellm.acompletion(
@ -667,6 +668,7 @@ async def test_async_chat_vertex_ai_stream():
        async for chunk in response:
            print(f"chunk: {chunk}")
            continue
+        await asyncio.sleep(10)
        print(f"customHandler.states: {customHandler.states}")
        assert (
            customHandler.states.count("async_success") == 1
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -490,7 +490,7 @@ def test_redis_cache_completion_stream():
            response_1_content += chunk.choices[0].delta.content or ""
        print(response_1_content)

-        time.sleep(0.1)  # sleep for 0.1 seconds allow set cache to occur
+        time.sleep(1)  # sleep for 0.1 seconds allow set cache to occur
        response2 = completion(
            model="gpt-3.5-turbo",
            messages=messages,
@ -505,8 +505,10 @@ def test_redis_cache_completion_stream():
            response_2_id = chunk.id
            print(chunk)
            response_2_content += chunk.choices[0].delta.content or ""
-        print("\nresponse 1", response_1_content)
-        print("\nresponse 2", response_2_content)
+        print(
+            f"\nresponse 1: {response_1_content}",
+        )
+        print(f"\nresponse 2: {response_2_content}")
        assert (
            response_1_id == response_2_id
        ), f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -298,6 +298,105 @@ def test_router_azure_acompletion():
 # test_router_azure_acompletion()


+def test_router_context_window_fallback():
+    """
+    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
+    - Send a 5k prompt
+    - Assert it works
+    """
+    from large_text import text
+    import os
+
+    litellm.set_verbose = False
+
+    print(f"len(text): {len(text)}")
+    try:
+        model_list = [
+            {
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                    "base_model": "azure/gpt-35-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo-large",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "gpt-3.5-turbo-1106",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ]
+
+        router = Router(model_list=model_list, set_verbose=True, context_window_fallbacks=[{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}], num_retries=0)  # type: ignore
+
+        response = router.completion(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": text},
+                {"role": "user", "content": "Who was Alexander?"},
+            ],
+        )
+
+        print(f"response: {response}")
+        assert response.model == "gpt-3.5-turbo-1106"
+    except Exception as e:
+        pytest.fail(f"Got unexpected exception on router! - {str(e)}")
+
+
+@pytest.mark.asyncio
+async def test_async_router_context_window_fallback():
+    """
+    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
+    - Send a 5k prompt
+    - Assert it works
+    """
+    from large_text import text
+    import os
+
+    litellm.set_verbose = False
+
+    print(f"len(text): {len(text)}")
+    try:
+        model_list = [
+            {
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                    "base_model": "azure/gpt-35-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo-large",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "gpt-3.5-turbo-1106",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ]
+
+        router = Router(model_list=model_list, set_verbose=True, context_window_fallbacks=[{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}], num_retries=0)  # type: ignore
+
+        response = await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": text},
+                {"role": "user", "content": "Who was Alexander?"},
+            ],
+        )
+
+        print(f"response: {response}")
+        assert response.model == "gpt-3.5-turbo-1106"
+    except Exception as e:
+        pytest.fail(f"Got unexpected exception on router! - {str(e)}")
+
+
 def test_router_context_window_check():
    """
    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
--- a/litellm/tests/test_router_debug_logs.py
+++ b/litellm/tests/test_router_debug_logs.py
@ -81,6 +81,7 @@ def test_async_fallbacks(caplog):
    # Define the expected log messages
    # - error request, falling back notice, success notice
    expected_logs = [
+        "Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None",
        "litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
        "Falling back to model_group = azure/gpt-3.5-turbo",
        "litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -108,8 +108,19 @@ last_openai_chunk_example = {
    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
 }

+"""
+Final chunk (sdk):
+chunk: ChatCompletionChunk(id='chatcmpl-96mM3oNBlxh2FDWVLKsgaFBBcULmI', 
+choices=[Choice(delta=ChoiceDelta(content=None, function_call=None, role=None, 
+tool_calls=None), finish_reason='stop', index=0, logprobs=None)], 
+created=1711402871, model='gpt-3.5-turbo-0125', object='chat.completion.chunk', system_fingerprint='fp_3bc1b5746c')
+"""
+

 def validate_last_format(chunk):
+    """
+    Ensure last chunk has no remaining content or tools
+    """
    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
    assert isinstance(chunk["id"], str), "'id' should be a string."
    assert isinstance(chunk["object"], str), "'object' should be a string."
@ -119,6 +130,10 @@ def validate_last_format(chunk):

    for choice in chunk["choices"]:
        assert isinstance(choice["index"], int), "'index' should be an integer."
+        assert choice["delta"]["content"] is None
+        assert choice["delta"]["function_call"] is None
+        assert choice["delta"]["role"] is None
+        assert choice["delta"]["tool_calls"] is None
        assert isinstance(
            choice["finish_reason"], str
        ), "'finish_reason' should be a string."
@ -493,13 +508,15 @@ def test_completion_mistral_api_stream():
            stream=True,
        )
        complete_response = ""
+        has_finish_reason = False
        for idx, chunk in enumerate(response):
-            print(chunk)
-            # print(chunk.choices[0].delta)
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
+                has_finish_reason = True
                break
            complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
        if complete_response.strip() == "":
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
@ -534,11 +551,15 @@ def test_completion_deep_infra_stream():

        complete_response = ""
        # Add any assertions here to check the response
+        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
+                has_finish_reason = True
                break
            complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
        if complete_response.strip() == "":
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
@ -608,11 +629,15 @@ def test_completion_claude_stream_bad_key():
        )
        complete_response = ""
        # Add any assertions here to check the response
+        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
+                has_finish_reason = True
                break
            complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
        if complete_response.strip() == "":
            raise Exception("Empty response received")
        print(f"1234completion_response: {complete_response}")
@ -626,6 +651,45 @@ def test_completion_claude_stream_bad_key():
 # test_completion_claude_stream_bad_key()
 # test_completion_replicate_stream()

+
+def test_vertex_ai_stream():
+    from litellm.tests.test_amazing_vertex_completion import load_vertex_ai_credentials
+
+    load_vertex_ai_credentials()
+    litellm.set_verbose = True
+    litellm.vertex_project = "reliablekeys"
+    import random
+
+    test_models = ["gemini-1.0-pro"]
+    for model in test_models:
+        try:
+            print("making request", model)
+            response = completion(
+                model=model,
+                messages=[
+                    {"role": "user", "content": "write 10 line code code for saying hi"}
+                ],
+                stream=True,
+            )
+            complete_response = ""
+            is_finished = False
+            for idx, chunk in enumerate(response):
+                print(f"chunk in response: {chunk}")
+                chunk, finished = streaming_format_tests(idx, chunk)
+                if finished:
+                    is_finished = True
+                    break
+                complete_response += chunk
+            if complete_response.strip() == "":
+                raise Exception("Empty response received")
+            print(f"completion_response: {complete_response}")
+            assert is_finished == True
+        except litellm.RateLimitError as e:
+            pass
+        except Exception as e:
+            pytest.fail(f"Error occurred: {e}")
+
+
 # def test_completion_vertexai_stream():
 #     try:
 #         import os
@ -742,11 +806,15 @@ def test_bedrock_claude_3_streaming():
        )
        complete_response = ""
        # Add any assertions here to check the response
+        has_finish_reason = False
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            if finished:
+                has_finish_reason = True
                break
            complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
        if complete_response.strip() == "":
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
@ -1705,7 +1773,7 @@ def test_success_callback_streaming():
    messages = [{"role": "user", "content": "hello"}]
    print("TESTING LITELLM COMPLETION CALL")
    response = litellm.completion(
-        model="j2-light",
+        model="gpt-3.5-turbo",
        messages=messages,
        stream=True,
        max_tokens=5,
@ -2028,7 +2096,7 @@ async def test_azure_astreaming_and_function_calling():
                    chunk.choices[0].delta.tool_calls[0].function.arguments, str
                )
                validate_first_streaming_function_calling_chunk(chunk=chunk)
-            elif idx == 1:
+            elif idx == 1 and chunk.choices[0].finish_reason is None:
                validate_second_streaming_function_calling_chunk(chunk=chunk)
            elif chunk.choices[0].finish_reason is not None:  # last chunk
                validate_final_streaming_function_calling_chunk(chunk=chunk)
@ -2072,7 +2140,7 @@ def test_completion_claude_3_function_call_with_streaming():
        )
        idx = 0
        for chunk in response:
-            # print(f"chunk: {chunk}")
+            print(f"chunk in response: {chunk}")
            if idx == 0:
                assert (
                    chunk.choices[0].delta.tool_calls[0].function.arguments is not None
@ -2081,7 +2149,7 @@ def test_completion_claude_3_function_call_with_streaming():
                    chunk.choices[0].delta.tool_calls[0].function.arguments, str
                )
                validate_first_streaming_function_calling_chunk(chunk=chunk)
-            elif idx == 1:
+            elif idx == 1 and chunk.choices[0].finish_reason is None:
                validate_second_streaming_function_calling_chunk(chunk=chunk)
            elif chunk.choices[0].finish_reason is not None:  # last chunk
                validate_final_streaming_function_calling_chunk(chunk=chunk)
@ -2136,7 +2204,7 @@ async def test_acompletion_claude_3_function_call_with_streaming():
                    chunk.choices[0].delta.tool_calls[0].function.arguments, str
                )
                validate_first_streaming_function_calling_chunk(chunk=chunk)
-            elif idx == 1:
+            elif idx == 1 and chunk.choices[0].finish_reason is None:
                validate_second_streaming_function_calling_chunk(chunk=chunk)
            elif chunk.choices[0].finish_reason is not None:  # last chunk
                validate_final_streaming_function_calling_chunk(chunk=chunk)
@ -2144,3 +2212,71 @@ async def test_acompletion_claude_3_function_call_with_streaming():
        # raise Exception("it worked!")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
+
+
+class ModelResponseIterator:
+    def __init__(self, model_response):
+        self.model_response = model_response
+        self.is_done = False
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.is_done:
+            raise StopIteration
+        self.is_done = True
+        return self.model_response
+
+    # Async iterator
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self.is_done:
+            raise StopAsyncIteration
+        self.is_done = True
+        return self.model_response
+
+
+def test_unit_test_custom_stream_wrapper():
+    """
+    Test if last streaming chunk ends with '?', if the message repeats itself.
+    """
+    litellm.set_verbose = False
+    chunk = {
+        "id": "chatcmpl-123",
+        "object": "chat.completion.chunk",
+        "created": 1694268190,
+        "model": "gpt-3.5-turbo-0125",
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [
+            {"index": 0, "delta": {"content": "How are you?"}, "finish_reason": "stop"}
+        ],
+    }
+    chunk = litellm.ModelResponse(**chunk, stream=True)
+
+    completion_stream = ModelResponseIterator(model_response=chunk)
+
+    response = litellm.CustomStreamWrapper(
+        completion_stream=completion_stream,
+        model="gpt-3.5-turbo",
+        custom_llm_provider="cached_response",
+        logging_obj=litellm.Logging(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey"}],
+            stream=True,
+            call_type="completion",
+            start_time=time.time(),
+            litellm_call_id="12345",
+            function_id="1245",
+        ),
+    )
+
+    freq = 0
+    for chunk in response:
+        if chunk.choices[0].delta.content is not None:
+            if "How are you?" in chunk.choices[0].delta.content:
+                freq += 1
+    assert freq == 1
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@ -2907,6 +2907,7 @@ def test_async_text_completion_stream():

    async def test_get_response():
        try:
+            litellm.set_verbose = True
            response = await litellm.atext_completion(
                model="gpt-3.5-turbo-instruct",
                prompt="good morning",
@ -2930,7 +2931,7 @@ def test_async_text_completion_stream():
    asyncio.run(test_get_response())


-test_async_text_completion_stream()
+# test_async_text_completion_stream()


@pytest.mark.asyncio
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -354,7 +354,10 @@ class Choices(OpenAIObject):
        if message is None:
            self.message = Message(content=None)
        else:
-            self.message = message
+            if isinstance(message, Message):
+                self.message = message
+            elif isinstance(message, dict):
+                self.message = Message(**message)
        if logprobs is not None:
            self.logprobs = logprobs
        if enhancements is not None:
@ -422,8 +425,11 @@ class StreamingChoices(OpenAIObject):
        else:
            self.finish_reason = None
        self.index = index
-        if delta:
-            self.delta = delta
+        if delta is not None:
+            if isinstance(delta, Delta):
+                self.delta = delta
+            if isinstance(delta, dict):
+                self.delta = Delta(**delta)
        else:
            self.delta = Delta()
        if enhancements is not None:
@ -491,13 +497,27 @@ class ModelResponse(OpenAIObject):
    ):
        if stream is not None and stream == True:
            object = "chat.completion.chunk"
-            choices = [StreamingChoices()]
+            if choices is not None and isinstance(choices, list):
+                new_choices = []
+                for choice in choices:
+                    _new_choice = StreamingChoices(**choice)
+                    new_choices.append(_new_choice)
+                choices = new_choices
+            else:
+                choices = [StreamingChoices()]
        else:
            if model in litellm.open_ai_embedding_models:
                object = "embedding"
            else:
                object = "chat.completion"
-            choices = [Choices()]
+            if choices is not None and isinstance(choices, list):
+                new_choices = []
+                for choice in choices:
+                    _new_choice = Choices(**choice)
+                    new_choices.append(_new_choice)
+                choices = new_choices
+            else:
+                choices = [Choices()]
        if id is None:
            id = _generate_id()
        else:
@ -1774,16 +1794,14 @@ class Logging:
                        end_time=end_time,
                    )
                except Exception as e:
-                    verbose_logger.debug(
+                    print_verbose(
                        f"Error occurred building stream chunk: {traceback.format_exc()}"
                    )
                    complete_streaming_response = None
            else:
                self.streaming_chunks.append(result)
        if complete_streaming_response is not None:
-            verbose_logger.debug(
-                "Async success callbacks: Got a complete streaming response"
-            )
+            print_verbose("Async success callbacks: Got a complete streaming response")
            self.model_call_details["async_complete_streaming_response"] = (
                complete_streaming_response
            )
@ -1824,7 +1842,7 @@ class Logging:
                    callbacks.append(callback)
        else:
            callbacks = litellm._async_success_callback
-        verbose_logger.debug(f"Async success callbacks: {callbacks}")
+        print_verbose(f"Async success callbacks: {callbacks}")
        for callback in callbacks:
            # check if callback can run for this request
            litellm_params = self.model_call_details.get("litellm_params", {})
@ -1894,10 +1912,6 @@ class Logging:
                            end_time=end_time,
                        )
                if callable(callback):  # custom logger functions
-                    # print_verbose(
-                    #     f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
-                    #     logger_only=True,
-                    # )
                    if self.stream:
                        if (
                            "async_complete_streaming_response"
@ -2814,21 +2828,19 @@ def client(original_function):
            )
            # if caching is false, don't run this
            final_embedding_cached_response = None
-            cache_controls = kwargs.get("cache", None)

-            # Check if user has opted out of caching
-            _opted_out_with_cache_controls = (
-                cache_controls and cache_controls.get("no-cache", False) == True
-            )
-            _opted_out_with_caching_param = kwargs.get("caching", True) == False
-
-            # cache is not None and user has not opted out
            if (
-                litellm.cache is not None
-                and (not _opted_out_with_cache_controls)
-                and (not _opted_out_with_caching_param)
-            ):
-                # allow users to control returning cached responses from the completion function
+                (
+                    kwargs.get("caching", None) is None
+                    and kwargs.get("cache", None) is None
+                    and litellm.cache is not None
+                )
+                or kwargs.get("caching", False) == True
+                or (
+                    kwargs.get("cache", None) is not None
+                    and kwargs.get("cache").get("no-cache", False) != True
+                )
+            ):  # allow users to control returning cached responses from the completion function
                # checking cache
                print_verbose(f"INSIDE CHECKING CACHE")
                if (
@ -8432,6 +8444,8 @@ class CustomStreamWrapper:
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
        self.sent_last_chunk = False
+        self.system_fingerprint: Optional[str] = None
+        self.received_finish_reason: Optional[str] = None
        self.special_tokens = ["<|assistant|>", "<|system|>", "<|user|>", "<s>", "</s>"]
        self.holding_chunk = ""
        self.complete_response = ""
@ -8872,7 +8886,6 @@ class CustomStreamWrapper:
                if data_json["choices"][0].get("finish_reason", None):
                    is_finished = True
                    finish_reason = data_json["choices"][0]["finish_reason"]
-                    self.sent_last_chunk = True
                print_verbose(
                    f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}"
                )
@ -9105,16 +9118,32 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }

-    def chunk_creator(self, chunk):
+    def model_response_creator(self):
        model_response = ModelResponse(stream=True, model=self.model)
        if self.response_id is not None:
            model_response.id = self.response_id
        else:
            self.response_id = model_response.id
+        if self.system_fingerprint is not None:
+            model_response.system_fingerprint = self.system_fingerprint
        model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider
        model_response._hidden_params["created_at"] = time.time()
        model_response.choices = [StreamingChoices()]
        model_response.choices[0].finish_reason = None
+        return model_response
+
+    def is_delta_empty(self, delta: Delta) -> bool:
+        is_empty = True
+        if delta.content is not None:
+            is_empty = False
+        elif delta.tool_calls is not None:
+            is_empty = False
+        elif delta.function_call is not None:
+            is_empty = False
+        return is_empty
+
+    def chunk_creator(self, chunk):
+        model_response = self.model_response_creator()
        response_obj = {}
        try:
            # return this for all models
@ -9123,30 +9152,22 @@ class CustomStreamWrapper:
                response_obj = self.handle_anthropic_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.model == "replicate" or self.custom_llm_provider == "replicate":
                response_obj = self.handle_replicate_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider and self.custom_llm_provider == "together_ai":
                response_obj = self.handle_together_ai_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
                response_obj = self.handle_huggingface_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif (
                self.custom_llm_provider and self.custom_llm_provider == "baseten"
            ):  # baseten doesn't provide streaming
@ -9157,16 +9178,12 @@ class CustomStreamWrapper:
                response_obj = self.handle_ai21_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
                response_obj = self.handle_maritalk_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                completion_obj["content"] = chunk[0].outputs[0].text
            elif (
@ -9175,152 +9192,116 @@ class CustomStreamWrapper:
                response_obj = self.handle_aleph_alpha_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "nlp_cloud":
                try:
                    response_obj = self.handle_nlp_cloud_chunk(chunk)
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]:
-                        model_response.choices[0].finish_reason = response_obj[
-                            "finish_reason"
-                        ]
+                        self.received_finish_reason = response_obj["finish_reason"]
                except Exception as e:
-                    if self.sent_last_chunk:
+                    if self.received_finish_reason:
                        raise e
                    else:
                        if self.sent_first_chunk is False:
                            raise Exception("An unknown error occurred with the stream")
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                        self.received_finish_reason = "stop"
            elif self.custom_llm_provider == "gemini":
-                try:
-                    if hasattr(chunk, "parts") == True:
-                        try:
-                            if len(chunk.parts) > 0:
-                                completion_obj["content"] = chunk.parts[0].text
-                            if hasattr(chunk.parts[0], "finish_reason"):
-                                model_response.choices[0].finish_reason = (
-                                    map_finish_reason(chunk.parts[0].finish_reason.name)
-                                )
-                        except:
-                            if chunk.parts[0].finish_reason.name == "SAFETY":
-                                raise Exception(
-                                    f"The response was blocked by VertexAI. {str(chunk)}"
-                                )
-                    else:
-                        completion_obj["content"] = str(chunk)
-                except StopIteration as e:
-                    if self.sent_last_chunk:
-                        raise e
-                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                if hasattr(chunk, "parts") == True:
+                    try:
+                        if len(chunk.parts) > 0:
+                            completion_obj["content"] = chunk.parts[0].text
+                        if hasattr(chunk.parts[0], "finish_reason"):
+                            self.received_finish_reason = chunk.parts[
+                                0
+                            ].finish_reason.name
+                    except:
+                        if chunk.parts[0].finish_reason.name == "SAFETY":
+                            raise Exception(
+                                f"The response was blocked by VertexAI. {str(chunk)}"
+                            )
+                else:
+                    completion_obj["content"] = str(chunk)
            elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"):
-                try:
-                    if hasattr(chunk, "candidates") == True:
+                if hasattr(chunk, "candidates") == True:
+                    try:
                        try:
-                            try:
-                                completion_obj["content"] = chunk.text
-                            except Exception as e:
-                                if "Part has no text." in str(e):
-                                    ## check for function calling
-                                    function_call = (
-                                        chunk.candidates[0]
-                                        .content.parts[0]
-                                        .function_call
-                                    )
-                                    args_dict = {}
-                                    for k, v in function_call.args.items():
-                                        args_dict[k] = v
-                                    args_str = json.dumps(args_dict)
-                                    _delta_obj = litellm.utils.Delta(
-                                        content=None,
-                                        tool_calls=[
-                                            {
-                                                "id": f"call_{str(uuid.uuid4())}",
-                                                "function": {
-                                                    "arguments": args_str,
-                                                    "name": function_call.name,
-                                                },
-                                                "type": "function",
-                                            }
-                                        ],
-                                    )
-                                    _streaming_response = StreamingChoices(
-                                        delta=_delta_obj
-                                    )
-                                    _model_response = ModelResponse(stream=True)
-                                    _model_response.choices = [_streaming_response]
-                                    response_obj = {"original_chunk": _model_response}
-                                else:
-                                    raise e
-                            if (
-                                hasattr(chunk.candidates[0], "finish_reason")
-                                and chunk.candidates[0].finish_reason.name
-                                != "FINISH_REASON_UNSPECIFIED"
-                            ):  # every non-final chunk in vertex ai has this
-                                model_response.choices[0].finish_reason = (
-                                    map_finish_reason(
-                                        chunk.candidates[0].finish_reason.name
-                                    )
-                                )
+                            completion_obj["content"] = chunk.text
                        except Exception as e:
-                            if chunk.candidates[0].finish_reason.name == "SAFETY":
-                                raise Exception(
-                                    f"The response was blocked by VertexAI. {str(chunk)}"
+                            if "Part has no text." in str(e):
+                                ## check for function calling
+                                function_call = (
+                                    chunk.candidates[0].content.parts[0].function_call
                                )
-                    else:
-                        completion_obj["content"] = str(chunk)
-                except StopIteration as e:
-                    if self.sent_last_chunk:
-                        raise e
-                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                                args_dict = {}
+                                for k, v in function_call.args.items():
+                                    args_dict[k] = v
+                                args_str = json.dumps(args_dict)
+                                _delta_obj = litellm.utils.Delta(
+                                    content=None,
+                                    tool_calls=[
+                                        {
+                                            "id": f"call_{str(uuid.uuid4())}",
+                                            "function": {
+                                                "arguments": args_str,
+                                                "name": function_call.name,
+                                            },
+                                            "type": "function",
+                                        }
+                                    ],
+                                )
+                                _streaming_response = StreamingChoices(delta=_delta_obj)
+                                _model_response = ModelResponse(stream=True)
+                                _model_response.choices = [_streaming_response]
+                                response_obj = {"original_chunk": _model_response}
+                            else:
+                                raise e
+                        if (
+                            hasattr(chunk.candidates[0], "finish_reason")
+                            and chunk.candidates[0].finish_reason.name
+                            != "FINISH_REASON_UNSPECIFIED"
+                        ):  # every non-final chunk in vertex ai has this
+                            self.received_finish_reason = chunk.candidates[
+                                0
+                            ].finish_reason.name
+                    except Exception as e:
+                        if chunk.candidates[0].finish_reason.name == "SAFETY":
+                            raise Exception(
+                                f"The response was blocked by VertexAI. {str(chunk)}"
+                            )
+                else:
+                    completion_obj["content"] = str(chunk)
            elif self.custom_llm_provider == "cohere":
                response_obj = self.handle_cohere_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "cohere_chat":
                response_obj = self.handle_cohere_chat_chunk(chunk)
                if response_obj is None:
                    return
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "bedrock":
-                if self.sent_last_chunk:
+                if self.received_finish_reason is not None:
                    raise StopIteration
                response_obj = self.handle_bedrock_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
-                    self.sent_last_chunk = True
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "sagemaker":
-                verbose_logger.debug(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
+                print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
                response_obj = self.handle_sagemaker_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
-                    self.sent_last_chunk = True
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "petals":
                if len(self.completion_stream) == 0:
-                    if self.sent_last_chunk:
+                    if self.received_finish_reason is not None:
                        raise StopIteration
                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                        self.received_finish_reason = "stop"
                chunk_size = 30
                new_chunk = self.completion_stream[:chunk_size]
                completion_obj["content"] = new_chunk
@ -9330,11 +9311,10 @@ class CustomStreamWrapper:
                # fake streaming
                response_obj = {}
                if len(self.completion_stream) == 0:
-                    if self.sent_last_chunk:
+                    if self.received_finish_reason is not None:
                        raise StopIteration
                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                        self.received_finish_reason = "stop"
                chunk_size = 30
                new_chunk = self.completion_stream[:chunk_size]
                completion_obj["content"] = new_chunk
@ -9345,41 +9325,31 @@ class CustomStreamWrapper:
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "ollama_chat":
                response_obj = self.handle_ollama_chat_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "cloudflare":
                response_obj = self.handle_cloudlfare_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
                response_obj = self.handle_openai_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "azure_text":
                response_obj = self.handle_azure_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "cached_response":
                response_obj = {
                    "text": chunk.choices[0].delta.content,
@ -9392,10 +9362,11 @@ class CustomStreamWrapper:
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if hasattr(chunk, "id"):
                    model_response.id = chunk.id
+                    self.response_id = chunk.id
+                if hasattr(chunk, "system_fingerprint"):
+                    self.system_fingerprint = chunk.system_fingerprint
                if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
            else:  # openai / azure chat model
                if self.custom_llm_provider == "azure":
                    if hasattr(chunk, "model"):
@ -9411,21 +9382,24 @@ class CustomStreamWrapper:
                        raise Exception(
                            "Mistral API raised a streaming error - finish_reason: error, no content string given."
                        )
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
                if response_obj.get("original_chunk", None) is not None:
-                    model_response.system_fingerprint = getattr(
-                        response_obj["original_chunk"], "system_fingerprint", None
-                    )
                    if hasattr(response_obj["original_chunk"], "id"):
                        model_response.id = response_obj["original_chunk"].id
+                        self.response_id = model_response.id
+                    if hasattr(response_obj["original_chunk"], "system_fingerprint"):
+                        model_response.system_fingerprint = response_obj[
+                            "original_chunk"
+                        ].system_fingerprint
+                        self.system_fingerprint = response_obj[
+                            "original_chunk"
+                        ].system_fingerprint
                if response_obj["logprobs"] is not None:
                    model_response.choices[0].logprobs = response_obj["logprobs"]

            model_response.model = self.model
            print_verbose(
-                f"model_response finish reason 3: {model_response.choices[0].finish_reason}; response_obj={response_obj}"
+                f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
            )
            ## FUNCTION CALL PARSING
            if (
@ -9435,6 +9409,7 @@ class CustomStreamWrapper:
                # enter this branch when no content has been passed in response
                original_chunk = response_obj.get("original_chunk", None)
                model_response.id = original_chunk.id
+                self.response_id = original_chunk.id
                if len(original_chunk.choices) > 0:
                    if (
                        original_chunk.choices[0].delta.function_call is not None
@ -9516,6 +9491,7 @@ class CustomStreamWrapper:
                    original_chunk = response_obj.get("original_chunk", None)
                    if original_chunk:
                        model_response.id = original_chunk.id
+                        self.response_id = original_chunk.id
                        if len(original_chunk.choices) > 0:
                            try:
                                delta = dict(original_chunk.choices[0].delta)
@ -9554,7 +9530,7 @@ class CustomStreamWrapper:
                    return model_response
                else:
                    return
-            elif model_response.choices[0].finish_reason is not None:
+            elif self.received_finish_reason is not None:
                # flush any remaining holding chunk
                if len(self.holding_chunk) > 0:
                    if model_response.choices[0].delta.content is None:
@ -9564,10 +9540,17 @@ class CustomStreamWrapper:
                            self.holding_chunk + model_response.choices[0].delta.content
                        )
                    self.holding_chunk = ""
-                # get any function call arguments
-                model_response.choices[0].finish_reason = map_finish_reason(
-                    model_response.choices[0].finish_reason
-                )  # ensure consistent output to openai
+                # if delta is None
+                is_delta_empty = self.is_delta_empty(
+                    delta=model_response.choices[0].delta
+                )
+                if is_delta_empty:
+                    # get any function call arguments
+                    model_response.choices[0].finish_reason = map_finish_reason(
+                        finish_reason=self.received_finish_reason
+                    )  # ensure consistent output to openai
+                    self.sent_last_chunk = True
+
                return model_response
            elif (
                model_response.choices[0].delta.tool_calls is not None
@ -9627,6 +9610,16 @@ class CustomStreamWrapper:
        ## SYNC LOGGING
        self.logging_obj.success_handler(processed_chunk)

+    def finish_reason_handler(self):
+        model_response = self.model_response_creator()
+        if self.received_finish_reason is not None:
+            model_response.choices[0].finish_reason = map_finish_reason(
+                finish_reason=self.received_finish_reason
+            )
+        else:
+            model_response.choices[0].finish_reason = "stop"
+        return model_response
+
    ## needs to handle the empty string case (even starting chunk can be an empty string)
    def __next__(self):
        try:
@ -9661,7 +9654,16 @@ class CustomStreamWrapper:
                    # RETURN RESULT
                    return response
        except StopIteration:
-            raise  # Re-raise StopIteration
+            if self.sent_last_chunk == True:
+                raise  # Re-raise StopIteration
+            else:
+                self.sent_last_chunk = True
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler, args=(processed_chunk,)
+                ).start()  # log response
+                return processed_chunk
        except Exception as e:
            traceback_exception = traceback.format_exc()
            # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
@ -9766,9 +9768,37 @@ class CustomStreamWrapper:
                        # RETURN RESULT
                        return processed_chunk
        except StopAsyncIteration:
-            raise
+            if self.sent_last_chunk == True:
+                raise  # Re-raise StopIteration
+            else:
+                self.sent_last_chunk = True
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler, args=(processed_chunk,)
+                ).start()  # log response
+                asyncio.create_task(
+                    self.logging_obj.async_success_handler(
+                        processed_chunk,
+                    )
+                )
+                return processed_chunk
        except StopIteration:
-            raise StopAsyncIteration  # Re-raise StopIteration
+            if self.sent_last_chunk == True:
+                raise StopAsyncIteration
+            else:
+                self.sent_last_chunk = True
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler, args=(processed_chunk,)
+                ).start()  # log response
+                asyncio.create_task(
+                    self.logging_obj.async_success_handler(
+                        processed_chunk,
+                    )
+                )
+                return processed_chunk
        except Exception as e:
            traceback_exception = traceback.format_exc()
            # Handle any exceptions that might occur during streaming
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -5,6 +5,10 @@ model_list:
      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
      api_version: "2023-05-15"
      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+  - model_name: gpt-3.5-turbo-large
+    litellm_params: 
+      "model": "gpt-3.5-turbo-1106"
+      "api_key": os.environ/OPENAI_API_KEY
  - model_name: gpt-4
    litellm_params:
      model: azure/chatgpt-v-2
@ -46,9 +50,10 @@ litellm_settings:
  num_retries: 5
  request_timeout: 600
  telemetry: False
+  context_window_fallbacks: [{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}]

 general_settings: 
-  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
+  master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
  proxy_budget_rescheduler_min_time: 60
  proxy_budget_rescheduler_max_time: 64
  proxy_batch_write_at: 1
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.34.3"
+version = "1.34.4"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.34.3"
+version = "1.34.4"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/tests/large_text.py
+++ b/tests/large_text.py
@ -0,0 +1,112 @@
+text = """
+Alexander the Great
+This article is about the ancient king of Macedonia. For other uses, see Alexander the Great (disambiguation).
+Alexander III of Macedon (Ancient Greek: Ἀλέξανδρος, romanized: Alexandros; 20/21 July 356 BC – 10/11 June 323 BC), most commonly known as Alexander the Great,[c] was a king of the ancient Greek kingdom of Macedon.[d] He succeeded his father Philip II to the throne in 336 BC at the age of 20 and spent most of his ruling years conducting a lengthy military campaign throughout Western Asia, Central Asia, parts of South Asia, and Egypt. By the age of 30, he had created one of the largest empires in history, stretching from Greece to northwestern India.[1] He was undefeated in battle and is widely considered to be one of history's greatest and most successful military commanders.[2][3]
+
+Until the age of 16, Alexander was tutored by Aristotle. In 335 BC, shortly after his assumption of kingship over Macedon, he campaigned in the Balkans and reasserted control over Thrace and parts of Illyria before marching on the city of Thebes, which was subsequently destroyed in battle. Alexander then led the League of Corinth, and used his authority to launch the pan-Hellenic project envisaged by his father, assuming leadership over all Greeks in their conquest of Persia.[4][5]
+
+In 334 BC, he invaded the Achaemenid Persian Empire and began a series of campaigns that lasted for 10 years. Following his conquest of Asia Minor, Alexander broke the power of Achaemenid Persia in a series of decisive battles, including those at Issus and Gaugamela; he subsequently overthrew Darius III and conquered the Achaemenid Empire in its entirety.[e] After the fall of Persia, the Macedonian Empire held a vast swath of territory between the Adriatic Sea and the Indus River. Alexander endeavored to reach the "ends of the world and the Great Outer Sea" and invaded India in 326 BC, achieving an important victory over Porus, an ancient Indian king of present-day Punjab, at the Battle of the Hydaspes. Due to the demand of his homesick troops, he eventually turned back at the Beas River and later died in 323 BC in Babylon, the city of Mesopotamia that he had planned to establish as his empire's capital. Alexander's death left unexecuted an additional series of planned military and mercantile campaigns that would have begun with a Greek invasion of Arabia. In the years following his death, a series of civil wars broke out across the Macedonian Empire, eventually leading to its disintegration at the hands of the Diadochi.
+
+With his death marking the start of the Hellenistic period, Alexander's legacy includes the cultural diffusion and syncretism that his conquests engendered, such as Greco-Buddhism and Hellenistic Judaism. He founded more than twenty cities, with the most prominent being the city of Alexandria in Egypt. Alexander's settlement of Greek colonists and the resulting spread of Greek culture led to the overwhelming dominance of Hellenistic civilization and influence as far east as the Indian subcontinent. The Hellenistic period developed through the Roman Empire into modern Western culture; the Greek language became the lingua franca of the region and was the predominant language of the Byzantine Empire up until its collapse in the mid-15th century AD. Alexander became legendary as a classical hero in the mould of Achilles, featuring prominently in the historical and mythical traditions of both Greek and non-Greek cultures. His military achievements and unprecedented enduring successes in battle made him the measure against which many later military leaders would compare themselves,[f] and his tactics remain a significant subject of study in military academies worldwide.[6] Legends of Alexander's exploits coalesced into the third-century Alexander Romance which, in the premodern period, went through over one hundred recensions, translations, and derivations and was translated into almost every European vernacular and every language of the Islamic world.[7] After the Bible, it was the most popular form of European literature.[8]
+
+Early life
+
+Lineage and childhood
+
+Alexander III was born in Pella, the capital of the Kingdom of Macedon,[9] on the sixth day of the ancient Greek month of Hekatombaion, which probably corresponds to 20 July 356 BC (although the exact date is uncertain).[10][11] He was the son of the erstwhile king of Macedon, Philip II, and his fourth wife, Olympias (daughter of Neoptolemus I, king of Epirus).[12][g] Although Philip had seven or eight wives, Olympias was his principal wife for some time, likely because she gave birth to Alexander.[13]
+
+Several legends surround Alexander's birth and childhood.[14] According to the ancient Greek biographer Plutarch, on the eve of the consummation of her marriage to Philip, Olympias dreamed that her womb was struck by a thunderbolt that caused a flame to spread "far and wide" before dying away. Sometime after the wedding, Philip is said to have seen himself, in a dream, securing his wife's womb with a seal engraved with a lion's image.[15] Plutarch offered a variety of interpretations for these dreams: that Olympias was pregnant before her marriage, indicated by the sealing of her womb; or that Alexander's father was Zeus. Ancient commentators were divided about whether the ambitious Olympias promulgated the story of Alexander's divine parentage, variously claiming that she had told Alexander, or that she dismissed the suggestion as impious.[15]
+
+On the day Alexander was born, Philip was preparing a siege on the city of Potidea on the peninsula of Chalcidice. That same day, Philip received news that his general Parmenion had defeated the combined Illyrian and Paeonian armies and that his horses had won at the Olympic Games. It was also said that on this day, the Temple of Artemis in Ephesus, one of the Seven Wonders of the World, burnt down. This led Hegesias of Magnesia to say that it had burnt down because Artemis was away, attending the birth of Alexander.[16] Such legends may have emerged when Alexander was king, and possibly at his instigation, to show that he was superhuman and destined for greatness from conception.[14]
+
+In his early years, Alexander was raised by a nurse, Lanike, sister of Alexander's future general Cleitus the Black. Later in his childhood, Alexander was tutored by the strict Leonidas, a relative of his mother, and by Lysimachus of Acarnania.[17] Alexander was raised in the manner of noble Macedonian youths, learning to read, play the lyre, ride, fight, and hunt.[18] When Alexander was ten years old, a trader from Thessaly brought Philip a horse, which he offered to sell for thirteen talents. The horse refused to be mounted, and Philip ordered it away. Alexander, however, detecting the horse's fear of its own shadow, asked to tame the horse, which he eventually managed.[14] Plutarch stated that Philip, overjoyed at this display of courage and ambition, kissed his son tearfully, declaring: "My boy, you must find a kingdom big enough for your ambitions. Macedon is too small for you", and bought the horse for him.[19] Alexander named it Bucephalas, meaning "ox-head". Bucephalas carried Alexander as far as India. When the animal died (because of old age, according to Plutarch, at age 30), Alexander named a city after him, Bucephala.[20]
+
+Education
+
+When Alexander was 13, Philip began to search for a tutor, and considered such academics as Isocrates and Speusippus, the latter offering to resign from his stewardship of the Academy to take up the post. In the end, Philip chose Aristotle and provided the Temple of the Nymphs at Mieza as a classroom. In return for teaching Alexander, Philip agreed to rebuild Aristotle's hometown of Stageira, which Philip had razed, and to repopulate it by buying and freeing the ex-citizens who were slaves, or pardoning those who were in exile.[21]
+
+Mieza was like a boarding school for Alexander and the children of Macedonian nobles, such as Ptolemy, Hephaistion, and Cassander. Many of these students would become his friends and future generals, and are often known as the "Companions". Aristotle taught Alexander and his companions about medicine, philosophy, morals, religion, logic, and art. Under Aristotle's tutelage, Alexander developed a passion for the works of Homer, and in particular the Iliad; Aristotle gave him an annotated copy, which Alexander later carried on his campaigns.[22] Alexander was able to quote Euripides from memory.[23]
+
+During his youth, Alexander was also acquainted with Persian exiles at the Macedonian court, who received the protection of Philip II for several years as they opposed Artaxerxes III.[24][25][26] Among them were Artabazos II and his daughter Barsine, possible future mistress of Alexander, who resided at the Macedonian court from 352 to 342 BC, as well as Amminapes, future satrap of Alexander, or a Persian nobleman named Sisines.[24][27][28][29] This gave the Macedonian court a good knowledge of Persian issues, and may even have influenced some of the innovations in the management of the Macedonian state.[27]
+
+Suda writes that Anaximenes of Lampsacus was one of Alexander's teachers, and that Anaximenes also accompanied Alexander on his campaigns.[30]
+
+Heir of Philip II
+
+Regency and ascent of Macedon
+
+Main articles: Philip II of Macedon and Rise of Macedon
+Further information: History of Macedonia (ancient kingdom)
+At the age of 16, Alexander's education under Aristotle ended. Philip II had waged war against the Thracians to the north, which left Alexander in charge as regent and heir apparent.[14] During Philip's absence, the Thracian tribe of Maedi revolted against Macedonia. Alexander responded quickly and drove them from their territory. The territory was colonized, and a city, named Alexandropolis, was founded.[31]
+
+Upon Philip's return, Alexander was dispatched with a small force to subdue the revolts in southern Thrace. Campaigning against the Greek city of Perinthus, Alexander reportedly saved his father's life. Meanwhile, the city of Amphissa began to work lands that were sacred to Apollo near Delphi, a sacrilege that gave Philip the opportunity to further intervene in Greek affairs. While Philip was occupied in Thrace, Alexander was ordered to muster an army for a campaign in southern Greece. Concerned that other Greek states might intervene, Alexander made it look as though he was preparing to attack Illyria instead. During this turmoil, the Illyrians invaded Macedonia, only to be repelled by Alexander.[32]
+
+Philip and his army joined his son in 338 BC, and they marched south through Thermopylae, taking it after stubborn resistance from its Theban garrison. They went on to occupy the city of Elatea, only a few days' march from both Athens and Thebes. The Athenians, led by Demosthenes, voted to seek alliance with Thebes against Macedonia. Both Athens and Philip sent embassies to win Thebes's favour, but Athens won the contest.[33] Philip marched on Amphissa (ostensibly acting on the request of the Amphictyonic League), capturing the mercenaries sent there by Demosthenes and accepting the city's surrender. Philip then returned to Elatea, sending a final offer of peace to Athens and Thebes, who both rejected it.[34]
+
+As Philip marched south, his opponents blocked him near Chaeronea, Boeotia. During the ensuing Battle of Chaeronea, Philip commanded the right wing and Alexander the left, accompanied by a group of Philip's trusted generals. According to the ancient sources, the two sides fought bitterly for some time. Philip deliberately commanded his troops to retreat, counting on the untested Athenian hoplites to follow, thus breaking their line. Alexander was the first to break the Theban lines, followed by Philip's generals. Having damaged the enemy's cohesion, Philip ordered his troops to press forward and quickly routed them. With the Athenians lost, the Thebans were surrounded. Left to fight alone, they were defeated.[35]
+
+After the victory at Chaeronea, Philip and Alexander marched unopposed into the Peloponnese, welcomed by all cities; however, when they reached Sparta, they were refused, but did not resort to war.[36] At Corinth, Philip established a "Hellenic Alliance" (modelled on the old anti-Persian alliance of the Greco-Persian Wars), which included most Greek city-states except Sparta. Philip was then named Hegemon (often translated as "Supreme Commander") of this league (known by modern scholars as the League of Corinth), and announced his plans to attack the Persian Empire.[37][38]
+
+Exile and return
+
+When Philip returned to Pella, he fell in love with and married Cleopatra Eurydice in 338 BC,[39] the niece of his general Attalus.[40] The marriage made Alexander's position as heir less secure, since any son of Cleopatra Eurydice would be a fully Macedonian heir, while Alexander was only half-Macedonian.[41] During the wedding banquet, a drunken Attalus publicly prayed to the gods that the union would produce a legitimate heir.[40]
+
+At the wedding of Cleopatra, whom Philip fell in love with and married, she being much too young for him, her uncle Attalus in his drink desired the Macedonians would implore the gods to give them a lawful successor to the kingdom by his niece. This so irritated Alexander, that throwing one of the cups at his head, "You villain," said he, "what, am I then a bastard?" Then Philip, taking Attalus's part, rose up and would have run his son through; but by good fortune for them both, either his over-hasty rage, or the wine he had drunk, made his foot slip, so that he fell down on the floor. At which Alexander reproachfully insulted over him: "See there," said he, "the man who makes preparations to pass out of Europe into Asia, overturned in passing from one seat to another."
+
+— Plutarch, describing the feud at Philip's wedding.[42]none
+In 337 BC, Alexander fled Macedon with his mother, dropping her off with her brother, King Alexander I of Epirus in Dodona, capital of the Molossians.[43] He continued to Illyria,[43] where he sought refuge with one or more Illyrian kings, perhaps with Glaucias, and was treated as a guest, despite having defeated them in battle a few years before.[44] However, it appears Philip never intended to disown his politically and militarily trained son.[43] Accordingly, Alexander returned to Macedon after six months due to the efforts of a family friend, Demaratus, who mediated between the two parties.[45]
+
+In the following year, the Persian satrap (governor) of Caria, Pixodarus, offered his eldest daughter to Alexander's half-brother, Philip Arrhidaeus.[43] Olympias and several of Alexander's friends suggested this showed Philip intended to make Arrhidaeus his heir.[43] Alexander reacted by sending an actor, Thessalus of Corinth, to tell Pixodarus that he should not offer his daughter's hand to an illegitimate son, but instead to Alexander. When Philip heard of this, he stopped the negotiations and scolded Alexander for wishing to marry the daughter of a Carian, explaining that he wanted a better bride for him.[43] Philip exiled four of Alexander's friends, Harpalus, Nearchus, Ptolemy and Erigyius, and had the Corinthians bring Thessalus to him in chains.[46]
+
+King of Macedon
+
+Accession
+
+Further information: Government of Macedonia (ancient kingdom)
+In summer 336 BC, while at Aegae attending the wedding of his daughter Cleopatra to Olympias's brother, Alexander I of Epirus, Philip was assassinated by the captain of his bodyguards, Pausanias.[h] As Pausanias tried to escape, he tripped over a vine and was killed by his pursuers, including two of Alexander's companions, Perdiccas and Leonnatus. Alexander was proclaimed king on the spot by the nobles and army at the age of 20.[47][48][49]
+
+Consolidation of power
+
+Alexander began his reign by eliminating potential rivals to the throne. He had his cousin, the former Amyntas IV, executed.[51] He also had two Macedonian princes from the region of Lyncestis killed for having been involved in his father's assassination, but spared a third, Alexander Lyncestes. Olympias had Cleopatra Eurydice, and Europa, her daughter by Philip, burned alive. When Alexander learned about this, he was furious. Alexander also ordered the murder of Attalus,[51] who was in command of the advance guard of the army in Asia Minor and Cleopatra's uncle.[52]
+
+Attalus was at that time corresponding with Demosthenes, regarding the possibility of defecting to Athens. Attalus also had severely insulted Alexander, and following Cleopatra's murder, Alexander may have considered him too dangerous to be left alive.[52] Alexander spared Arrhidaeus, who was by all accounts mentally disabled, possibly as a result of poisoning by Olympias.[47][49][53]
+
+News of Philip's death roused many states into revolt, including Thebes, Athens, Thessaly, and the Thracian tribes north of Macedon. When news of the revolts reached Alexander, he responded quickly. Though advised to use diplomacy, Alexander mustered 3,000 Macedonian cavalry and rode south towards Thessaly. He found the Thessalian army occupying the pass between Mount Olympus and Mount Ossa, and ordered his men to ride over Mount Ossa. When the Thessalians awoke the next day, they found Alexander in their rear and promptly surrendered, adding their cavalry to Alexander's force. He then continued south towards the Peloponnese.[54]
+
+Alexander stopped at Thermopylae, where he was recognized as the leader of the Amphictyonic League before heading south to Corinth. Athens sued for peace and Alexander pardoned the rebels. The famous encounter between Alexander and Diogenes the Cynic occurred during Alexander's stay in Corinth. When Alexander asked Diogenes what he could do for him, the philosopher disdainfully asked Alexander to stand a little to the side, as he was blocking the sunlight.[55] This reply apparently delighted Alexander, who is reported to have said "But verily, if I were not Alexander, I would like to be Diogenes."[56] At Corinth, Alexander took the title of Hegemon ("leader") and, like Philip, was appointed commander for the coming war against Persia. He also received news of a Thracian uprising.[57]
+
+Balkan campaign
+
+Main article: Alexander's Balkan campaign
+Before crossing to Asia, Alexander wanted to safeguard his northern borders. In the spring of 335 BC, he advanced to suppress several revolts. Starting from Amphipolis, he travelled east into the country of the "Independent Thracians"; and at Mount Haemus, the Macedonian army attacked and defeated the Thracian forces manning the heights.[58] The Macedonians marched into the country of the Triballi, and defeated their army near the Lyginus river[59] (a tributary of the Danube). Alexander then marched for three days to the Danube, encountering the Getae tribe on the opposite shore. Crossing the river at night, he surprised them and forced their army to retreat after the first cavalry skirmish.[60]
+
+News then reached Alexander that the Illyrian chieftain Cleitus and King Glaukias of the Taulantii were in open revolt against his authority. Marching west into Illyria, Alexander defeated each in turn, forcing the two rulers to flee with their troops. With these victories, he secured his northern frontier.[61]
+
+Destruction of Thebes
+
+While Alexander campaigned north, the Thebans and Athenians rebelled once again. Alexander immediately headed south.[62] While the other cities again hesitated, Thebes decided to fight. The Theban resistance was ineffective, and Alexander razed the city and divided its territory between the other Boeotian cities. The end of Thebes cowed Athens, leaving all of Greece temporarily at peace.[62] Alexander then set out on his Asian campaign, leaving Antipater as regent.[63]
+
+Conquest of the Achaemenid Persian Empire
+
+Main articles: Wars of Alexander the Great and Chronology of the expedition of Alexander the Great into Asia
+Asia Minor
+
+Further information: Battle of the Granicus, Siege of Halicarnassus, and Siege of Miletus
+After his victory at the Battle of Chaeronea (338 BC), Philip II began the work of establishing himself as hēgemṓn (Greek: ἡγεμών) of a league which according to Diodorus was to wage a campaign against the Persians for the sundry grievances Greece suffered in 480 and free the Greek cities of the western coast and islands from Achaemenid rule. In 336 he sent Parmenion, Amyntas, Andromenes, Attalus, and an army of 10,000 men into Anatolia to make preparations for an invasion.[64][65] At first, all went well. The Greek cities on the western coast of Anatolia revolted until the news arrived that Philip had been murdered and had been succeeded by his young son Alexander. The Macedonians were demoralized by Philip's death and were subsequently defeated near Magnesia by the Achaemenids under the command of the mercenary Memnon of Rhodes.[64][65]
+
+Taking over the invasion project of Philip II, Alexander's army crossed the Hellespont in 334 BC with approximately 48,100 soldiers, 6,100 cavalry and a fleet of 120 ships with crews numbering 38,000,[62] drawn from Macedon and various Greek city-states, mercenaries, and feudally raised soldiers from Thrace, Paionia, and Illyria.[66][i] He showed his intent to conquer the entirety of the Persian Empire by throwing a spear into Asian soil and saying he accepted Asia as a gift from the gods. This also showed Alexander's eagerness to fight, in contrast to his father's preference for diplomacy.[62]
+
+After an initial victory against Persian forces at the Battle of the Granicus, Alexander accepted the surrender of the Persian provincial capital and treasury of Sardis; he then proceeded along the Ionian coast, granting autonomy and democracy to the cities. Miletus, held by Achaemenid forces, required a delicate siege operation, with Persian naval forces nearby. Further south, at Halicarnassus, in Caria, Alexander successfully waged his first large-scale siege, eventually forcing his opponents, the mercenary captain Memnon of Rhodes and the Persian satrap of Caria, Orontobates, to withdraw by sea.[67] Alexander left the government of Caria to a member of the Hecatomnid dynasty, Ada, who adopted Alexander.[68]
+
+From Halicarnassus, Alexander proceeded into mountainous Lycia and the Pamphylian plain, asserting control over all coastal cities to deny the Persians naval bases. From Pamphylia onwards the coast held no major ports and Alexander moved inland. At Termessos, Alexander humbled but did not storm the Pisidian city.[69] At the ancient Phrygian capital of Gordium, Alexander "undid" the hitherto unsolvable Gordian Knot, a feat said to await the future "king of Asia".[70] According to the story, Alexander proclaimed that it did not matter how the knot was undone and hacked it apart with his sword.[71]
+
+The Levant and Syria
+
+Further information: Battle of Issus and Siege of Tyre (332 BC)
+In spring 333 BC, Alexander crossed the Taurus into Cilicia. After a long pause due to an illness, he marched on towards Syria. Though outmanoeuvered by Darius's significantly larger army, he marched back to Cilicia, where he defeated Darius at Issus. Darius fled the battle, causing his army to collapse, and left behind his wife, his two daughters, his mother Sisygambis, and a fabulous treasure.[72] He offered a peace treaty that included the lands he had already lost, and a ransom of 10,000 talents for his family. Alexander replied that since he was now king of Asia, it was he alone who decided territorial divisions.[73] Alexander proceeded to take possession of Syria, and most of the coast of the Levant.[68] In the following year, 332 BC, he was forced to attack Tyre, which he captured after a long and difficult siege.[74][75] The men of military age were massacred and the women and children sold into slavery.[76]
+
+Egypt
+
+Further information: Siege of Gaza (332 BCE)
+When Alexander destroyed Tyre, most of the towns on the route to Egypt quickly capitulated. However, Alexander was met with resistance at Gaza. The stronghold was heavily fortified and built on a hill, requiring a siege. When "his engineers pointed out to him that because of the height of the mound it would be impossible... this encouraged Alexander all the more to make the attempt".[77] After three unsuccessful assaults, the stronghold fell, but not before Alexander had received a serious shoulder wound. As in Tyre, men of military age were put to the sword and the women and children were sold into slavery.[78]
+"""
--- a/tests/test_fallbacks.py
+++ b/tests/test_fallbacks.py
@ -0,0 +1,45 @@
+# What is this?
+## This tests if the proxy fallbacks work as expected
+import pytest
+import asyncio
+import aiohttp
+from large_text import text
+
+
+async def chat_completion(session, key: str, model: str, messages: list):
+    url = "http://0.0.0.0:4000/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {key}",
+        "Content-Type": "application/json",
+    }
+    data = {
+        "model": model,
+        "messages": messages,
+    }
+
+    async with session.post(url, headers=headers, json=data) as response:
+        status = response.status
+        response_text = await response.text()
+
+        print(response_text)
+        print()
+
+        if status != 200:
+            raise Exception(f"Request did not return a 200 status code: {status}")
+        return await response.json()
+
+
+@pytest.mark.asyncio
+async def test_chat_completion():
+    """
+    make chat completion call with prompt > context window. expect it to work with fallback
+    """
+    async with aiohttp.ClientSession() as session:
+        model = "gpt-3.5-turbo"
+        messages = [
+            {"role": "system", "content": text},
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+        await chat_completion(
+            session=session, key="sk-1234", model=model, messages=messages
+        )