LiteLLM Minor Fixes & Improvements (09/27/2024) (#5938)

* fix(langfuse.py): prevent double logging requester metadata Fixes https://github.com/BerriAI/litellm/issues/5935 * build(model_prices_and_context_window.json): add mistral pixtral cost tracking Closes https://github.com/BerriAI/litellm/issues/5837 * handle streaming for azure ai studio error * [Perf Proxy] parallel request limiter - use one cache update call (#5932) * fix parallel request limiter - use one cache update call * ci/cd run again * run ci/cd again * use docker username password * fix config.yml * fix config * fix config * fix config.yml * ci/cd run again * use correct typing for batch set cache * fix async_set_cache_pipeline * fix only check user id tpm / rpm limits when limits set * fix test_openai_azure_embedding_with_oidc_and_cf * fix(groq/chat/transformation.py): Fixes https://github.com/BerriAI/litellm/issues/5839 * feat(anthropic/chat.py): return 'retry-after' headers from anthropic Fixes https://github.com/BerriAI/litellm/issues/4387 * feat: raise validation error if message has tool calls without passing `tools` param for anthropic/bedrock Closes https://github.com/BerriAI/litellm/issues/5747 * [Feature]#5940, add max_workers parameter for the batch_completion (#5947) * handle streaming for azure ai studio error * bump: version 1.48.2 → 1.48.3 * docs(data_security.md): add legal/compliance faq's Make it easier for companies to use litellm * docs: resolve imports * [Feature]#5940, add max_workers parameter for the batch_completion method --------- Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com> Co-authored-by: Krrish Dholakia <krrishdholakia@gmail.com> Co-authored-by: josearangos <josearangos@Joses-MacBook-Pro.local> * fix(converse_transformation.py): fix default message value * fix(utils.py): fix get_model_info to handle finetuned models Fixes issue for standard logging payloads, where model_map_value was null for finetuned openai models * fix(litellm_pre_call_utils.py): add debug statement for data sent after updating with team/key callbacks * fix: fix linting errors * fix(anthropic/chat/handler.py): fix cache creation input tokens * fix(exception_mapping_utils.py): fix missing imports * fix(anthropic/chat/handler.py): fix usage block translation * test: fix test * test: fix tests * style(types/utils.py): trigger new build * test: fix test --------- Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com> Co-authored-by: Jose Alberto Arango Sanchez <jose.arangos@udea.edu.co> Co-authored-by: josearangos <josearangos@Joses-MacBook-Pro.local>
2025-04-25 10:44:24 +00:00 · 2024-09-27 22:52:57 -07:00 · 2024-09-27 22:52:57 -07:00 · 0b30e212da
commit 0b30e212da
parent 754981a78f
35 changed files with 3657 additions and 2820 deletions
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -7,6 +7,8 @@ from typing import Any

 from openai import AuthenticationError, BadRequestError, OpenAIError, RateLimitError

+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
@ -884,6 +886,42 @@ def _pre_call_utils(
    return data, original_function, mapped_target


+def _pre_call_utils_httpx(
+    call_type: str,
+    data: dict,
+    client: Union[HTTPHandler, AsyncHTTPHandler],
+    sync_mode: bool,
+    streaming: Optional[bool],
+):
+    mapped_target: Any = client.client
+    if call_type == "embedding":
+        data["input"] = "Hello world!"
+
+        if sync_mode:
+            original_function = litellm.embedding
+        else:
+            original_function = litellm.aembedding
+    elif call_type == "chat_completion":
+        data["messages"] = [{"role": "user", "content": "Hello world"}]
+        if streaming is True:
+            data["stream"] = True
+
+        if sync_mode:
+            original_function = litellm.completion
+        else:
+            original_function = litellm.acompletion
+    elif call_type == "completion":
+        data["prompt"] = "Hello world"
+        if streaming is True:
+            data["stream"] = True
+        if sync_mode:
+            original_function = litellm.text_completion
+        else:
+            original_function = litellm.atext_completion
+
+    return data, original_function, mapped_target
+
+
@pytest.mark.parametrize(
    "sync_mode",
    [True, False],
@ -1006,3 +1044,111 @@ async def test_exception_with_headers(sync_mode, provider, model, call_type, str
        if exception_raised is False:
            print(resp)
        assert exception_raised
+
+
+@pytest.mark.parametrize(
+    "sync_mode",
+    [True, False],
+)
+@pytest.mark.parametrize("streaming", [True, False])
+@pytest.mark.parametrize(
+    "provider, model, call_type",
+    [
+        ("anthropic", "claude-3-haiku-20240307", "chat_completion"),
+    ],
+)
+@pytest.mark.asyncio
+async def test_exception_with_headers_httpx(
+    sync_mode, provider, model, call_type, streaming
+):
+    """
+    User feedback: litellm says "No deployments available for selected model, Try again in 60 seconds"
+    but Azure says to retry in at most 9s
+
+    ```
+    {"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
+    ```
+    """
+    print(f"Received args: {locals()}")
+    import openai
+
+    if sync_mode:
+        client = HTTPHandler()
+    else:
+        client = AsyncHTTPHandler()
+
+    data = {"model": model}
+    data, original_function, mapped_target = _pre_call_utils_httpx(
+        call_type=call_type,
+        data=data,
+        client=client,
+        sync_mode=sync_mode,
+        streaming=streaming,
+    )
+
+    cooldown_time = 30.0
+
+    def _return_exception(*args, **kwargs):
+        import datetime
+
+        from httpx import Headers, HTTPStatusError, Request, Response
+
+        # Create the Request object
+        request = Request("POST", "http://0.0.0.0:9000/chat/completions")
+
+        # Create the Response object with the necessary headers and status code
+        response = Response(
+            status_code=429,
+            headers=Headers(
+                {
+                    "date": "Sat, 21 Sep 2024 22:56:53 GMT",
+                    "server": "uvicorn",
+                    "retry-after": "30",
+                    "content-length": "30",
+                    "content-type": "application/json",
+                }
+            ),
+            request=request,
+        )
+
+        # Create and raise the HTTPStatusError exception
+        raise HTTPStatusError(
+            message="Error code: 429 - Rate Limit Error!",
+            request=request,
+            response=response,
+        )
+
+    with patch.object(
+        mapped_target,
+        "send",
+        side_effect=_return_exception,
+    ):
+        new_retry_after_mock_client = MagicMock(return_value=-1)
+
+        litellm.utils._get_retry_after_from_exception_header = (
+            new_retry_after_mock_client
+        )
+
+        exception_raised = False
+        try:
+            if sync_mode:
+                resp = original_function(**data, client=client)
+                if streaming:
+                    for chunk in resp:
+                        continue
+            else:
+                resp = await original_function(**data, client=client)
+
+                if streaming:
+                    async for chunk in resp:
+                        continue
+
+        except litellm.RateLimitError as e:
+            exception_raised = True
+            assert e.litellm_response_headers is not None
+            print("e.litellm_response_headers", e.litellm_response_headers)
+            assert int(e.litellm_response_headers["retry-after"]) == cooldown_time
+
+        if exception_raised is False:
+            print(resp)
+        assert exception_raised