Litellm dev 12 25 2024 p1 (#7411)

* test(test_watsonx.py): e2e unit test for watsonx custom header covers https://github.com/BerriAI/litellm/issues/7408 * fix(common_utils.py): handle auth token already present in headers (watsonx + openai-like base handler) Fixes https://github.com/BerriAI/litellm/issues/7408 * fix(watsonx/chat): fix chat route Fixes https://github.com/BerriAI/litellm/issues/7408 * fix(huggingface/chat/handler.py): fix huggingface async completion calls * Correct handling of max_retries=0 to disable AzureOpenAI retries (#7379) * test: fix test --------- Co-authored-by: Minh Duc <phamminhduc0711@gmail.com>
2025-04-25 18:54:30 +00:00 · 2024-12-25 17:36:30 -08:00 · 2024-12-25 17:36:30 -08:00 · 9237357bcc
commit 9237357bcc
parent 157810fcbf
9 changed files with 299 additions and 14 deletions
--- a/litellm/llms/huggingface/chat/handler.py
+++ b/litellm/llms/huggingface/chat/handler.py
@ -203,7 +203,26 @@ class Huggingface(BaseLLM):
                    return self.async_streaming(logging_obj=logging_obj, api_base=completion_url, data=data, headers=headers, model_response=model_response, model=model, timeout=timeout, messages=messages)  # type: ignore
                else:
                    ### ASYNC COMPLETION
-                    return self.acompletion(api_base=completion_url, data=data, headers=headers, model_response=model_response, task=task, encoding=encoding, model=model, optional_params=optional_params, timeout=timeout, litellm_params=litellm_params)  # type: ignore
+                    return self.acompletion(
+                        api_base=completion_url,
+                        data=data,
+                        headers=headers,
+                        model_response=model_response,
+                        encoding=encoding,
+                        model=model,
+                        optional_params=optional_params,
+                        timeout=timeout,
+                        litellm_params=litellm_params,
+                        logging_obj=logging_obj,
+                        api_key=api_key,
+                        messages=messages,
+                        client=(
+                            client
+                            if client is not None
+                            and isinstance(client, AsyncHTTPHandler)
+                            else None
+                        ),
+                    )
            if client is None or not isinstance(client, HTTPHandler):
                client = _get_httpx_client()
            ### SYNC STREAMING
@ -267,14 +286,16 @@ class Huggingface(BaseLLM):
        logging_obj: LiteLLMLoggingObj,
        api_key: str,
        messages: List[AllMessageValues],
+        client: Optional[AsyncHTTPHandler] = None,
    ):
        response: Optional[httpx.Response] = None
        try:
-            http_client = get_async_httpx_client(
-                llm_provider=litellm.LlmProviders.HUGGINGFACE
-            )
+            if client is None:
+                client = get_async_httpx_client(
+                    llm_provider=litellm.LlmProviders.HUGGINGFACE
+                )
            ### ASYNC COMPLETION
-            http_response = await http_client.post(
+            http_response = await client.post(
                url=api_base, headers=headers, data=json.dumps(data), timeout=timeout
            )