fix(anthropic.py): fix parallel streaming on anthropic.py

prevent parallel requests from cancelling each other Fixes https://github.com/BerriAI/litellm/issues/3881
2024-05-28 16:29:09 -07:00 · 2024-05-28 16:29:09 -07:00 · 324bf027f5
commit 324bf027f5
parent 073bca78d4
3 changed files with 152 additions and 180 deletions
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -379,13 +379,12 @@ class AnthropicChatCompletion(BaseLLM):
        logger_fn=None,
        headers={},
    ):
-        self.async_handler = AsyncHTTPHandler(
+
-            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        async_handler = AsyncHTTPHandler(
            timeout=httpx.Timeout(timeout=600.0, connect=20.0)
        )
        data["stream"] = True
-        response = await self.async_handler.post(
+        response = await async_handler.post(api_base, headers=headers, json=data)
            api_base, headers=headers, data=json.dumps(data), stream=True
        )
        if response.status_code != 200:
            raise AnthropicError(
@ -421,12 +420,10 @@ class AnthropicChatCompletion(BaseLLM):
        logger_fn=None,
        headers={},
    ) -> Union[ModelResponse, CustomStreamWrapper]:
-        self.async_handler = AsyncHTTPHandler(
+        async_handler = AsyncHTTPHandler(
            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
        )
-        response = await self.async_handler.post(
+        response = await async_handler.post(api_base, headers=headers, json=data)
            api_base, headers=headers, data=json.dumps(data)
        )
        if stream and _is_function_call:
            return self.process_streaming_response(
                model=model,
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -43,12 +43,13 @@ class AsyncHTTPHandler:
        self,
        url: str,
        data: Optional[Union[dict, str]] = None,  # type: ignore
        json: Optional[dict] = None,
        params: Optional[dict] = None,
        headers: Optional[dict] = None,
        stream: bool = False,
    ):
        req = self.client.build_request(
-            "POST", url, data=data, params=params, headers=headers  # type: ignore
+            "POST", url, data=data, json=json, params=params, headers=headers  # type: ignore
        )
        response = await self.client.send(req, stream=stream)
        return response
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -3,7 +3,7 @@
 import sys, os, asyncio
 import traceback
-import time, pytest
+import time, pytest, uuid
 from pydantic import BaseModel
 from typing import Tuple
@ -241,203 +241,138 @@ def test_completion_azure_stream_content_filter_no_delta():
    """
    try:
        chunks = [
-                {
+            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {
-                    "delta": {
+                        "delta": {"content": "", "role": "assistant"},
-                        "content": "",
+                        "finish_reason": None,
-                        "role": "assistant"
+                        "index": 0,
                    },
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
-                "system_fingerprint": "fp_5f4bad809a"
+                "system_fingerprint": "fp_5f4bad809a",
-                },
+            },
-                {
+            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {"delta": {"content": "This"}, "finish_reason": None, "index": 0}
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a",
            },
            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {"delta": {"content": " is"}, "finish_reason": None, "index": 0}
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a",
            },
            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {"delta": {"content": " a"}, "finish_reason": None, "index": 0}
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a",
            },
            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {"delta": {"content": " dummy"}, "finish_reason": None, "index": 0}
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a",
            },
            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {
-                    "delta": {
+                        "delta": {"content": " response"},
-                        "content": "This"
+                        "finish_reason": None,
-                    },
+                        "index": 0,
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
-                "system_fingerprint": "fp_5f4bad809a"
+                "system_fingerprint": "fp_5f4bad809a",
-                },
+            },
-                {
+            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {
                    "delta": {
                        "content": " is"
                    },
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a"
                },
                {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {
                    "delta": {
                        "content": " a"
                    },
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a"
                },
                {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {
                    "delta": {
                        "content": " dummy"
                    },
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a"
                },
                {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
                    {
                    "delta": {
                        "content": " response"
                    },
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
                "system_fingerprint": "fp_5f4bad809a"
                },
                {
                "id": "",
                "choices": [
                    {
-                    "finish_reason": None,
+                        "finish_reason": None,
-                    "index": 0,
+                        "index": 0,
-                    "content_filter_offsets": {
+                        "content_filter_offsets": {
-                        "check_offset": 35159,
+                            "check_offset": 35159,
-                        "start_offset": 35159,
+                            "start_offset": 35159,
-                        "end_offset": 36150
+                            "end_offset": 36150,
                    },
                    "content_filter_results": {
                        "hate": {
                        "filtered": False,
                        "severity": "safe"
                        },
-                        "self_harm": {
+                        "content_filter_results": {
-                        "filtered": False,
+                            "hate": {"filtered": False, "severity": "safe"},
-                        "severity": "safe"
+                            "self_harm": {"filtered": False, "severity": "safe"},
                            "sexual": {"filtered": False, "severity": "safe"},
                            "violence": {"filtered": False, "severity": "safe"},
                        },
                        "sexual": {
                        "filtered": False,
                        "severity": "safe"
                        },
                        "violence": {
                        "filtered": False,
                        "severity": "safe"
                        }
                    }
                    }
                ],
                "created": 0,
                "model": "",
-                "object": ""
+                "object": "",
-                },
+            },
-                {
+            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
                "choices": [
-                    {
+                    {"delta": {"content": "."}, "finish_reason": None, "index": 0}
                    "delta": {
                        "content": "."
                    },
                    "finish_reason": None,
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
-                "system_fingerprint": "fp_5f4bad809a"
+                "system_fingerprint": "fp_5f4bad809a",
-                },
+            },
-                {
+            {
                "id": "chatcmpl-9SQxdH5hODqkWyJopWlaVOOUnFwlj",
-                "choices": [
+                "choices": [{"delta": {}, "finish_reason": "stop", "index": 0}],
                    {
                    "delta": {},
                    "finish_reason": "stop",
                    "index": 0
                    }
                ],
                "created": 1716563849,
                "model": "gpt-4o-2024-05-13",
                "object": "chat.completion.chunk",
-                "system_fingerprint": "fp_5f4bad809a"
+                "system_fingerprint": "fp_5f4bad809a",
-                },
+            },
-                {
+            {
                "id": "",
                "choices": [
                    {
-                    "finish_reason": None,
+                        "finish_reason": None,
-                    "index": 0,
+                        "index": 0,
-                    "content_filter_offsets": {
+                        "content_filter_offsets": {
-                        "check_offset": 36150,
+                            "check_offset": 36150,
-                        "start_offset": 36060,
+                            "start_offset": 36060,
-                        "end_offset": 37029
+                            "end_offset": 37029,
                    },
                    "content_filter_results": {
                        "hate": {
                        "filtered": False,
                        "severity": "safe"
                        },
-                        "self_harm": {
+                        "content_filter_results": {
-                        "filtered": False,
+                            "hate": {"filtered": False, "severity": "safe"},
-                        "severity": "safe"
+                            "self_harm": {"filtered": False, "severity": "safe"},
                            "sexual": {"filtered": False, "severity": "safe"},
                            "violence": {"filtered": False, "severity": "safe"},
                        },
                        "sexual": {
                        "filtered": False,
                        "severity": "safe"
                        },
                        "violence": {
                        "filtered": False,
                        "severity": "safe"
                        }
                    }
                    }
                ],
                "created": 0,
                "model": "",
-                "object": ""
+                "object": "",
-                }            
+            },
        ]
        chunk_list = []
@ -1449,29 +1384,68 @@ def test_bedrock_claude_3_streaming():
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
-async def test_claude_3_streaming_finish_reason():
+async def test_claude_3_streaming_finish_reason(sync_mode):
    try:
        import threading
        litellm.set_verbose = True
        messages = [
            {"role": "system", "content": "Be helpful"},
            {"role": "user", "content": "What do you know?"},
        ]
-        response: ModelResponse = await litellm.acompletion(  # type: ignore
+
-            model="claude-3-opus-20240229",
+        def sync_test_streaming():
-            messages=messages,
+            response: litellm.CustomStreamWrapper = litellm.acompletion(  # type: ignore
-            stream=True,
+                model="claude-3-opus-20240229",
-            max_tokens=10,
+                messages=messages,
-        )
+                stream=True,
-        complete_response = ""
+                max_tokens=10,
-        # Add any assertions here to-check the response
+            )
-        num_finish_reason = 0
+            complete_response = ""
-        async for chunk in response:
+            # Add any assertions here to-check the response
-            print(f"chunk: {chunk}")
+            num_finish_reason = 0
-            if isinstance(chunk, ModelResponse):
+            for chunk in response:
-                if chunk.choices[0].finish_reason is not None:
+                print(f"chunk: {chunk}")
-                    num_finish_reason += 1
+                if isinstance(chunk, ModelResponse):
-        assert num_finish_reason == 1
+                    if chunk.choices[0].finish_reason is not None:
                        num_finish_reason += 1
            assert num_finish_reason == 1
        async def test_streaming():
            response: litellm.CustomStreamWrapper = await litellm.acompletion(  # type: ignore
                model="claude-3-opus-20240229",
                messages=messages,
                stream=True,
                max_tokens=10,
            )
            complete_response = ""
            # Add any assertions here to-check the response
            num_finish_reason = 0
            async for chunk in response:
                print(f"chunk: {chunk}")
                if isinstance(chunk, ModelResponse):
                    if chunk.choices[0].finish_reason is not None:
                        num_finish_reason += 1
            assert num_finish_reason == 1
        tasks = []
        for _ in range(2):
            if sync_mode == False:
                tasks.append(test_streaming())
            else:
                thread = threading.Thread(target=sync_test_streaming)
                thread.start()
                tasks.append(thread)
        if sync_mode == False:
            await asyncio.gather(*tasks)
        else:
            # Wait for all threads to complete
            for thread in tasks:
                thread.join()
    except RateLimitError:
        pass
    except Exception as e: