feat(utils.py): support async streaming for custom llm provider

2024-07-25 17:11:57 -07:00 · 2024-07-25 17:11:57 -07:00 · 060249c7e0
commit 060249c7e0
parent b4e3a77ad0
3 changed files with 38 additions and 2 deletions
--- a/litellm/llms/custom_llm.py
+++ b/litellm/llms/custom_llm.py
@ -17,8 +17,10 @@ from enum import Enum
 from functools import partial
 from typing import (
    Any,
    AsyncGenerator,
    AsyncIterator,
    Callable,
    Coroutine,
    Iterator,
    List,
    Literal,
--- a/litellm/tests/test_custom_llm.py
+++ b/litellm/tests/test_custom_llm.py
@ -17,7 +17,7 @@ sys.path.insert(
 import os
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, AsyncIterator, Iterator, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Coroutine, Iterator, Union
 from unittest.mock import AsyncMock, MagicMock, patch
 import httpx
@ -75,7 +75,7 @@ class CustomModelResponseIterator:
    # Async iterator
    def __aiter__(self):
        self.async_response_iterator = self.streaming_response.__aiter__()  # type: ignore
-        return self
+        return self.streaming_response
    async def __anext__(self) -> GenericStreamingChunk:
        try:
@ -126,6 +126,18 @@ class MyCustomLLM(CustomLLM):
        )
        return custom_iterator
    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:  # type: ignore
        generic_streaming_chunk: GenericStreamingChunk = {
            "finish_reason": "stop",
            "index": 0,
            "is_finished": True,
            "text": "Hello world",
            "tool_use": None,
            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
        }
        yield generic_streaming_chunk  # type: ignore
 def test_get_llm_provider():
    """"""
@ -187,3 +199,23 @@ def test_simple_completion_streaming():
            assert isinstance(chunk.choices[0].delta.content, str)
        else:
            assert chunk.choices[0].finish_reason == "stop"
@pytest.mark.asyncio
 async def test_simple_completion_async_streaming():
    my_custom_llm = MyCustomLLM()
    litellm.custom_provider_map = [
        {"provider": "custom_llm", "custom_handler": my_custom_llm}
    ]
    resp = await litellm.acompletion(
        model="custom_llm/my-fake-model",
        messages=[{"role": "user", "content": "Hello world!"}],
        stream=True,
    )
    async for chunk in resp:
        print(chunk)
        if chunk.choices[0].finish_reason is None:
            assert isinstance(chunk.choices[0].delta.content, str)
        else:
            assert chunk.choices[0].finish_reason == "stop"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -10132,6 +10132,7 @@ class CustomStreamWrapper:
        try:
            if self.completion_stream is None:
                await self.fetch_stream()
            if (
                self.custom_llm_provider == "openai"
                or self.custom_llm_provider == "azure"
@ -10156,6 +10157,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "triton"
                or self.custom_llm_provider == "watsonx"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
                or self.custom_llm_provider in litellm._custom_providers
            ):
                async for chunk in self.completion_stream:
                    print_verbose(f"value of async chunk: {chunk}")