Merge branch 'main' into multi-class-krrish

2023-08-12 17:47:37 -07:00 · 2023-08-12 17:47:37 -07:00 · 887350d082
commit 887350d082
parent d09aac5d4f 96fe2d7757
9 changed files with 696 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ a light package to simplify calling OpenAI, Azure, Cohere, Anthropic, Huggingfac
 - exception mapping - common exceptions across providers are mapped to the [OpenAI exception types](https://help.openai.com/en/articles/6897213-openai-library-error-types-guidance)
 # usage
 Demo - https://litellm.ai/ \
-Read the docs - https://litellm.readthedocs.io/en/latest/
+Read the docs - https://docs.litellm.ai/docs/

 ## quick start
 ```
--- a/cookbook/liteLLM_Ollama.ipynb
+++ b/cookbook/liteLLM_Ollama.ipynb
@ -0,0 +1,179 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install litellm==0.1.385"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from litellm import completion\n",
+    "import asyncio"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setup Messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user_message = \"respond in 20 words. who are you?\"\n",
+    "messages = [{ \"content\": user_message,\"role\": \"user\"}]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call Ollama - llama2 with chatGPT Input/Output using litellm.completion() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<async_generator object get_ollama_response_stream at 0x1069198b0>\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = completion(model=\"llama2\", messages=messages, custom_api_base=\"http://localhost:11434\", custom_llm_provider=\"ollama\", stream=True)\n",
+    "print(response)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Iterate through the generator - Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " I\n",
+      "{'role': 'assistant', 'content': ' I'}\n",
+      " am\n",
+      "{'role': 'assistant', 'content': ' am'}\n",
+      " L\n",
+      "{'role': 'assistant', 'content': ' L'}\n",
+      "La\n",
+      "{'role': 'assistant', 'content': 'La'}\n",
+      "MA\n",
+      "{'role': 'assistant', 'content': 'MA'}\n",
+      ",\n",
+      "{'role': 'assistant', 'content': ','}\n",
+      " an\n",
+      "{'role': 'assistant', 'content': ' an'}\n",
+      " A\n",
+      "{'role': 'assistant', 'content': ' A'}\n",
+      "I\n",
+      "{'role': 'assistant', 'content': 'I'}\n",
+      " assistant\n",
+      "{'role': 'assistant', 'content': ' assistant'}\n",
+      " developed\n",
+      "{'role': 'assistant', 'content': ' developed'}\n",
+      " by\n",
+      "{'role': 'assistant', 'content': ' by'}\n",
+      " Meta\n",
+      "{'role': 'assistant', 'content': ' Meta'}\n",
+      " A\n",
+      "{'role': 'assistant', 'content': ' A'}\n",
+      "I\n",
+      "{'role': 'assistant', 'content': 'I'}\n",
+      " that\n",
+      "{'role': 'assistant', 'content': ' that'}\n",
+      " can\n",
+      "{'role': 'assistant', 'content': ' can'}\n",
+      " understand\n",
+      "{'role': 'assistant', 'content': ' understand'}\n",
+      " and\n",
+      "{'role': 'assistant', 'content': ' and'}\n",
+      " respond\n",
+      "{'role': 'assistant', 'content': ' respond'}\n",
+      " to\n",
+      "{'role': 'assistant', 'content': ' to'}\n",
+      " human\n",
+      "{'role': 'assistant', 'content': ' human'}\n",
+      " input\n",
+      "{'role': 'assistant', 'content': ' input'}\n",
+      " in\n",
+      "{'role': 'assistant', 'content': ' in'}\n",
+      " a\n",
+      "{'role': 'assistant', 'content': ' a'}\n",
+      " convers\n",
+      "{'role': 'assistant', 'content': ' convers'}\n",
+      "ational\n",
+      "{'role': 'assistant', 'content': 'ational'}\n",
+      " manner\n",
+      "{'role': 'assistant', 'content': ' manner'}\n",
+      ".\n",
+      "{'role': 'assistant', 'content': '.'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "async def get_response(generator):\n",
+    "    response = \"\"\n",
+    "    async for elem in generator:\n",
+    "        print(elem)\n",
+    "        response += elem['choices'][0]['delta'][\"content\"]\n",
+    "    return response\n",
+    "\n",
+    "string_response = await get_response(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/cookbook/liteLLM_OpenAI.ipynb
+++ b/cookbook/liteLLM_OpenAI.ipynb
@ -0,0 +1,349 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MZ01up0p7wOJ"
+      },
+      "source": [
+        "## 🚅 liteLLM Demo\n",
+        "### TLDR: Call 50+ LLM APIs using chatGPT Input/Output format\n",
+        "https://github.com/BerriAI/litellm\n",
+        "\n",
+        "liteLLM is package to simplify calling **OpenAI, Azure, Llama2, Cohere, Anthropic, Huggingface API Endpoints**. LiteLLM manages\n",
+        "\n",
+        "* Translating inputs to the provider's `completion()` and `embedding()` endpoints\n",
+        "* Guarantees consistent output, text responses will always be available at `['choices'][0]['message']['content']`\n",
+        "* Exception mapping - common exceptions across providers are mapped to the OpenAI exception types\n",
+        "\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RZtzCnQS7rW-"
+      },
+      "source": [
+        "## Installation and setting Params"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rsrN5W-N7L8d"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install litellm"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "ArrWyG5b7QAG"
+      },
+      "outputs": [],
+      "source": [
+        "from litellm import completion\n",
+        "import os"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bbhJRt34_NJ1"
+      },
+      "source": [
+        "## Set your API keys\n",
+        "- liteLLM reads your .env, env variables or key manager for Auth"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 23,
+      "metadata": {
+        "id": "-h8Ga5cR7SvV"
+      },
+      "outputs": [],
+      "source": [
+        "os.environ['OPENAI_API_KEY'] = \"\" #@param\n",
+        "os.environ[\"ANTHROPIC_API_KEY\"] = \"\" #@param\n",
+        "os.environ[\"AZURE_API_BASE\"] = \"\" #@param\n",
+        "os.environ[\"AZURE_API_VERSION\"] = \"\" #@param\n",
+        "os.environ[\"AZURE_API_KEY\"] = \"\" #@param\n",
+        "os.environ[\"REPLICATE_API_TOKEN\"] = \"\" #@param\n",
+        "os.environ[\"COHERE_API_KEY\"] = \"\" #@param\n",
+        "os.environ[\"HF_TOKEN\"] = \"\" #@param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "MBujGiby8YBu"
+      },
+      "outputs": [],
+      "source": [
+        "messages = [{ \"content\": \"what's the weather in SF\",\"role\": \"user\"}]"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fhqpKv6L8fBj"
+      },
+      "source": [
+        "## Call chatGPT"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "speIkoX_8db4",
+        "outputId": "bc804d62-1d33-4198-b6d7-b721961694a3"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "<OpenAIObject chat.completion id=chatcmpl-7mrklZEq2zK3Z5pSkOR3Jn54gpN5A at 0x7f76df70e930> JSON: {\n",
+              "  \"id\": \"chatcmpl-7mrklZEq2zK3Z5pSkOR3Jn54gpN5A\",\n",
+              "  \"object\": \"chat.completion\",\n",
+              "  \"created\": 1691880727,\n",
+              "  \"model\": \"gpt-3.5-turbo-0613\",\n",
+              "  \"choices\": [\n",
+              "    {\n",
+              "      \"index\": 0,\n",
+              "      \"message\": {\n",
+              "        \"role\": \"assistant\",\n",
+              "        \"content\": \"I'm sorry, but as an AI language model, I don't have real-time data. However, you can check the current weather in San Francisco by using a weather website or app, or by searching \\\"weather in San Francisco\\\" on a search engine.\"\n",
+              "      },\n",
+              "      \"finish_reason\": \"stop\"\n",
+              "    }\n",
+              "  ],\n",
+              "  \"usage\": {\n",
+              "    \"prompt_tokens\": 13,\n",
+              "    \"completion_tokens\": 52,\n",
+              "    \"total_tokens\": 65\n",
+              "  }\n",
+              "}"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "completion(model=\"gpt-3.5-turbo\", messages=messages)"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Q3jV1Uxv8zNo"
+      },
+      "source": [
+        "## Call Claude-2"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "V8yTWYzY8m9S",
+        "outputId": "8b6dd32d-f9bf-4e89-886d-47cb8020f025"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'choices': [{'finish_reason': 'stop',\n",
+              "   'index': 0,\n",
+              "   'message': {'role': 'assistant',\n",
+              "    'content': \" Unfortunately I do not have enough context to provide the current weather in San Francisco. To get the most accurate weather report, it's helpful if I know details like:\\n\\n- Exact location (city name, zip code, etc)\\n- Time frame (current conditions, forecast for a certain day/week, etc)\\n\\nIf you can provide some more specifics about what weather information you need for San Francisco, I'd be happy to look that up for you!\"}}],\n",
+              " 'created': 1691880836.974166,\n",
+              " 'model': 'claude-2',\n",
+              " 'usage': {'prompt_tokens': 18, 'completion_tokens': 95, 'total_tokens': 113}}"
+            ]
+          },
+          "execution_count": 11,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "completion(model=\"claude-2\", messages=messages)"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yu0LPDmW9PJa"
+      },
+      "source": [
+        "## Call llama2 on replicate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0GWV5mtO9Jbu",
+        "outputId": "38538825-b271-406d-a437-f5cf0eb7e548"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'choices': [{'finish_reason': 'stop',\n",
+              "   'index': 0,\n",
+              "   'message': {'role': 'assistant',\n",
+              "    'content': ' I\\'m happy to help! However, I must point out that the question \"what\\'s the weather in SF\" doesn\\'t make sense as \"SF\" could refer to multiple locations (San Francisco, South Florida, San Fernando, etc.). Could you please provide more context or specify which location you\\'re referring to? That way, I can give you an accurate answer.'}}],\n",
+              " 'created': 1691880930.9003325,\n",
+              " 'model': 'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1',\n",
+              " 'usage': {'prompt_tokens': 6, 'completion_tokens': 74, 'total_tokens': 80}}"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "model = \"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\"\n",
+        "completion(model=model, messages=messages)"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HXdj5SEe9iLK"
+      },
+      "source": [
+        "## Call Command-Nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "EaUq2xIx9fhr",
+        "outputId": "55fe6f52-b58b-4729-948a-74dac4b431b2"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'choices': [{'finish_reason': 'stop',\n",
+              "   'index': 0,\n",
+              "   'message': {'role': 'assistant',\n",
+              "    'content': ' The weather in San Francisco can be quite unpredictable. The city is known for its fog, which can'}}],\n",
+              " 'created': 1691880972.5565543,\n",
+              " 'model': 'command-nightly',\n",
+              " 'usage': {'prompt_tokens': 6, 'completion_tokens': 20, 'total_tokens': 26}}"
+            ]
+          },
+          "execution_count": 15,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "completion(model=\"command-nightly\", messages=messages)"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1g9hSgsL9soJ"
+      },
+      "source": [
+        "## Call Azure OpenAI"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 24,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "AvLjR-PF-lt0",
+        "outputId": "deff2db3-b003-48cd-ea62-c03a68a4464a"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "<OpenAIObject chat.completion id=chatcmpl-7mrtwvpx3okijXmbt9PEYdPMeE7lH at 0x7f76cfb356c0> JSON: {\n",
+              "  \"id\": \"chatcmpl-7mrtwvpx3okijXmbt9PEYdPMeE7lH\",\n",
+              "  \"object\": \"chat.completion\",\n",
+              "  \"created\": 1691881296,\n",
+              "  \"model\": \"gpt-35-turbo\",\n",
+              "  \"choices\": [\n",
+              "    {\n",
+              "      \"index\": 0,\n",
+              "      \"finish_reason\": \"stop\",\n",
+              "      \"message\": {\n",
+              "        \"role\": \"assistant\",\n",
+              "        \"content\": \"I'm sorry, as an AI language model, I don't have real-time data. However, you can check the weather forecast for San Francisco on websites such as AccuWeather or Weather Channel.\"\n",
+              "      }\n",
+              "    }\n",
+              "  ],\n",
+              "  \"usage\": {\n",
+              "    \"completion_tokens\": 40,\n",
+              "    \"prompt_tokens\": 14,\n",
+              "    \"total_tokens\": 54\n",
+              "  }\n",
+              "}"
+            ]
+          },
+          "execution_count": 24,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "completion(deployment_id=\"chatgpt-test\", messages=messages, azure=True)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/cookbook/proxy-server/.DS_Store
+++ b/cookbook/proxy-server/.DS_Store
--- a/litellm/main.py
+++ b/litellm/main.py
@ -10,6 +10,8 @@ from .llms.anthropic import AnthropicLLM
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 encoding = tiktoken.get_encoding("cl100k_base")
+from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args
+from litellm.utils import get_ollama_response_stream, stream_to_string
 ####### ENVIRONMENT VARIABLES ###################
 dotenv.load_dotenv() # Loading env variables using dotenv
 new_response = {
@ -388,6 +390,15 @@ def completion(
      model_response["created"] = time.time()
      model_response["model"] = model
      response = model_response
+    elif custom_llm_provider == "ollama":
+      endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base
+      prompt = " ".join([message["content"] for message in messages])
+
+      ## LOGGING
+      logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn)
+      generator = get_ollama_response_stream(endpoint, model, prompt)
+      # assume all responses are streamed
+      return generator
    else: 
      ## LOGGING
      logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
--- a/litellm/tests/test_ollama.py
+++ b/litellm/tests/test_ollama.py
@ -0,0 +1,62 @@
+###### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
+# import aiohttp
+# import json
+# import asyncio
+# import requests
+
+# async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
+#     session = aiohttp.ClientSession()
+#     url = f'{api_base}/api/generate'
+#     data = {
+#         "model": model,
+#         "prompt": prompt,
+#     }
+
+#     response = ""
+
+#     try:
+#         async with session.post(url, json=data) as resp:
+#             async for line in resp.content.iter_any():
+#                 if line:
+#                     try:
+#                         json_chunk = line.decode("utf-8")
+#                         chunks = json_chunk.split("\n")
+#                         for chunk in chunks:
+#                             if chunk.strip() != "":
+#                                 j = json.loads(chunk)
+#                                 if "response" in j:
+#                                     print(j["response"])
+#                                     yield {
+#                                         "role": "assistant",
+#                                         "content": j["response"]
+#                                     }
+#                                     # self.responses.append(j["response"])
+#                                     # yield "blank"
+#                     except Exception as e:
+#                         print(f"Error decoding JSON: {e}")
+#     finally:
+#         await session.close()
+
+# # async def get_ollama_response_no_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
+# #     generator =  get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?")
+# #     response = ""
+# #     async for elem in generator:
+# #         print(elem)
+# #         response += elem["content"]
+# #     return response
+
+# # #generator = get_ollama_response_stream()
+
+# # result = asyncio.run(get_ollama_response_no_stream())
+# # print(result)
+
+# # # return this generator to the client for streaming requests
+
+
+
+# # async def get_response():
+# #     global generator
+# #     async for elem in generator:
+# #         print(elem)
+
+# # asyncio.run(get_response())
--- a/litellm/tests/test_ollama_local.py
+++ b/litellm/tests/test_ollama_local.py
@ -0,0 +1,52 @@
+###### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
+
+# import sys, os
+# import traceback
+# from dotenv import load_dotenv
+# load_dotenv()
+# import os
+# sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+# import pytest
+# import litellm
+# from litellm import embedding, completion
+# import asyncio
+
+
+
+# user_message = "respond in 20 words. who are you?"
+# messages = [{ "content": user_message,"role": "user"}]
+
+# async def get_response(generator):
+#     response = ""
+#     async for elem in generator:
+#         print(elem)
+#         response += elem["content"]
+#     return response
+
+# def test_completion_ollama():
+#     try:
+#         response = completion(model="llama2", messages=messages, custom_api_base="http://localhost:11434", custom_llm_provider="ollama")
+#         print(response)
+#         string_response = asyncio.run(get_response(response))
+#         print(string_response)
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+
+# # test_completion_ollama()
+
+# def test_completion_ollama_stream():
+#     try:
+#         response = completion(model="llama2", messages=messages, custom_api_base="http://localhost:11434", custom_llm_provider="ollama", stream=True)
+#         print(response)
+#         string_response = asyncio.run(get_response(response))
+#         print(string_response)
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# test_completion_ollama_stream()
+
+
+
+
+
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -772,4 +772,44 @@ def read_config_args(config_path):
        return config
    except Exception as e:
        print("An error occurred while reading config:", str(e))
-        raise e
+        raise e
+
+
+########## ollama implementation ############################
+import aiohttp
+async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
+    session = aiohttp.ClientSession()
+    url = f'{api_base}/api/generate'
+    data = {
+        "model": model,
+        "prompt": prompt,
+    }
+    try:
+        async with session.post(url, json=data) as resp:
+            async for line in resp.content.iter_any():
+                if line:
+                    try:
+                        json_chunk = line.decode("utf-8")
+                        chunks = json_chunk.split("\n")
+                        for chunk in chunks:
+                            if chunk.strip() != "":
+                                j = json.loads(chunk)
+                                if "response" in j:
+                                    completion_obj ={ "role": "assistant", "content": ""}
+                                    completion_obj["content"] = j["response"]
+                                    yield {"choices": [{"delta": completion_obj}]}
+                                    # self.responses.append(j["response"])
+                                    # yield "blank"
+                    except Exception as e:
+                        print(f"Error decoding JSON: {e}")
+    finally:
+        await session.close()
+
+
+async def stream_to_string(generator):
+   response = ""
+   async for chunk in generator:
+      response += chunk["content"]
+   return response
+
+   
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.383"
+version = "0.1.385"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"