diff --git a/README.md b/README.md index 955d6a62d..164cbb927 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ a light package to simplify calling OpenAI, Azure, Cohere, Anthropic, Huggingfac - exception mapping - common exceptions across providers are mapped to the [OpenAI exception types](https://help.openai.com/en/articles/6897213-openai-library-error-types-guidance) # usage Demo - https://litellm.ai/ \ -Read the docs - https://litellm.readthedocs.io/en/latest/ +Read the docs - https://docs.litellm.ai/docs/ ## quick start ``` diff --git a/cookbook/liteLLM_Ollama.ipynb b/cookbook/liteLLM_Ollama.ipynb new file mode 100644 index 000000000..3aab935f9 --- /dev/null +++ b/cookbook/liteLLM_Ollama.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install litellm==0.1.385" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from litellm import completion\n", + "import asyncio" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup Messages" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "user_message = \"respond in 20 words. who are you?\"\n", + "messages = [{ \"content\": user_message,\"role\": \"user\"}]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Call Ollama - llama2 with chatGPT Input/Output using litellm.completion() " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "response = completion(model=\"llama2\", messages=messages, custom_api_base=\"http://localhost:11434\", custom_llm_provider=\"ollama\", stream=True)\n", + "print(response)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Iterate through the generator - Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " I\n", + "{'role': 'assistant', 'content': ' I'}\n", + " am\n", + "{'role': 'assistant', 'content': ' am'}\n", + " L\n", + "{'role': 'assistant', 'content': ' L'}\n", + "La\n", + "{'role': 'assistant', 'content': 'La'}\n", + "MA\n", + "{'role': 'assistant', 'content': 'MA'}\n", + ",\n", + "{'role': 'assistant', 'content': ','}\n", + " an\n", + "{'role': 'assistant', 'content': ' an'}\n", + " A\n", + "{'role': 'assistant', 'content': ' A'}\n", + "I\n", + "{'role': 'assistant', 'content': 'I'}\n", + " assistant\n", + "{'role': 'assistant', 'content': ' assistant'}\n", + " developed\n", + "{'role': 'assistant', 'content': ' developed'}\n", + " by\n", + "{'role': 'assistant', 'content': ' by'}\n", + " Meta\n", + "{'role': 'assistant', 'content': ' Meta'}\n", + " A\n", + "{'role': 'assistant', 'content': ' A'}\n", + "I\n", + "{'role': 'assistant', 'content': 'I'}\n", + " that\n", + "{'role': 'assistant', 'content': ' that'}\n", + " can\n", + "{'role': 'assistant', 'content': ' can'}\n", + " understand\n", + "{'role': 'assistant', 'content': ' understand'}\n", + " and\n", + "{'role': 'assistant', 'content': ' and'}\n", + " respond\n", + "{'role': 'assistant', 'content': ' respond'}\n", + " to\n", + "{'role': 'assistant', 'content': ' to'}\n", + " human\n", + "{'role': 'assistant', 'content': ' human'}\n", + " input\n", + "{'role': 'assistant', 'content': ' input'}\n", + " in\n", + "{'role': 'assistant', 'content': ' in'}\n", + " a\n", + "{'role': 'assistant', 'content': ' a'}\n", + " convers\n", + "{'role': 'assistant', 'content': ' convers'}\n", + "ational\n", + "{'role': 'assistant', 'content': 'ational'}\n", + " manner\n", + "{'role': 'assistant', 'content': ' manner'}\n", + ".\n", + "{'role': 'assistant', 'content': '.'}\n" + ] + } + ], + "source": [ + "\n", + "async def get_response(generator):\n", + " response = \"\"\n", + " async for elem in generator:\n", + " print(elem)\n", + " response += elem['choices'][0]['delta'][\"content\"]\n", + " return response\n", + "\n", + "string_response = await get_response(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/cookbook/liteLLM_OpenAI.ipynb b/cookbook/liteLLM_OpenAI.ipynb new file mode 100644 index 000000000..2842d6e7a --- /dev/null +++ b/cookbook/liteLLM_OpenAI.ipynb @@ -0,0 +1,349 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "MZ01up0p7wOJ" + }, + "source": [ + "## 🚅 liteLLM Demo\n", + "### TLDR: Call 50+ LLM APIs using chatGPT Input/Output format\n", + "https://github.com/BerriAI/litellm\n", + "\n", + "liteLLM is package to simplify calling **OpenAI, Azure, Llama2, Cohere, Anthropic, Huggingface API Endpoints**. LiteLLM manages\n", + "\n", + "* Translating inputs to the provider's `completion()` and `embedding()` endpoints\n", + "* Guarantees consistent output, text responses will always be available at `['choices'][0]['message']['content']`\n", + "* Exception mapping - common exceptions across providers are mapped to the OpenAI exception types\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "RZtzCnQS7rW-" + }, + "source": [ + "## Installation and setting Params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rsrN5W-N7L8d" + }, + "outputs": [], + "source": [ + "!pip install litellm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "ArrWyG5b7QAG" + }, + "outputs": [], + "source": [ + "from litellm import completion\n", + "import os" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "bbhJRt34_NJ1" + }, + "source": [ + "## Set your API keys\n", + "- liteLLM reads your .env, env variables or key manager for Auth" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "-h8Ga5cR7SvV" + }, + "outputs": [], + "source": [ + "os.environ['OPENAI_API_KEY'] = \"\" #@param\n", + "os.environ[\"ANTHROPIC_API_KEY\"] = \"\" #@param\n", + "os.environ[\"AZURE_API_BASE\"] = \"\" #@param\n", + "os.environ[\"AZURE_API_VERSION\"] = \"\" #@param\n", + "os.environ[\"AZURE_API_KEY\"] = \"\" #@param\n", + "os.environ[\"REPLICATE_API_TOKEN\"] = \"\" #@param\n", + "os.environ[\"COHERE_API_KEY\"] = \"\" #@param\n", + "os.environ[\"HF_TOKEN\"] = \"\" #@param" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "MBujGiby8YBu" + }, + "outputs": [], + "source": [ + "messages = [{ \"content\": \"what's the weather in SF\",\"role\": \"user\"}]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "fhqpKv6L8fBj" + }, + "source": [ + "## Call chatGPT" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "speIkoX_8db4", + "outputId": "bc804d62-1d33-4198-b6d7-b721961694a3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + " JSON: {\n", + " \"id\": \"chatcmpl-7mrklZEq2zK3Z5pSkOR3Jn54gpN5A\",\n", + " \"object\": \"chat.completion\",\n", + " \"created\": 1691880727,\n", + " \"model\": \"gpt-3.5-turbo-0613\",\n", + " \"choices\": [\n", + " {\n", + " \"index\": 0,\n", + " \"message\": {\n", + " \"role\": \"assistant\",\n", + " \"content\": \"I'm sorry, but as an AI language model, I don't have real-time data. However, you can check the current weather in San Francisco by using a weather website or app, or by searching \\\"weather in San Francisco\\\" on a search engine.\"\n", + " },\n", + " \"finish_reason\": \"stop\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 13,\n", + " \"completion_tokens\": 52,\n", + " \"total_tokens\": 65\n", + " }\n", + "}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completion(model=\"gpt-3.5-turbo\", messages=messages)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Q3jV1Uxv8zNo" + }, + "source": [ + "## Call Claude-2" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V8yTWYzY8m9S", + "outputId": "8b6dd32d-f9bf-4e89-886d-47cb8020f025" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'choices': [{'finish_reason': 'stop',\n", + " 'index': 0,\n", + " 'message': {'role': 'assistant',\n", + " 'content': \" Unfortunately I do not have enough context to provide the current weather in San Francisco. To get the most accurate weather report, it's helpful if I know details like:\\n\\n- Exact location (city name, zip code, etc)\\n- Time frame (current conditions, forecast for a certain day/week, etc)\\n\\nIf you can provide some more specifics about what weather information you need for San Francisco, I'd be happy to look that up for you!\"}}],\n", + " 'created': 1691880836.974166,\n", + " 'model': 'claude-2',\n", + " 'usage': {'prompt_tokens': 18, 'completion_tokens': 95, 'total_tokens': 113}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completion(model=\"claude-2\", messages=messages)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "yu0LPDmW9PJa" + }, + "source": [ + "## Call llama2 on replicate" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0GWV5mtO9Jbu", + "outputId": "38538825-b271-406d-a437-f5cf0eb7e548" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'choices': [{'finish_reason': 'stop',\n", + " 'index': 0,\n", + " 'message': {'role': 'assistant',\n", + " 'content': ' I\\'m happy to help! However, I must point out that the question \"what\\'s the weather in SF\" doesn\\'t make sense as \"SF\" could refer to multiple locations (San Francisco, South Florida, San Fernando, etc.). Could you please provide more context or specify which location you\\'re referring to? That way, I can give you an accurate answer.'}}],\n", + " 'created': 1691880930.9003325,\n", + " 'model': 'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1',\n", + " 'usage': {'prompt_tokens': 6, 'completion_tokens': 74, 'total_tokens': 80}}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = \"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\"\n", + "completion(model=model, messages=messages)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "HXdj5SEe9iLK" + }, + "source": [ + "## Call Command-Nightly" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EaUq2xIx9fhr", + "outputId": "55fe6f52-b58b-4729-948a-74dac4b431b2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'choices': [{'finish_reason': 'stop',\n", + " 'index': 0,\n", + " 'message': {'role': 'assistant',\n", + " 'content': ' The weather in San Francisco can be quite unpredictable. The city is known for its fog, which can'}}],\n", + " 'created': 1691880972.5565543,\n", + " 'model': 'command-nightly',\n", + " 'usage': {'prompt_tokens': 6, 'completion_tokens': 20, 'total_tokens': 26}}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completion(model=\"command-nightly\", messages=messages)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "1g9hSgsL9soJ" + }, + "source": [ + "## Call Azure OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AvLjR-PF-lt0", + "outputId": "deff2db3-b003-48cd-ea62-c03a68a4464a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + " JSON: {\n", + " \"id\": \"chatcmpl-7mrtwvpx3okijXmbt9PEYdPMeE7lH\",\n", + " \"object\": \"chat.completion\",\n", + " \"created\": 1691881296,\n", + " \"model\": \"gpt-35-turbo\",\n", + " \"choices\": [\n", + " {\n", + " \"index\": 0,\n", + " \"finish_reason\": \"stop\",\n", + " \"message\": {\n", + " \"role\": \"assistant\",\n", + " \"content\": \"I'm sorry, as an AI language model, I don't have real-time data. However, you can check the weather forecast for San Francisco on websites such as AccuWeather or Weather Channel.\"\n", + " }\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"completion_tokens\": 40,\n", + " \"prompt_tokens\": 14,\n", + " \"total_tokens\": 54\n", + " }\n", + "}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completion(deployment_id=\"chatgpt-test\", messages=messages, azure=True)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/cookbook/proxy-server/.DS_Store b/cookbook/proxy-server/.DS_Store new file mode 100644 index 000000000..739982f14 Binary files /dev/null and b/cookbook/proxy-server/.DS_Store differ diff --git a/litellm/main.py b/litellm/main.py index b788c717c..dfd1d12c2 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -10,6 +10,8 @@ from .llms.anthropic import AnthropicLLM import tiktoken from concurrent.futures import ThreadPoolExecutor encoding = tiktoken.get_encoding("cl100k_base") +from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args +from litellm.utils import get_ollama_response_stream, stream_to_string ####### ENVIRONMENT VARIABLES ################### dotenv.load_dotenv() # Loading env variables using dotenv new_response = { @@ -388,6 +390,15 @@ def completion( model_response["created"] = time.time() model_response["model"] = model response = model_response + elif custom_llm_provider == "ollama": + endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base + prompt = " ".join([message["content"] for message in messages]) + + ## LOGGING + logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) + generator = get_ollama_response_stream(endpoint, model, prompt) + # assume all responses are streamed + return generator else: ## LOGGING logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) diff --git a/litellm/tests/test_ollama.py b/litellm/tests/test_ollama.py new file mode 100644 index 000000000..d95414560 --- /dev/null +++ b/litellm/tests/test_ollama.py @@ -0,0 +1,62 @@ +###### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ###### +# import aiohttp +# import json +# import asyncio +# import requests + +# async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"): +# session = aiohttp.ClientSession() +# url = f'{api_base}/api/generate' +# data = { +# "model": model, +# "prompt": prompt, +# } + +# response = "" + +# try: +# async with session.post(url, json=data) as resp: +# async for line in resp.content.iter_any(): +# if line: +# try: +# json_chunk = line.decode("utf-8") +# chunks = json_chunk.split("\n") +# for chunk in chunks: +# if chunk.strip() != "": +# j = json.loads(chunk) +# if "response" in j: +# print(j["response"]) +# yield { +# "role": "assistant", +# "content": j["response"] +# } +# # self.responses.append(j["response"]) +# # yield "blank" +# except Exception as e: +# print(f"Error decoding JSON: {e}") +# finally: +# await session.close() + +# # async def get_ollama_response_no_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"): +# # generator = get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?") +# # response = "" +# # async for elem in generator: +# # print(elem) +# # response += elem["content"] +# # return response + +# # #generator = get_ollama_response_stream() + +# # result = asyncio.run(get_ollama_response_no_stream()) +# # print(result) + +# # # return this generator to the client for streaming requests + + + +# # async def get_response(): +# # global generator +# # async for elem in generator: +# # print(elem) + +# # asyncio.run(get_response()) diff --git a/litellm/tests/test_ollama_local.py b/litellm/tests/test_ollama_local.py new file mode 100644 index 000000000..22544f4cf --- /dev/null +++ b/litellm/tests/test_ollama_local.py @@ -0,0 +1,52 @@ +###### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ###### + +# import sys, os +# import traceback +# from dotenv import load_dotenv +# load_dotenv() +# import os +# sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path +# import pytest +# import litellm +# from litellm import embedding, completion +# import asyncio + + + +# user_message = "respond in 20 words. who are you?" +# messages = [{ "content": user_message,"role": "user"}] + +# async def get_response(generator): +# response = "" +# async for elem in generator: +# print(elem) +# response += elem["content"] +# return response + +# def test_completion_ollama(): +# try: +# response = completion(model="llama2", messages=messages, custom_api_base="http://localhost:11434", custom_llm_provider="ollama") +# print(response) +# string_response = asyncio.run(get_response(response)) +# print(string_response) +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + + +# # test_completion_ollama() + +# def test_completion_ollama_stream(): +# try: +# response = completion(model="llama2", messages=messages, custom_api_base="http://localhost:11434", custom_llm_provider="ollama", stream=True) +# print(response) +# string_response = asyncio.run(get_response(response)) +# print(string_response) +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + +# test_completion_ollama_stream() + + + + + diff --git a/litellm/utils.py b/litellm/utils.py index 5dec0f220..1e2420170 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -772,4 +772,44 @@ def read_config_args(config_path): return config except Exception as e: print("An error occurred while reading config:", str(e)) - raise e \ No newline at end of file + raise e + + +########## ollama implementation ############################ +import aiohttp +async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"): + session = aiohttp.ClientSession() + url = f'{api_base}/api/generate' + data = { + "model": model, + "prompt": prompt, + } + try: + async with session.post(url, json=data) as resp: + async for line in resp.content.iter_any(): + if line: + try: + json_chunk = line.decode("utf-8") + chunks = json_chunk.split("\n") + for chunk in chunks: + if chunk.strip() != "": + j = json.loads(chunk) + if "response" in j: + completion_obj ={ "role": "assistant", "content": ""} + completion_obj["content"] = j["response"] + yield {"choices": [{"delta": completion_obj}]} + # self.responses.append(j["response"]) + # yield "blank" + except Exception as e: + print(f"Error decoding JSON: {e}") + finally: + await session.close() + + +async def stream_to_string(generator): + response = "" + async for chunk in generator: + response += chunk["content"] + return response + + diff --git a/pyproject.toml b/pyproject.toml index d0094c107..735003dc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.383" +version = "0.1.385" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"