From 6ab2527fdcdf9cf0c94d129bc0bc2853a6f1f0d3 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 24 Jul 2024 18:14:49 -0700 Subject: [PATCH 01/28] feat(auth_check.py): support using redis cache for team objects Allows team update / check logic to work across instances instantly --- litellm/proxy/_new_secret_config.yaml | 5 +- litellm/proxy/auth/auth_checks.py | 24 ++++++- .../management_endpoints/team_endpoints.py | 2 + litellm/proxy/utils.py | 2 +- litellm/tests/test_proxy_server.py | 64 +++++++++++++++++++ 5 files changed, 92 insertions(+), 5 deletions(-) diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index bec92c1e9..13babaac6 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -4,5 +4,6 @@ model_list: model: "openai/*" # passes our validation check that a real provider is given api_key: "" -general_settings: - completion_model: "gpt-3.5-turbo" \ No newline at end of file +litellm_settings: + cache: True + \ No newline at end of file diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py index 91d4b1938..7c5356a37 100644 --- a/litellm/proxy/auth/auth_checks.py +++ b/litellm/proxy/auth/auth_checks.py @@ -370,10 +370,17 @@ async def _cache_team_object( team_id: str, team_table: LiteLLM_TeamTable, user_api_key_cache: DualCache, + proxy_logging_obj: Optional[ProxyLogging], ): key = "team_id:{}".format(team_id) await user_api_key_cache.async_set_cache(key=key, value=team_table) + ## UPDATE REDIS CACHE ## + if proxy_logging_obj is not None: + await proxy_logging_obj.internal_usage_cache.async_set_cache( + key=key, value=team_table + ) + @log_to_opentelemetry async def get_team_object( @@ -395,7 +402,17 @@ async def get_team_object( # check if in cache key = "team_id:{}".format(team_id) - cached_team_obj = await user_api_key_cache.async_get_cache(key=key) + + cached_team_obj: Optional[LiteLLM_TeamTable] = None + ## CHECK REDIS CACHE ## + if proxy_logging_obj is not None: + cached_team_obj = await proxy_logging_obj.internal_usage_cache.async_get_cache( + key=key + ) + + if cached_team_obj is None: + cached_team_obj = await user_api_key_cache.async_get_cache(key=key) + if cached_team_obj is not None: if isinstance(cached_team_obj, dict): return LiteLLM_TeamTable(**cached_team_obj) @@ -413,7 +430,10 @@ async def get_team_object( _response = LiteLLM_TeamTable(**response.dict()) # save the team object to cache await _cache_team_object( - team_id=team_id, team_table=_response, user_api_key_cache=user_api_key_cache + team_id=team_id, + team_table=_response, + user_api_key_cache=user_api_key_cache, + proxy_logging_obj=proxy_logging_obj, ) return _response diff --git a/litellm/proxy/management_endpoints/team_endpoints.py b/litellm/proxy/management_endpoints/team_endpoints.py index 9ba76a203..9c20836d2 100644 --- a/litellm/proxy/management_endpoints/team_endpoints.py +++ b/litellm/proxy/management_endpoints/team_endpoints.py @@ -334,6 +334,7 @@ async def update_team( create_audit_log_for_update, litellm_proxy_admin_name, prisma_client, + proxy_logging_obj, user_api_key_cache, ) @@ -380,6 +381,7 @@ async def update_team( team_id=team_row.team_id, team_table=team_row, user_api_key_cache=user_api_key_cache, + proxy_logging_obj=proxy_logging_obj, ) # Enterprise Feature - Audit Logging. Enable with litellm.store_audit_logs = True diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index b08d7a30f..fc47abf9c 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -862,7 +862,7 @@ class PrismaClient: ) """ ) - if ret[0]['sum'] == 6: + if ret[0]["sum"] == 6: print("All necessary views exist!") # noqa return except Exception: diff --git a/litellm/tests/test_proxy_server.py b/litellm/tests/test_proxy_server.py index f3cb69a08..e088f2055 100644 --- a/litellm/tests/test_proxy_server.py +++ b/litellm/tests/test_proxy_server.py @@ -731,3 +731,67 @@ def test_load_router_config(mock_cache, fake_env_vars): # test_load_router_config() + + +@pytest.mark.asyncio +async def test_team_update_redis(): + """ + Tests if team update, updates the redis cache if set + """ + from litellm.caching import DualCache, RedisCache + from litellm.proxy._types import LiteLLM_TeamTable + from litellm.proxy.auth.auth_checks import _cache_team_object + + proxy_logging_obj: ProxyLogging = getattr( + litellm.proxy.proxy_server, "proxy_logging_obj" + ) + + proxy_logging_obj.internal_usage_cache.redis_cache = RedisCache() + + with patch.object( + proxy_logging_obj.internal_usage_cache.redis_cache, + "async_set_cache", + new=MagicMock(), + ) as mock_client: + await _cache_team_object( + team_id="1234", + team_table=LiteLLM_TeamTable(), + user_api_key_cache=DualCache(), + proxy_logging_obj=proxy_logging_obj, + ) + + mock_client.assert_called_once() + + +@pytest.mark.asyncio +async def test_get_team_redis(client_no_auth): + """ + Tests if get_team_object gets value from redis cache, if set + """ + from litellm.caching import DualCache, RedisCache + from litellm.proxy._types import LiteLLM_TeamTable + from litellm.proxy.auth.auth_checks import _cache_team_object, get_team_object + + proxy_logging_obj: ProxyLogging = getattr( + litellm.proxy.proxy_server, "proxy_logging_obj" + ) + + proxy_logging_obj.internal_usage_cache.redis_cache = RedisCache() + + with patch.object( + proxy_logging_obj.internal_usage_cache.redis_cache, + "async_get_cache", + new=AsyncMock(), + ) as mock_client: + try: + await get_team_object( + team_id="1234", + user_api_key_cache=DualCache(), + parent_otel_span=None, + proxy_logging_obj=proxy_logging_obj, + prisma_client=MagicMock(), + ) + except Exception as e: + pass + + mock_client.assert_called_once() From 5553f84d511fc352dc95cbf49ad752eefbfeefa5 Mon Sep 17 00:00:00 2001 From: fracapuano Date: Thu, 25 Jul 2024 19:06:07 +0200 Subject: [PATCH 02/28] fix: now supports single tokens prediction --- litellm/llms/replicate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py index 1dd29fd7d..0d129ce02 100644 --- a/litellm/llms/replicate.py +++ b/litellm/llms/replicate.py @@ -387,7 +387,7 @@ def process_response( result = " " ## Building RESPONSE OBJECT - if len(result) > 1: + if len(result) >= 1: model_response.choices[0].message.content = result # type: ignore # Calculate usage From 3293ad745805b65b10d42f26477888d27f462f5c Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 25 Jul 2024 19:29:55 +0000 Subject: [PATCH 03/28] Add Llama 3.1 405b for Bedrock --- litellm/llms/bedrock_httpx.py | 1 + litellm/model_prices_and_context_window_backup.json | 9 +++++++++ model_prices_and_context_window.json | 9 +++++++++ 3 files changed, 19 insertions(+) diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index 16c3f60b7..3f06a50b8 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -78,6 +78,7 @@ BEDROCK_CONVERSE_MODELS = [ "ai21.jamba-instruct-v1:0", "meta.llama3-1-8b-instruct-v1:0", "meta.llama3-1-70b-instruct-v1:0", + "meta.llama3-1-405b-instruct-v1:0", ] diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 667745c30..c05256d34 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -3731,6 +3731,15 @@ "litellm_provider": "bedrock", "mode": "chat" }, + "meta.llama3-1-405b-instruct-v1:0": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000532, + "output_cost_per_token": 0.000016, + "litellm_provider": "bedrock", + "mode": "chat" + }, "512-x-512/50-steps/stability.stable-diffusion-xl-v0": { "max_tokens": 77, "max_input_tokens": 77, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 667745c30..c05256d34 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -3731,6 +3731,15 @@ "litellm_provider": "bedrock", "mode": "chat" }, + "meta.llama3-1-405b-instruct-v1:0": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000532, + "output_cost_per_token": 0.000016, + "litellm_provider": "bedrock", + "mode": "chat" + }, "512-x-512/50-steps/stability.stable-diffusion-xl-v0": { "max_tokens": 77, "max_input_tokens": 77, From 5c4ee3ef3c042b40b438e87b22b563cc716afa6a Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 25 Jul 2024 20:00:29 +0000 Subject: [PATCH 04/28] Add mistral.mistral-large-2407-v1:0 on Amazon Bedrock. --- litellm/llms/bedrock_httpx.py | 1 + litellm/model_prices_and_context_window_backup.json | 9 +++++++++ model_prices_and_context_window.json | 9 +++++++++ 3 files changed, 19 insertions(+) diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index 16c3f60b7..59b8acad0 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -78,6 +78,7 @@ BEDROCK_CONVERSE_MODELS = [ "ai21.jamba-instruct-v1:0", "meta.llama3-1-8b-instruct-v1:0", "meta.llama3-1-70b-instruct-v1:0", + "mistral.mistral-large-2407-v1:0", ] diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 667745c30..66a5565f3 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -2996,6 +2996,15 @@ "litellm_provider": "bedrock", "mode": "chat" }, + "mistral.mistral-large-2407-v1:0": { + "max_tokens": 8191, + "max_input_tokens": 128000, + "max_output_tokens": 8191, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000009, + "litellm_provider": "bedrock", + "mode": "chat" + }, "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": { "max_tokens": 8191, "max_input_tokens": 32000, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 667745c30..66a5565f3 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -2996,6 +2996,15 @@ "litellm_provider": "bedrock", "mode": "chat" }, + "mistral.mistral-large-2407-v1:0": { + "max_tokens": 8191, + "max_input_tokens": 128000, + "max_output_tokens": 8191, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000009, + "litellm_provider": "bedrock", + "mode": "chat" + }, "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": { "max_tokens": 8191, "max_input_tokens": 32000, From 22c66991ed671a544bbf2df6aa6bd0bef1122b34 Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 25 Jul 2024 20:36:03 +0000 Subject: [PATCH 05/28] Support tool calling for Llama 3.1 on Amazon bedrock. --- litellm/llms/bedrock_httpx.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index 3f06a50b8..cb3832845 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -1316,6 +1316,7 @@ class AmazonConverseConfig: model.startswith("anthropic") or model.startswith("mistral") or model.startswith("cohere") + or model.startswith("meta.llama3-1") ): supported_params.append("tools") From 64adae6e7fd57a89e7c4693d833c705e169ac579 Mon Sep 17 00:00:00 2001 From: David Manouchehri Date: Thu, 25 Jul 2024 21:06:58 +0000 Subject: [PATCH 06/28] Check for converse support first. --- litellm/utils.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index e104de958..a597643a6 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3121,7 +3121,19 @@ def get_optional_params( supported_params = get_supported_openai_params( model=model, custom_llm_provider=custom_llm_provider ) - if "ai21" in model: + if model in litellm.BEDROCK_CONVERSE_MODELS: + _check_valid_arg(supported_params=supported_params) + optional_params = litellm.AmazonConverseConfig().map_openai_params( + model=model, + non_default_params=non_default_params, + optional_params=optional_params, + drop_params=( + drop_params + if drop_params is not None and isinstance(drop_params, bool) + else False + ), + ) + elif "ai21" in model: _check_valid_arg(supported_params=supported_params) # params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[], # https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra @@ -3143,17 +3155,6 @@ def get_optional_params( optional_params=optional_params, ) ) - elif model in litellm.BEDROCK_CONVERSE_MODELS: - optional_params = litellm.AmazonConverseConfig().map_openai_params( - model=model, - non_default_params=non_default_params, - optional_params=optional_params, - drop_params=( - drop_params - if drop_params is not None and isinstance(drop_params, bool) - else False - ), - ) else: optional_params = litellm.AmazonAnthropicConfig().map_openai_params( non_default_params=non_default_params, From 6bf1b9353bbc675390cac2a5821eaa76a4788c28 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 15:33:05 -0700 Subject: [PATCH 07/28] feat(custom_llm.py): initial working commit for writing your own custom LLM handler Fixes https://github.com/BerriAI/litellm/issues/4675 Also Addresses https://github.com/BerriAI/litellm/discussions/4677 --- litellm/__init__.py | 9 ++++ litellm/llms/custom_llm.py | 70 ++++++++++++++++++++++++++++++++ litellm/main.py | 15 +++++++ litellm/tests/test_custom_llm.py | 63 ++++++++++++++++++++++++++++ litellm/types/llms/custom_llm.py | 10 +++++ litellm/utils.py | 16 ++++++++ 6 files changed, 183 insertions(+) create mode 100644 litellm/llms/custom_llm.py create mode 100644 litellm/tests/test_custom_llm.py create mode 100644 litellm/types/llms/custom_llm.py diff --git a/litellm/__init__.py b/litellm/__init__.py index 956834afc..0527ef199 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -813,6 +813,7 @@ from .utils import ( ) from .types.utils import ImageObject +from .llms.custom_llm import CustomLLM from .llms.huggingface_restapi import HuggingfaceConfig from .llms.anthropic import AnthropicConfig from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig @@ -909,3 +910,11 @@ from .cost_calculator import response_cost_calculator, cost_per_token from .types.adapter import AdapterItem adapters: List[AdapterItem] = [] + +### CUSTOM LLMs ### +from .types.llms.custom_llm import CustomLLMItem + +custom_provider_map: List[CustomLLMItem] = [] +_custom_providers: List[str] = ( + [] +) # internal helper util, used to track names of custom providers diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py new file mode 100644 index 000000000..fac1eb293 --- /dev/null +++ b/litellm/llms/custom_llm.py @@ -0,0 +1,70 @@ +# What is this? +## Handler file for a Custom Chat LLM + +""" +- completion +- acompletion +- streaming +- async_streaming +""" + +import copy +import json +import os +import time +import types +from enum import Enum +from functools import partial +from typing import Callable, List, Literal, Optional, Tuple, Union + +import httpx # type: ignore +import requests # type: ignore + +import litellm +from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.types.llms.databricks import GenericStreamingChunk +from litellm.types.utils import ProviderField +from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage + +from .base import BaseLLM +from .prompt_templates.factory import custom_prompt, prompt_factory + + +class CustomLLMError(Exception): # use this for all your exceptions + def __init__( + self, + status_code, + message, + ): + self.status_code = status_code + self.message = message + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + + +def custom_chat_llm_router(): + """ + Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type + + Validates if response is in expected format + """ + pass + + +class CustomLLM(BaseLLM): + def __init__(self) -> None: + super().__init__() + + def completion(self, *args, **kwargs) -> ModelResponse: + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + def streaming(self, *args, **kwargs): + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + async def acompletion(self, *args, **kwargs) -> ModelResponse: + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + async def astreaming(self, *args, **kwargs): + raise CustomLLMError(status_code=500, message="Not implemented yet!") diff --git a/litellm/main.py b/litellm/main.py index f724a68bd..539c3d3e1 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -107,6 +107,7 @@ from .llms.anthropic_text import AnthropicTextCompletion from .llms.azure import AzureChatCompletion from .llms.azure_text import AzureTextCompletion from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM +from .llms.custom_llm import CustomLLM, custom_chat_llm_router from .llms.databricks import DatabricksChatCompletion from .llms.huggingface_restapi import Huggingface from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion @@ -2690,6 +2691,20 @@ def completion( model_response.created = int(time.time()) model_response.model = model response = model_response + elif ( + custom_llm_provider in litellm._custom_providers + ): # Assume custom LLM provider + # Get the Custom Handler + custom_handler: Optional[CustomLLM] = None + for item in litellm.custom_provider_map: + if item["provider"] == custom_llm_provider: + custom_handler = item["custom_handler"] + + if custom_handler is None: + raise ValueError( + f"Unable to map your input to a model. Check your input - {args}" + ) + response = custom_handler.completion() else: raise ValueError( f"Unable to map your input to a model. Check your input - {args}" diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py new file mode 100644 index 000000000..0506986eb --- /dev/null +++ b/litellm/tests/test_custom_llm.py @@ -0,0 +1,63 @@ +# What is this? +## Unit tests for the CustomLLM class + + +import asyncio +import os +import sys +import time +import traceback + +import openai +import pytest + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import os +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from unittest.mock import AsyncMock, MagicMock, patch + +import httpx +from dotenv import load_dotenv + +import litellm +from litellm import CustomLLM, completion, get_llm_provider + + +class MyCustomLLM(CustomLLM): + def completion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + + +def test_get_llm_provider(): + from litellm.utils import custom_llm_setup + + my_custom_llm = MyCustomLLM() + litellm.custom_provider_map = [ + {"provider": "custom_llm", "custom_handler": my_custom_llm} + ] + + custom_llm_setup() + + model, provider, _, _ = get_llm_provider(model="custom_llm/my-fake-model") + + assert provider == "custom_llm" + + +def test_simple_completion(): + my_custom_llm = MyCustomLLM() + litellm.custom_provider_map = [ + {"provider": "custom_llm", "custom_handler": my_custom_llm} + ] + resp = completion( + model="custom_llm/my-fake-model", + messages=[{"role": "user", "content": "Hello world!"}], + ) + + assert resp.choices[0].message.content == "Hi!" diff --git a/litellm/types/llms/custom_llm.py b/litellm/types/llms/custom_llm.py new file mode 100644 index 000000000..d5499a419 --- /dev/null +++ b/litellm/types/llms/custom_llm.py @@ -0,0 +1,10 @@ +from typing import List + +from typing_extensions import Dict, Required, TypedDict, override + +from litellm.llms.custom_llm import CustomLLM + + +class CustomLLMItem(TypedDict): + provider: str + custom_handler: CustomLLM diff --git a/litellm/utils.py b/litellm/utils.py index e104de958..0f1b0315d 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -330,6 +330,18 @@ class Rules: ####### CLIENT ################### # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking +def custom_llm_setup(): + """ + Add custom_llm provider to provider list + """ + for custom_llm in litellm.custom_provider_map: + if custom_llm["provider"] not in litellm.provider_list: + litellm.provider_list.append(custom_llm["provider"]) + + if custom_llm["provider"] not in litellm._custom_providers: + litellm._custom_providers.append(custom_llm["provider"]) + + def function_setup( original_function: str, rules_obj, start_time, *args, **kwargs ): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc. @@ -341,6 +353,10 @@ def function_setup( try: global callback_list, add_breadcrumb, user_logger_fn, Logging + ## CUSTOM LLM SETUP ## + custom_llm_setup() + + ## LOGGING SETUP function_id = kwargs["id"] if "id" in kwargs else None if len(litellm.callbacks) > 0: From 9f97436308de5c1ddc1acf14567b0caf0c23ab2d Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 15:51:39 -0700 Subject: [PATCH 08/28] fix(custom_llm.py): support async completion calls --- litellm/llms/custom_llm.py | 26 +++++++++++++++++--------- litellm/main.py | 10 +++++++++- litellm/tests/test_custom_llm.py | 25 ++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py index fac1eb293..5e9933194 100644 --- a/litellm/llms/custom_llm.py +++ b/litellm/llms/custom_llm.py @@ -44,15 +44,6 @@ class CustomLLMError(Exception): # use this for all your exceptions ) # Call the base class constructor with the parameters it needs -def custom_chat_llm_router(): - """ - Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type - - Validates if response is in expected format - """ - pass - - class CustomLLM(BaseLLM): def __init__(self) -> None: super().__init__() @@ -68,3 +59,20 @@ class CustomLLM(BaseLLM): async def astreaming(self, *args, **kwargs): raise CustomLLMError(status_code=500, message="Not implemented yet!") + + +def custom_chat_llm_router( + async_fn: bool, stream: Optional[bool], custom_llm: CustomLLM +): + """ + Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type + + Validates if response is in expected format + """ + if async_fn: + if stream: + return custom_llm.astreaming + return custom_llm.acompletion + if stream: + return custom_llm.streaming + return custom_llm.completion diff --git a/litellm/main.py b/litellm/main.py index 539c3d3e1..51e7c611c 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -382,6 +382,7 @@ async def acompletion( or custom_llm_provider == "clarifai" or custom_llm_provider == "watsonx" or custom_llm_provider in litellm.openai_compatible_providers + or custom_llm_provider in litellm._custom_providers ): # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all. init_response = await loop.run_in_executor(None, func_with_context) if isinstance(init_response, dict) or isinstance( @@ -2704,7 +2705,14 @@ def completion( raise ValueError( f"Unable to map your input to a model. Check your input - {args}" ) - response = custom_handler.completion() + + ## ROUTE LLM CALL ## + handler_fn = custom_chat_llm_router( + async_fn=acompletion, stream=stream, custom_llm=custom_handler + ) + + ## CALL FUNCTION + response = handler_fn() else: raise ValueError( f"Unable to map your input to a model. Check your input - {args}" diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py index 0506986eb..fd46c892e 100644 --- a/litellm/tests/test_custom_llm.py +++ b/litellm/tests/test_custom_llm.py @@ -23,7 +23,7 @@ import httpx from dotenv import load_dotenv import litellm -from litellm import CustomLLM, completion, get_llm_provider +from litellm import CustomLLM, acompletion, completion, get_llm_provider class MyCustomLLM(CustomLLM): @@ -35,6 +35,15 @@ class MyCustomLLM(CustomLLM): ) # type: ignore +class MyCustomAsyncLLM(CustomLLM): + async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + + def test_get_llm_provider(): from litellm.utils import custom_llm_setup @@ -61,3 +70,17 @@ def test_simple_completion(): ) assert resp.choices[0].message.content == "Hi!" + + +@pytest.mark.asyncio +async def test_simple_acompletion(): + my_custom_llm = MyCustomAsyncLLM() + litellm.custom_provider_map = [ + {"provider": "custom_llm", "custom_handler": my_custom_llm} + ] + resp = await acompletion( + model="custom_llm/my-fake-model", + messages=[{"role": "user", "content": "Hello world!"}], + ) + + assert resp.choices[0].message.content == "Hi!" From b4e3a77ad0b823fb5ab44f6ee92a48e2b929993d Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 16:47:32 -0700 Subject: [PATCH 09/28] feat(utils.py): support sync streaming for custom llm provider --- litellm/__init__.py | 1 + litellm/llms/custom_llm.py | 19 ++++-- litellm/main.py | 8 +++ litellm/tests/test_custom_llm.py | 111 +++++++++++++++++++++++++++++-- litellm/utils.py | 10 ++- 5 files changed, 139 insertions(+), 10 deletions(-) diff --git a/litellm/__init__.py b/litellm/__init__.py index 0527ef199..b6aacad1a 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -913,6 +913,7 @@ adapters: List[AdapterItem] = [] ### CUSTOM LLMs ### from .types.llms.custom_llm import CustomLLMItem +from .types.utils import GenericStreamingChunk custom_provider_map: List[CustomLLMItem] = [] _custom_providers: List[str] = ( diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py index 5e9933194..f00d02ab7 100644 --- a/litellm/llms/custom_llm.py +++ b/litellm/llms/custom_llm.py @@ -15,7 +15,17 @@ import time import types from enum import Enum from functools import partial -from typing import Callable, List, Literal, Optional, Tuple, Union +from typing import ( + Any, + AsyncIterator, + Callable, + Iterator, + List, + Literal, + Optional, + Tuple, + Union, +) import httpx # type: ignore import requests # type: ignore @@ -23,8 +33,7 @@ import requests # type: ignore import litellm from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler -from litellm.types.llms.databricks import GenericStreamingChunk -from litellm.types.utils import ProviderField +from litellm.types.utils import GenericStreamingChunk, ProviderField from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage from .base import BaseLLM @@ -51,13 +60,13 @@ class CustomLLM(BaseLLM): def completion(self, *args, **kwargs) -> ModelResponse: raise CustomLLMError(status_code=500, message="Not implemented yet!") - def streaming(self, *args, **kwargs): + def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: raise CustomLLMError(status_code=500, message="Not implemented yet!") async def acompletion(self, *args, **kwargs) -> ModelResponse: raise CustomLLMError(status_code=500, message="Not implemented yet!") - async def astreaming(self, *args, **kwargs): + async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: raise CustomLLMError(status_code=500, message="Not implemented yet!") diff --git a/litellm/main.py b/litellm/main.py index 51e7c611c..c3be01373 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -2713,6 +2713,14 @@ def completion( ## CALL FUNCTION response = handler_fn() + if stream is True: + return CustomStreamWrapper( + completion_stream=response, + model=model, + custom_llm_provider=custom_llm_provider, + logging_obj=logging, + ) + else: raise ValueError( f"Unable to map your input to a model. Check your input - {args}" diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py index fd46c892e..4cc355e4b 100644 --- a/litellm/tests/test_custom_llm.py +++ b/litellm/tests/test_custom_llm.py @@ -17,13 +17,80 @@ sys.path.insert( import os from collections import defaultdict from concurrent.futures import ThreadPoolExecutor +from typing import Any, AsyncIterator, Iterator, Union from unittest.mock import AsyncMock, MagicMock, patch import httpx from dotenv import load_dotenv import litellm -from litellm import CustomLLM, acompletion, completion, get_llm_provider +from litellm import ( + ChatCompletionDeltaChunk, + ChatCompletionUsageBlock, + CustomLLM, + GenericStreamingChunk, + ModelResponse, + acompletion, + completion, + get_llm_provider, +) +from litellm.utils import ModelResponseIterator + + +class CustomModelResponseIterator: + def __init__(self, streaming_response: Union[Iterator, AsyncIterator]): + self.streaming_response = streaming_response + + def chunk_parser(self, chunk: Any) -> GenericStreamingChunk: + return GenericStreamingChunk( + text="hello world", + tool_use=None, + is_finished=True, + finish_reason="stop", + usage=ChatCompletionUsageBlock( + prompt_tokens=10, completion_tokens=20, total_tokens=30 + ), + index=0, + ) + + # Sync iterator + def __iter__(self): + return self + + def __next__(self) -> GenericStreamingChunk: + try: + chunk: Any = self.streaming_response.__next__() # type: ignore + except StopIteration: + raise StopIteration + except ValueError as e: + raise RuntimeError(f"Error receiving chunk from stream: {e}") + + try: + return self.chunk_parser(chunk=chunk) + except StopIteration: + raise StopIteration + except ValueError as e: + raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") + + # Async iterator + def __aiter__(self): + self.async_response_iterator = self.streaming_response.__aiter__() # type: ignore + return self + + async def __anext__(self) -> GenericStreamingChunk: + try: + chunk = await self.async_response_iterator.__anext__() + except StopAsyncIteration: + raise StopAsyncIteration + except ValueError as e: + raise RuntimeError(f"Error receiving chunk from stream: {e}") + + try: + return self.chunk_parser(chunk=chunk) + except StopIteration: + raise StopIteration + except ValueError as e: + raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") class MyCustomLLM(CustomLLM): @@ -34,8 +101,6 @@ class MyCustomLLM(CustomLLM): mock_response="Hi!", ) # type: ignore - -class MyCustomAsyncLLM(CustomLLM): async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse: return litellm.completion( model="gpt-3.5-turbo", @@ -43,8 +108,27 @@ class MyCustomAsyncLLM(CustomLLM): mock_response="Hi!", ) # type: ignore + def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: + generic_streaming_chunk: GenericStreamingChunk = { + "finish_reason": "stop", + "index": 0, + "is_finished": True, + "text": "Hello world", + "tool_use": None, + "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30}, + } + + completion_stream = ModelResponseIterator( + model_response=generic_streaming_chunk # type: ignore + ) + custom_iterator = CustomModelResponseIterator( + streaming_response=completion_stream + ) + return custom_iterator + def test_get_llm_provider(): + """""" from litellm.utils import custom_llm_setup my_custom_llm = MyCustomLLM() @@ -74,7 +158,7 @@ def test_simple_completion(): @pytest.mark.asyncio async def test_simple_acompletion(): - my_custom_llm = MyCustomAsyncLLM() + my_custom_llm = MyCustomLLM() litellm.custom_provider_map = [ {"provider": "custom_llm", "custom_handler": my_custom_llm} ] @@ -84,3 +168,22 @@ async def test_simple_acompletion(): ) assert resp.choices[0].message.content == "Hi!" + + +def test_simple_completion_streaming(): + my_custom_llm = MyCustomLLM() + litellm.custom_provider_map = [ + {"provider": "custom_llm", "custom_handler": my_custom_llm} + ] + resp = completion( + model="custom_llm/my-fake-model", + messages=[{"role": "user", "content": "Hello world!"}], + stream=True, + ) + + for chunk in resp: + print(chunk) + if chunk.choices[0].finish_reason is None: + assert isinstance(chunk.choices[0].delta.content, str) + else: + assert chunk.choices[0].finish_reason == "stop" diff --git a/litellm/utils.py b/litellm/utils.py index 0f1b0315d..c14ab36dd 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -9262,7 +9262,10 @@ class CustomStreamWrapper: try: # return this for all models completion_obj = {"content": ""} - if self.custom_llm_provider and self.custom_llm_provider == "anthropic": + if self.custom_llm_provider and ( + self.custom_llm_provider == "anthropic" + or self.custom_llm_provider in litellm._custom_providers + ): from litellm.types.utils import GenericStreamingChunk as GChunk if self.received_finish_reason is not None: @@ -10981,3 +10984,8 @@ class ModelResponseIterator: raise StopAsyncIteration self.is_done = True return self.model_response + + +class CustomModelResponseIterator(Iterable): + def __init__(self) -> None: + super().__init__() From 060249c7e0477fee7740a856b4bb7d58ba3c8079 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 17:11:57 -0700 Subject: [PATCH 10/28] feat(utils.py): support async streaming for custom llm provider --- litellm/llms/custom_llm.py | 2 ++ litellm/tests/test_custom_llm.py | 36 ++++++++++++++++++++++++++++++-- litellm/utils.py | 2 ++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py index f00d02ab7..f1b2b28b4 100644 --- a/litellm/llms/custom_llm.py +++ b/litellm/llms/custom_llm.py @@ -17,8 +17,10 @@ from enum import Enum from functools import partial from typing import ( Any, + AsyncGenerator, AsyncIterator, Callable, + Coroutine, Iterator, List, Literal, diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py index 4cc355e4b..af88b1f3a 100644 --- a/litellm/tests/test_custom_llm.py +++ b/litellm/tests/test_custom_llm.py @@ -17,7 +17,7 @@ sys.path.insert( import os from collections import defaultdict from concurrent.futures import ThreadPoolExecutor -from typing import Any, AsyncIterator, Iterator, Union +from typing import Any, AsyncGenerator, AsyncIterator, Coroutine, Iterator, Union from unittest.mock import AsyncMock, MagicMock, patch import httpx @@ -75,7 +75,7 @@ class CustomModelResponseIterator: # Async iterator def __aiter__(self): self.async_response_iterator = self.streaming_response.__aiter__() # type: ignore - return self + return self.streaming_response async def __anext__(self) -> GenericStreamingChunk: try: @@ -126,6 +126,18 @@ class MyCustomLLM(CustomLLM): ) return custom_iterator + async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: # type: ignore + generic_streaming_chunk: GenericStreamingChunk = { + "finish_reason": "stop", + "index": 0, + "is_finished": True, + "text": "Hello world", + "tool_use": None, + "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30}, + } + + yield generic_streaming_chunk # type: ignore + def test_get_llm_provider(): """""" @@ -187,3 +199,23 @@ def test_simple_completion_streaming(): assert isinstance(chunk.choices[0].delta.content, str) else: assert chunk.choices[0].finish_reason == "stop" + + +@pytest.mark.asyncio +async def test_simple_completion_async_streaming(): + my_custom_llm = MyCustomLLM() + litellm.custom_provider_map = [ + {"provider": "custom_llm", "custom_handler": my_custom_llm} + ] + resp = await litellm.acompletion( + model="custom_llm/my-fake-model", + messages=[{"role": "user", "content": "Hello world!"}], + stream=True, + ) + + async for chunk in resp: + print(chunk) + if chunk.choices[0].finish_reason is None: + assert isinstance(chunk.choices[0].delta.content, str) + else: + assert chunk.choices[0].finish_reason == "stop" diff --git a/litellm/utils.py b/litellm/utils.py index c14ab36dd..9158afb74 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -10132,6 +10132,7 @@ class CustomStreamWrapper: try: if self.completion_stream is None: await self.fetch_stream() + if ( self.custom_llm_provider == "openai" or self.custom_llm_provider == "azure" @@ -10156,6 +10157,7 @@ class CustomStreamWrapper: or self.custom_llm_provider == "triton" or self.custom_llm_provider == "watsonx" or self.custom_llm_provider in litellm.openai_compatible_endpoints + or self.custom_llm_provider in litellm._custom_providers ): async for chunk in self.completion_stream: print_verbose(f"value of async chunk: {chunk}") From e3142b4294cfd5b0b5219607f99d1b554a2a11ff Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 17:22:57 -0700 Subject: [PATCH 11/28] fix whisper health check with litellm --- litellm/llms/openai.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 25e2e518c..2c7a7a4df 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -1,5 +1,6 @@ import hashlib import json +import os import time import traceback import types @@ -1870,6 +1871,16 @@ class OpenAIChatCompletion(BaseLLM): model=model, # type: ignore prompt=prompt, # type: ignore ) + elif mode == "audio_transcription": + # Get the current directory of the file being run + pwd = os.path.dirname(os.path.realpath(__file__)) + file_path = os.path.join(pwd, "../tests/gettysburg.wav") + audio_file = open(file_path, "rb") + completion = await client.audio.transcriptions.with_raw_response.create( + file=audio_file, + model=model, # type: ignore + prompt=prompt, # type: ignore + ) else: raise Exception("mode not set") response = {} From 2432c90515229da4d80d9ec298c315e7c9040a57 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 17:26:14 -0700 Subject: [PATCH 12/28] feat - support health check audio_speech --- litellm/llms/openai.py | 9 ++++++++- litellm/proxy/proxy_config.yaml | 6 ++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 2c7a7a4df..fae8a448a 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -1881,8 +1881,15 @@ class OpenAIChatCompletion(BaseLLM): model=model, # type: ignore prompt=prompt, # type: ignore ) + elif mode == "audio_speech": + # Get the current directory of the file being run + completion = await client.audio.speech.with_raw_response.create( + model=model, # type: ignore + input=prompt, # type: ignore + voice="alloy", + ) else: - raise Exception("mode not set") + raise ValueError("mode not set, passed in mode: " + mode) response = {} if completion is None or not hasattr(completion, "headers"): diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 0e3f0826e..bd8f5bfd0 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -8,6 +8,12 @@ model_list: litellm_params: model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct api_key: "os.environ/FIREWORKS" + - model_name: tts + litellm_params: + model: openai/tts-1 + api_key: "os.environ/OPENAI_API_KEY" + model_info: + mode: audio_speech general_settings: master_key: sk-1234 alerting: ["slack"] From 3573b47098c52b1dc506e8918b46f5ee471bca28 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 17:29:28 -0700 Subject: [PATCH 13/28] docs add example on using text to speech models --- docs/my-website/docs/proxy/health.md | 57 +++++++++++++++++----------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md index 6d383fc41..632702b91 100644 --- a/docs/my-website/docs/proxy/health.md +++ b/docs/my-website/docs/proxy/health.md @@ -41,28 +41,6 @@ litellm --health } ``` -### Background Health Checks - -You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`. - -Here's how to use it: -1. in the config.yaml add: -``` -general_settings: - background_health_checks: True # enable background health checks - health_check_interval: 300 # frequency of background health checks -``` - -2. Start server -``` -$ litellm /path/to/config.yaml -``` - -3. Query health endpoint: -``` -curl --location 'http://0.0.0.0:4000/health' -``` - ### Embedding Models We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check @@ -124,6 +102,41 @@ model_list: mode: audio_transcription ``` + +### Text to Speech Models + +```yaml +# OpenAI Text to Speech Models + - model_name: tts + litellm_params: + model: openai/tts-1 + api_key: "os.environ/OPENAI_API_KEY" + model_info: + mode: audio_speech +``` + +## Background Health Checks + +You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`. + +Here's how to use it: +1. in the config.yaml add: +``` +general_settings: + background_health_checks: True # enable background health checks + health_check_interval: 300 # frequency of background health checks +``` + +2. Start server +``` +$ litellm /path/to/config.yaml +``` + +3. Query health endpoint: +``` +curl --location 'http://0.0.0.0:4000/health' +``` + ### Hide details The health check response contains details like endpoint URLs, error messages, From f2443996d82d50e88ecfbca4efb045fc0522aa84 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 17:30:15 -0700 Subject: [PATCH 14/28] feat support audio health checks for azure --- litellm/llms/azure.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py index a2928cf20..ec143f3fe 100644 --- a/litellm/llms/azure.py +++ b/litellm/llms/azure.py @@ -1864,6 +1864,23 @@ class AzureChatCompletion(BaseLLM): model=model, # type: ignore prompt=prompt, # type: ignore ) + elif mode == "audio_transcription": + # Get the current directory of the file being run + pwd = os.path.dirname(os.path.realpath(__file__)) + file_path = os.path.join(pwd, "../tests/gettysburg.wav") + audio_file = open(file_path, "rb") + completion = await client.audio.transcriptions.with_raw_response.create( + file=audio_file, + model=model, # type: ignore + prompt=prompt, # type: ignore + ) + elif mode == "audio_speech": + # Get the current directory of the file being run + completion = await client.audio.speech.with_raw_response.create( + model=model, # type: ignore + input=prompt, # type: ignore + voice="alloy", + ) else: raise Exception("mode not set") response = {} From 3814170ae17d748110058a0c411ad7eccc786b6a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 17:41:16 -0700 Subject: [PATCH 15/28] docs - add info about routing strategy on load balancing docs --- docs/my-website/docs/proxy/reliability.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index 2404c744c..a3f03b3d7 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -31,8 +31,19 @@ model_list: api_base: https://openai-france-1234.openai.azure.com/ api_key: rpm: 1440 +routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" + model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo` + num_retries: 2 + timeout: 30 # 30 seconds + redis_host: # set this when using multiple litellm proxy deployments, load balancing state stored in redis + redis_password: + redis_port: 1992 ``` +:::info +Detailed information about [routing strategies can be found here](../routing) +::: + #### Step 2: Start Proxy with config ```shell From a2d07cfe64e24f2a42612213f46e49114a94ff8e Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 17:41:19 -0700 Subject: [PATCH 16/28] docs(custom_llm_server.md): add calling custom llm server to docs --- .../docs/providers/custom_llm_server.md | 73 ++++++++++ .../docs/providers/custom_openai_proxy.md | 129 ------------------ docs/my-website/sidebars.js | 3 +- 3 files changed, 75 insertions(+), 130 deletions(-) create mode 100644 docs/my-website/docs/providers/custom_llm_server.md delete mode 100644 docs/my-website/docs/providers/custom_openai_proxy.md diff --git a/docs/my-website/docs/providers/custom_llm_server.md b/docs/my-website/docs/providers/custom_llm_server.md new file mode 100644 index 000000000..f8d5fb551 --- /dev/null +++ b/docs/my-website/docs/providers/custom_llm_server.md @@ -0,0 +1,73 @@ +# Custom API Server (Custom Format) + +LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format + + +:::info + +For calling an openai-compatible endpoint, [go here](./openai_compatible.md) +::: + +## Quick Start + +```python +import litellm +from litellm import CustomLLM, completion, get_llm_provider + + +class MyCustomLLM(CustomLLM): + def completion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + +litellm.custom_provider_map = [ # 👈 KEY STEP - REGISTER HANDLER + {"provider": "my-custom-llm", "custom_handler": my_custom_llm} + ] + +resp = completion( + model="my-custom-llm/my-fake-model", + messages=[{"role": "user", "content": "Hello world!"}], + ) + +assert resp.choices[0].message.content == "Hi!" +``` + + +## Custom Handler Spec + +```python +from litellm.types.utils import GenericStreamingChunk, ModelResponse +from typing import Iterator, AsyncIterator +from litellm.llms.base import BaseLLM + +class CustomLLMError(Exception): # use this for all your exceptions + def __init__( + self, + status_code, + message, + ): + self.status_code = status_code + self.message = message + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + +class CustomLLM(BaseLLM): + def __init__(self) -> None: + super().__init__() + + def completion(self, *args, **kwargs) -> ModelResponse: + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + async def acompletion(self, *args, **kwargs) -> ModelResponse: + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: + raise CustomLLMError(status_code=500, message="Not implemented yet!") +``` \ No newline at end of file diff --git a/docs/my-website/docs/providers/custom_openai_proxy.md b/docs/my-website/docs/providers/custom_openai_proxy.md deleted file mode 100644 index b6f2eccac..000000000 --- a/docs/my-website/docs/providers/custom_openai_proxy.md +++ /dev/null @@ -1,129 +0,0 @@ -# Custom API Server (OpenAI Format) - -LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format - -## API KEYS -No api keys required - -## Set up your Custom API Server -Your server should have the following Endpoints: - -Here's an example OpenAI proxy server with routes: https://replit.com/@BerriAI/openai-proxy#main.py - -### Required Endpoints -- POST `/chat/completions` - chat completions endpoint - -### Optional Endpoints -- POST `/completions` - completions endpoint -- Get `/models` - available models on server -- POST `/embeddings` - creates an embedding vector representing the input text. - - -## Example Usage - -### Call `/chat/completions` -In order to use your custom OpenAI Chat Completion proxy with LiteLLM, ensure you set - -* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co" -* `custom_llm_provider` to `openai` this ensures litellm uses the `openai.ChatCompletion` to your api_base - -```python -import os -from litellm import completion - -## set ENV variables -os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy - -messages = [{ "content": "Hello, how are you?","role": "user"}] - -response = completion( - model="command-nightly", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://openai-proxy.berriai.repl.co", - custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request - -) -print(response) -``` - -#### Response -```json -{ - "object": - "chat.completion", - "choices": [{ - "finish_reason": "stop", - "index": 0, - "message": { - "content": - "The sky, a canvas of blue,\nA work of art, pure and true,\nA", - "role": "assistant" - } - }], - "id": - "chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8", - "created": - 1699290237.408061, - "model": - "togethercomputer/llama-2-70b-chat", - "usage": { - "completion_tokens": 18, - "prompt_tokens": 14, - "total_tokens": 32 - } - } -``` - - -### Call `/completions` -In order to use your custom OpenAI Completion proxy with LiteLLM, ensure you set - -* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co" -* `custom_llm_provider` to `text-completion-openai` this ensures litellm uses the `openai.Completion` to your api_base - -```python -import os -from litellm import completion - -## set ENV variables -os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy - -messages = [{ "content": "Hello, how are you?","role": "user"}] - -response = completion( - model="command-nightly", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://openai-proxy.berriai.repl.co", - custom_llm_provider="text-completion-openai" # litellm will use the openai.Completion to make the request - -) -print(response) -``` - -#### Response -```json -{ - "warning": - "This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations", - "id": - "cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r", - "object": - "text_completion", - "created": - 1699290166, - "model": - "text-davinci-003", - "choices": [{ - "text": - "\n\nThe weather in San Francisco varies depending on what time of year and time", - "index": 0, - "logprobs": None, - "finish_reason": "length" - }], - "usage": { - "prompt_tokens": 7, - "completion_tokens": 16, - "total_tokens": 23 - } - } -``` \ No newline at end of file diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index d228e09d2..c1ce83068 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -175,7 +175,8 @@ const sidebars = { "providers/aleph_alpha", "providers/baseten", "providers/openrouter", - "providers/custom_openai_proxy", + // "providers/custom_openai_proxy", + "providers/custom_llm_server", "providers/petals", ], From bd7af04a725e74290aeb0d87889538041aa0cc3a Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 17:56:34 -0700 Subject: [PATCH 17/28] feat(proxy_server.py): support custom llm handler on proxy --- .../docs/providers/custom_llm_server.md | 97 ++++++++++++++++++- litellm/proxy/_new_secret_config.yaml | 9 +- litellm/proxy/custom_handler.py | 21 ++++ litellm/proxy/proxy_server.py | 15 +++ 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 litellm/proxy/custom_handler.py diff --git a/docs/my-website/docs/providers/custom_llm_server.md b/docs/my-website/docs/providers/custom_llm_server.md index f8d5fb551..70fc4cea5 100644 --- a/docs/my-website/docs/providers/custom_llm_server.md +++ b/docs/my-website/docs/providers/custom_llm_server.md @@ -35,6 +35,101 @@ resp = completion( assert resp.choices[0].message.content == "Hi!" ``` +## OpenAI Proxy Usage + +1. Setup your `custom_handler.py` file + +```python +import litellm +from litellm import CustomLLM, completion, get_llm_provider + + +class MyCustomLLM(CustomLLM): + def completion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + + async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + + +my_custom_llm = MyCustomLLM() +``` + +2. Add to `config.yaml` + +In the config below, we pass + +python_filename: `custom_handler.py` +custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1 + +custom_handler: `custom_handler.my_custom_llm` + +```yaml +model_list: + - model_name: "test-model" + litellm_params: + model: "openai/text-embedding-ada-002" + - model_name: "my-custom-model" + litellm_params: + model: "my-custom-llm/my-model" + +litellm_settings: + custom_provider_map: + - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm} +``` + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "my-custom-model", + "messages": [{"role": "user", "content": "Say \"this is a test\" in JSON!"}], +}' +``` + +Expected Response + +``` +{ + "id": "chatcmpl-06f1b9cd-08bc-43f7-9814-a69173921216", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "Hi!", + "role": "assistant", + "tool_calls": null, + "function_call": null + } + } + ], + "created": 1721955063, + "model": "gpt-3.5-turbo", + "object": "chat.completion", + "system_fingerprint": null, + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30 + } +} +``` ## Custom Handler Spec @@ -70,4 +165,4 @@ class CustomLLM(BaseLLM): async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: raise CustomLLMError(status_code=500, message="Not implemented yet!") -``` \ No newline at end of file +``` diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index a81d133e5..0854f0901 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,4 +1,11 @@ model_list: - model_name: "test-model" litellm_params: - model: "openai/text-embedding-ada-002" \ No newline at end of file + model: "openai/text-embedding-ada-002" + - model_name: "my-custom-model" + litellm_params: + model: "my-custom-llm/my-model" + +litellm_settings: + custom_provider_map: + - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm} \ No newline at end of file diff --git a/litellm/proxy/custom_handler.py b/litellm/proxy/custom_handler.py new file mode 100644 index 000000000..56943c34d --- /dev/null +++ b/litellm/proxy/custom_handler.py @@ -0,0 +1,21 @@ +import litellm +from litellm import CustomLLM, completion, get_llm_provider + + +class MyCustomLLM(CustomLLM): + def completion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + + async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse: + return litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + mock_response="Hi!", + ) # type: ignore + + +my_custom_llm = MyCustomLLM() diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index f22f25f73..bad1abae2 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1507,6 +1507,21 @@ class ProxyConfig: verbose_proxy_logger.debug( f"litellm.post_call_rules: {litellm.post_call_rules}" ) + elif key == "custom_provider_map": + from litellm.utils import custom_llm_setup + + litellm.custom_provider_map = [ + { + "provider": item["provider"], + "custom_handler": get_instance_fn( + value=item["custom_handler"], + config_file_path=config_file_path, + ), + } + for item in value + ] + + custom_llm_setup() elif key == "success_callback": litellm.success_callback = [] From 41abd5124023c931aa7856271d6e5761804358e6 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 19:03:52 -0700 Subject: [PATCH 18/28] fix(custom_llm.py): pass input params to custom llm --- litellm/llms/custom_llm.py | 80 ++++++++++++++++++++++++++-- litellm/main.py | 21 +++++++- litellm/tests/test_custom_llm.py | 91 ++++++++++++++++++++++++++++++-- 3 files changed, 182 insertions(+), 10 deletions(-) diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py index f1b2b28b4..47c5a485c 100644 --- a/litellm/llms/custom_llm.py +++ b/litellm/llms/custom_llm.py @@ -59,16 +59,88 @@ class CustomLLM(BaseLLM): def __init__(self) -> None: super().__init__() - def completion(self, *args, **kwargs) -> ModelResponse: + def completion( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + ) -> ModelResponse: raise CustomLLMError(status_code=500, message="Not implemented yet!") - def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: + def streaming( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + ) -> Iterator[GenericStreamingChunk]: raise CustomLLMError(status_code=500, message="Not implemented yet!") - async def acompletion(self, *args, **kwargs) -> ModelResponse: + async def acompletion( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> ModelResponse: raise CustomLLMError(status_code=500, message="Not implemented yet!") - async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: + async def astreaming( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> AsyncIterator[GenericStreamingChunk]: raise CustomLLMError(status_code=500, message="Not implemented yet!") diff --git a/litellm/main.py b/litellm/main.py index c3be01373..672029f69 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -2711,8 +2711,27 @@ def completion( async_fn=acompletion, stream=stream, custom_llm=custom_handler ) + headers = headers or litellm.headers + ## CALL FUNCTION - response = handler_fn() + response = handler_fn( + model=model, + messages=messages, + headers=headers, + model_response=model_response, + print_verbose=print_verbose, + api_key=api_key, + api_base=api_base, + acompletion=acompletion, + logging_obj=logging, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + timeout=timeout, # type: ignore + custom_prompt_dict=custom_prompt_dict, + client=client, # pass AsyncOpenAI, OpenAI client + encoding=encoding, + ) if stream is True: return CustomStreamWrapper( completion_stream=response, diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py index af88b1f3a..a0f8b569e 100644 --- a/litellm/tests/test_custom_llm.py +++ b/litellm/tests/test_custom_llm.py @@ -17,7 +17,16 @@ sys.path.insert( import os from collections import defaultdict from concurrent.futures import ThreadPoolExecutor -from typing import Any, AsyncGenerator, AsyncIterator, Coroutine, Iterator, Union +from typing import ( + Any, + AsyncGenerator, + AsyncIterator, + Callable, + Coroutine, + Iterator, + Optional, + Union, +) from unittest.mock import AsyncMock, MagicMock, patch import httpx @@ -94,21 +103,75 @@ class CustomModelResponseIterator: class MyCustomLLM(CustomLLM): - def completion(self, *args, **kwargs) -> litellm.ModelResponse: + def completion( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable[..., Any], + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, openai.Timeout]] = None, + client: Optional[litellm.HTTPHandler] = None, + ) -> ModelResponse: return litellm.completion( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hello world"}], mock_response="Hi!", ) # type: ignore - async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse: + async def acompletion( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable[..., Any], + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, openai.Timeout]] = None, + client: Optional[litellm.AsyncHTTPHandler] = None, + ) -> litellm.ModelResponse: return litellm.completion( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hello world"}], mock_response="Hi!", ) # type: ignore - def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: + def streaming( + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable[..., Any], + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, openai.Timeout]] = None, + client: Optional[litellm.HTTPHandler] = None, + ) -> Iterator[GenericStreamingChunk]: generic_streaming_chunk: GenericStreamingChunk = { "finish_reason": "stop", "index": 0, @@ -126,7 +189,25 @@ class MyCustomLLM(CustomLLM): ) return custom_iterator - async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: # type: ignore + async def astreaming( # type: ignore + self, + model: str, + messages: list, + api_base: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable[..., Any], + encoding, + api_key, + logging_obj, + optional_params: dict, + acompletion=None, + litellm_params=None, + logger_fn=None, + headers={}, + timeout: Optional[Union[float, openai.Timeout]] = None, + client: Optional[litellm.AsyncHTTPHandler] = None, + ) -> AsyncIterator[GenericStreamingChunk]: # type: ignore generic_streaming_chunk: GenericStreamingChunk = { "finish_reason": "stop", "index": 0, From fcd834b2775c3a8531a57c66b1e9e8847741cfc1 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 19:22:26 -0700 Subject: [PATCH 19/28] fix logfire - don't load_dotenv --- litellm/integrations/logfire_logger.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/litellm/integrations/logfire_logger.py b/litellm/integrations/logfire_logger.py index fa4ab7bd5..5e9267dca 100644 --- a/litellm/integrations/logfire_logger.py +++ b/litellm/integrations/logfire_logger.py @@ -2,10 +2,6 @@ # On success + failure, log events to Logfire import os - -import dotenv - -dotenv.load_dotenv() # Loading env variables using dotenv import traceback import uuid from enum import Enum From bb6f72b315b2bb66140cec6e6b24eab61b35ab1a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 19:47:54 -0700 Subject: [PATCH 20/28] add mistral sdk usage --- docs/my-website/docs/proxy/quick_start.md | 159 ------------------- docs/my-website/docs/proxy/user_keys.md | 180 ++++++++++++++++++++++ 2 files changed, 180 insertions(+), 159 deletions(-) diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md index 4ee4d8831..31eb52141 100644 --- a/docs/my-website/docs/proxy/quick_start.md +++ b/docs/my-website/docs/proxy/quick_start.md @@ -396,165 +396,6 @@ print(response) - POST `/key/generate` - generate a key to access the proxy -## Using with OpenAI compatible projects -Set `base_url` to the LiteLLM Proxy server - - - - -```python -import openai -client = openai.OpenAI( - api_key="anything", - base_url="http://0.0.0.0:4000" -) - -# request sent to model set on litellm proxy, `litellm --model` -response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ - { - "role": "user", - "content": "this is a test request, write a short poem" - } -]) - -print(response) - -``` - - - -#### Start the LiteLLM proxy -```shell -litellm --model gpt-3.5-turbo - -#INFO: Proxy running on http://0.0.0.0:4000 -``` - -#### 1. Clone the repo - -```shell -git clone https://github.com/danny-avila/LibreChat.git -``` - - -#### 2. Modify Librechat's `docker-compose.yml` -LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below -```yaml -OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions -``` - -#### 3. Save fake OpenAI key in Librechat's `.env` - -Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key). -```env -OPENAI_API_KEY=sk-1234 -``` - -#### 4. Run LibreChat: -```shell -docker compose up -``` - - - - -Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart). - -In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model. -```python - default=OpenAI( - api_key="IGNORED", - model="fake-model-name", - context_length=2048, # customize if needed for your model - api_base="http://localhost:4000" # your proxy server url - ), -``` - -Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial. - - - - -```shell -$ pip install aider - -$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key -``` - - - -```python -pip install pyautogen -``` - -```python -from autogen import AssistantAgent, UserProxyAgent, oai -config_list=[ - { - "model": "my-fake-model", - "api_base": "http://localhost:4000", #litellm compatible endpoint - "api_type": "open_ai", - "api_key": "NULL", # just a placeholder - } -] - -response = oai.Completion.create(config_list=config_list, prompt="Hi") -print(response) # works fine - -llm_config={ - "config_list": config_list, -} - -assistant = AssistantAgent("assistant", llm_config=llm_config) -user_proxy = UserProxyAgent("user_proxy") -user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list) -``` - -Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial. - - - -A guidance language for controlling large language models. -https://github.com/guidance-ai/guidance - -**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it. - -**Fix**: Start your proxy using the `--drop_params` flag - -```shell -litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params -``` - -```python -import guidance - -# set api_base to your proxy -# set api_key to anything -gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything") - -experts = guidance(''' -{{#system~}} -You are a helpful and terse assistant. -{{~/system}} - -{{#user~}} -I want a response to the following question: -{{query}} -Name 3 world-class experts (past or present) who would be great at answering this? -Don't answer the question yet. -{{~/user}} - -{{#assistant~}} -{{gen 'expert_names' temperature=0 max_tokens=300}} -{{~/assistant}} -''', llm=gpt4) - -result = experts(query='How can I be more productive?') -print(result) -``` - - - ## Debugging Proxy Events that occur during normal operation diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md index 00fb3526b..5e57c18b1 100644 --- a/docs/my-website/docs/proxy/user_keys.md +++ b/docs/my-website/docs/proxy/user_keys.md @@ -234,6 +234,26 @@ main(); ``` + + +```python +import os +from mistralai.client import MistralClient +from mistralai.models.chat_completion import ChatMessage + + +client = MistralClient(api_key="sk-1234", endpoint="http://0.0.0.0:4000") +chat_response = client.chat( + model="mistral-small-latest", + messages=[ + {"role": "user", "content": "this is a test request, write a short poem"} + ], +) +print(chat_response.choices[0].message.content) +``` + + + ```python @@ -566,6 +586,166 @@ curl --location 'http://0.0.0.0:4000/moderations' \ ``` +## Using with OpenAI compatible projects +Set `base_url` to the LiteLLM Proxy server + + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +]) + +print(response) + +``` + + + +#### Start the LiteLLM proxy +```shell +litellm --model gpt-3.5-turbo + +#INFO: Proxy running on http://0.0.0.0:4000 +``` + +#### 1. Clone the repo + +```shell +git clone https://github.com/danny-avila/LibreChat.git +``` + + +#### 2. Modify Librechat's `docker-compose.yml` +LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below +```yaml +OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions +``` + +#### 3. Save fake OpenAI key in Librechat's `.env` + +Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key). +```env +OPENAI_API_KEY=sk-1234 +``` + +#### 4. Run LibreChat: +```shell +docker compose up +``` + + + + +Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart). + +In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model. +```python + default=OpenAI( + api_key="IGNORED", + model="fake-model-name", + context_length=2048, # customize if needed for your model + api_base="http://localhost:4000" # your proxy server url + ), +``` + +Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial. + + + + +```shell +$ pip install aider + +$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key +``` + + + +```python +pip install pyautogen +``` + +```python +from autogen import AssistantAgent, UserProxyAgent, oai +config_list=[ + { + "model": "my-fake-model", + "api_base": "http://localhost:4000", #litellm compatible endpoint + "api_type": "open_ai", + "api_key": "NULL", # just a placeholder + } +] + +response = oai.Completion.create(config_list=config_list, prompt="Hi") +print(response) # works fine + +llm_config={ + "config_list": config_list, +} + +assistant = AssistantAgent("assistant", llm_config=llm_config) +user_proxy = UserProxyAgent("user_proxy") +user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list) +``` + +Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial. + + + +A guidance language for controlling large language models. +https://github.com/guidance-ai/guidance + +**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it. + +**Fix**: Start your proxy using the `--drop_params` flag + +```shell +litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params +``` + +```python +import guidance + +# set api_base to your proxy +# set api_key to anything +gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything") + +experts = guidance(''' +{{#system~}} +You are a helpful and terse assistant. +{{~/system}} + +{{#user~}} +I want a response to the following question: +{{query}} +Name 3 world-class experts (past or present) who would be great at answering this? +Don't answer the question yet. +{{~/user}} + +{{#assistant~}} +{{gen 'expert_names' temperature=0 max_tokens=300}} +{{~/assistant}} +''', llm=gpt4) + +result = experts(query='How can I be more productive?') +print(result) +``` + + + + ## Advanced ### (BETA) Batch Completions - pass multiple models From 68e94f097678f3a32fcd875f8e34a23b2357ed24 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 19:48:54 -0700 Subject: [PATCH 21/28] example mistral sdk --- litellm/proxy/proxy_config.yaml | 4 ++++ litellm/proxy/tests/test_mistral_sdk.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 litellm/proxy/tests/test_mistral_sdk.py diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index bd8f5bfd0..8dc03d6e0 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -8,6 +8,10 @@ model_list: litellm_params: model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct api_key: "os.environ/FIREWORKS" + - model_name: mistral-small-latest + litellm_params: + model: mistral/mistral-small-latest + api_key: "os.environ/MISTRAL_API_KEY" - model_name: tts litellm_params: model: openai/tts-1 diff --git a/litellm/proxy/tests/test_mistral_sdk.py b/litellm/proxy/tests/test_mistral_sdk.py new file mode 100644 index 000000000..0adc67b93 --- /dev/null +++ b/litellm/proxy/tests/test_mistral_sdk.py @@ -0,0 +1,13 @@ +import os + +from mistralai.client import MistralClient +from mistralai.models.chat_completion import ChatMessage + +client = MistralClient(api_key="sk-1234", endpoint="http://0.0.0.0:4000") +chat_response = client.chat( + model="mistral-small-latest", + messages=[ + {"role": "user", "content": "this is a test request, write a short poem"} + ], +) +print(chat_response.choices[0].message.content) From a2fd8459fc59a670d9c2302d2d3518934da4b7a8 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 19:50:07 -0700 Subject: [PATCH 22/28] fix(utils.py): don't raise error on openai content filter during streaming - return as is Fixes issue where we would raise an error vs. openai who return the chunk with finish reason as 'content_filter' --- litellm/tests/test_streaming.py | 50 +++++++++++++++++++++++++++++++++ litellm/utils.py | 15 ---------- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 768c8752c..9aebc0f24 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -3248,6 +3248,56 @@ def test_unit_test_custom_stream_wrapper(): assert freq == 1 +def test_unit_test_custom_stream_wrapper_openai(): + """ + Test if last streaming chunk ends with '?', if the message repeats itself. + """ + litellm.set_verbose = False + chunk = { + "id": "chatcmpl-9mWtyDnikZZoB75DyfUzWUxiiE2Pi", + "choices": [ + litellm.utils.StreamingChoices( + delta=litellm.utils.Delta( + content=None, function_call=None, role=None, tool_calls=None + ), + finish_reason="content_filter", + index=0, + logprobs=None, + ) + ], + "created": 1721353246, + "model": "gpt-3.5-turbo-0613", + "object": "chat.completion.chunk", + "system_fingerprint": None, + "usage": None, + } + chunk = litellm.ModelResponse(**chunk, stream=True) + + completion_stream = ModelResponseIterator(model_response=chunk) + + response = litellm.CustomStreamWrapper( + completion_stream=completion_stream, + model="gpt-3.5-turbo", + custom_llm_provider="azure", + logging_obj=litellm.Logging( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey"}], + stream=True, + call_type="completion", + start_time=time.time(), + litellm_call_id="12345", + function_id="1245", + ), + ) + + stream_finish_reason: Optional[str] = None + for chunk in response: + assert chunk.choices[0].delta.content is None + if chunk.choices[0].finish_reason is not None: + stream_finish_reason = chunk.choices[0].finish_reason + assert stream_finish_reason == "content_filter" + + def test_aamazing_unit_test_custom_stream_wrapper_n(): """ Test if the translated output maps exactly to the received openai input diff --git a/litellm/utils.py b/litellm/utils.py index 5e4dc4479..87f50f5ed 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8840,21 +8840,6 @@ class CustomStreamWrapper: if str_line.choices[0].finish_reason: is_finished = True finish_reason = str_line.choices[0].finish_reason - if finish_reason == "content_filter": - if hasattr(str_line.choices[0], "content_filter_result"): - error_message = json.dumps( - str_line.choices[0].content_filter_result - ) - else: - error_message = "{} Response={}".format( - self.custom_llm_provider, str(dict(str_line)) - ) - - raise litellm.ContentPolicyViolationError( - message=error_message, - llm_provider=self.custom_llm_provider, - model=self.model, - ) # checking for logprobs if ( From 646b2d50f9f32686f699d31a0397a95659564f81 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 19:52:53 -0700 Subject: [PATCH 23/28] docs -quick start --- docs/my-website/docs/proxy/quick_start.md | 6 ++++++ docs/my-website/docs/proxy/user_keys.md | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md index 31eb52141..21698bd49 100644 --- a/docs/my-website/docs/proxy/quick_start.md +++ b/docs/my-website/docs/proxy/quick_start.md @@ -255,6 +255,12 @@ litellm --config your_config.yaml ## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain +:::info +LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python) + +[More examples here](user_keys) +::: + diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md index 5e57c18b1..44e1c8842 100644 --- a/docs/my-website/docs/proxy/user_keys.md +++ b/docs/my-website/docs/proxy/user_keys.md @@ -1,7 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl +# 💡 Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl :::info From 826bb125e80d6e27678cc88a45ee0bde71125dd9 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 25 Jul 2024 19:54:40 -0700 Subject: [PATCH 24/28] test(test_router.py): handle azure api instability --- litellm/tests/test_router.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/tests/test_router.py b/litellm/tests/test_router.py index 86506a589..715ba10d5 100644 --- a/litellm/tests/test_router.py +++ b/litellm/tests/test_router.py @@ -1117,6 +1117,8 @@ async def test_aimg_gen_on_router(): assert len(response.data) > 0 router.reset() + except litellm.InternalServerError as e: + pass except Exception as e: if "Your task failed as a result of our safety system." in str(e): pass From c2e309baf36ebe6abcd4b747cade6d637edf7fe6 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 20:05:28 -0700 Subject: [PATCH 25/28] docs using litellm proxy --- docs/my-website/docs/proxy/quick_start.md | 28 +++++++++++++++++++++++ docs/my-website/docs/proxy/user_keys.md | 28 +++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md index 21698bd49..9da860b0d 100644 --- a/docs/my-website/docs/proxy/quick_start.md +++ b/docs/my-website/docs/proxy/quick_start.md @@ -388,6 +388,34 @@ print(response) ``` + + + +```python +import os + +from anthropic import Anthropic + +client = Anthropic( + base_url="http://localhost:4000", # proxy endpoint + api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key +) + +message = client.messages.create( + max_tokens=1024, + messages=[ + { + "role": "user", + "content": "Hello, Claude", + } + ], + model="claude-3-opus-20240229", +) +print(message.content) +``` + + + [**More Info**](./configs.md) diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md index 44e1c8842..7417ef6bd 100644 --- a/docs/my-website/docs/proxy/user_keys.md +++ b/docs/my-website/docs/proxy/user_keys.md @@ -234,6 +234,34 @@ main(); ``` + + + +```python +import os + +from anthropic import Anthropic + +client = Anthropic( + base_url="http://localhost:4000", # proxy endpoint + api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key +) + +message = client.messages.create( + max_tokens=1024, + messages=[ + { + "role": "user", + "content": "Hello, Claude", + } + ], + model="claude-3-opus-20240229", +) +print(message.content) +``` + + + ```python From 9247fc3c64335b920a55fabef1f72bbe5b4d7c2b Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 20:09:49 -0700 Subject: [PATCH 26/28] deploy link to using litellm --- docs/my-website/docs/proxy/deploy.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index ff575f0d4..e8bc432b8 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -254,6 +254,15 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. **That's it ! That's the quick start to deploy litellm** +## Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl + +:::info +💡 Go here 👉 [to make your first LLM API Request](user_keys) + +LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python) + +::: + ## Options to deploy LiteLLM | Docs | When to Use | From 50bf488b58f790d191adfd53963b603ebc216bf4 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 20:10:02 -0700 Subject: [PATCH 27/28] read me link to using litellm --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 3ac5f0285..f36f189f3 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,10 @@ $ litellm --model huggingface/bigcode/starcoder ### Step 2: Make ChatCompletions Request to Proxy + +> [!IMPORTANT] +> [Use with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/migration) + ```python import openai # openai v1.0.0+ client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url From 4bf9681df4ef2cd1108ff41b56b97cd1b524d5b4 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 25 Jul 2024 20:12:32 -0700 Subject: [PATCH 28/28] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f36f189f3..306f07ec2 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ $ litellm --model huggingface/bigcode/starcoder > [!IMPORTANT] -> [Use with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/migration) +> 💡 [Use LiteLLM Proxy with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/proxy/user_keys) ```python import openai # openai v1.0.0+