From 5b4ca42de64a6ac628449d4078fc0ef9b52af887 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Fri, 15 Dec 2023 21:51:59 -0800 Subject: [PATCH] docs(routing.md): add docs on using caching groups across deployments --- docs/my-website/docs/routing.md | 254 +++++++------------------------- litellm/main.py | 1 + 2 files changed, 58 insertions(+), 197 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 3f55ae28e..5239f7ab7 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -366,6 +366,63 @@ router = Router(model_list: Optional[list] = None, cache_responses=True) ``` +## Caching across model groups + +If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. + +```python +import litellm, asyncio, time +from litellm import Router + +# set os env +os.environ["OPENAI_API_KEY"] = "" +os.environ["AZURE_API_KEY"] = "" +os.environ["AZURE_API_BASE"] = "" +os.environ["AZURE_API_VERSION"] = "" + +async def test_acompletion_caching_on_router_caching_groups(): + # tests acompletion + caching on router + try: + litellm.set_verbose = True + model_list = [ + { + "model_name": "openai-gpt-3.5-turbo", + "litellm_params": { + "model": "gpt-3.5-turbo-0613", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + }, + { + "model_name": "azure-gpt-3.5-turbo", + "litellm_params": { + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_base": os.getenv("AZURE_API_BASE"), + "api_version": os.getenv("AZURE_API_VERSION") + }, + } + ] + + messages = [ + {"role": "user", "content": f"write a one sentence poem {time.time()}?"} + ] + start_time = time.time() + router = Router(model_list=model_list, + cache_responses=True, + caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]) + response1 = await router.acompletion(model="openai-gpt-3.5-turbo", messages=messages, temperature=1) + print(f"response1: {response1}") + await asyncio.sleep(1) # add cache is async, async sleep for cache to get set + response2 = await router.acompletion(model="azure-gpt-3.5-turbo", messages=messages, temperature=1) + assert response1.id == response2.id + assert len(response1.choices[0].message.content) > 0 + assert response1.choices[0].message.content == response2.choices[0].message.content + except Exception as e: + traceback.print_exc() + +asyncio.run(test_acompletion_caching_on_router_caching_groups()) +``` + #### Default litellm.completion/embedding params You can also set default params for litellm completion/embedding calls. Here's how to do that: @@ -391,200 +448,3 @@ print(f"response: {response}") ## Deploy Router If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model) - -## Queuing (Beta) - -**Never fail a request due to rate limits** - -The LiteLLM Queuing endpoints can handle 100+ req/s. We use Celery workers to process requests. - -:::info - -This is pretty new, and might have bugs. Any contributions to improving our implementation are welcome - -::: - - -[**See Code**](https://github.com/BerriAI/litellm/blob/fbf9cab5b9e35df524e2c9953180c58d92e4cd97/litellm/proxy/proxy_server.py#L589) - - -### Quick Start - -1. Add Redis credentials in a .env file - -```python -REDIS_HOST="my-redis-endpoint" -REDIS_PORT="my-redis-port" -REDIS_PASSWORD="my-redis-password" # [OPTIONAL] if self-hosted -REDIS_USERNAME="default" # [OPTIONAL] if self-hosted -``` - -2. Start litellm server with your model config - -```bash -$ litellm --config /path/to/config.yaml --use_queue -``` - -Here's an example config for `gpt-3.5-turbo` - -**config.yaml** (This will load balance between OpenAI + Azure endpoints) -```yaml -model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: gpt-3.5-turbo - api_key: - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/chatgpt-v-2 # actual model name - api_key: - api_version: 2023-07-01-preview - api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ -``` - -3. Test (in another window) → sends 100 simultaneous requests to the queue - -```bash -$ litellm --test_async --num_requests 100 -``` - - -### Available Endpoints -- `/queue/request` - Queues a /chat/completions request. Returns a job id. -- `/queue/response/{id}` - Returns the status of a job. If completed, returns the response as well. Potential status's are: `queued` and `finished`. - - -## Hosted Request Queing api.litellm.ai -Queue your LLM API requests to ensure you're under your rate limits -- Step 1: Step 1 Add a config to the proxy, generate a temp key -- Step 2: Queue a request to the proxy, using your generated_key -- Step 3: Poll the request - - -### Step 1 Add a config to the proxy, generate a temp key -```python -import requests -import time -import os - -# Set the base URL as needed -base_url = "https://api.litellm.ai" - -# Step 1 Add a config to the proxy, generate a temp key -# use the same model_name to load balance -config = { - "model_list": [ - { - "model_name": "gpt-3.5-turbo", - "litellm_params": { - "model": "gpt-3.5-turbo", - "api_key": os.environ['OPENAI_API_KEY'], - } - }, - { - "model_name": "gpt-3.5-turbo", - "litellm_params": { - "model": "azure/chatgpt-v-2", - "api_key": "", - "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com/", - "api_version": "2023-07-01-preview" - } - } - ] -} - -response = requests.post( - url=f"{base_url}/key/generate", - json={ - "config": config, - "duration": "30d" # default to 30d, set it to 30m if you want a temp 30 minute key - }, - headers={ - "Authorization": "Bearer sk-hosted-litellm" # this is the key to use api.litellm.ai - } -) - -print("\nresponse from generating key", response.text) -print("\n json response from gen key", response.json()) - -generated_key = response.json()["key"] -print("\ngenerated key for proxy", generated_key) -``` - -#### Output -```shell -response from generating key {"key":"sk-...,"expires":"2023-12-22T03:43:57.615000+00:00"} -``` - -### Step 2: Queue a request to the proxy, using your generated_key -```python -print("Creating a job on the proxy") -job_response = requests.post( - url=f"{base_url}/queue/request", - json={ - 'model': 'gpt-3.5-turbo', - 'messages': [ - {'role': 'system', 'content': f'You are a helpful assistant. What is your name'}, - ], - }, - headers={ - "Authorization": f"Bearer {generated_key}" - } -) -print(job_response.status_code) -print(job_response.text) -print("\nResponse from creating job", job_response.text) -job_response = job_response.json() -job_id = job_response["id"] -polling_url = job_response["url"] -polling_url = f"{base_url}{polling_url}" -print("\nCreated Job, Polling Url", polling_url) -``` - -#### Output -```shell -Response from creating job -{"id":"0e3d9e98-5d56-4d07-9cc8-c34b7e6658d7","url":"/queue/response/0e3d9e98-5d56-4d07-9cc8-c34b7e6658d7","eta":5,"status":"queued"} -``` - -### Step 3: Poll the request -```python -while True: - try: - print("\nPolling URL", polling_url) - polling_response = requests.get( - url=polling_url, - headers={ - "Authorization": f"Bearer {generated_key}" - } - ) - print("\nResponse from polling url", polling_response.text) - polling_response = polling_response.json() - status = polling_response.get("status", None) - if status == "finished": - llm_response = polling_response["result"] - print("LLM Response") - print(llm_response) - break - time.sleep(0.5) - except Exception as e: - print("got exception in polling", e) - break -``` - -#### Output -```shell -Polling URL https://api.litellm.ai/queue/response/0e3d9e98-5d56-4d07-9cc8-c34b7e6658d7 - -Response from polling url {"status":"queued"} - -Polling URL https://api.litellm.ai/queue/response/0e3d9e98-5d56-4d07-9cc8-c34b7e6658d7 - -Response from polling url {"status":"queued"} - -Polling URL https://api.litellm.ai/queue/response/0e3d9e98-5d56-4d07-9cc8-c34b7e6658d7 - -Response from polling url -{"status":"finished","result":{"id":"chatcmpl-8NYRce4IeI4NzYyodT3NNp8fk5cSW","choices":[{"finish_reason":"stop","index":0,"message":{"content":"I am an AI assistant and do not have a physical presence or personal identity. You can simply refer to me as \"Assistant.\" How may I assist you today?","role":"assistant"}}],"created":1700624639,"model":"gpt-3.5-turbo-0613","object":"chat.completion","system_fingerprint":null,"usage":{"completion_tokens":33,"prompt_tokens":17,"total_tokens":50}}} - -``` \ No newline at end of file diff --git a/litellm/main.py b/litellm/main.py index d128fc14b..d666cfb2c 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -14,6 +14,7 @@ import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy import httpx import litellm + from litellm import ( # type: ignore client, exception_type,