From 87df233a19d95ac6ce2fdb9a37b6bfc6b27dde40 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Sat, 16 Dec 2023 10:31:46 -0800 Subject: [PATCH] fix(health.md): add background health check details to docs --- docs/my-website/docs/proxy/call_hooks.md | 2 +- docs/my-website/docs/proxy/health.md | 62 +++++++++ docs/my-website/docs/proxy/load_balancing.md | 125 ------------------- docs/my-website/docs/proxy/reliability.md | 89 +++++++++++++ docs/my-website/sidebars.js | 2 + litellm/llms/ollama.py | 60 --------- 6 files changed, 154 insertions(+), 186 deletions(-) create mode 100644 docs/my-website/docs/proxy/health.md create mode 100644 docs/my-website/docs/proxy/reliability.md diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md index 2728529c24..a92b94a865 100644 --- a/docs/my-website/docs/proxy/call_hooks.md +++ b/docs/my-website/docs/proxy/call_hooks.md @@ -1,4 +1,4 @@ -# Call Hooks - Modify Data +# Modify Incoming Data Modify data just before making litellm completion calls call on proxy diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md new file mode 100644 index 0000000000..5dffd71000 --- /dev/null +++ b/docs/my-website/docs/proxy/health.md @@ -0,0 +1,62 @@ +# Health Checks +Use this to health check all LLMs defined in your config.yaml + +## Summary + +The proxy exposes: +* a /health endpoint which returns the health of the LLM APIs +* a /test endpoint which makes a ping to the litellm server + +#### Request +Make a GET Request to `/health` on the proxy +```shell +curl --location 'http://0.0.0.0:8000/health' +``` + +You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you +``` +litellm --health +``` +#### Response +```shell +{ + "healthy_endpoints": [ + { + "model": "azure/gpt-35-turbo", + "api_base": "https://my-endpoint-canada-berri992.openai.azure.com/" + }, + { + "model": "azure/gpt-35-turbo", + "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/" + } + ], + "unhealthy_endpoints": [ + { + "model": "azure/gpt-35-turbo", + "api_base": "https://openai-france-1234.openai.azure.com/" + } + ] +} +``` + +## Background Health Checks + +You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`. + +Here's how to use it: +1. in the config.yaml add: +``` +general_settings: + background_health_checks: True # enable background health checks + health_check_interval: 300 # frequency of background health checks +``` + +2. Start server +``` +$ litellm /path/to/config.yaml +``` + +3. Query health endpoint: +``` +curl --location 'http://0.0.0.0:8000/health' +``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/load_balancing.md b/docs/my-website/docs/proxy/load_balancing.md index 786e1887fd..e223c2d5a3 100644 --- a/docs/my-website/docs/proxy/load_balancing.md +++ b/docs/my-website/docs/proxy/load_balancing.md @@ -96,129 +96,4 @@ router_settings: routing_strategy: least-busy # Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"] num_retries: 2 timeout: 30 # 30 seconds -``` - -## Fallbacks + Cooldowns + Retries + Timeouts - -If a call fails after num_retries, fall back to another model group. - -If the error is a context window exceeded error, fall back to a larger model group (if given). - -[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) - -**Set via config** -```yaml -model_list: - - model_name: zephyr-beta - litellm_params: - model: huggingface/HuggingFaceH4/zephyr-7b-beta - api_base: http://0.0.0.0:8001 - - model_name: zephyr-beta - litellm_params: - model: huggingface/HuggingFaceH4/zephyr-7b-beta - api_base: http://0.0.0.0:8002 - - model_name: zephyr-beta - litellm_params: - model: huggingface/HuggingFaceH4/zephyr-7b-beta - api_base: http://0.0.0.0:8003 - - model_name: gpt-3.5-turbo - litellm_params: - model: gpt-3.5-turbo - api_key: - - model_name: gpt-3.5-turbo-16k - litellm_params: - model: gpt-3.5-turbo-16k - api_key: - -litellm_settings: - num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta) - request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout - fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries - context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error - allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. -``` - -**Set dynamically** - -```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ ---header 'Content-Type: application/json' \ ---data ' { - "model": "zephyr-beta", - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ], - "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}], - "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}], - "num_retries": 2, - "timeout": 10 - } -' -``` - -## Custom Timeouts, Stream Timeouts - Per Model -For each model you can set `timeout` & `stream_timeout` under `litellm_params` -```yaml -model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/gpt-turbo-small-eu - api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ - api_key: - timeout: 0.1 # timeout in (seconds) - stream_timeout: 0.01 # timeout for stream requests (seconds) - max_retries: 5 - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/gpt-turbo-small-ca - api_base: https://my-endpoint-canada-berri992.openai.azure.com/ - api_key: - timeout: 0.1 # timeout in (seconds) - stream_timeout: 0.01 # timeout for stream requests (seconds) - max_retries: 5 - -``` - -#### Start Proxy -```shell -$ litellm --config /path/to/config.yaml -``` - - - -## Health Check LLMs on Proxy -Use this to health check all LLMs defined in your config.yaml -#### Request -Make a GET Request to `/health` on the proxy -```shell -curl --location 'http://0.0.0.0:8000/health' -``` - -You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you -``` -litellm --health -``` -#### Response -```shell -{ - "healthy_endpoints": [ - { - "model": "azure/gpt-35-turbo", - "api_base": "https://my-endpoint-canada-berri992.openai.azure.com/" - }, - { - "model": "azure/gpt-35-turbo", - "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/" - } - ], - "unhealthy_endpoints": [ - { - "model": "azure/gpt-35-turbo", - "api_base": "https://openai-france-1234.openai.azure.com/" - } - ] -} ``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md new file mode 100644 index 0000000000..75f43bcdc3 --- /dev/null +++ b/docs/my-website/docs/proxy/reliability.md @@ -0,0 +1,89 @@ +# Fallbacks, Retries, Timeouts, Cooldowns + +If a call fails after num_retries, fall back to another model group. + +If the error is a context window exceeded error, fall back to a larger model group (if given). + +[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) + +**Set via config** +```yaml +model_list: + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8001 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8002 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8003 + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + api_key: + - model_name: gpt-3.5-turbo-16k + litellm_params: + model: gpt-3.5-turbo-16k + api_key: + +litellm_settings: + num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta) + request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout + fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries + context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error + allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. +``` + +**Set dynamically** + +```bash +curl --location 'http://0.0.0.0:8000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data ' { + "model": "zephyr-beta", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}], + "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}], + "num_retries": 2, + "timeout": 10 + } +' +``` + +## Custom Timeouts, Stream Timeouts - Per Model +For each model you can set `timeout` & `stream_timeout` under `litellm_params` +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ + api_key: + timeout: 0.1 # timeout in (seconds) + stream_timeout: 0.01 # timeout for stream requests (seconds) + max_retries: 5 + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-ca + api_base: https://my-endpoint-canada-berri992.openai.azure.com/ + api_key: + timeout: 0.1 # timeout in (seconds) + stream_timeout: 0.01 # timeout for stream requests (seconds) + max_retries: 5 + +``` + +#### Start Proxy +```shell +$ litellm --config /path/to/config.yaml +``` + diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 6ac6a0822f..069faa48aa 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -103,6 +103,8 @@ const sidebars = { "proxy/load_balancing", "proxy/virtual_keys", "proxy/model_management", + "proxy/reliability", + "proxy/health", "proxy/call_hooks", "proxy/caching", "proxy/logging", diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 032c7a048d..30aaa53817 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -248,63 +248,3 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): return model_response except Exception as e: traceback.print_exc() - - # ollama implementation - @async_generator - async def async_get_ollama_response_stream( - api_base="http://localhost:11434", - model="llama2", - prompt="Why is the sky blue?", - optional_params=None, - logging_obj=None, - ): - url = f"{api_base}/api/generate" - - ## Load Config - config=litellm.OllamaConfig.get_config() - for k, v in config.items(): - if k not in optional_params: # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in - optional_params[k] = v - - data = { - "model": model, - "prompt": prompt, - **optional_params - } - ## LOGGING - logging_obj.pre_call( - input=None, - api_key=None, - additional_args={"api_base": url, "complete_input_dict": data}, - ) - session = requests.Session() - - with session.post(url, json=data, stream=True) as resp: - if resp.status_code != 200: - raise OllamaError(status_code=resp.status_code, message=resp.text) - for line in resp.iter_lines(): - if line: - try: - json_chunk = line.decode("utf-8") - chunks = json_chunk.split("\n") - for chunk in chunks: - if chunk.strip() != "": - j = json.loads(chunk) - if "error" in j: - completion_obj = { - "role": "assistant", - "content": "", - "error": j - } - await yield_({"choices": [{"delta": completion_obj}]}) - if "response" in j: - completion_obj = { - "role": "assistant", - "content": "", - } - completion_obj["content"] = j["response"] - await yield_({"choices": [{"delta": completion_obj}]}) - except Exception as e: - import logging - logging.debug(f"Error decoding JSON: {e}") - session.close() \ No newline at end of file