From 87df233a19d95ac6ce2fdb9a37b6bfc6b27dde40 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 16 Dec 2023 10:31:46 -0800
Subject: [PATCH] fix(health.md): add background health check details to docs

---
 docs/my-website/docs/proxy/call_hooks.md     |   2 +-
 docs/my-website/docs/proxy/health.md         |  62 +++++++++
 docs/my-website/docs/proxy/load_balancing.md | 125 -------------------
 docs/my-website/docs/proxy/reliability.md    |  89 +++++++++++++
 docs/my-website/sidebars.js                  |   2 +
 litellm/llms/ollama.py                       |  60 ---------
 6 files changed, 154 insertions(+), 186 deletions(-)
 create mode 100644 docs/my-website/docs/proxy/health.md
 create mode 100644 docs/my-website/docs/proxy/reliability.md

diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md
index 2728529c24..a92b94a865 100644
--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@@ -1,4 +1,4 @@
-# Call Hooks - Modify Data
+# Modify Incoming Data
 
 Modify data just before making litellm completion calls call on proxy
 
diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md
new file mode 100644
index 0000000000..5dffd71000
--- /dev/null
+++ b/docs/my-website/docs/proxy/health.md
@@ -0,0 +1,62 @@
+# Health Checks
+Use this to health check all LLMs defined in your config.yaml
+
+## Summary 
+
+The proxy exposes: 
+* a /health endpoint which returns the health of the LLM APIs  
+* a /test endpoint which makes a ping to the litellm server
+
+#### Request
+Make a GET Request to `/health` on the proxy
+```shell
+curl --location 'http://0.0.0.0:8000/health'
+```
+
+You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
+```
+litellm --health
+```
+#### Response
+```shell
+{
+    "healthy_endpoints": [
+        {
+            "model": "azure/gpt-35-turbo",
+            "api_base": "https://my-endpoint-canada-berri992.openai.azure.com/"
+        },
+        {
+            "model": "azure/gpt-35-turbo",
+            "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/"
+        }
+    ],
+    "unhealthy_endpoints": [
+        {
+            "model": "azure/gpt-35-turbo",
+            "api_base": "https://openai-france-1234.openai.azure.com/"
+        }
+    ]
+}
+```
+
+## Background Health Checks 
+
+You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
+
+Here's how to use it: 
+1. in the config.yaml add:
+```
+general_settings: 
+  background_health_checks: True # enable background health checks
+  health_check_interval: 300 # frequency of background health checks
+```
+
+2. Start server 
+```
+$ litellm /path/to/config.yaml
+```
+
+3. Query health endpoint: 
+```
+curl --location 'http://0.0.0.0:8000/health'
+```
\ No newline at end of file
diff --git a/docs/my-website/docs/proxy/load_balancing.md b/docs/my-website/docs/proxy/load_balancing.md
index 786e1887fd..e223c2d5a3 100644
--- a/docs/my-website/docs/proxy/load_balancing.md
+++ b/docs/my-website/docs/proxy/load_balancing.md
@@ -96,129 +96,4 @@ router_settings:
   routing_strategy: least-busy                  # Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]
   num_retries: 2
   timeout: 30                                  # 30 seconds
-```
-
-## Fallbacks + Cooldowns + Retries + Timeouts 
-
-If a call fails after num_retries, fall back to another model group.
-
-If the error is a context window exceeded error, fall back to a larger model group (if given).
-
-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
-
-**Set via config**
-```yaml
-model_list:
-  - model_name: zephyr-beta
-    litellm_params:
-        model: huggingface/HuggingFaceH4/zephyr-7b-beta
-        api_base: http://0.0.0.0:8001
-  - model_name: zephyr-beta
-    litellm_params:
-        model: huggingface/HuggingFaceH4/zephyr-7b-beta
-        api_base: http://0.0.0.0:8002
-  - model_name: zephyr-beta
-    litellm_params:
-        model: huggingface/HuggingFaceH4/zephyr-7b-beta
-        api_base: http://0.0.0.0:8003
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-        model: gpt-3.5-turbo
-        api_key: <my-openai-key>
-  - model_name: gpt-3.5-turbo-16k
-    litellm_params:
-        model: gpt-3.5-turbo-16k
-        api_key: <my-openai-key>
-
-litellm_settings:
-  num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
-  request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
-  fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries 
-  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
-  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
-```
-
-**Set dynamically**
-
-```bash
-curl --location 'http://0.0.0.0:8000/chat/completions' \
---header 'Content-Type: application/json' \
---data ' {
-      "model": "zephyr-beta",
-      "messages": [
-        {
-          "role": "user",
-          "content": "what llm are you"
-        }
-      ],
-      "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
-      "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
-      "num_retries": 2,
-      "timeout": 10
-    }
-'
-```
-
-## Custom Timeouts, Stream Timeouts - Per Model
-For each model you can set `timeout` & `stream_timeout` under `litellm_params`
-```yaml
-model_list:
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: azure/gpt-turbo-small-eu
-      api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
-      api_key: <your-key>
-      timeout: 0.1                      # timeout in (seconds)
-      stream_timeout: 0.01              # timeout for stream requests (seconds)
-      max_retries: 5
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: azure/gpt-turbo-small-ca
-      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
-      api_key: 
-      timeout: 0.1                      # timeout in (seconds)
-      stream_timeout: 0.01              # timeout for stream requests (seconds)
-      max_retries: 5
-
-```
-
-#### Start Proxy 
-```shell
-$ litellm --config /path/to/config.yaml
-```
-
-
-
-## Health Check LLMs on Proxy
-Use this to health check all LLMs defined in your config.yaml
-#### Request
-Make a GET Request to `/health` on the proxy
-```shell
-curl --location 'http://0.0.0.0:8000/health'
-```
-
-You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
-```
-litellm --health
-```
-#### Response
-```shell
-{
-    "healthy_endpoints": [
-        {
-            "model": "azure/gpt-35-turbo",
-            "api_base": "https://my-endpoint-canada-berri992.openai.azure.com/"
-        },
-        {
-            "model": "azure/gpt-35-turbo",
-            "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/"
-        }
-    ],
-    "unhealthy_endpoints": [
-        {
-            "model": "azure/gpt-35-turbo",
-            "api_base": "https://openai-france-1234.openai.azure.com/"
-        }
-    ]
-}
 ```
\ No newline at end of file
diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md
new file mode 100644
index 0000000000..75f43bcdc3
--- /dev/null
+++ b/docs/my-website/docs/proxy/reliability.md
@@ -0,0 +1,89 @@
+# Fallbacks, Retries, Timeouts, Cooldowns 
+
+If a call fails after num_retries, fall back to another model group.
+
+If the error is a context window exceeded error, fall back to a larger model group (if given).
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
+
+**Set via config**
+```yaml
+model_list:
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8002
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8003
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+        model: gpt-3.5-turbo
+        api_key: <my-openai-key>
+  - model_name: gpt-3.5-turbo-16k
+    litellm_params:
+        model: gpt-3.5-turbo-16k
+        api_key: <my-openai-key>
+
+litellm_settings:
+  num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
+  request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
+  fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries 
+  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
+  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
+```
+
+**Set dynamically**
+
+```bash
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "zephyr-beta",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+      "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
+      "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
+      "num_retries": 2,
+      "timeout": 10
+    }
+'
+```
+
+## Custom Timeouts, Stream Timeouts - Per Model
+For each model you can set `timeout` & `stream_timeout` under `litellm_params`
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/gpt-turbo-small-eu
+      api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
+      api_key: <your-key>
+      timeout: 0.1                      # timeout in (seconds)
+      stream_timeout: 0.01              # timeout for stream requests (seconds)
+      max_retries: 5
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/gpt-turbo-small-ca
+      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
+      api_key: 
+      timeout: 0.1                      # timeout in (seconds)
+      stream_timeout: 0.01              # timeout for stream requests (seconds)
+      max_retries: 5
+
+```
+
+#### Start Proxy 
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index 6ac6a0822f..069faa48aa 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -103,6 +103,8 @@ const sidebars = {
         "proxy/load_balancing", 
         "proxy/virtual_keys",
         "proxy/model_management",
+        "proxy/reliability",
+        "proxy/health",
         "proxy/call_hooks",
         "proxy/caching",
         "proxy/logging", 
diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index 032c7a048d..30aaa53817 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -248,63 +248,3 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
             return model_response
     except Exception as e:
         traceback.print_exc()
-
-    # ollama implementation
-    @async_generator
-    async def async_get_ollama_response_stream(
-            api_base="http://localhost:11434",
-            model="llama2",
-            prompt="Why is the sky blue?",
-            optional_params=None,
-            logging_obj=None,
-        ):
-        url = f"{api_base}/api/generate"
-        
-        ## Load Config
-        config=litellm.OllamaConfig.get_config()
-        for k, v in config.items():
-            if k not in optional_params: # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
-                optional_params[k] = v
-
-        data = {
-            "model": model,
-            "prompt": prompt,
-            **optional_params
-        }
-        ## LOGGING
-        logging_obj.pre_call(
-            input=None,
-            api_key=None,
-            additional_args={"api_base": url, "complete_input_dict": data},
-        )
-        session = requests.Session()
-
-        with session.post(url, json=data, stream=True) as resp:
-            if resp.status_code != 200:
-                raise OllamaError(status_code=resp.status_code, message=resp.text)
-            for line in resp.iter_lines():
-                if line:
-                    try:
-                        json_chunk = line.decode("utf-8")
-                        chunks = json_chunk.split("\n")
-                        for chunk in chunks:
-                            if chunk.strip() != "":
-                                j = json.loads(chunk)
-                                if "error" in j:
-                                    completion_obj = {
-                                        "role": "assistant",
-                                        "content": "",
-                                        "error": j
-                                    }
-                                    await yield_({"choices": [{"delta": completion_obj}]})
-                                if "response" in j:
-                                    completion_obj = {
-                                        "role": "assistant",
-                                        "content": "",
-                                    }
-                                    completion_obj["content"] = j["response"]
-                                    await yield_({"choices": [{"delta": completion_obj}]})
-                    except Exception as e:
-                        import logging
-                        logging.debug(f"Error decoding JSON: {e}")
-        session.close()
\ No newline at end of file