From b93b2636a998081260ec9bb47ac23b307f582ff3 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 24 Jul 2024 16:51:40 -0700
Subject: [PATCH 01/34] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 92328b4d5..3ac5f0285 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
           <img src="https://railway.app/button.svg" alt="Deploy on Railway">
         </a>
         </p>
-        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
+        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
         <br>
     </p>
 <h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>

From f35af3bf1c631f878b9be8fc207882383af5cf83 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 24 Jul 2024 18:42:31 -0700
Subject: [PATCH 02/34] test(test_completion.py): update azure extra headers

---
 litellm/tests/test_completion.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index f62b2b7ef..9061293d5 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -2573,21 +2573,17 @@ def test_completion_azure_extra_headers():
     http_client = Client()
 
     with patch.object(http_client, "send", new=MagicMock()) as mock_client:
-        client = AzureOpenAI(
-            azure_endpoint=os.getenv("AZURE_API_BASE"),
-            api_version=litellm.AZURE_DEFAULT_API_VERSION,
-            api_key=os.getenv("AZURE_API_KEY"),
-            http_client=http_client,
-        )
+        litellm.client_session = http_client
         try:
             response = completion(
                 model="azure/chatgpt-v-2",
                 messages=messages,
-                client=client,
+                api_base=os.getenv("AZURE_API_BASE"),
+                api_version="2023-07-01-preview",
+                api_key=os.getenv("AZURE_API_KEY"),
                 extra_headers={
                     "Authorization": "my-bad-key",
                     "Ocp-Apim-Subscription-Key": "hello-world-testing",
-                    "api-key": "my-bad-key",
                 },
             )
             print(response)
@@ -2603,8 +2599,10 @@ def test_completion_azure_extra_headers():
         print(request.url)  # This will print the full URL
         print(request.headers)  # This will print the full URL
         auth_header = request.headers.get("Authorization")
+        apim_key = request.headers.get("Ocp-Apim-Subscription-Key")
         print(auth_header)
         assert auth_header == "my-bad-key"
+        assert apim_key == "hello-world-testing"
 
 
 def test_completion_azure_ad_token():

From dd10da4d466ca4c145fe4b320f008efab81a8652 Mon Sep 17 00:00:00 2001
From: wslee <wslee@friendli.ai>
Date: Wed, 10 Jul 2024 19:05:38 +0900
Subject: [PATCH 03/34] add support for friendli dedicated endpoint

---
 docs/my-website/docs/providers/friendliai.md | 60 ++++++++++++++++++++
 docs/my-website/sidebars.js                  |  1 +
 litellm/utils.py                             |  5 +-
 3 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 docs/my-website/docs/providers/friendliai.md

diff --git a/docs/my-website/docs/providers/friendliai.md b/docs/my-website/docs/providers/friendliai.md
new file mode 100644
index 000000000..137c3dde3
--- /dev/null
+++ b/docs/my-website/docs/providers/friendliai.md
@@ -0,0 +1,60 @@
+# FriendliAI
+https://suite.friendli.ai/
+
+**We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['FRIENDLI_TOKEN']
+os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models
+### Serverless Endpoints
+We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` | 
+| meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
+| meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |  
+
+### Dedicated Endpoints
+```
+model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
+```
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index c3f7e9249..d228e09d2 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -158,6 +158,7 @@ const sidebars = {
         "providers/triton-inference-server",
         "providers/ollama", 
         "providers/perplexity", 
+        "providers/friendliai",
         "providers/groq", 
         "providers/deepseek", 
         "providers/fireworks_ai",
diff --git a/litellm/utils.py b/litellm/utils.py
index a6d3d8603..03bbb0e8c 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -4486,7 +4486,10 @@ def get_llm_provider(
                     or get_secret("TOGETHER_AI_TOKEN")
                 )
             elif custom_llm_provider == "friendliai":
-                api_base = "https://inference.friendli.ai/v1"
+                api_base = (
+                    get_secret("FRIENDLI_API_BASE")
+                    or "https://inference.friendli.ai/v1"
+                )
                 dynamic_api_key = (
                     api_key
                     or get_secret("FRIENDLIAI_API_KEY")

From 40bb165108ddb3bf3a20e5084924c42698023530 Mon Sep 17 00:00:00 2001
From: wslee <wslee@friendli.ai>
Date: Mon, 15 Jul 2024 10:24:54 +0900
Subject: [PATCH 04/34] support dynamic api base

---
 litellm/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 03bbb0e8c..f35f1ce4b 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -4487,7 +4487,8 @@ def get_llm_provider(
                 )
             elif custom_llm_provider == "friendliai":
                 api_base = (
-                    get_secret("FRIENDLI_API_BASE")
+                    api_base
+                    or get_secret("FRIENDLI_API_BASE")
                     or "https://inference.friendli.ai/v1"
                 )
                 dynamic_api_key = (

From 3cd3491920fa64c9e7c9635478f05615d132cafb Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 24 Jul 2024 19:47:50 -0700
Subject: [PATCH 05/34] test: cleanup testing

---
 litellm/tests/test_completion.py | 37 ++++++++++++++++++++++++--------
 litellm/tests/test_embedding.py  | 19 +++++-----------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 9061293d5..6aaf99515 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -2611,18 +2611,37 @@ def test_completion_azure_ad_token():
     # If you want to remove it, speak to Ishaan!
     # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
     from httpx import Client
-    from openai import AzureOpenAI
 
     from litellm import completion
-    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 
-    response = completion(
-        model="azure/chatgpt-v-2",
-        messages=messages,
-        # api_key="my-fake-ad-token",
-        azure_ad_token=os.getenv("AZURE_API_KEY"),
-    )
-    print(response)
+    litellm.set_verbose = True
+
+    old_key = os.environ["AZURE_API_KEY"]
+    os.environ.pop("AZURE_API_KEY", None)
+
+    http_client = Client()
+
+    with patch.object(http_client, "send", new=MagicMock()) as mock_client:
+        litellm.client_session = http_client
+        try:
+            response = completion(
+                model="azure/chatgpt-v-2",
+                messages=messages,
+                azure_ad_token="my-special-token",
+            )
+            print(response)
+        except Exception as e:
+            pass
+        finally:
+            os.environ["AZURE_API_KEY"] = old_key
+
+        mock_client.assert_called_once()
+        request = mock_client.call_args[0][0]
+        print(request.method)  # This will print 'POST'
+        print(request.url)  # This will print the full URL
+        print(request.headers)  # This will print the full URL
+        auth_header = request.headers.get("Authorization")
+        assert auth_header == "Bearer my-special-token"
 
 
 def test_completion_azure_key_completion_arg():
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index e6dd8bbb2..79ba8bc3e 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -206,6 +206,9 @@ def test_openai_azure_embedding_with_oidc_and_cf():
     os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
     os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
 
+    old_key = os.environ["AZURE_API_KEY"]
+    os.environ.pop("AZURE_API_KEY", None)
+
     try:
         response = embedding(
             model="azure/text-embedding-ada-002",
@@ -218,6 +221,8 @@ def test_openai_azure_embedding_with_oidc_and_cf():
 
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
+    finally:
+        os.environ["AZURE_API_KEY"] = old_key
 
 
 def test_openai_azure_embedding_optional_arg(mocker):
@@ -673,17 +678,3 @@ async def test_databricks_embeddings(sync_mode):
 #     print(response)
 
 # local_proxy_embeddings()
-
-
-def test_embedding_azure_ad_token():
-    # this tests if we can pass api_key to completion, when it's not in the env.
-    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
-    # If you want to remove it, speak to Ishaan!
-    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
-
-    response = embedding(
-        model="azure/azure-embedding-model",
-        input=["good morning from litellm"],
-        azure_ad_token=os.getenv("AZURE_API_KEY"),
-    )
-    print(response)

From 4cd96976b38ec94164d1064f537efa22bd0f1553 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 24 Jul 2024 20:46:56 -0700
Subject: [PATCH 06/34] feat - add groq/llama-3.1

---
 ...odel_prices_and_context_window_backup.json | 30 +++++++++++++++++++
 model_prices_and_context_window.json          | 30 +++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 08bc292c9..428d95589 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1094,6 +1094,36 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "groq/llama-3.1-8b-instant": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-70b-versatile": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-405b-reasoning": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "groq/mixtral-8x7b-32768": {
         "max_tokens": 32768,
         "max_input_tokens": 32768,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 08bc292c9..428d95589 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1094,6 +1094,36 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "groq/llama-3.1-8b-instant": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-70b-versatile": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-405b-reasoning": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "groq/mixtral-8x7b-32768": {
         "max_tokens": 32768,
         "max_input_tokens": 32768,

From c08d4ca9ec2e0f6cb8ffb725a23d4a899e5cd181 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 24 Jul 2024 20:49:28 -0700
Subject: [PATCH 07/34] docs groq models

---
 docs/my-website/docs/providers/groq.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/providers/groq.md b/docs/my-website/docs/providers/groq.md
index bfb944cb4..37d63d031 100644
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@@ -148,8 +148,11 @@ print(response)
 ## Supported Models - ALL Groq Models Supported!
 We support ALL Groq models, just set `groq/` as a prefix when sending completion requests
 
-| Model Name         | Function Call                                           |
+| Model Name         | Usage                                           |
 |--------------------|---------------------------------------------------------|
+| llama-3.1-8b-instant     | `completion(model="groq/llama-3.1-8b-instant", messages)`     | 
+| llama-3.1-70b-versatile    | `completion(model="groq/llama-3.1-70b-versatile", messages)`    | 
+| llama-3.1-405b-reasoning    | `completion(model="groq/llama-3.1-405b-reasoning", messages)`    | 
 | llama3-8b-8192     | `completion(model="groq/llama3-8b-8192", messages)`     | 
 | llama3-70b-8192    | `completion(model="groq/llama3-70b-8192", messages)`    | 
 | llama2-70b-4096    | `completion(model="groq/llama2-70b-4096", messages)`    | 

From d5a7c654f191a5d320bdbe875941d15bbe18d28b Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 24 Jul 2024 21:25:31 -0700
Subject: [PATCH 08/34] =?UTF-8?q?bump:=20version=201.42.0=20=E2=86=92=201.?=
 =?UTF-8?q?42.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 10246abd7..08a41c9ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.42.0"
+version = "1.42.1"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.42.0"
+version = "1.42.1"
 version_files = [
     "pyproject.toml:^version"
 ]

From c77abaa07f274774e1497dced7d59e2df33cddb0 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 24 Jul 2024 21:31:41 -0700
Subject: [PATCH 09/34] feat - add mistral large 2

---
 ...odel_prices_and_context_window_backup.json | 20 ++++++++++++++-----
 model_prices_and_context_window.json          | 20 ++++++++++++++-----
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 428d95589..667745c30 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -893,11 +893,11 @@
         "mode": "chat"
     },
     "mistral/mistral-large-latest": {
-        "max_tokens": 8191,
-        "max_input_tokens": 32000,
-        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000012,
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
         "litellm_provider": "mistral",
         "mode": "chat",
         "supports_function_calling": true
@@ -912,6 +912,16 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "mistral/mistral-large-2407": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "mistral/open-mistral-7b": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 428d95589..667745c30 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -893,11 +893,11 @@
         "mode": "chat"
     },
     "mistral/mistral-large-latest": {
-        "max_tokens": 8191,
-        "max_input_tokens": 32000,
-        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000012,
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
         "litellm_provider": "mistral",
         "mode": "chat",
         "supports_function_calling": true
@@ -912,6 +912,16 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "mistral/mistral-large-2407": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "mistral/open-mistral-7b": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,

From a92a2ca382403f47dbb5f588c9f855b5d7b3d9a5 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 24 Jul 2024 21:35:34 -0700
Subject: [PATCH 10/34] docs add mistral api large 2

---
 docs/my-website/docs/providers/mistral.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/providers/mistral.md b/docs/my-website/docs/providers/mistral.md
index 21e3a9d54..62a91c687 100644
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@@ -148,7 +148,8 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
 |----------------|--------------------------------------------------------------|
 | Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
 | Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
-| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
+| Mistral Large 2  | `completion(model="mistral/mistral-large-2407", messages)` |
+| Mistral Large Latest  | `completion(model="mistral/mistral-large-latest", messages)` |
 | Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
 | Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
 | Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |

From b376ee71b01e3e8c6453a3dd21421b365aaaf9f8 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 24 Jul 2024 21:51:24 -0700
Subject: [PATCH 11/34] fix(internal_user_endpoints.py): support updating
 budgets for `/user/update`

---
 .../proxy/management_endpoints/internal_user_endpoints.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/litellm/proxy/management_endpoints/internal_user_endpoints.py b/litellm/proxy/management_endpoints/internal_user_endpoints.py
index 280ff2ad2..b132761ae 100644
--- a/litellm/proxy/management_endpoints/internal_user_endpoints.py
+++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py
@@ -27,6 +27,7 @@ from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.management_endpoints.key_management_endpoints import (
+    _duration_in_seconds,
     generate_key_helper_fn,
 )
 from litellm.proxy.management_helpers.utils import (
@@ -486,6 +487,13 @@ async def user_update(
             ):  # models default to [], spend defaults to 0, we should not reset these values
                 non_default_values[k] = v
 
+        if "budget_duration" in non_default_values:
+            duration_s = _duration_in_seconds(
+                duration=non_default_values["budget_duration"]
+            )
+            user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
+            non_default_values["budget_reset_at"] = user_reset_at
+
         ## ADD USER, IF NEW ##
         verbose_proxy_logger.debug("/user/update: Received data = %s", data)
         if data.user_id is not None and len(data.user_id) > 0:

From 4e51f712f3c4982833255782a4c70961f3b8b56a Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 09:57:19 -0700
Subject: [PATCH 12/34] fix(main.py): fix calling openai gpt-3.5-turbo-instruct
 via /completions

Fixes https://github.com/BerriAI/litellm/issues/749
---
 litellm/main.py                        | 10 ++++++----
 litellm/proxy/_new_secret_config.yaml  |  8 ++------
 litellm/tests/test_get_llm_provider.py | 14 ++++++++++++--
 litellm/tests/test_text_completion.py  | 21 ++++++++++++++++++++-
 litellm/utils.py                       |  2 +-
 5 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 35fad5e02..f724a68bd 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -3833,7 +3833,7 @@ def text_completion(
         optional_params["custom_llm_provider"] = custom_llm_provider
 
     # get custom_llm_provider
-    _, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
+    _model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
 
     if custom_llm_provider == "huggingface":
         # if echo == True, for TGI llms we need to set top_n_tokens to 3
@@ -3916,10 +3916,12 @@ def text_completion(
 
     kwargs.pop("prompt", None)
 
-    if model is not None and model.startswith(
-        "openai/"
+    if (
+        _model is not None and custom_llm_provider == "openai"
     ):  # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
-        model = model.replace("openai/", "text-completion-openai/")
+        if _model not in litellm.open_ai_chat_completion_models:
+            model = "text-completion-openai/" + _model
+            optional_params.pop("custom_llm_provider", None)
 
     kwargs["text_completion"] = True
     response = completion(
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index bec92c1e9..c4d2a6441 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,8 +1,4 @@
 model_list:
-  - model_name: "*"             # all requests where model not in your config go to this deployment
+  - model_name: "test-model"             
     litellm_params:
-      model: "openai/*"           # passes our validation check that a real provider is given
-      api_key: "" 
-
-general_settings:
-  completion_model: "gpt-3.5-turbo"
\ No newline at end of file
+      model: "openai/gpt-3.5-turbo-instruct-0914"
\ No newline at end of file
diff --git a/litellm/tests/test_get_llm_provider.py b/litellm/tests/test_get_llm_provider.py
index e443830b2..3ec867af4 100644
--- a/litellm/tests/test_get_llm_provider.py
+++ b/litellm/tests/test_get_llm_provider.py
@@ -1,14 +1,18 @@
-import sys, os
+import os
+import sys
 import traceback
+
 from dotenv import load_dotenv
 
 load_dotenv()
-import os, io
+import io
+import os
 
 sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
+
 import litellm
 
 
@@ -21,6 +25,12 @@ def test_get_llm_provider():
 # test_get_llm_provider()
 
 
+def test_get_llm_provider_gpt_instruct():
+    _, response, _, _ = litellm.get_llm_provider(model="gpt-3.5-turbo-instruct-0914")
+
+    assert response == "text-completion-openai"
+
+
 def test_get_llm_provider_mistral_custom_api_base():
     model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
         model="mistral/mistral-large-fr",
diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py
index c6bbf71f2..6a0080b37 100644
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@@ -3840,7 +3840,26 @@ def test_completion_chatgpt_prompt():
     try:
         print("\n gpt3.5 test\n")
         response = text_completion(
-            model="gpt-3.5-turbo", prompt="What's the weather in SF?"
+            model="openai/gpt-3.5-turbo", prompt="What's the weather in SF?"
+        )
+        print(response)
+        response_str = response["choices"][0]["text"]
+        print("\n", response.choices)
+        print("\n", response.choices[0])
+        # print(response.choices[0].text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+# test_completion_chatgpt_prompt()
+
+
+def test_completion_gpt_instruct():
+    try:
+        response = text_completion(
+            model="gpt-3.5-turbo-instruct-0914",
+            prompt="What's the weather in SF?",
+            custom_llm_provider="openai",
         )
         print(response)
         response_str = response["choices"][0]["text"]
diff --git a/litellm/utils.py b/litellm/utils.py
index f35f1ce4b..e104de958 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2774,7 +2774,7 @@ def get_optional_params(
                 tool_function["parameters"] = new_parameters
 
     def _check_valid_arg(supported_params):
-        verbose_logger.debug(
+        verbose_logger.info(
             f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
         )
         verbose_logger.debug(

From 80800b9ec86478003dffe58fd433259fcdd0f021 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 10:01:47 -0700
Subject: [PATCH 13/34] docs(caching.md): update caching docs to include ttl
 info

---
 docs/my-website/docs/proxy/caching.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 6769ec6c5..ded8333f0 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -59,6 +59,8 @@ litellm_settings:
   cache_params:        # set cache params for redis
     type: redis
     ttl: 600 # will be cached on redis for 600s
+    # default_in_memory_ttl: Optional[float], default is None. time in seconds. 
+    # default_in_redis_ttl: Optional[float], default is None. time in seconds. 
 ```
 
 
@@ -613,6 +615,11 @@ litellm_settings:
 
 ```yaml
 cache_params:
+  # ttl 
+  ttl: Optional[float]
+  default_in_memory_ttl: Optional[float]
+  default_in_redis_ttl: Optional[float]
+
   # Type of cache (options: "local", "redis", "s3")
   type: s3
 
@@ -628,6 +635,8 @@ cache_params:
   host: localhost  # Redis server hostname or IP address
   port: "6379"  # Redis server port (as a string)
   password: secret_password  # Redis server password
+  namespace: Optional[str] = None,
+  
 
   # S3 cache parameters
   s3_bucket_name: your_s3_bucket_name  # Name of the S3 bucket

From 5553f84d511fc352dc95cbf49ad752eefbfeefa5 Mon Sep 17 00:00:00 2001
From: fracapuano <francesco_capuano@aol.com>
Date: Thu, 25 Jul 2024 19:06:07 +0200
Subject: [PATCH 14/34] fix: now supports single tokens prediction

---
 litellm/llms/replicate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py
index 1dd29fd7d..0d129ce02 100644
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@@ -387,7 +387,7 @@ def process_response(
         result = " "
 
     ## Building RESPONSE OBJECT
-    if len(result) > 1:
+    if len(result) >= 1:
         model_response.choices[0].message.content = result  # type: ignore
 
     # Calculate usage

From d91b01a24bf72eec6e38a4325bddc1a2e78a1faa Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 10:08:40 -0700
Subject: [PATCH 15/34] docs(enterprise.md): cleanup docs

---
 docs/my-website/docs/proxy/enterprise.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md
index 5b97dc14e..01bc32783 100644
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@@ -25,7 +25,7 @@ Features:
     - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
 - **Spend Tracking**
     - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
-    - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
+    - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
 - **Advanced Metrics**
     - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
 - **Guardrails, PII Masking, Content Moderation**

From 397451570e3a97ef12564bb4745828606d5077aa Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 10:09:02 -0700
Subject: [PATCH 16/34] docs(enterprise.md): cleanup docs

---
 docs/my-website/docs/proxy/enterprise.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md
index 01bc32783..3607cb07f 100644
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@@ -23,7 +23,7 @@ Features:
     - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
     - ✅ Set Max Request / File Size on Requests
     - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
-- **Spend Tracking**
+- **Enterprise Spend Tracking Features**
     - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
     - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
 - **Advanced Metrics**

From 3293ad745805b65b10d42f26477888d27f462f5c Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 25 Jul 2024 19:29:55 +0000
Subject: [PATCH 17/34] Add Llama 3.1 405b for Bedrock

---
 litellm/llms/bedrock_httpx.py                       | 1 +
 litellm/model_prices_and_context_window_backup.json | 9 +++++++++
 model_prices_and_context_window.json                | 9 +++++++++
 3 files changed, 19 insertions(+)

diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py
index 16c3f60b7..3f06a50b8 100644
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@@ -78,6 +78,7 @@ BEDROCK_CONVERSE_MODELS = [
     "ai21.jamba-instruct-v1:0",
     "meta.llama3-1-8b-instruct-v1:0",
     "meta.llama3-1-70b-instruct-v1:0",
+    "meta.llama3-1-405b-instruct-v1:0",
 ]
 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 667745c30..c05256d34 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -3731,6 +3731,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "meta.llama3-1-405b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000532,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
         "max_tokens": 77, 
         "max_input_tokens": 77, 
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 667745c30..c05256d34 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -3731,6 +3731,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "meta.llama3-1-405b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000532,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
         "max_tokens": 77, 
         "max_input_tokens": 77, 

From 5c4ee3ef3c042b40b438e87b22b563cc716afa6a Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 25 Jul 2024 20:00:29 +0000
Subject: [PATCH 18/34] Add mistral.mistral-large-2407-v1:0 on Amazon Bedrock.

---
 litellm/llms/bedrock_httpx.py                       | 1 +
 litellm/model_prices_and_context_window_backup.json | 9 +++++++++
 model_prices_and_context_window.json                | 9 +++++++++
 3 files changed, 19 insertions(+)

diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py
index 16c3f60b7..59b8acad0 100644
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@@ -78,6 +78,7 @@ BEDROCK_CONVERSE_MODELS = [
     "ai21.jamba-instruct-v1:0",
     "meta.llama3-1-8b-instruct-v1:0",
     "meta.llama3-1-70b-instruct-v1:0",
+    "mistral.mistral-large-2407-v1:0",
 ]
 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 667745c30..66a5565f3 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -2996,6 +2996,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "mistral.mistral-large-2407-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 667745c30..66a5565f3 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -2996,6 +2996,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "mistral.mistral-large-2407-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,

From 22c66991ed671a544bbf2df6aa6bd0bef1122b34 Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 25 Jul 2024 20:36:03 +0000
Subject: [PATCH 19/34] Support tool calling for Llama 3.1 on Amazon bedrock.

---
 litellm/llms/bedrock_httpx.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py
index 3f06a50b8..cb3832845 100644
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@@ -1316,6 +1316,7 @@ class AmazonConverseConfig:
             model.startswith("anthropic")
             or model.startswith("mistral")
             or model.startswith("cohere")
+            or model.startswith("meta.llama3-1")
         ):
             supported_params.append("tools")
 

From 64adae6e7fd57a89e7c4693d833c705e169ac579 Mon Sep 17 00:00:00 2001
From: David Manouchehri <david.manouchehri@ai.moda>
Date: Thu, 25 Jul 2024 21:06:58 +0000
Subject: [PATCH 20/34] Check for converse support first.

---
 litellm/utils.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index e104de958..a597643a6 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -3121,7 +3121,19 @@ def get_optional_params(
         supported_params = get_supported_openai_params(
             model=model, custom_llm_provider=custom_llm_provider
         )
-        if "ai21" in model:
+        if model in litellm.BEDROCK_CONVERSE_MODELS:
+            _check_valid_arg(supported_params=supported_params)
+            optional_params = litellm.AmazonConverseConfig().map_openai_params(
+                model=model,
+                non_default_params=non_default_params,
+                optional_params=optional_params,
+                drop_params=(
+                    drop_params
+                    if drop_params is not None and isinstance(drop_params, bool)
+                    else False
+                ),
+            )
+        elif "ai21" in model:
             _check_valid_arg(supported_params=supported_params)
             # params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
             # https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@@ -3143,17 +3155,6 @@ def get_optional_params(
                             optional_params=optional_params,
                         )
                     )
-            elif model in litellm.BEDROCK_CONVERSE_MODELS:
-                optional_params = litellm.AmazonConverseConfig().map_openai_params(
-                    model=model,
-                    non_default_params=non_default_params,
-                    optional_params=optional_params,
-                    drop_params=(
-                        drop_params
-                        if drop_params is not None and isinstance(drop_params, bool)
-                        else False
-                    ),
-                )
             else:
                 optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
                     non_default_params=non_default_params,

From bfdda089c8ab36c0920c234ce890c4aba9bea447 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 14:23:07 -0700
Subject: [PATCH 21/34] fix(proxy_server.py): check if input list > 0 before
 indexing into it

resolves 'list index out of range' error
---
 litellm/proxy/_new_secret_config.yaml | 2 +-
 litellm/proxy/proxy_server.py         | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index c4d2a6441..a81d133e5 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,4 +1,4 @@
 model_list:
   - model_name: "test-model"             
     litellm_params:
-      model: "openai/gpt-3.5-turbo-instruct-0914"
\ No newline at end of file
+      model: "openai/text-embedding-ada-002"
\ No newline at end of file
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 106b95453..f22f25f73 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3334,6 +3334,7 @@ async def embeddings(
         if (
             "input" in data
             and isinstance(data["input"], list)
+            and len(data["input"]) > 0
             and isinstance(data["input"][0], list)
             and isinstance(data["input"][0][0], int)
         ):  # check if array of tokens passed in
@@ -3464,8 +3465,8 @@ async def embeddings(
             litellm_debug_info,
         )
         verbose_proxy_logger.error(
-            "litellm.proxy.proxy_server.embeddings(): Exception occured - {}".format(
-                str(e)
+            "litellm.proxy.proxy_server.embeddings(): Exception occured - {}\n{}".format(
+                str(e), traceback.format_exc()
             )
         )
         verbose_proxy_logger.debug(traceback.format_exc())

From 711496e2600adc8510d6f0cdc8f0a482856b3e4c Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 14:30:46 -0700
Subject: [PATCH 22/34] fix(router.py): add support for diskcache to router

---
 litellm/router.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/router.py b/litellm/router.py
index 11ad5fd9e..53013a759 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -263,7 +263,9 @@ class Router:
         )  # names of models under litellm_params. ex. azure/chatgpt-v-2
         self.deployment_latency_map = {}
         ### CACHING ###
-        cache_type: Literal["local", "redis"] = "local"  # default to an in-memory cache
+        cache_type: Literal["local", "redis", "redis-semantic", "s3", "disk"] = (
+            "local"  # default to an in-memory cache
+        )
         redis_cache = None
         cache_config = {}
         self.client_ttl = client_ttl

From 6bf1b9353bbc675390cac2a5821eaa76a4788c28 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 15:33:05 -0700
Subject: [PATCH 23/34] feat(custom_llm.py): initial working commit for writing
 your own custom LLM handler

Fixes https://github.com/BerriAI/litellm/issues/4675

 Also Addresses https://github.com/BerriAI/litellm/discussions/4677
---
 litellm/__init__.py              |  9 ++++
 litellm/llms/custom_llm.py       | 70 ++++++++++++++++++++++++++++++++
 litellm/main.py                  | 15 +++++++
 litellm/tests/test_custom_llm.py | 63 ++++++++++++++++++++++++++++
 litellm/types/llms/custom_llm.py | 10 +++++
 litellm/utils.py                 | 16 ++++++++
 6 files changed, 183 insertions(+)
 create mode 100644 litellm/llms/custom_llm.py
 create mode 100644 litellm/tests/test_custom_llm.py
 create mode 100644 litellm/types/llms/custom_llm.py

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 956834afc..0527ef199 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -813,6 +813,7 @@ from .utils import (
 )
 
 from .types.utils import ImageObject
+from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
@@ -909,3 +910,11 @@ from .cost_calculator import response_cost_calculator, cost_per_token
 from .types.adapter import AdapterItem
 
 adapters: List[AdapterItem] = []
+
+### CUSTOM LLMs ###
+from .types.llms.custom_llm import CustomLLMItem
+
+custom_provider_map: List[CustomLLMItem] = []
+_custom_providers: List[str] = (
+    []
+)  # internal helper util, used to track names of custom providers
diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py
new file mode 100644
index 000000000..fac1eb293
--- /dev/null
+++ b/litellm/llms/custom_llm.py
@@ -0,0 +1,70 @@
+# What is this?
+## Handler file for a Custom Chat LLM
+
+"""
+- completion
+- acompletion
+- streaming
+- async_streaming
+"""
+
+import copy
+import json
+import os
+import time
+import types
+from enum import Enum
+from functools import partial
+from typing import Callable, List, Literal, Optional, Tuple, Union
+
+import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.llms.databricks import GenericStreamingChunk
+from litellm.types.utils import ProviderField
+from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory
+
+
+class CustomLLMError(Exception):  # use this for all your exceptions
+    def __init__(
+        self,
+        status_code,
+        message,
+    ):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+def custom_chat_llm_router():
+    """
+    Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type
+
+    Validates if response is in expected format
+    """
+    pass
+
+
+class CustomLLM(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def completion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    def streaming(self, *args, **kwargs):
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def acompletion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def astreaming(self, *args, **kwargs):
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
diff --git a/litellm/main.py b/litellm/main.py
index f724a68bd..539c3d3e1 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -107,6 +107,7 @@ from .llms.anthropic_text import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
+from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks import DatabricksChatCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
@@ -2690,6 +2691,20 @@ def completion(
             model_response.created = int(time.time())
             model_response.model = model
             response = model_response
+        elif (
+            custom_llm_provider in litellm._custom_providers
+        ):  # Assume custom LLM provider
+            # Get the Custom Handler
+            custom_handler: Optional[CustomLLM] = None
+            for item in litellm.custom_provider_map:
+                if item["provider"] == custom_llm_provider:
+                    custom_handler = item["custom_handler"]
+
+            if custom_handler is None:
+                raise ValueError(
+                    f"Unable to map your input to a model. Check your input - {args}"
+                )
+            response = custom_handler.completion()
         else:
             raise ValueError(
                 f"Unable to map your input to a model. Check your input - {args}"
diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py
new file mode 100644
index 000000000..0506986eb
--- /dev/null
+++ b/litellm/tests/test_custom_llm.py
@@ -0,0 +1,63 @@
+# What is this?
+## Unit tests for the CustomLLM class
+
+
+import asyncio
+import os
+import sys
+import time
+import traceback
+
+import openai
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import os
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+from dotenv import load_dotenv
+
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+def test_get_llm_provider():
+    from litellm.utils import custom_llm_setup
+
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+
+    custom_llm_setup()
+
+    model, provider, _, _ = get_llm_provider(model="custom_llm/my-fake-model")
+
+    assert provider == "custom_llm"
+
+
+def test_simple_completion():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = completion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+    assert resp.choices[0].message.content == "Hi!"
diff --git a/litellm/types/llms/custom_llm.py b/litellm/types/llms/custom_llm.py
new file mode 100644
index 000000000..d5499a419
--- /dev/null
+++ b/litellm/types/llms/custom_llm.py
@@ -0,0 +1,10 @@
+from typing import List
+
+from typing_extensions import Dict, Required, TypedDict, override
+
+from litellm.llms.custom_llm import CustomLLM
+
+
+class CustomLLMItem(TypedDict):
+    provider: str
+    custom_handler: CustomLLM
diff --git a/litellm/utils.py b/litellm/utils.py
index e104de958..0f1b0315d 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -330,6 +330,18 @@ class Rules:
 
 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
+def custom_llm_setup():
+    """
+    Add custom_llm provider to provider list
+    """
+    for custom_llm in litellm.custom_provider_map:
+        if custom_llm["provider"] not in litellm.provider_list:
+            litellm.provider_list.append(custom_llm["provider"])
+
+        if custom_llm["provider"] not in litellm._custom_providers:
+            litellm._custom_providers.append(custom_llm["provider"])
+
+
 def function_setup(
     original_function: str, rules_obj, start_time, *args, **kwargs
 ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
@@ -341,6 +353,10 @@ def function_setup(
     try:
         global callback_list, add_breadcrumb, user_logger_fn, Logging
 
+        ## CUSTOM LLM SETUP ##
+        custom_llm_setup()
+
+        ## LOGGING SETUP
         function_id = kwargs["id"] if "id" in kwargs else None
 
         if len(litellm.callbacks) > 0:

From 9f97436308de5c1ddc1acf14567b0caf0c23ab2d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 15:51:39 -0700
Subject: [PATCH 24/34] fix(custom_llm.py): support async completion calls

---
 litellm/llms/custom_llm.py       | 26 +++++++++++++++++---------
 litellm/main.py                  | 10 +++++++++-
 litellm/tests/test_custom_llm.py | 25 ++++++++++++++++++++++++-
 3 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py
index fac1eb293..5e9933194 100644
--- a/litellm/llms/custom_llm.py
+++ b/litellm/llms/custom_llm.py
@@ -44,15 +44,6 @@ class CustomLLMError(Exception):  # use this for all your exceptions
         )  # Call the base class constructor with the parameters it needs
 
 
-def custom_chat_llm_router():
-    """
-    Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type
-
-    Validates if response is in expected format
-    """
-    pass
-
-
 class CustomLLM(BaseLLM):
     def __init__(self) -> None:
         super().__init__()
@@ -68,3 +59,20 @@ class CustomLLM(BaseLLM):
 
     async def astreaming(self, *args, **kwargs):
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+
+def custom_chat_llm_router(
+    async_fn: bool, stream: Optional[bool], custom_llm: CustomLLM
+):
+    """
+    Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type
+
+    Validates if response is in expected format
+    """
+    if async_fn:
+        if stream:
+            return custom_llm.astreaming
+        return custom_llm.acompletion
+    if stream:
+        return custom_llm.streaming
+    return custom_llm.completion
diff --git a/litellm/main.py b/litellm/main.py
index 539c3d3e1..51e7c611c 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -382,6 +382,7 @@ async def acompletion(
             or custom_llm_provider == "clarifai"
             or custom_llm_provider == "watsonx"
             or custom_llm_provider in litellm.openai_compatible_providers
+            or custom_llm_provider in litellm._custom_providers
         ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
             init_response = await loop.run_in_executor(None, func_with_context)
             if isinstance(init_response, dict) or isinstance(
@@ -2704,7 +2705,14 @@ def completion(
                 raise ValueError(
                     f"Unable to map your input to a model. Check your input - {args}"
                 )
-            response = custom_handler.completion()
+
+            ## ROUTE LLM CALL ##
+            handler_fn = custom_chat_llm_router(
+                async_fn=acompletion, stream=stream, custom_llm=custom_handler
+            )
+
+            ## CALL FUNCTION
+            response = handler_fn()
         else:
             raise ValueError(
                 f"Unable to map your input to a model. Check your input - {args}"
diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py
index 0506986eb..fd46c892e 100644
--- a/litellm/tests/test_custom_llm.py
+++ b/litellm/tests/test_custom_llm.py
@@ -23,7 +23,7 @@ import httpx
 from dotenv import load_dotenv
 
 import litellm
-from litellm import CustomLLM, completion, get_llm_provider
+from litellm import CustomLLM, acompletion, completion, get_llm_provider
 
 
 class MyCustomLLM(CustomLLM):
@@ -35,6 +35,15 @@ class MyCustomLLM(CustomLLM):
         )  # type: ignore
 
 
+class MyCustomAsyncLLM(CustomLLM):
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
 def test_get_llm_provider():
     from litellm.utils import custom_llm_setup
 
@@ -61,3 +70,17 @@ def test_simple_completion():
     )
 
     assert resp.choices[0].message.content == "Hi!"
+
+
+@pytest.mark.asyncio
+async def test_simple_acompletion():
+    my_custom_llm = MyCustomAsyncLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = await acompletion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+    assert resp.choices[0].message.content == "Hi!"

From b4e3a77ad0b823fb5ab44f6ee92a48e2b929993d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 16:47:32 -0700
Subject: [PATCH 25/34] feat(utils.py): support sync streaming for custom llm
 provider

---
 litellm/__init__.py              |   1 +
 litellm/llms/custom_llm.py       |  19 ++++--
 litellm/main.py                  |   8 +++
 litellm/tests/test_custom_llm.py | 111 +++++++++++++++++++++++++++++--
 litellm/utils.py                 |  10 ++-
 5 files changed, 139 insertions(+), 10 deletions(-)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 0527ef199..b6aacad1a 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -913,6 +913,7 @@ adapters: List[AdapterItem] = []
 
 ### CUSTOM LLMs ###
 from .types.llms.custom_llm import CustomLLMItem
+from .types.utils import GenericStreamingChunk
 
 custom_provider_map: List[CustomLLMItem] = []
 _custom_providers: List[str] = (
diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py
index 5e9933194..f00d02ab7 100644
--- a/litellm/llms/custom_llm.py
+++ b/litellm/llms/custom_llm.py
@@ -15,7 +15,17 @@ import time
 import types
 from enum import Enum
 from functools import partial
-from typing import Callable, List, Literal, Optional, Tuple, Union
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import httpx  # type: ignore
 import requests  # type: ignore
@@ -23,8 +33,7 @@ import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from litellm.types.llms.databricks import GenericStreamingChunk
-from litellm.types.utils import ProviderField
+from litellm.types.utils import GenericStreamingChunk, ProviderField
 from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
 
 from .base import BaseLLM
@@ -51,13 +60,13 @@ class CustomLLM(BaseLLM):
     def completion(self, *args, **kwargs) -> ModelResponse:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
-    def streaming(self, *args, **kwargs):
+    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
     async def acompletion(self, *args, **kwargs) -> ModelResponse:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
-    async def astreaming(self, *args, **kwargs):
+    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
 
diff --git a/litellm/main.py b/litellm/main.py
index 51e7c611c..c3be01373 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -2713,6 +2713,14 @@ def completion(
 
             ## CALL FUNCTION
             response = handler_fn()
+            if stream is True:
+                return CustomStreamWrapper(
+                    completion_stream=response,
+                    model=model,
+                    custom_llm_provider=custom_llm_provider,
+                    logging_obj=logging,
+                )
+
         else:
             raise ValueError(
                 f"Unable to map your input to a model. Check your input - {args}"
diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py
index fd46c892e..4cc355e4b 100644
--- a/litellm/tests/test_custom_llm.py
+++ b/litellm/tests/test_custom_llm.py
@@ -17,13 +17,80 @@ sys.path.insert(
 import os
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, AsyncIterator, Iterator, Union
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import httpx
 from dotenv import load_dotenv
 
 import litellm
-from litellm import CustomLLM, acompletion, completion, get_llm_provider
+from litellm import (
+    ChatCompletionDeltaChunk,
+    ChatCompletionUsageBlock,
+    CustomLLM,
+    GenericStreamingChunk,
+    ModelResponse,
+    acompletion,
+    completion,
+    get_llm_provider,
+)
+from litellm.utils import ModelResponseIterator
+
+
+class CustomModelResponseIterator:
+    def __init__(self, streaming_response: Union[Iterator, AsyncIterator]):
+        self.streaming_response = streaming_response
+
+    def chunk_parser(self, chunk: Any) -> GenericStreamingChunk:
+        return GenericStreamingChunk(
+            text="hello world",
+            tool_use=None,
+            is_finished=True,
+            finish_reason="stop",
+            usage=ChatCompletionUsageBlock(
+                prompt_tokens=10, completion_tokens=20, total_tokens=30
+            ),
+            index=0,
+        )
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> GenericStreamingChunk:
+        try:
+            chunk: Any = self.streaming_response.__next__()  # type: ignore
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            return self.chunk_parser(chunk=chunk)
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+
+    # Async iterator
+    def __aiter__(self):
+        self.async_response_iterator = self.streaming_response.__aiter__()  # type: ignore
+        return self
+
+    async def __anext__(self) -> GenericStreamingChunk:
+        try:
+            chunk = await self.async_response_iterator.__anext__()
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            return self.chunk_parser(chunk=chunk)
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
 
 
 class MyCustomLLM(CustomLLM):
@@ -34,8 +101,6 @@ class MyCustomLLM(CustomLLM):
             mock_response="Hi!",
         )  # type: ignore
 
-
-class MyCustomAsyncLLM(CustomLLM):
     async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
         return litellm.completion(
             model="gpt-3.5-turbo",
@@ -43,8 +108,27 @@ class MyCustomAsyncLLM(CustomLLM):
             mock_response="Hi!",
         )  # type: ignore
 
+    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
+        generic_streaming_chunk: GenericStreamingChunk = {
+            "finish_reason": "stop",
+            "index": 0,
+            "is_finished": True,
+            "text": "Hello world",
+            "tool_use": None,
+            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
+        }
+
+        completion_stream = ModelResponseIterator(
+            model_response=generic_streaming_chunk  # type: ignore
+        )
+        custom_iterator = CustomModelResponseIterator(
+            streaming_response=completion_stream
+        )
+        return custom_iterator
+
 
 def test_get_llm_provider():
+    """"""
     from litellm.utils import custom_llm_setup
 
     my_custom_llm = MyCustomLLM()
@@ -74,7 +158,7 @@ def test_simple_completion():
 
 @pytest.mark.asyncio
 async def test_simple_acompletion():
-    my_custom_llm = MyCustomAsyncLLM()
+    my_custom_llm = MyCustomLLM()
     litellm.custom_provider_map = [
         {"provider": "custom_llm", "custom_handler": my_custom_llm}
     ]
@@ -84,3 +168,22 @@ async def test_simple_acompletion():
     )
 
     assert resp.choices[0].message.content == "Hi!"
+
+
+def test_simple_completion_streaming():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = completion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        stream=True,
+    )
+
+    for chunk in resp:
+        print(chunk)
+        if chunk.choices[0].finish_reason is None:
+            assert isinstance(chunk.choices[0].delta.content, str)
+        else:
+            assert chunk.choices[0].finish_reason == "stop"
diff --git a/litellm/utils.py b/litellm/utils.py
index 0f1b0315d..c14ab36dd 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -9262,7 +9262,10 @@ class CustomStreamWrapper:
         try:
             # return this for all models
             completion_obj = {"content": ""}
-            if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
+            if self.custom_llm_provider and (
+                self.custom_llm_provider == "anthropic"
+                or self.custom_llm_provider in litellm._custom_providers
+            ):
                 from litellm.types.utils import GenericStreamingChunk as GChunk
 
                 if self.received_finish_reason is not None:
@@ -10981,3 +10984,8 @@ class ModelResponseIterator:
             raise StopAsyncIteration
         self.is_done = True
         return self.model_response
+
+
+class CustomModelResponseIterator(Iterable):
+    def __init__(self) -> None:
+        super().__init__()

From 060249c7e0477fee7740a856b4bb7d58ba3c8079 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 17:11:57 -0700
Subject: [PATCH 26/34] feat(utils.py): support async streaming for custom llm
 provider

---
 litellm/llms/custom_llm.py       |  2 ++
 litellm/tests/test_custom_llm.py | 36 ++++++++++++++++++++++++++++++--
 litellm/utils.py                 |  2 ++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py
index f00d02ab7..f1b2b28b4 100644
--- a/litellm/llms/custom_llm.py
+++ b/litellm/llms/custom_llm.py
@@ -17,8 +17,10 @@ from enum import Enum
 from functools import partial
 from typing import (
     Any,
+    AsyncGenerator,
     AsyncIterator,
     Callable,
+    Coroutine,
     Iterator,
     List,
     Literal,
diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py
index 4cc355e4b..af88b1f3a 100644
--- a/litellm/tests/test_custom_llm.py
+++ b/litellm/tests/test_custom_llm.py
@@ -17,7 +17,7 @@ sys.path.insert(
 import os
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, AsyncIterator, Iterator, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Coroutine, Iterator, Union
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import httpx
@@ -75,7 +75,7 @@ class CustomModelResponseIterator:
     # Async iterator
     def __aiter__(self):
         self.async_response_iterator = self.streaming_response.__aiter__()  # type: ignore
-        return self
+        return self.streaming_response
 
     async def __anext__(self) -> GenericStreamingChunk:
         try:
@@ -126,6 +126,18 @@ class MyCustomLLM(CustomLLM):
         )
         return custom_iterator
 
+    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:  # type: ignore
+        generic_streaming_chunk: GenericStreamingChunk = {
+            "finish_reason": "stop",
+            "index": 0,
+            "is_finished": True,
+            "text": "Hello world",
+            "tool_use": None,
+            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
+        }
+
+        yield generic_streaming_chunk  # type: ignore
+
 
 def test_get_llm_provider():
     """"""
@@ -187,3 +199,23 @@ def test_simple_completion_streaming():
             assert isinstance(chunk.choices[0].delta.content, str)
         else:
             assert chunk.choices[0].finish_reason == "stop"
+
+
+@pytest.mark.asyncio
+async def test_simple_completion_async_streaming():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = await litellm.acompletion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk)
+        if chunk.choices[0].finish_reason is None:
+            assert isinstance(chunk.choices[0].delta.content, str)
+        else:
+            assert chunk.choices[0].finish_reason == "stop"
diff --git a/litellm/utils.py b/litellm/utils.py
index c14ab36dd..9158afb74 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -10132,6 +10132,7 @@ class CustomStreamWrapper:
         try:
             if self.completion_stream is None:
                 await self.fetch_stream()
+
             if (
                 self.custom_llm_provider == "openai"
                 or self.custom_llm_provider == "azure"
@@ -10156,6 +10157,7 @@ class CustomStreamWrapper:
                 or self.custom_llm_provider == "triton"
                 or self.custom_llm_provider == "watsonx"
                 or self.custom_llm_provider in litellm.openai_compatible_endpoints
+                or self.custom_llm_provider in litellm._custom_providers
             ):
                 async for chunk in self.completion_stream:
                     print_verbose(f"value of async chunk: {chunk}")

From e3142b4294cfd5b0b5219607f99d1b554a2a11ff Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jul 2024 17:22:57 -0700
Subject: [PATCH 27/34] fix whisper health check with litellm

---
 litellm/llms/openai.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 25e2e518c..2c7a7a4df 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1,5 +1,6 @@
 import hashlib
 import json
+import os
 import time
 import traceback
 import types
@@ -1870,6 +1871,16 @@ class OpenAIChatCompletion(BaseLLM):
                 model=model,  # type: ignore
                 prompt=prompt,  # type: ignore
             )
+        elif mode == "audio_transcription":
+            # Get the current directory of the file being run
+            pwd = os.path.dirname(os.path.realpath(__file__))
+            file_path = os.path.join(pwd, "../tests/gettysburg.wav")
+            audio_file = open(file_path, "rb")
+            completion = await client.audio.transcriptions.with_raw_response.create(
+                file=audio_file,
+                model=model,  # type: ignore
+                prompt=prompt,  # type: ignore
+            )
         else:
             raise Exception("mode not set")
         response = {}

From 2432c90515229da4d80d9ec298c315e7c9040a57 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jul 2024 17:26:14 -0700
Subject: [PATCH 28/34] feat - support health check audio_speech

---
 litellm/llms/openai.py          | 9 ++++++++-
 litellm/proxy/proxy_config.yaml | 6 ++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 2c7a7a4df..fae8a448a 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1881,8 +1881,15 @@ class OpenAIChatCompletion(BaseLLM):
                 model=model,  # type: ignore
                 prompt=prompt,  # type: ignore
             )
+        elif mode == "audio_speech":
+            # Get the current directory of the file being run
+            completion = await client.audio.speech.with_raw_response.create(
+                model=model,  # type: ignore
+                input=prompt,  # type: ignore
+                voice="alloy",
+            )
         else:
-            raise Exception("mode not set")
+            raise ValueError("mode not set, passed in mode: " + mode)
         response = {}
 
         if completion is None or not hasattr(completion, "headers"):
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 0e3f0826e..bd8f5bfd0 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -8,6 +8,12 @@ model_list:
     litellm_params:
       model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
       api_key: "os.environ/FIREWORKS"
+  - model_name: tts
+    litellm_params:
+      model: openai/tts-1
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info:
+      mode: audio_speech
 general_settings: 
   master_key: sk-1234
   alerting: ["slack"]

From 3573b47098c52b1dc506e8918b46f5ee471bca28 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jul 2024 17:29:28 -0700
Subject: [PATCH 29/34] docs add example on using text to speech models

---
 docs/my-website/docs/proxy/health.md | 57 +++++++++++++++++-----------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md
index 6d383fc41..632702b91 100644
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@@ -41,28 +41,6 @@ litellm --health
 }
 ```
 
-### Background Health Checks 
-
-You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
-
-Here's how to use it: 
-1. in the config.yaml add:
-```
-general_settings: 
-  background_health_checks: True # enable background health checks
-  health_check_interval: 300 # frequency of background health checks
-```
-
-2. Start server 
-```
-$ litellm /path/to/config.yaml
-```
-
-3. Query health endpoint: 
-```
-curl --location 'http://0.0.0.0:4000/health'
-```
-
 ### Embedding Models 
 
 We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@@ -124,6 +102,41 @@ model_list:
       mode: audio_transcription
 ```
 
+
+### Text to Speech Models 
+
+```yaml
+# OpenAI Text to Speech Models
+  - model_name: tts
+    litellm_params:
+      model: openai/tts-1
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info:
+      mode: audio_speech
+```
+
+## Background Health Checks 
+
+You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
+
+Here's how to use it: 
+1. in the config.yaml add:
+```
+general_settings: 
+  background_health_checks: True # enable background health checks
+  health_check_interval: 300 # frequency of background health checks
+```
+
+2. Start server 
+```
+$ litellm /path/to/config.yaml
+```
+
+3. Query health endpoint: 
+```
+curl --location 'http://0.0.0.0:4000/health'
+```
+
 ### Hide details
 
 The health check response contains details like endpoint URLs, error messages,

From f2443996d82d50e88ecfbca4efb045fc0522aa84 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jul 2024 17:30:15 -0700
Subject: [PATCH 30/34] feat  support audio health checks for azure

---
 litellm/llms/azure.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py
index a2928cf20..ec143f3fe 100644
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@@ -1864,6 +1864,23 @@ class AzureChatCompletion(BaseLLM):
                 model=model,  # type: ignore
                 prompt=prompt,  # type: ignore
             )
+        elif mode == "audio_transcription":
+            # Get the current directory of the file being run
+            pwd = os.path.dirname(os.path.realpath(__file__))
+            file_path = os.path.join(pwd, "../tests/gettysburg.wav")
+            audio_file = open(file_path, "rb")
+            completion = await client.audio.transcriptions.with_raw_response.create(
+                file=audio_file,
+                model=model,  # type: ignore
+                prompt=prompt,  # type: ignore
+            )
+        elif mode == "audio_speech":
+            # Get the current directory of the file being run
+            completion = await client.audio.speech.with_raw_response.create(
+                model=model,  # type: ignore
+                input=prompt,  # type: ignore
+                voice="alloy",
+            )
         else:
             raise Exception("mode not set")
         response = {}

From 3814170ae17d748110058a0c411ad7eccc786b6a Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jul 2024 17:41:16 -0700
Subject: [PATCH 31/34] docs - add info about routing strategy on load
 balancing docs

---
 docs/my-website/docs/proxy/reliability.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md
index 2404c744c..a3f03b3d7 100644
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@@ -31,8 +31,19 @@ model_list:
       api_base: https://openai-france-1234.openai.azure.com/
       api_key: <your-azure-api-key>
       rpm: 1440
+routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
+  num_retries: 2
+  timeout: 30                                  # 30 seconds
+  redis_host: <your redis host>                # set this when using multiple litellm proxy deployments, load balancing state stored in redis
+  redis_password: <your redis password>
+  redis_port: 1992
 ```
 
+:::info
+Detailed information about [routing strategies can be found here](../routing)
+:::
+
 #### Step 2: Start Proxy with config
 
 ```shell

From a2d07cfe64e24f2a42612213f46e49114a94ff8e Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 17:41:19 -0700
Subject: [PATCH 32/34] docs(custom_llm_server.md): add calling custom llm
 server to docs

---
 .../docs/providers/custom_llm_server.md       |  73 ++++++++++
 .../docs/providers/custom_openai_proxy.md     | 129 ------------------
 docs/my-website/sidebars.js                   |   3 +-
 3 files changed, 75 insertions(+), 130 deletions(-)
 create mode 100644 docs/my-website/docs/providers/custom_llm_server.md
 delete mode 100644 docs/my-website/docs/providers/custom_openai_proxy.md

diff --git a/docs/my-website/docs/providers/custom_llm_server.md b/docs/my-website/docs/providers/custom_llm_server.md
new file mode 100644
index 000000000..f8d5fb551
--- /dev/null
+++ b/docs/my-website/docs/providers/custom_llm_server.md
@@ -0,0 +1,73 @@
+# Custom API Server (Custom Format)
+
+LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
+
+
+:::info
+
+For calling an openai-compatible endpoint, [go here](./openai_compatible.md)
+:::
+
+## Quick Start 
+
+```python
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+litellm.custom_provider_map = [ # 👈 KEY STEP - REGISTER HANDLER
+        {"provider": "my-custom-llm", "custom_handler": my_custom_llm}
+    ]
+
+resp = completion(
+        model="my-custom-llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+assert resp.choices[0].message.content == "Hi!"
+```
+
+
+## Custom Handler Spec
+
+```python
+from litellm.types.utils import GenericStreamingChunk, ModelResponse
+from typing import Iterator, AsyncIterator
+from litellm.llms.base import BaseLLM
+
+class CustomLLMError(Exception):  # use this for all your exceptions
+    def __init__(
+        self,
+        status_code,
+        message,
+    ):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+class CustomLLM(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def completion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def acompletion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+```
\ No newline at end of file
diff --git a/docs/my-website/docs/providers/custom_openai_proxy.md b/docs/my-website/docs/providers/custom_openai_proxy.md
deleted file mode 100644
index b6f2eccac..000000000
--- a/docs/my-website/docs/providers/custom_openai_proxy.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# Custom API Server (OpenAI Format)
-
-LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
-
-## API KEYS
-No api keys required
-
-## Set up your Custom API Server
-Your server should have the following Endpoints:
-
-Here's an example OpenAI proxy server with routes: https://replit.com/@BerriAI/openai-proxy#main.py
-
-### Required Endpoints
-- POST `/chat/completions` - chat completions endpoint 
-
-### Optional Endpoints
-- POST `/completions` - completions endpoint 
-- Get `/models` - available models on server
-- POST `/embeddings` - creates an embedding vector representing the input text.
-
-
-## Example Usage
-
-### Call `/chat/completions`
-In order to use your custom OpenAI Chat Completion proxy with LiteLLM, ensure you set
-
-* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
-* `custom_llm_provider` to `openai` this ensures litellm uses the `openai.ChatCompletion` to your api_base
-
-```python
-import os
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-response = completion(
-    model="command-nightly", 
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://openai-proxy.berriai.repl.co",
-    custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request
-
-)
-print(response)
-```
-
-#### Response
-```json
-{
-    "object":
-    "chat.completion",
-    "choices": [{
-      "finish_reason": "stop",
-      "index": 0,
-      "message": {
-        "content":
-        "The sky, a canvas of blue,\nA work of art, pure and true,\nA",
-        "role": "assistant"
-      }
-    }],
-    "id":
-    "chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
-    "created":
-    1699290237.408061,
-    "model":
-    "togethercomputer/llama-2-70b-chat",
-    "usage": {
-      "completion_tokens": 18,
-      "prompt_tokens": 14,
-      "total_tokens": 32
-    }
-  }
-```
-
-
-### Call `/completions`
-In order to use your custom OpenAI Completion proxy with LiteLLM, ensure you set
-
-* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
-* `custom_llm_provider` to `text-completion-openai` this ensures litellm uses the `openai.Completion` to your api_base
-
-```python
-import os
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-response = completion(
-    model="command-nightly", 
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://openai-proxy.berriai.repl.co",
-    custom_llm_provider="text-completion-openai" # litellm will use the openai.Completion to make the request
-
-)
-print(response)
-```
-
-#### Response 
-```json
-{
-    "warning":
-    "This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
-    "id":
-    "cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
-    "object":
-    "text_completion",
-    "created":
-    1699290166,
-    "model":
-    "text-davinci-003",
-    "choices": [{
-      "text":
-      "\n\nThe weather in San Francisco varies depending on what time of year and time",
-      "index": 0,
-      "logprobs": None,
-      "finish_reason": "length"
-    }],
-    "usage": {
-      "prompt_tokens": 7,
-      "completion_tokens": 16,
-      "total_tokens": 23
-    }
-  }
-```
\ No newline at end of file
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index d228e09d2..c1ce83068 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -175,7 +175,8 @@ const sidebars = {
         "providers/aleph_alpha", 
         "providers/baseten", 
         "providers/openrouter", 
-        "providers/custom_openai_proxy",
+        // "providers/custom_openai_proxy",
+        "providers/custom_llm_server",
         "providers/petals",
         
       ],

From bd7af04a725e74290aeb0d87889538041aa0cc3a Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 17:56:34 -0700
Subject: [PATCH 33/34] feat(proxy_server.py): support custom llm handler on
 proxy

---
 .../docs/providers/custom_llm_server.md       | 97 ++++++++++++++++++-
 litellm/proxy/_new_secret_config.yaml         |  9 +-
 litellm/proxy/custom_handler.py               | 21 ++++
 litellm/proxy/proxy_server.py                 | 15 +++
 4 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 litellm/proxy/custom_handler.py

diff --git a/docs/my-website/docs/providers/custom_llm_server.md b/docs/my-website/docs/providers/custom_llm_server.md
index f8d5fb551..70fc4cea5 100644
--- a/docs/my-website/docs/providers/custom_llm_server.md
+++ b/docs/my-website/docs/providers/custom_llm_server.md
@@ -35,6 +35,101 @@ resp = completion(
 assert resp.choices[0].message.content == "Hi!"
 ```
 
+## OpenAI Proxy Usage
+
+1. Setup your `custom_handler.py` file 
+
+```python
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+my_custom_llm = MyCustomLLM()
+```
+
+2. Add to `config.yaml` 
+
+In the config below, we pass
+
+python_filename: `custom_handler.py`
+custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1
+
+custom_handler: `custom_handler.my_custom_llm`
+
+```yaml
+model_list:
+  - model_name: "test-model"             
+    litellm_params:
+      model: "openai/text-embedding-ada-002"
+  - model_name: "my-custom-model"
+    litellm_params:
+      model: "my-custom-llm/my-model"
+
+litellm_settings:
+  custom_provider_map:
+  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "my-custom-model",
+    "messages": [{"role": "user", "content": "Say \"this is a test\" in JSON!"}],
+}'
+```
+
+Expected Response
+
+```
+{
+    "id": "chatcmpl-06f1b9cd-08bc-43f7-9814-a69173921216",
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "Hi!",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null
+            }
+        }
+    ],
+    "created": 1721955063,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+        "prompt_tokens": 10,
+        "completion_tokens": 20,
+        "total_tokens": 30
+    }
+}
+```
 
 ## Custom Handler Spec
 
@@ -70,4 +165,4 @@ class CustomLLM(BaseLLM):
 
     async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
-```
\ No newline at end of file
+```
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index a81d133e5..0854f0901 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,4 +1,11 @@
 model_list:
   - model_name: "test-model"             
     litellm_params:
-      model: "openai/text-embedding-ada-002"
\ No newline at end of file
+      model: "openai/text-embedding-ada-002"
+  - model_name: "my-custom-model"
+    litellm_params:
+      model: "my-custom-llm/my-model"
+
+litellm_settings:
+  custom_provider_map:
+  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
\ No newline at end of file
diff --git a/litellm/proxy/custom_handler.py b/litellm/proxy/custom_handler.py
new file mode 100644
index 000000000..56943c34d
--- /dev/null
+++ b/litellm/proxy/custom_handler.py
@@ -0,0 +1,21 @@
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+my_custom_llm = MyCustomLLM()
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index f22f25f73..bad1abae2 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1507,6 +1507,21 @@ class ProxyConfig:
                     verbose_proxy_logger.debug(
                         f"litellm.post_call_rules: {litellm.post_call_rules}"
                     )
+                elif key == "custom_provider_map":
+                    from litellm.utils import custom_llm_setup
+
+                    litellm.custom_provider_map = [
+                        {
+                            "provider": item["provider"],
+                            "custom_handler": get_instance_fn(
+                                value=item["custom_handler"],
+                                config_file_path=config_file_path,
+                            ),
+                        }
+                        for item in value
+                    ]
+
+                    custom_llm_setup()
                 elif key == "success_callback":
                     litellm.success_callback = []
 

From 41abd5124023c931aa7856271d6e5761804358e6 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jul 2024 19:03:52 -0700
Subject: [PATCH 34/34] fix(custom_llm.py): pass input params to custom llm

---
 litellm/llms/custom_llm.py       | 80 ++++++++++++++++++++++++++--
 litellm/main.py                  | 21 +++++++-
 litellm/tests/test_custom_llm.py | 91 ++++++++++++++++++++++++++++++--
 3 files changed, 182 insertions(+), 10 deletions(-)

diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py
index f1b2b28b4..47c5a485c 100644
--- a/litellm/llms/custom_llm.py
+++ b/litellm/llms/custom_llm.py
@@ -59,16 +59,88 @@ class CustomLLM(BaseLLM):
     def __init__(self) -> None:
         super().__init__()
 
-    def completion(self, *args, **kwargs) -> ModelResponse:
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[HTTPHandler] = None,
+    ) -> ModelResponse:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
-    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
+    def streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[HTTPHandler] = None,
+    ) -> Iterator[GenericStreamingChunk]:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
-    async def acompletion(self, *args, **kwargs) -> ModelResponse:
+    async def acompletion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> ModelResponse:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
-    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
+    async def astreaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:
         raise CustomLLMError(status_code=500, message="Not implemented yet!")
 
 
diff --git a/litellm/main.py b/litellm/main.py
index c3be01373..672029f69 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -2711,8 +2711,27 @@ def completion(
                 async_fn=acompletion, stream=stream, custom_llm=custom_handler
             )
 
+            headers = headers or litellm.headers
+
             ## CALL FUNCTION
-            response = handler_fn()
+            response = handler_fn(
+                model=model,
+                messages=messages,
+                headers=headers,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                api_key=api_key,
+                api_base=api_base,
+                acompletion=acompletion,
+                logging_obj=logging,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                timeout=timeout,  # type: ignore
+                custom_prompt_dict=custom_prompt_dict,
+                client=client,  # pass AsyncOpenAI, OpenAI client
+                encoding=encoding,
+            )
             if stream is True:
                 return CustomStreamWrapper(
                     completion_stream=response,
diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py
index af88b1f3a..a0f8b569e 100644
--- a/litellm/tests/test_custom_llm.py
+++ b/litellm/tests/test_custom_llm.py
@@ -17,7 +17,16 @@ sys.path.insert(
 import os
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, AsyncGenerator, AsyncIterator, Coroutine, Iterator, Union
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Coroutine,
+    Iterator,
+    Optional,
+    Union,
+)
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import httpx
@@ -94,21 +103,75 @@ class CustomModelResponseIterator:
 
 
 class MyCustomLLM(CustomLLM):
-    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.HTTPHandler] = None,
+    ) -> ModelResponse:
         return litellm.completion(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": "Hello world"}],
             mock_response="Hi!",
         )  # type: ignore
 
-    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+    async def acompletion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.AsyncHTTPHandler] = None,
+    ) -> litellm.ModelResponse:
         return litellm.completion(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": "Hello world"}],
             mock_response="Hi!",
         )  # type: ignore
 
-    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
+    def streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.HTTPHandler] = None,
+    ) -> Iterator[GenericStreamingChunk]:
         generic_streaming_chunk: GenericStreamingChunk = {
             "finish_reason": "stop",
             "index": 0,
@@ -126,7 +189,25 @@ class MyCustomLLM(CustomLLM):
         )
         return custom_iterator
 
-    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:  # type: ignore
+    async def astreaming(  # type: ignore
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.AsyncHTTPHandler] = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:  # type: ignore
         generic_streaming_chunk: GenericStreamingChunk = {
             "finish_reason": "stop",
             "index": 0,