diff --git a/README.md b/README.md
index 92328b4d5..3ac5f0285 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
           <img src="https://railway.app/button.svg" alt="Deploy on Railway">
         </a>
         </p>
-        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
+        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
         <br>
     </p>
 <h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
diff --git a/docs/my-website/docs/providers/custom_llm_server.md b/docs/my-website/docs/providers/custom_llm_server.md
new file mode 100644
index 000000000..70fc4cea5
--- /dev/null
+++ b/docs/my-website/docs/providers/custom_llm_server.md
@@ -0,0 +1,168 @@
+# Custom API Server (Custom Format)
+
+LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
+
+
+:::info
+
+For calling an openai-compatible endpoint, [go here](./openai_compatible.md)
+:::
+
+## Quick Start 
+
+```python
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+litellm.custom_provider_map = [ # 👈 KEY STEP - REGISTER HANDLER
+        {"provider": "my-custom-llm", "custom_handler": my_custom_llm}
+    ]
+
+resp = completion(
+        model="my-custom-llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+assert resp.choices[0].message.content == "Hi!"
+```
+
+## OpenAI Proxy Usage
+
+1. Setup your `custom_handler.py` file 
+
+```python
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+my_custom_llm = MyCustomLLM()
+```
+
+2. Add to `config.yaml` 
+
+In the config below, we pass
+
+python_filename: `custom_handler.py`
+custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1
+
+custom_handler: `custom_handler.my_custom_llm`
+
+```yaml
+model_list:
+  - model_name: "test-model"             
+    litellm_params:
+      model: "openai/text-embedding-ada-002"
+  - model_name: "my-custom-model"
+    litellm_params:
+      model: "my-custom-llm/my-model"
+
+litellm_settings:
+  custom_provider_map:
+  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "my-custom-model",
+    "messages": [{"role": "user", "content": "Say \"this is a test\" in JSON!"}],
+}'
+```
+
+Expected Response
+
+```
+{
+    "id": "chatcmpl-06f1b9cd-08bc-43f7-9814-a69173921216",
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "Hi!",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null
+            }
+        }
+    ],
+    "created": 1721955063,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+        "prompt_tokens": 10,
+        "completion_tokens": 20,
+        "total_tokens": 30
+    }
+}
+```
+
+## Custom Handler Spec
+
+```python
+from litellm.types.utils import GenericStreamingChunk, ModelResponse
+from typing import Iterator, AsyncIterator
+from litellm.llms.base import BaseLLM
+
+class CustomLLMError(Exception):  # use this for all your exceptions
+    def __init__(
+        self,
+        status_code,
+        message,
+    ):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+class CustomLLM(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def completion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def acompletion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+```
diff --git a/docs/my-website/docs/providers/custom_openai_proxy.md b/docs/my-website/docs/providers/custom_openai_proxy.md
deleted file mode 100644
index b6f2eccac..000000000
--- a/docs/my-website/docs/providers/custom_openai_proxy.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# Custom API Server (OpenAI Format)
-
-LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
-
-## API KEYS
-No api keys required
-
-## Set up your Custom API Server
-Your server should have the following Endpoints:
-
-Here's an example OpenAI proxy server with routes: https://replit.com/@BerriAI/openai-proxy#main.py
-
-### Required Endpoints
-- POST `/chat/completions` - chat completions endpoint 
-
-### Optional Endpoints
-- POST `/completions` - completions endpoint 
-- Get `/models` - available models on server
-- POST `/embeddings` - creates an embedding vector representing the input text.
-
-
-## Example Usage
-
-### Call `/chat/completions`
-In order to use your custom OpenAI Chat Completion proxy with LiteLLM, ensure you set
-
-* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
-* `custom_llm_provider` to `openai` this ensures litellm uses the `openai.ChatCompletion` to your api_base
-
-```python
-import os
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-response = completion(
-    model="command-nightly", 
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://openai-proxy.berriai.repl.co",
-    custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request
-
-)
-print(response)
-```
-
-#### Response
-```json
-{
-    "object":
-    "chat.completion",
-    "choices": [{
-      "finish_reason": "stop",
-      "index": 0,
-      "message": {
-        "content":
-        "The sky, a canvas of blue,\nA work of art, pure and true,\nA",
-        "role": "assistant"
-      }
-    }],
-    "id":
-    "chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
-    "created":
-    1699290237.408061,
-    "model":
-    "togethercomputer/llama-2-70b-chat",
-    "usage": {
-      "completion_tokens": 18,
-      "prompt_tokens": 14,
-      "total_tokens": 32
-    }
-  }
-```
-
-
-### Call `/completions`
-In order to use your custom OpenAI Completion proxy with LiteLLM, ensure you set
-
-* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
-* `custom_llm_provider` to `text-completion-openai` this ensures litellm uses the `openai.Completion` to your api_base
-
-```python
-import os
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-response = completion(
-    model="command-nightly", 
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://openai-proxy.berriai.repl.co",
-    custom_llm_provider="text-completion-openai" # litellm will use the openai.Completion to make the request
-
-)
-print(response)
-```
-
-#### Response 
-```json
-{
-    "warning":
-    "This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
-    "id":
-    "cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
-    "object":
-    "text_completion",
-    "created":
-    1699290166,
-    "model":
-    "text-davinci-003",
-    "choices": [{
-      "text":
-      "\n\nThe weather in San Francisco varies depending on what time of year and time",
-      "index": 0,
-      "logprobs": None,
-      "finish_reason": "length"
-    }],
-    "usage": {
-      "prompt_tokens": 7,
-      "completion_tokens": 16,
-      "total_tokens": 23
-    }
-  }
-```
\ No newline at end of file
diff --git a/docs/my-website/docs/providers/friendliai.md b/docs/my-website/docs/providers/friendliai.md
new file mode 100644
index 000000000..137c3dde3
--- /dev/null
+++ b/docs/my-website/docs/providers/friendliai.md
@@ -0,0 +1,60 @@
+# FriendliAI
+https://suite.friendli.ai/
+
+**We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['FRIENDLI_TOKEN']
+os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models
+### Serverless Endpoints
+We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` | 
+| meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
+| meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |  
+
+### Dedicated Endpoints
+```
+model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
+```
diff --git a/docs/my-website/docs/providers/groq.md b/docs/my-website/docs/providers/groq.md
index bfb944cb4..37d63d031 100644
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@@ -148,8 +148,11 @@ print(response)
 ## Supported Models - ALL Groq Models Supported!
 We support ALL Groq models, just set `groq/` as a prefix when sending completion requests
 
-| Model Name         | Function Call                                           |
+| Model Name         | Usage                                           |
 |--------------------|---------------------------------------------------------|
+| llama-3.1-8b-instant     | `completion(model="groq/llama-3.1-8b-instant", messages)`     | 
+| llama-3.1-70b-versatile    | `completion(model="groq/llama-3.1-70b-versatile", messages)`    | 
+| llama-3.1-405b-reasoning    | `completion(model="groq/llama-3.1-405b-reasoning", messages)`    | 
 | llama3-8b-8192     | `completion(model="groq/llama3-8b-8192", messages)`     | 
 | llama3-70b-8192    | `completion(model="groq/llama3-70b-8192", messages)`    | 
 | llama2-70b-4096    | `completion(model="groq/llama2-70b-4096", messages)`    | 
diff --git a/docs/my-website/docs/providers/mistral.md b/docs/my-website/docs/providers/mistral.md
index 21e3a9d54..62a91c687 100644
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@@ -148,7 +148,8 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
 |----------------|--------------------------------------------------------------|
 | Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
 | Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
-| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
+| Mistral Large 2  | `completion(model="mistral/mistral-large-2407", messages)` |
+| Mistral Large Latest  | `completion(model="mistral/mistral-large-latest", messages)` |
 | Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
 | Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
 | Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 6769ec6c5..ded8333f0 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -59,6 +59,8 @@ litellm_settings:
   cache_params:        # set cache params for redis
     type: redis
     ttl: 600 # will be cached on redis for 600s
+    # default_in_memory_ttl: Optional[float], default is None. time in seconds. 
+    # default_in_redis_ttl: Optional[float], default is None. time in seconds. 
 ```
 
 
@@ -613,6 +615,11 @@ litellm_settings:
 
 ```yaml
 cache_params:
+  # ttl 
+  ttl: Optional[float]
+  default_in_memory_ttl: Optional[float]
+  default_in_redis_ttl: Optional[float]
+
   # Type of cache (options: "local", "redis", "s3")
   type: s3
 
@@ -628,6 +635,8 @@ cache_params:
   host: localhost  # Redis server hostname or IP address
   port: "6379"  # Redis server port (as a string)
   password: secret_password  # Redis server password
+  namespace: Optional[str] = None,
+  
 
   # S3 cache parameters
   s3_bucket_name: your_s3_bucket_name  # Name of the S3 bucket
diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md
index 5b97dc14e..3607cb07f 100644
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@@ -23,9 +23,9 @@ Features:
     - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
     - ✅ Set Max Request / File Size on Requests
     - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
-- **Spend Tracking**
+- **Enterprise Spend Tracking Features**
     - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
-    - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
+    - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
 - **Advanced Metrics**
     - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
 - **Guardrails, PII Masking, Content Moderation**
diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md
index 6d383fc41..632702b91 100644
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@@ -41,28 +41,6 @@ litellm --health
 }
 ```
 
-### Background Health Checks 
-
-You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
-
-Here's how to use it: 
-1. in the config.yaml add:
-```
-general_settings: 
-  background_health_checks: True # enable background health checks
-  health_check_interval: 300 # frequency of background health checks
-```
-
-2. Start server 
-```
-$ litellm /path/to/config.yaml
-```
-
-3. Query health endpoint: 
-```
-curl --location 'http://0.0.0.0:4000/health'
-```
-
 ### Embedding Models 
 
 We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@@ -124,6 +102,41 @@ model_list:
       mode: audio_transcription
 ```
 
+
+### Text to Speech Models 
+
+```yaml
+# OpenAI Text to Speech Models
+  - model_name: tts
+    litellm_params:
+      model: openai/tts-1
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info:
+      mode: audio_speech
+```
+
+## Background Health Checks 
+
+You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
+
+Here's how to use it: 
+1. in the config.yaml add:
+```
+general_settings: 
+  background_health_checks: True # enable background health checks
+  health_check_interval: 300 # frequency of background health checks
+```
+
+2. Start server 
+```
+$ litellm /path/to/config.yaml
+```
+
+3. Query health endpoint: 
+```
+curl --location 'http://0.0.0.0:4000/health'
+```
+
 ### Hide details
 
 The health check response contains details like endpoint URLs, error messages,
diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md
index 2404c744c..a3f03b3d7 100644
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@@ -31,8 +31,19 @@ model_list:
       api_base: https://openai-france-1234.openai.azure.com/
       api_key: <your-azure-api-key>
       rpm: 1440
+routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
+  num_retries: 2
+  timeout: 30                                  # 30 seconds
+  redis_host: <your redis host>                # set this when using multiple litellm proxy deployments, load balancing state stored in redis
+  redis_password: <your redis password>
+  redis_port: 1992
 ```
 
+:::info
+Detailed information about [routing strategies can be found here](../routing)
+:::
+
 #### Step 2: Start Proxy with config
 
 ```shell
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index c3f7e9249..c1ce83068 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -158,6 +158,7 @@ const sidebars = {
         "providers/triton-inference-server",
         "providers/ollama", 
         "providers/perplexity", 
+        "providers/friendliai",
         "providers/groq", 
         "providers/deepseek", 
         "providers/fireworks_ai",
@@ -174,7 +175,8 @@ const sidebars = {
         "providers/aleph_alpha", 
         "providers/baseten", 
         "providers/openrouter", 
-        "providers/custom_openai_proxy",
+        // "providers/custom_openai_proxy",
+        "providers/custom_llm_server",
         "providers/petals",
         
       ],
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 956834afc..b6aacad1a 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -813,6 +813,7 @@ from .utils import (
 )
 
 from .types.utils import ImageObject
+from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
@@ -909,3 +910,12 @@ from .cost_calculator import response_cost_calculator, cost_per_token
 from .types.adapter import AdapterItem
 
 adapters: List[AdapterItem] = []
+
+### CUSTOM LLMs ###
+from .types.llms.custom_llm import CustomLLMItem
+from .types.utils import GenericStreamingChunk
+
+custom_provider_map: List[CustomLLMItem] = []
+_custom_providers: List[str] = (
+    []
+)  # internal helper util, used to track names of custom providers
diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py
index a2928cf20..ec143f3fe 100644
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@@ -1864,6 +1864,23 @@ class AzureChatCompletion(BaseLLM):
                 model=model,  # type: ignore
                 prompt=prompt,  # type: ignore
             )
+        elif mode == "audio_transcription":
+            # Get the current directory of the file being run
+            pwd = os.path.dirname(os.path.realpath(__file__))
+            file_path = os.path.join(pwd, "../tests/gettysburg.wav")
+            audio_file = open(file_path, "rb")
+            completion = await client.audio.transcriptions.with_raw_response.create(
+                file=audio_file,
+                model=model,  # type: ignore
+                prompt=prompt,  # type: ignore
+            )
+        elif mode == "audio_speech":
+            # Get the current directory of the file being run
+            completion = await client.audio.speech.with_raw_response.create(
+                model=model,  # type: ignore
+                input=prompt,  # type: ignore
+                voice="alloy",
+            )
         else:
             raise Exception("mode not set")
         response = {}
diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py
index 16c3f60b7..fbb51fb93 100644
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@@ -78,6 +78,8 @@ BEDROCK_CONVERSE_MODELS = [
     "ai21.jamba-instruct-v1:0",
     "meta.llama3-1-8b-instruct-v1:0",
     "meta.llama3-1-70b-instruct-v1:0",
+    "meta.llama3-1-405b-instruct-v1:0",
+    "mistral.mistral-large-2407-v1:0",
 ]
 
 
@@ -1315,6 +1317,7 @@ class AmazonConverseConfig:
             model.startswith("anthropic")
             or model.startswith("mistral")
             or model.startswith("cohere")
+            or model.startswith("meta.llama3-1")
         ):
             supported_params.append("tools")
 
diff --git a/litellm/llms/custom_llm.py b/litellm/llms/custom_llm.py
new file mode 100644
index 000000000..47c5a485c
--- /dev/null
+++ b/litellm/llms/custom_llm.py
@@ -0,0 +1,161 @@
+# What is this?
+## Handler file for a Custom Chat LLM
+
+"""
+- completion
+- acompletion
+- streaming
+- async_streaming
+"""
+
+import copy
+import json
+import os
+import time
+import types
+from enum import Enum
+from functools import partial
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Coroutine,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.utils import GenericStreamingChunk, ProviderField
+from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory
+
+
+class CustomLLMError(Exception):  # use this for all your exceptions
+    def __init__(
+        self,
+        status_code,
+        message,
+    ):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class CustomLLM(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[HTTPHandler] = None,
+    ) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    def streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[HTTPHandler] = None,
+    ) -> Iterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def acompletion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def astreaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+
+def custom_chat_llm_router(
+    async_fn: bool, stream: Optional[bool], custom_llm: CustomLLM
+):
+    """
+    Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type
+
+    Validates if response is in expected format
+    """
+    if async_fn:
+        if stream:
+            return custom_llm.astreaming
+        return custom_llm.acompletion
+    if stream:
+        return custom_llm.streaming
+    return custom_llm.completion
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 25e2e518c..fae8a448a 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1,5 +1,6 @@
 import hashlib
 import json
+import os
 import time
 import traceback
 import types
@@ -1870,8 +1871,25 @@ class OpenAIChatCompletion(BaseLLM):
                 model=model,  # type: ignore
                 prompt=prompt,  # type: ignore
             )
+        elif mode == "audio_transcription":
+            # Get the current directory of the file being run
+            pwd = os.path.dirname(os.path.realpath(__file__))
+            file_path = os.path.join(pwd, "../tests/gettysburg.wav")
+            audio_file = open(file_path, "rb")
+            completion = await client.audio.transcriptions.with_raw_response.create(
+                file=audio_file,
+                model=model,  # type: ignore
+                prompt=prompt,  # type: ignore
+            )
+        elif mode == "audio_speech":
+            # Get the current directory of the file being run
+            completion = await client.audio.speech.with_raw_response.create(
+                model=model,  # type: ignore
+                input=prompt,  # type: ignore
+                voice="alloy",
+            )
         else:
-            raise Exception("mode not set")
+            raise ValueError("mode not set, passed in mode: " + mode)
         response = {}
 
         if completion is None or not hasattr(completion, "headers"):
diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py
index 1dd29fd7d..0d129ce02 100644
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@@ -387,7 +387,7 @@ def process_response(
         result = " "
 
     ## Building RESPONSE OBJECT
-    if len(result) > 1:
+    if len(result) >= 1:
         model_response.choices[0].message.content = result  # type: ignore
 
     # Calculate usage
diff --git a/litellm/main.py b/litellm/main.py
index 35fad5e02..672029f69 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -107,6 +107,7 @@ from .llms.anthropic_text import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
+from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks import DatabricksChatCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
@@ -381,6 +382,7 @@ async def acompletion(
             or custom_llm_provider == "clarifai"
             or custom_llm_provider == "watsonx"
             or custom_llm_provider in litellm.openai_compatible_providers
+            or custom_llm_provider in litellm._custom_providers
         ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
             init_response = await loop.run_in_executor(None, func_with_context)
             if isinstance(init_response, dict) or isinstance(
@@ -2690,6 +2692,54 @@ def completion(
             model_response.created = int(time.time())
             model_response.model = model
             response = model_response
+        elif (
+            custom_llm_provider in litellm._custom_providers
+        ):  # Assume custom LLM provider
+            # Get the Custom Handler
+            custom_handler: Optional[CustomLLM] = None
+            for item in litellm.custom_provider_map:
+                if item["provider"] == custom_llm_provider:
+                    custom_handler = item["custom_handler"]
+
+            if custom_handler is None:
+                raise ValueError(
+                    f"Unable to map your input to a model. Check your input - {args}"
+                )
+
+            ## ROUTE LLM CALL ##
+            handler_fn = custom_chat_llm_router(
+                async_fn=acompletion, stream=stream, custom_llm=custom_handler
+            )
+
+            headers = headers or litellm.headers
+
+            ## CALL FUNCTION
+            response = handler_fn(
+                model=model,
+                messages=messages,
+                headers=headers,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                api_key=api_key,
+                api_base=api_base,
+                acompletion=acompletion,
+                logging_obj=logging,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                timeout=timeout,  # type: ignore
+                custom_prompt_dict=custom_prompt_dict,
+                client=client,  # pass AsyncOpenAI, OpenAI client
+                encoding=encoding,
+            )
+            if stream is True:
+                return CustomStreamWrapper(
+                    completion_stream=response,
+                    model=model,
+                    custom_llm_provider=custom_llm_provider,
+                    logging_obj=logging,
+                )
+
         else:
             raise ValueError(
                 f"Unable to map your input to a model. Check your input - {args}"
@@ -3833,7 +3883,7 @@ def text_completion(
         optional_params["custom_llm_provider"] = custom_llm_provider
 
     # get custom_llm_provider
-    _, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
+    _model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
 
     if custom_llm_provider == "huggingface":
         # if echo == True, for TGI llms we need to set top_n_tokens to 3
@@ -3916,10 +3966,12 @@ def text_completion(
 
     kwargs.pop("prompt", None)
 
-    if model is not None and model.startswith(
-        "openai/"
+    if (
+        _model is not None and custom_llm_provider == "openai"
     ):  # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
-        model = model.replace("openai/", "text-completion-openai/")
+        if _model not in litellm.open_ai_chat_completion_models:
+            model = "text-completion-openai/" + _model
+            optional_params.pop("custom_llm_provider", None)
 
     kwargs["text_completion"] = True
     response = completion(
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 08bc292c9..d4985bffd 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -893,11 +893,11 @@
         "mode": "chat"
     },
     "mistral/mistral-large-latest": {
-        "max_tokens": 8191,
-        "max_input_tokens": 32000,
-        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000012,
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
         "litellm_provider": "mistral",
         "mode": "chat",
         "supports_function_calling": true
@@ -912,6 +912,16 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "mistral/mistral-large-2407": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "mistral/open-mistral-7b": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
@@ -1094,6 +1104,36 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "groq/llama-3.1-8b-instant": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-70b-versatile": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-405b-reasoning": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "groq/mixtral-8x7b-32768": {
         "max_tokens": 32768,
         "max_input_tokens": 32768,
@@ -2956,6 +2996,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "mistral.mistral-large-2407-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
@@ -3691,6 +3740,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "meta.llama3-1-405b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000532,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
         "max_tokens": 77, 
         "max_input_tokens": 77, 
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 13babaac6..173624c25 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,9 +1,11 @@
 model_list:
-  - model_name: "*"             # all requests where model not in your config go to this deployment
+  - model_name: "test-model"             
     litellm_params:
-      model: "openai/*"           # passes our validation check that a real provider is given
-      api_key: "" 
+      model: "openai/text-embedding-ada-002"
+  - model_name: "my-custom-model"
+    litellm_params:
+      model: "my-custom-llm/my-model"
 
 litellm_settings:
-  cache: True
-  
\ No newline at end of file
+  custom_provider_map:
+  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
diff --git a/litellm/proxy/custom_handler.py b/litellm/proxy/custom_handler.py
new file mode 100644
index 000000000..56943c34d
--- /dev/null
+++ b/litellm/proxy/custom_handler.py
@@ -0,0 +1,21 @@
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+my_custom_llm = MyCustomLLM()
diff --git a/litellm/proxy/management_endpoints/internal_user_endpoints.py b/litellm/proxy/management_endpoints/internal_user_endpoints.py
index 280ff2ad2..b132761ae 100644
--- a/litellm/proxy/management_endpoints/internal_user_endpoints.py
+++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py
@@ -27,6 +27,7 @@ from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.management_endpoints.key_management_endpoints import (
+    _duration_in_seconds,
     generate_key_helper_fn,
 )
 from litellm.proxy.management_helpers.utils import (
@@ -486,6 +487,13 @@ async def user_update(
             ):  # models default to [], spend defaults to 0, we should not reset these values
                 non_default_values[k] = v
 
+        if "budget_duration" in non_default_values:
+            duration_s = _duration_in_seconds(
+                duration=non_default_values["budget_duration"]
+            )
+            user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
+            non_default_values["budget_reset_at"] = user_reset_at
+
         ## ADD USER, IF NEW ##
         verbose_proxy_logger.debug("/user/update: Received data = %s", data)
         if data.user_id is not None and len(data.user_id) > 0:
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 0e3f0826e..bd8f5bfd0 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -8,6 +8,12 @@ model_list:
     litellm_params:
       model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
       api_key: "os.environ/FIREWORKS"
+  - model_name: tts
+    litellm_params:
+      model: openai/tts-1
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info:
+      mode: audio_speech
 general_settings: 
   master_key: sk-1234
   alerting: ["slack"]
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 106b95453..bad1abae2 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1507,6 +1507,21 @@ class ProxyConfig:
                     verbose_proxy_logger.debug(
                         f"litellm.post_call_rules: {litellm.post_call_rules}"
                     )
+                elif key == "custom_provider_map":
+                    from litellm.utils import custom_llm_setup
+
+                    litellm.custom_provider_map = [
+                        {
+                            "provider": item["provider"],
+                            "custom_handler": get_instance_fn(
+                                value=item["custom_handler"],
+                                config_file_path=config_file_path,
+                            ),
+                        }
+                        for item in value
+                    ]
+
+                    custom_llm_setup()
                 elif key == "success_callback":
                     litellm.success_callback = []
 
@@ -3334,6 +3349,7 @@ async def embeddings(
         if (
             "input" in data
             and isinstance(data["input"], list)
+            and len(data["input"]) > 0
             and isinstance(data["input"][0], list)
             and isinstance(data["input"][0][0], int)
         ):  # check if array of tokens passed in
@@ -3464,8 +3480,8 @@ async def embeddings(
             litellm_debug_info,
         )
         verbose_proxy_logger.error(
-            "litellm.proxy.proxy_server.embeddings(): Exception occured - {}".format(
-                str(e)
+            "litellm.proxy.proxy_server.embeddings(): Exception occured - {}\n{}".format(
+                str(e), traceback.format_exc()
             )
         )
         verbose_proxy_logger.debug(traceback.format_exc())
diff --git a/litellm/router.py b/litellm/router.py
index 11ad5fd9e..53013a759 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -263,7 +263,9 @@ class Router:
         )  # names of models under litellm_params. ex. azure/chatgpt-v-2
         self.deployment_latency_map = {}
         ### CACHING ###
-        cache_type: Literal["local", "redis"] = "local"  # default to an in-memory cache
+        cache_type: Literal["local", "redis", "redis-semantic", "s3", "disk"] = (
+            "local"  # default to an in-memory cache
+        )
         redis_cache = None
         cache_config = {}
         self.client_ttl = client_ttl
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index f62b2b7ef..6aaf99515 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -2573,21 +2573,17 @@ def test_completion_azure_extra_headers():
     http_client = Client()
 
     with patch.object(http_client, "send", new=MagicMock()) as mock_client:
-        client = AzureOpenAI(
-            azure_endpoint=os.getenv("AZURE_API_BASE"),
-            api_version=litellm.AZURE_DEFAULT_API_VERSION,
-            api_key=os.getenv("AZURE_API_KEY"),
-            http_client=http_client,
-        )
+        litellm.client_session = http_client
         try:
             response = completion(
                 model="azure/chatgpt-v-2",
                 messages=messages,
-                client=client,
+                api_base=os.getenv("AZURE_API_BASE"),
+                api_version="2023-07-01-preview",
+                api_key=os.getenv("AZURE_API_KEY"),
                 extra_headers={
                     "Authorization": "my-bad-key",
                     "Ocp-Apim-Subscription-Key": "hello-world-testing",
-                    "api-key": "my-bad-key",
                 },
             )
             print(response)
@@ -2603,8 +2599,10 @@ def test_completion_azure_extra_headers():
         print(request.url)  # This will print the full URL
         print(request.headers)  # This will print the full URL
         auth_header = request.headers.get("Authorization")
+        apim_key = request.headers.get("Ocp-Apim-Subscription-Key")
         print(auth_header)
         assert auth_header == "my-bad-key"
+        assert apim_key == "hello-world-testing"
 
 
 def test_completion_azure_ad_token():
@@ -2613,18 +2611,37 @@ def test_completion_azure_ad_token():
     # If you want to remove it, speak to Ishaan!
     # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
     from httpx import Client
-    from openai import AzureOpenAI
 
     from litellm import completion
-    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 
-    response = completion(
-        model="azure/chatgpt-v-2",
-        messages=messages,
-        # api_key="my-fake-ad-token",
-        azure_ad_token=os.getenv("AZURE_API_KEY"),
-    )
-    print(response)
+    litellm.set_verbose = True
+
+    old_key = os.environ["AZURE_API_KEY"]
+    os.environ.pop("AZURE_API_KEY", None)
+
+    http_client = Client()
+
+    with patch.object(http_client, "send", new=MagicMock()) as mock_client:
+        litellm.client_session = http_client
+        try:
+            response = completion(
+                model="azure/chatgpt-v-2",
+                messages=messages,
+                azure_ad_token="my-special-token",
+            )
+            print(response)
+        except Exception as e:
+            pass
+        finally:
+            os.environ["AZURE_API_KEY"] = old_key
+
+        mock_client.assert_called_once()
+        request = mock_client.call_args[0][0]
+        print(request.method)  # This will print 'POST'
+        print(request.url)  # This will print the full URL
+        print(request.headers)  # This will print the full URL
+        auth_header = request.headers.get("Authorization")
+        assert auth_header == "Bearer my-special-token"
 
 
 def test_completion_azure_key_completion_arg():
diff --git a/litellm/tests/test_custom_llm.py b/litellm/tests/test_custom_llm.py
new file mode 100644
index 000000000..a0f8b569e
--- /dev/null
+++ b/litellm/tests/test_custom_llm.py
@@ -0,0 +1,302 @@
+# What is this?
+## Unit tests for the CustomLLM class
+
+
+import asyncio
+import os
+import sys
+import time
+import traceback
+
+import openai
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import os
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Coroutine,
+    Iterator,
+    Optional,
+    Union,
+)
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+from dotenv import load_dotenv
+
+import litellm
+from litellm import (
+    ChatCompletionDeltaChunk,
+    ChatCompletionUsageBlock,
+    CustomLLM,
+    GenericStreamingChunk,
+    ModelResponse,
+    acompletion,
+    completion,
+    get_llm_provider,
+)
+from litellm.utils import ModelResponseIterator
+
+
+class CustomModelResponseIterator:
+    def __init__(self, streaming_response: Union[Iterator, AsyncIterator]):
+        self.streaming_response = streaming_response
+
+    def chunk_parser(self, chunk: Any) -> GenericStreamingChunk:
+        return GenericStreamingChunk(
+            text="hello world",
+            tool_use=None,
+            is_finished=True,
+            finish_reason="stop",
+            usage=ChatCompletionUsageBlock(
+                prompt_tokens=10, completion_tokens=20, total_tokens=30
+            ),
+            index=0,
+        )
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> GenericStreamingChunk:
+        try:
+            chunk: Any = self.streaming_response.__next__()  # type: ignore
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            return self.chunk_parser(chunk=chunk)
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+
+    # Async iterator
+    def __aiter__(self):
+        self.async_response_iterator = self.streaming_response.__aiter__()  # type: ignore
+        return self.streaming_response
+
+    async def __anext__(self) -> GenericStreamingChunk:
+        try:
+            chunk = await self.async_response_iterator.__anext__()
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            return self.chunk_parser(chunk=chunk)
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.HTTPHandler] = None,
+    ) -> ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.AsyncHTTPHandler] = None,
+    ) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    def streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.HTTPHandler] = None,
+    ) -> Iterator[GenericStreamingChunk]:
+        generic_streaming_chunk: GenericStreamingChunk = {
+            "finish_reason": "stop",
+            "index": 0,
+            "is_finished": True,
+            "text": "Hello world",
+            "tool_use": None,
+            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
+        }
+
+        completion_stream = ModelResponseIterator(
+            model_response=generic_streaming_chunk  # type: ignore
+        )
+        custom_iterator = CustomModelResponseIterator(
+            streaming_response=completion_stream
+        )
+        return custom_iterator
+
+    async def astreaming(  # type: ignore
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.AsyncHTTPHandler] = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:  # type: ignore
+        generic_streaming_chunk: GenericStreamingChunk = {
+            "finish_reason": "stop",
+            "index": 0,
+            "is_finished": True,
+            "text": "Hello world",
+            "tool_use": None,
+            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
+        }
+
+        yield generic_streaming_chunk  # type: ignore
+
+
+def test_get_llm_provider():
+    """"""
+    from litellm.utils import custom_llm_setup
+
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+
+    custom_llm_setup()
+
+    model, provider, _, _ = get_llm_provider(model="custom_llm/my-fake-model")
+
+    assert provider == "custom_llm"
+
+
+def test_simple_completion():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = completion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+    assert resp.choices[0].message.content == "Hi!"
+
+
+@pytest.mark.asyncio
+async def test_simple_acompletion():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = await acompletion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+    assert resp.choices[0].message.content == "Hi!"
+
+
+def test_simple_completion_streaming():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = completion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        stream=True,
+    )
+
+    for chunk in resp:
+        print(chunk)
+        if chunk.choices[0].finish_reason is None:
+            assert isinstance(chunk.choices[0].delta.content, str)
+        else:
+            assert chunk.choices[0].finish_reason == "stop"
+
+
+@pytest.mark.asyncio
+async def test_simple_completion_async_streaming():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = await litellm.acompletion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk)
+        if chunk.choices[0].finish_reason is None:
+            assert isinstance(chunk.choices[0].delta.content, str)
+        else:
+            assert chunk.choices[0].finish_reason == "stop"
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index e6dd8bbb2..79ba8bc3e 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -206,6 +206,9 @@ def test_openai_azure_embedding_with_oidc_and_cf():
     os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
     os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
 
+    old_key = os.environ["AZURE_API_KEY"]
+    os.environ.pop("AZURE_API_KEY", None)
+
     try:
         response = embedding(
             model="azure/text-embedding-ada-002",
@@ -218,6 +221,8 @@ def test_openai_azure_embedding_with_oidc_and_cf():
 
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
+    finally:
+        os.environ["AZURE_API_KEY"] = old_key
 
 
 def test_openai_azure_embedding_optional_arg(mocker):
@@ -673,17 +678,3 @@ async def test_databricks_embeddings(sync_mode):
 #     print(response)
 
 # local_proxy_embeddings()
-
-
-def test_embedding_azure_ad_token():
-    # this tests if we can pass api_key to completion, when it's not in the env.
-    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
-    # If you want to remove it, speak to Ishaan!
-    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
-
-    response = embedding(
-        model="azure/azure-embedding-model",
-        input=["good morning from litellm"],
-        azure_ad_token=os.getenv("AZURE_API_KEY"),
-    )
-    print(response)
diff --git a/litellm/tests/test_get_llm_provider.py b/litellm/tests/test_get_llm_provider.py
index e443830b2..3ec867af4 100644
--- a/litellm/tests/test_get_llm_provider.py
+++ b/litellm/tests/test_get_llm_provider.py
@@ -1,14 +1,18 @@
-import sys, os
+import os
+import sys
 import traceback
+
 from dotenv import load_dotenv
 
 load_dotenv()
-import os, io
+import io
+import os
 
 sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
+
 import litellm
 
 
@@ -21,6 +25,12 @@ def test_get_llm_provider():
 # test_get_llm_provider()
 
 
+def test_get_llm_provider_gpt_instruct():
+    _, response, _, _ = litellm.get_llm_provider(model="gpt-3.5-turbo-instruct-0914")
+
+    assert response == "text-completion-openai"
+
+
 def test_get_llm_provider_mistral_custom_api_base():
     model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
         model="mistral/mistral-large-fr",
diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py
index c6bbf71f2..6a0080b37 100644
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@@ -3840,7 +3840,26 @@ def test_completion_chatgpt_prompt():
     try:
         print("\n gpt3.5 test\n")
         response = text_completion(
-            model="gpt-3.5-turbo", prompt="What's the weather in SF?"
+            model="openai/gpt-3.5-turbo", prompt="What's the weather in SF?"
+        )
+        print(response)
+        response_str = response["choices"][0]["text"]
+        print("\n", response.choices)
+        print("\n", response.choices[0])
+        # print(response.choices[0].text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+# test_completion_chatgpt_prompt()
+
+
+def test_completion_gpt_instruct():
+    try:
+        response = text_completion(
+            model="gpt-3.5-turbo-instruct-0914",
+            prompt="What's the weather in SF?",
+            custom_llm_provider="openai",
         )
         print(response)
         response_str = response["choices"][0]["text"]
diff --git a/litellm/types/llms/custom_llm.py b/litellm/types/llms/custom_llm.py
new file mode 100644
index 000000000..d5499a419
--- /dev/null
+++ b/litellm/types/llms/custom_llm.py
@@ -0,0 +1,10 @@
+from typing import List
+
+from typing_extensions import Dict, Required, TypedDict, override
+
+from litellm.llms.custom_llm import CustomLLM
+
+
+class CustomLLMItem(TypedDict):
+    provider: str
+    custom_handler: CustomLLM
diff --git a/litellm/utils.py b/litellm/utils.py
index a6d3d8603..5e4dc4479 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -330,6 +330,18 @@ class Rules:
 
 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
+def custom_llm_setup():
+    """
+    Add custom_llm provider to provider list
+    """
+    for custom_llm in litellm.custom_provider_map:
+        if custom_llm["provider"] not in litellm.provider_list:
+            litellm.provider_list.append(custom_llm["provider"])
+
+        if custom_llm["provider"] not in litellm._custom_providers:
+            litellm._custom_providers.append(custom_llm["provider"])
+
+
 def function_setup(
     original_function: str, rules_obj, start_time, *args, **kwargs
 ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
@@ -341,6 +353,10 @@ def function_setup(
     try:
         global callback_list, add_breadcrumb, user_logger_fn, Logging
 
+        ## CUSTOM LLM SETUP ##
+        custom_llm_setup()
+
+        ## LOGGING SETUP
         function_id = kwargs["id"] if "id" in kwargs else None
 
         if len(litellm.callbacks) > 0:
@@ -2774,7 +2790,7 @@ def get_optional_params(
                 tool_function["parameters"] = new_parameters
 
     def _check_valid_arg(supported_params):
-        verbose_logger.debug(
+        verbose_logger.info(
             f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
         )
         verbose_logger.debug(
@@ -3121,7 +3137,19 @@ def get_optional_params(
         supported_params = get_supported_openai_params(
             model=model, custom_llm_provider=custom_llm_provider
         )
-        if "ai21" in model:
+        if model in litellm.BEDROCK_CONVERSE_MODELS:
+            _check_valid_arg(supported_params=supported_params)
+            optional_params = litellm.AmazonConverseConfig().map_openai_params(
+                model=model,
+                non_default_params=non_default_params,
+                optional_params=optional_params,
+                drop_params=(
+                    drop_params
+                    if drop_params is not None and isinstance(drop_params, bool)
+                    else False
+                ),
+            )
+        elif "ai21" in model:
             _check_valid_arg(supported_params=supported_params)
             # params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
             # https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@@ -3143,17 +3171,6 @@ def get_optional_params(
                             optional_params=optional_params,
                         )
                     )
-            elif model in litellm.BEDROCK_CONVERSE_MODELS:
-                optional_params = litellm.AmazonConverseConfig().map_openai_params(
-                    model=model,
-                    non_default_params=non_default_params,
-                    optional_params=optional_params,
-                    drop_params=(
-                        drop_params
-                        if drop_params is not None and isinstance(drop_params, bool)
-                        else False
-                    ),
-                )
             else:
                 optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
                     non_default_params=non_default_params,
@@ -4486,7 +4503,11 @@ def get_llm_provider(
                     or get_secret("TOGETHER_AI_TOKEN")
                 )
             elif custom_llm_provider == "friendliai":
-                api_base = "https://inference.friendli.ai/v1"
+                api_base = (
+                    api_base
+                    or get_secret("FRIENDLI_API_BASE")
+                    or "https://inference.friendli.ai/v1"
+                )
                 dynamic_api_key = (
                     api_key
                     or get_secret("FRIENDLIAI_API_KEY")
@@ -9242,7 +9263,10 @@ class CustomStreamWrapper:
         try:
             # return this for all models
             completion_obj = {"content": ""}
-            if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
+            if self.custom_llm_provider and (
+                self.custom_llm_provider == "anthropic"
+                or self.custom_llm_provider in litellm._custom_providers
+            ):
                 from litellm.types.utils import GenericStreamingChunk as GChunk
 
                 if self.received_finish_reason is not None:
@@ -10109,6 +10133,7 @@ class CustomStreamWrapper:
         try:
             if self.completion_stream is None:
                 await self.fetch_stream()
+
             if (
                 self.custom_llm_provider == "openai"
                 or self.custom_llm_provider == "azure"
@@ -10133,6 +10158,7 @@ class CustomStreamWrapper:
                 or self.custom_llm_provider == "triton"
                 or self.custom_llm_provider == "watsonx"
                 or self.custom_llm_provider in litellm.openai_compatible_endpoints
+                or self.custom_llm_provider in litellm._custom_providers
             ):
                 async for chunk in self.completion_stream:
                     print_verbose(f"value of async chunk: {chunk}")
@@ -10961,3 +10987,8 @@ class ModelResponseIterator:
             raise StopAsyncIteration
         self.is_done = True
         return self.model_response
+
+
+class CustomModelResponseIterator(Iterable):
+    def __init__(self) -> None:
+        super().__init__()
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 08bc292c9..d4985bffd 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -893,11 +893,11 @@
         "mode": "chat"
     },
     "mistral/mistral-large-latest": {
-        "max_tokens": 8191,
-        "max_input_tokens": 32000,
-        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000012,
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
         "litellm_provider": "mistral",
         "mode": "chat",
         "supports_function_calling": true
@@ -912,6 +912,16 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "mistral/mistral-large-2407": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "mistral/open-mistral-7b": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
@@ -1094,6 +1104,36 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "groq/llama-3.1-8b-instant": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-70b-versatile": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-405b-reasoning": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "groq/mixtral-8x7b-32768": {
         "max_tokens": 32768,
         "max_input_tokens": 32768,
@@ -2956,6 +2996,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "mistral.mistral-large-2407-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
         "max_tokens": 8191,
         "max_input_tokens": 32000,
@@ -3691,6 +3740,15 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "meta.llama3-1-405b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000532,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
     "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
         "max_tokens": 77, 
         "max_input_tokens": 77, 
diff --git a/pyproject.toml b/pyproject.toml
index 10246abd7..08a41c9ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.42.0"
+version = "1.42.1"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.42.0"
+version = "1.42.1"
 version_files = [
     "pyproject.toml:^version"
 ]