Merge branch 'main' into litellm_redis_team_object

2024-07-25 19:31:52 -07:00 · 2024-07-25 19:31:52 -07:00 · c2086300b7
commit c2086300b7
parent 6ab2527fdc 80e2facf3d
34 changed files with 1182 additions and 232 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@
          <img src="https://railway.app/button.svg" alt="Deploy on Railway">
        </a>
        </p>
-        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
+        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
        <br>
    </p>
 <h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
--- a/docs/my-website/docs/providers/custom_llm_server.md
+++ b/docs/my-website/docs/providers/custom_llm_server.md
@ -0,0 +1,168 @@
+# Custom API Server (Custom Format)
+
+LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
+
+
+:::info
+
+For calling an openai-compatible endpoint, [go here](./openai_compatible.md)
+:::
+
+## Quick Start 
+
+```python
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+litellm.custom_provider_map = [ # 👈 KEY STEP - REGISTER HANDLER
+        {"provider": "my-custom-llm", "custom_handler": my_custom_llm}
+    ]
+
+resp = completion(
+        model="my-custom-llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+assert resp.choices[0].message.content == "Hi!"
+```
+
+## OpenAI Proxy Usage
+
+1. Setup your `custom_handler.py` file 
+
+```python
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+my_custom_llm = MyCustomLLM()
+```
+
+2. Add to `config.yaml` 
+
+In the config below, we pass
+
+python_filename: `custom_handler.py`
+custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1
+
+custom_handler: `custom_handler.my_custom_llm`
+
+```yaml
+model_list:
+  - model_name: "test-model"             
+    litellm_params:
+      model: "openai/text-embedding-ada-002"
+  - model_name: "my-custom-model"
+    litellm_params:
+      model: "my-custom-llm/my-model"
+
+litellm_settings:
+  custom_provider_map:
+  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "my-custom-model",
+    "messages": [{"role": "user", "content": "Say \"this is a test\" in JSON!"}],
+}'
+```
+
+Expected Response
+
+```
+{
+    "id": "chatcmpl-06f1b9cd-08bc-43f7-9814-a69173921216",
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "Hi!",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null
+            }
+        }
+    ],
+    "created": 1721955063,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+        "prompt_tokens": 10,
+        "completion_tokens": 20,
+        "total_tokens": 30
+    }
+}
+```
+
+## Custom Handler Spec
+
+```python
+from litellm.types.utils import GenericStreamingChunk, ModelResponse
+from typing import Iterator, AsyncIterator
+from litellm.llms.base import BaseLLM
+
+class CustomLLMError(Exception):  # use this for all your exceptions
+    def __init__(
+        self,
+        status_code,
+        message,
+    ):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+class CustomLLM(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def completion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def acompletion(self, *args, **kwargs) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+```
--- a/docs/my-website/docs/providers/custom_openai_proxy.md
+++ b/docs/my-website/docs/providers/custom_openai_proxy.md
@ -1,129 +0,0 @@
-# Custom API Server (OpenAI Format)
-
-LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
-
-## API KEYS
-No api keys required
-
-## Set up your Custom API Server
-Your server should have the following Endpoints:
-
-Here's an example OpenAI proxy server with routes: https://replit.com/@BerriAI/openai-proxy#main.py
-
-### Required Endpoints
- POST `/chat/completions` - chat completions endpoint 
-
-### Optional Endpoints
- POST `/completions` - completions endpoint 
- Get `/models` - available models on server
- POST `/embeddings` - creates an embedding vector representing the input text.
-
-
-## Example Usage
-
-### Call `/chat/completions`
-In order to use your custom OpenAI Chat Completion proxy with LiteLLM, ensure you set
-
-* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
-* `custom_llm_provider` to `openai` this ensures litellm uses the `openai.ChatCompletion` to your api_base
-
-```python
-import os
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-response = completion(
-    model="command-nightly", 
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://openai-proxy.berriai.repl.co",
-    custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request
-
-)
-print(response)
-```
-
-#### Response
-```json
-{
-    "object":
-    "chat.completion",
-    "choices": [{
-      "finish_reason": "stop",
-      "index": 0,
-      "message": {
-        "content":
-        "The sky, a canvas of blue,\nA work of art, pure and true,\nA",
-        "role": "assistant"
-      }
-    }],
-    "id":
-    "chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
-    "created":
-    1699290237.408061,
-    "model":
-    "togethercomputer/llama-2-70b-chat",
-    "usage": {
-      "completion_tokens": 18,
-      "prompt_tokens": 14,
-      "total_tokens": 32
-    }
-  }
-```
-
-
-### Call `/completions`
-In order to use your custom OpenAI Completion proxy with LiteLLM, ensure you set
-
-* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
-* `custom_llm_provider` to `text-completion-openai` this ensures litellm uses the `openai.Completion` to your api_base
-
-```python
-import os
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-response = completion(
-    model="command-nightly", 
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://openai-proxy.berriai.repl.co",
-    custom_llm_provider="text-completion-openai" # litellm will use the openai.Completion to make the request
-
-)
-print(response)
-```
-
-#### Response 
-```json
-{
-    "warning":
-    "This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
-    "id":
-    "cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
-    "object":
-    "text_completion",
-    "created":
-    1699290166,
-    "model":
-    "text-davinci-003",
-    "choices": [{
-      "text":
-      "\n\nThe weather in San Francisco varies depending on what time of year and time",
-      "index": 0,
-      "logprobs": None,
-      "finish_reason": "length"
-    }],
-    "usage": {
-      "prompt_tokens": 7,
-      "completion_tokens": 16,
-      "total_tokens": 23
-    }
-  }
-```
--- a/docs/my-website/docs/providers/friendliai.md
+++ b/docs/my-website/docs/providers/friendliai.md
@ -0,0 +1,60 @@
+# FriendliAI
+https://suite.friendli.ai/
+
+**We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['FRIENDLI_TOKEN']
+os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models
+### Serverless Endpoints
+We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` | 
+| meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
+| meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |  
+
+### Dedicated Endpoints
+```
+model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
+```
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -148,8 +148,11 @@ print(response)
 ## Supported Models - ALL Groq Models Supported!
 We support ALL Groq models, just set `groq/` as a prefix when sending completion requests

-| Model Name         | Function Call                                           |
+| Model Name         | Usage                                           |
 |--------------------|---------------------------------------------------------|
+| llama-3.1-8b-instant     | `completion(model="groq/llama-3.1-8b-instant", messages)`     | 
+| llama-3.1-70b-versatile    | `completion(model="groq/llama-3.1-70b-versatile", messages)`    | 
+| llama-3.1-405b-reasoning    | `completion(model="groq/llama-3.1-405b-reasoning", messages)`    | 
 | llama3-8b-8192     | `completion(model="groq/llama3-8b-8192", messages)`     | 
 | llama3-70b-8192    | `completion(model="groq/llama3-70b-8192", messages)`    | 
 | llama2-70b-4096    | `completion(model="groq/llama2-70b-4096", messages)`    | 
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -148,7 +148,8 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
 |----------------|--------------------------------------------------------------|
 | Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
 | Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
-| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
+| Mistral Large 2  | `completion(model="mistral/mistral-large-2407", messages)` |
+| Mistral Large Latest  | `completion(model="mistral/mistral-large-latest", messages)` |
 | Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
 | Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
 | Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -59,6 +59,8 @@ litellm_settings:
  cache_params:        # set cache params for redis
    type: redis
    ttl: 600 # will be cached on redis for 600s
+    # default_in_memory_ttl: Optional[float], default is None. time in seconds. 
+    # default_in_redis_ttl: Optional[float], default is None. time in seconds. 
 ```


@ -613,6 +615,11 @@ litellm_settings:

 ```yaml
 cache_params:
+  # ttl 
+  ttl: Optional[float]
+  default_in_memory_ttl: Optional[float]
+  default_in_redis_ttl: Optional[float]
+
  # Type of cache (options: "local", "redis", "s3")
  type: s3

@ -628,6 +635,8 @@ cache_params:
  host: localhost  # Redis server hostname or IP address
  port: "6379"  # Redis server port (as a string)
  password: secret_password  # Redis server password
+  namespace: Optional[str] = None,
+  

  # S3 cache parameters
  s3_bucket_name: your_s3_bucket_name  # Name of the S3 bucket
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -23,9 +23,9 @@ Features:
    - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
    - ✅ Set Max Request / File Size on Requests
    - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
- **Spend Tracking**
+- **Enterprise Spend Tracking Features**
    - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
-    - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
+    - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
 - **Advanced Metrics**
    - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
 - **Guardrails, PII Masking, Content Moderation**
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -41,28 +41,6 @@ litellm --health
 }
 ```

-### Background Health Checks 
-
-You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
-
-Here's how to use it: 
-1. in the config.yaml add:
-```
-general_settings: 
-  background_health_checks: True # enable background health checks
-  health_check_interval: 300 # frequency of background health checks
-```
-
-2. Start server 
-```
-$ litellm /path/to/config.yaml
-```
-
-3. Query health endpoint: 
-```
-curl --location 'http://0.0.0.0:4000/health'
-```
-
 ### Embedding Models 

 We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -124,6 +102,41 @@ model_list:
      mode: audio_transcription
 ```

+
+### Text to Speech Models 
+
+```yaml
+# OpenAI Text to Speech Models
+  - model_name: tts
+    litellm_params:
+      model: openai/tts-1
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info:
+      mode: audio_speech
+```
+
+## Background Health Checks 
+
+You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
+
+Here's how to use it: 
+1. in the config.yaml add:
+```
+general_settings: 
+  background_health_checks: True # enable background health checks
+  health_check_interval: 300 # frequency of background health checks
+```
+
+2. Start server 
+```
+$ litellm /path/to/config.yaml
+```
+
+3. Query health endpoint: 
+```
+curl --location 'http://0.0.0.0:4000/health'
+```
+
 ### Hide details

 The health check response contains details like endpoint URLs, error messages,
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -31,8 +31,19 @@ model_list:
      api_base: https://openai-france-1234.openai.azure.com/
      api_key: <your-azure-api-key>
      rpm: 1440
+routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
+  num_retries: 2
+  timeout: 30                                  # 30 seconds
+  redis_host: <your redis host>                # set this when using multiple litellm proxy deployments, load balancing state stored in redis
+  redis_password: <your redis password>
+  redis_port: 1992
 ```

+:::info
+Detailed information about [routing strategies can be found here](../routing)
+:::
+
 #### Step 2: Start Proxy with config

 ```shell
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -158,6 +158,7 @@ const sidebars = {
        "providers/triton-inference-server",
        "providers/ollama", 
        "providers/perplexity", 
+        "providers/friendliai",
        "providers/groq", 
        "providers/deepseek", 
        "providers/fireworks_ai",
@ -174,7 +175,8 @@ const sidebars = {
        "providers/aleph_alpha", 
        "providers/baseten", 
        "providers/openrouter", 
-        "providers/custom_openai_proxy",
+        // "providers/custom_openai_proxy",
+        "providers/custom_llm_server",
        "providers/petals",
        
      ],
--- a/litellm/init.py
+++ b/litellm/init.py
@ -813,6 +813,7 @@ from .utils import (
 )

 from .types.utils import ImageObject
+from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
@ -909,3 +910,12 @@ from .cost_calculator import response_cost_calculator, cost_per_token
 from .types.adapter import AdapterItem

 adapters: List[AdapterItem] = []
+
+### CUSTOM LLMs ###
+from .types.llms.custom_llm import CustomLLMItem
+from .types.utils import GenericStreamingChunk
+
+custom_provider_map: List[CustomLLMItem] = []
+_custom_providers: List[str] = (
+    []
+)  # internal helper util, used to track names of custom providers
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -1864,6 +1864,23 @@ class AzureChatCompletion(BaseLLM):
                model=model,  # type: ignore
                prompt=prompt,  # type: ignore
            )
+        elif mode == "audio_transcription":
+            # Get the current directory of the file being run
+            pwd = os.path.dirname(os.path.realpath(__file__))
+            file_path = os.path.join(pwd, "../tests/gettysburg.wav")
+            audio_file = open(file_path, "rb")
+            completion = await client.audio.transcriptions.with_raw_response.create(
+                file=audio_file,
+                model=model,  # type: ignore
+                prompt=prompt,  # type: ignore
+            )
+        elif mode == "audio_speech":
+            # Get the current directory of the file being run
+            completion = await client.audio.speech.with_raw_response.create(
+                model=model,  # type: ignore
+                input=prompt,  # type: ignore
+                voice="alloy",
+            )
        else:
            raise Exception("mode not set")
        response = {}
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -78,6 +78,8 @@ BEDROCK_CONVERSE_MODELS = [
    "ai21.jamba-instruct-v1:0",
    "meta.llama3-1-8b-instruct-v1:0",
    "meta.llama3-1-70b-instruct-v1:0",
+    "meta.llama3-1-405b-instruct-v1:0",
+    "mistral.mistral-large-2407-v1:0",
 ]


@ -1315,6 +1317,7 @@ class AmazonConverseConfig:
            model.startswith("anthropic")
            or model.startswith("mistral")
            or model.startswith("cohere")
+            or model.startswith("meta.llama3-1")
        ):
            supported_params.append("tools")

--- a/litellm/llms/custom_llm.py
+++ b/litellm/llms/custom_llm.py
@ -0,0 +1,161 @@
+# What is this?
+## Handler file for a Custom Chat LLM
+
+"""
+- completion
+- acompletion
+- streaming
+- async_streaming
+"""
+
+import copy
+import json
+import os
+import time
+import types
+from enum import Enum
+from functools import partial
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Coroutine,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.utils import GenericStreamingChunk, ProviderField
+from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory
+
+
+class CustomLLMError(Exception):  # use this for all your exceptions
+    def __init__(
+        self,
+        status_code,
+        message,
+    ):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class CustomLLM(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[HTTPHandler] = None,
+    ) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    def streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[HTTPHandler] = None,
+    ) -> Iterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def acompletion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> ModelResponse:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+    async def astreaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:
+        raise CustomLLMError(status_code=500, message="Not implemented yet!")
+
+
+def custom_chat_llm_router(
+    async_fn: bool, stream: Optional[bool], custom_llm: CustomLLM
+):
+    """
+    Routes call to CustomLLM completion/acompletion/streaming/astreaming functions, based on call type
+
+    Validates if response is in expected format
+    """
+    if async_fn:
+        if stream:
+            return custom_llm.astreaming
+        return custom_llm.acompletion
+    if stream:
+        return custom_llm.streaming
+    return custom_llm.completion
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -1,5 +1,6 @@
 import hashlib
 import json
+import os
 import time
 import traceback
 import types
@ -1870,8 +1871,25 @@ class OpenAIChatCompletion(BaseLLM):
                model=model,  # type: ignore
                prompt=prompt,  # type: ignore
            )
+        elif mode == "audio_transcription":
+            # Get the current directory of the file being run
+            pwd = os.path.dirname(os.path.realpath(__file__))
+            file_path = os.path.join(pwd, "../tests/gettysburg.wav")
+            audio_file = open(file_path, "rb")
+            completion = await client.audio.transcriptions.with_raw_response.create(
+                file=audio_file,
+                model=model,  # type: ignore
+                prompt=prompt,  # type: ignore
+            )
+        elif mode == "audio_speech":
+            # Get the current directory of the file being run
+            completion = await client.audio.speech.with_raw_response.create(
+                model=model,  # type: ignore
+                input=prompt,  # type: ignore
+                voice="alloy",
+            )
        else:
-            raise Exception("mode not set")
+            raise ValueError("mode not set, passed in mode: " + mode)
        response = {}

        if completion is None or not hasattr(completion, "headers"):
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -387,7 +387,7 @@ def process_response(
        result = " "

    ## Building RESPONSE OBJECT
-    if len(result) > 1:
+    if len(result) >= 1:
        model_response.choices[0].message.content = result  # type: ignore

    # Calculate usage
--- a/litellm/main.py
+++ b/litellm/main.py
@ -107,6 +107,7 @@ from .llms.anthropic_text import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
+from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks import DatabricksChatCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
@ -381,6 +382,7 @@ async def acompletion(
            or custom_llm_provider == "clarifai"
            or custom_llm_provider == "watsonx"
            or custom_llm_provider in litellm.openai_compatible_providers
+            or custom_llm_provider in litellm._custom_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
            if isinstance(init_response, dict) or isinstance(
@ -2690,6 +2692,54 @@ def completion(
            model_response.created = int(time.time())
            model_response.model = model
            response = model_response
+        elif (
+            custom_llm_provider in litellm._custom_providers
+        ):  # Assume custom LLM provider
+            # Get the Custom Handler
+            custom_handler: Optional[CustomLLM] = None
+            for item in litellm.custom_provider_map:
+                if item["provider"] == custom_llm_provider:
+                    custom_handler = item["custom_handler"]
+
+            if custom_handler is None:
+                raise ValueError(
+                    f"Unable to map your input to a model. Check your input - {args}"
+                )
+
+            ## ROUTE LLM CALL ##
+            handler_fn = custom_chat_llm_router(
+                async_fn=acompletion, stream=stream, custom_llm=custom_handler
+            )
+
+            headers = headers or litellm.headers
+
+            ## CALL FUNCTION
+            response = handler_fn(
+                model=model,
+                messages=messages,
+                headers=headers,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                api_key=api_key,
+                api_base=api_base,
+                acompletion=acompletion,
+                logging_obj=logging,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                timeout=timeout,  # type: ignore
+                custom_prompt_dict=custom_prompt_dict,
+                client=client,  # pass AsyncOpenAI, OpenAI client
+                encoding=encoding,
+            )
+            if stream is True:
+                return CustomStreamWrapper(
+                    completion_stream=response,
+                    model=model,
+                    custom_llm_provider=custom_llm_provider,
+                    logging_obj=logging,
+                )
+
        else:
            raise ValueError(
                f"Unable to map your input to a model. Check your input - {args}"
@ -3833,7 +3883,7 @@ def text_completion(
        optional_params["custom_llm_provider"] = custom_llm_provider

    # get custom_llm_provider
-    _, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
+    _model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore

    if custom_llm_provider == "huggingface":
        # if echo == True, for TGI llms we need to set top_n_tokens to 3
@ -3916,10 +3966,12 @@ def text_completion(

    kwargs.pop("prompt", None)

-    if model is not None and model.startswith(
-        "openai/"
+    if (
+        _model is not None and custom_llm_provider == "openai"
    ):  # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
-        model = model.replace("openai/", "text-completion-openai/")
+        if _model not in litellm.open_ai_chat_completion_models:
+            model = "text-completion-openai/" + _model
+            optional_params.pop("custom_llm_provider", None)

    kwargs["text_completion"] = True
    response = completion(
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -893,11 +893,11 @@
        "mode": "chat"
    },
    "mistral/mistral-large-latest": {
-        "max_tokens": 8191,
-        "max_input_tokens": 32000,
-        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000012,
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
        "litellm_provider": "mistral",
        "mode": "chat",
        "supports_function_calling": true
@ -912,6 +912,16 @@
        "mode": "chat",
        "supports_function_calling": true
    },
+    "mistral/mistral-large-2407": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
    "mistral/open-mistral-7b": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -1094,6 +1104,36 @@
        "mode": "chat",
        "supports_function_calling": true
    },
+    "groq/llama-3.1-8b-instant": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-70b-versatile": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-405b-reasoning": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
    "groq/mixtral-8x7b-32768": {
        "max_tokens": 32768,
        "max_input_tokens": 32768,
@ -2956,6 +2996,15 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "mistral.mistral-large-2407-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -3691,6 +3740,15 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "meta.llama3-1-405b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000532,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,9 +1,11 @@
 model_list:
-  - model_name: "*"             # all requests where model not in your config go to this deployment
+  - model_name: "test-model"             
    litellm_params:
-      model: "openai/*"           # passes our validation check that a real provider is given
-      api_key: "" 
+      model: "openai/text-embedding-ada-002"
+  - model_name: "my-custom-model"
+    litellm_params:
+      model: "my-custom-llm/my-model"

 litellm_settings:
-  cache: True
-  
+  custom_provider_map:
+  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
--- a/litellm/proxy/custom_handler.py
+++ b/litellm/proxy/custom_handler.py
@ -0,0 +1,21 @@
+import litellm
+from litellm import CustomLLM, completion, get_llm_provider
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+
+my_custom_llm = MyCustomLLM()
--- a/litellm/proxy/management_endpoints/internal_user_endpoints.py
+++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py
@ -27,6 +27,7 @@ from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.management_endpoints.key_management_endpoints import (
+    _duration_in_seconds,
    generate_key_helper_fn,
 )
 from litellm.proxy.management_helpers.utils import (
@ -486,6 +487,13 @@ async def user_update(
            ):  # models default to [], spend defaults to 0, we should not reset these values
                non_default_values[k] = v

+        if "budget_duration" in non_default_values:
+            duration_s = _duration_in_seconds(
+                duration=non_default_values["budget_duration"]
+            )
+            user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
+            non_default_values["budget_reset_at"] = user_reset_at
+
        ## ADD USER, IF NEW ##
        verbose_proxy_logger.debug("/user/update: Received data = %s", data)
        if data.user_id is not None and len(data.user_id) > 0:
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -8,6 +8,12 @@ model_list:
    litellm_params:
      model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
      api_key: "os.environ/FIREWORKS"
+  - model_name: tts
+    litellm_params:
+      model: openai/tts-1
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info:
+      mode: audio_speech
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1507,6 +1507,21 @@ class ProxyConfig:
                    verbose_proxy_logger.debug(
                        f"litellm.post_call_rules: {litellm.post_call_rules}"
                    )
+                elif key == "custom_provider_map":
+                    from litellm.utils import custom_llm_setup
+
+                    litellm.custom_provider_map = [
+                        {
+                            "provider": item["provider"],
+                            "custom_handler": get_instance_fn(
+                                value=item["custom_handler"],
+                                config_file_path=config_file_path,
+                            ),
+                        }
+                        for item in value
+                    ]
+
+                    custom_llm_setup()
                elif key == "success_callback":
                    litellm.success_callback = []

@ -3334,6 +3349,7 @@ async def embeddings(
        if (
            "input" in data
            and isinstance(data["input"], list)
+            and len(data["input"]) > 0
            and isinstance(data["input"][0], list)
            and isinstance(data["input"][0][0], int)
        ):  # check if array of tokens passed in
@ -3464,8 +3480,8 @@ async def embeddings(
            litellm_debug_info,
        )
        verbose_proxy_logger.error(
-            "litellm.proxy.proxy_server.embeddings(): Exception occured - {}".format(
-                str(e)
+            "litellm.proxy.proxy_server.embeddings(): Exception occured - {}\n{}".format(
+                str(e), traceback.format_exc()
            )
        )
        verbose_proxy_logger.debug(traceback.format_exc())
--- a/litellm/router.py
+++ b/litellm/router.py
@ -263,7 +263,9 @@ class Router:
        )  # names of models under litellm_params. ex. azure/chatgpt-v-2
        self.deployment_latency_map = {}
        ### CACHING ###
-        cache_type: Literal["local", "redis"] = "local"  # default to an in-memory cache
+        cache_type: Literal["local", "redis", "redis-semantic", "s3", "disk"] = (
+            "local"  # default to an in-memory cache
+        )
        redis_cache = None
        cache_config = {}
        self.client_ttl = client_ttl
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -2573,21 +2573,17 @@ def test_completion_azure_extra_headers():
    http_client = Client()

    with patch.object(http_client, "send", new=MagicMock()) as mock_client:
-        client = AzureOpenAI(
-            azure_endpoint=os.getenv("AZURE_API_BASE"),
-            api_version=litellm.AZURE_DEFAULT_API_VERSION,
-            api_key=os.getenv("AZURE_API_KEY"),
-            http_client=http_client,
-        )
+        litellm.client_session = http_client
        try:
            response = completion(
                model="azure/chatgpt-v-2",
                messages=messages,
-                client=client,
+                api_base=os.getenv("AZURE_API_BASE"),
+                api_version="2023-07-01-preview",
+                api_key=os.getenv("AZURE_API_KEY"),
                extra_headers={
                    "Authorization": "my-bad-key",
                    "Ocp-Apim-Subscription-Key": "hello-world-testing",
-                    "api-key": "my-bad-key",
                },
            )
            print(response)
@ -2603,8 +2599,10 @@ def test_completion_azure_extra_headers():
        print(request.url)  # This will print the full URL
        print(request.headers)  # This will print the full URL
        auth_header = request.headers.get("Authorization")
+        apim_key = request.headers.get("Ocp-Apim-Subscription-Key")
        print(auth_header)
        assert auth_header == "my-bad-key"
+        assert apim_key == "hello-world-testing"


 def test_completion_azure_ad_token():
@ -2613,18 +2611,37 @@ def test_completion_azure_ad_token():
    # If you want to remove it, speak to Ishaan!
    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
    from httpx import Client
-    from openai import AzureOpenAI

    from litellm import completion
-    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler

+    litellm.set_verbose = True
+
+    old_key = os.environ["AZURE_API_KEY"]
+    os.environ.pop("AZURE_API_KEY", None)
+
+    http_client = Client()
+
+    with patch.object(http_client, "send", new=MagicMock()) as mock_client:
+        litellm.client_session = http_client
+        try:
            response = completion(
                model="azure/chatgpt-v-2",
                messages=messages,
-        # api_key="my-fake-ad-token",
-        azure_ad_token=os.getenv("AZURE_API_KEY"),
+                azure_ad_token="my-special-token",
            )
            print(response)
+        except Exception as e:
+            pass
+        finally:
+            os.environ["AZURE_API_KEY"] = old_key
+
+        mock_client.assert_called_once()
+        request = mock_client.call_args[0][0]
+        print(request.method)  # This will print 'POST'
+        print(request.url)  # This will print the full URL
+        print(request.headers)  # This will print the full URL
+        auth_header = request.headers.get("Authorization")
+        assert auth_header == "Bearer my-special-token"


 def test_completion_azure_key_completion_arg():
--- a/litellm/tests/test_custom_llm.py
+++ b/litellm/tests/test_custom_llm.py
@ -0,0 +1,302 @@
+# What is this?
+## Unit tests for the CustomLLM class
+
+
+import asyncio
+import os
+import sys
+import time
+import traceback
+
+import openai
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import os
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Coroutine,
+    Iterator,
+    Optional,
+    Union,
+)
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+from dotenv import load_dotenv
+
+import litellm
+from litellm import (
+    ChatCompletionDeltaChunk,
+    ChatCompletionUsageBlock,
+    CustomLLM,
+    GenericStreamingChunk,
+    ModelResponse,
+    acompletion,
+    completion,
+    get_llm_provider,
+)
+from litellm.utils import ModelResponseIterator
+
+
+class CustomModelResponseIterator:
+    def __init__(self, streaming_response: Union[Iterator, AsyncIterator]):
+        self.streaming_response = streaming_response
+
+    def chunk_parser(self, chunk: Any) -> GenericStreamingChunk:
+        return GenericStreamingChunk(
+            text="hello world",
+            tool_use=None,
+            is_finished=True,
+            finish_reason="stop",
+            usage=ChatCompletionUsageBlock(
+                prompt_tokens=10, completion_tokens=20, total_tokens=30
+            ),
+            index=0,
+        )
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> GenericStreamingChunk:
+        try:
+            chunk: Any = self.streaming_response.__next__()  # type: ignore
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            return self.chunk_parser(chunk=chunk)
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+
+    # Async iterator
+    def __aiter__(self):
+        self.async_response_iterator = self.streaming_response.__aiter__()  # type: ignore
+        return self.streaming_response
+
+    async def __anext__(self) -> GenericStreamingChunk:
+        try:
+            chunk = await self.async_response_iterator.__anext__()
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            return self.chunk_parser(chunk=chunk)
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+
+
+class MyCustomLLM(CustomLLM):
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.HTTPHandler] = None,
+    ) -> ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    async def acompletion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.AsyncHTTPHandler] = None,
+    ) -> litellm.ModelResponse:
+        return litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello world"}],
+            mock_response="Hi!",
+        )  # type: ignore
+
+    def streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.HTTPHandler] = None,
+    ) -> Iterator[GenericStreamingChunk]:
+        generic_streaming_chunk: GenericStreamingChunk = {
+            "finish_reason": "stop",
+            "index": 0,
+            "is_finished": True,
+            "text": "Hello world",
+            "tool_use": None,
+            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
+        }
+
+        completion_stream = ModelResponseIterator(
+            model_response=generic_streaming_chunk  # type: ignore
+        )
+        custom_iterator = CustomModelResponseIterator(
+            streaming_response=completion_stream
+        )
+        return custom_iterator
+
+    async def astreaming(  # type: ignore
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable[..., Any],
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params: dict,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        timeout: Optional[Union[float, openai.Timeout]] = None,
+        client: Optional[litellm.AsyncHTTPHandler] = None,
+    ) -> AsyncIterator[GenericStreamingChunk]:  # type: ignore
+        generic_streaming_chunk: GenericStreamingChunk = {
+            "finish_reason": "stop",
+            "index": 0,
+            "is_finished": True,
+            "text": "Hello world",
+            "tool_use": None,
+            "usage": {"completion_tokens": 10, "prompt_tokens": 20, "total_tokens": 30},
+        }
+
+        yield generic_streaming_chunk  # type: ignore
+
+
+def test_get_llm_provider():
+    """"""
+    from litellm.utils import custom_llm_setup
+
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+
+    custom_llm_setup()
+
+    model, provider, _, _ = get_llm_provider(model="custom_llm/my-fake-model")
+
+    assert provider == "custom_llm"
+
+
+def test_simple_completion():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = completion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+    assert resp.choices[0].message.content == "Hi!"
+
+
+@pytest.mark.asyncio
+async def test_simple_acompletion():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = await acompletion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+    )
+
+    assert resp.choices[0].message.content == "Hi!"
+
+
+def test_simple_completion_streaming():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = completion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        stream=True,
+    )
+
+    for chunk in resp:
+        print(chunk)
+        if chunk.choices[0].finish_reason is None:
+            assert isinstance(chunk.choices[0].delta.content, str)
+        else:
+            assert chunk.choices[0].finish_reason == "stop"
+
+
+@pytest.mark.asyncio
+async def test_simple_completion_async_streaming():
+    my_custom_llm = MyCustomLLM()
+    litellm.custom_provider_map = [
+        {"provider": "custom_llm", "custom_handler": my_custom_llm}
+    ]
+    resp = await litellm.acompletion(
+        model="custom_llm/my-fake-model",
+        messages=[{"role": "user", "content": "Hello world!"}],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk)
+        if chunk.choices[0].finish_reason is None:
+            assert isinstance(chunk.choices[0].delta.content, str)
+        else:
+            assert chunk.choices[0].finish_reason == "stop"
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -206,6 +206,9 @@ def test_openai_azure_embedding_with_oidc_and_cf():
    os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
    os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"

+    old_key = os.environ["AZURE_API_KEY"]
+    os.environ.pop("AZURE_API_KEY", None)
+
    try:
        response = embedding(
            model="azure/text-embedding-ada-002",
@ -218,6 +221,8 @@ def test_openai_azure_embedding_with_oidc_and_cf():

    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
+    finally:
+        os.environ["AZURE_API_KEY"] = old_key


 def test_openai_azure_embedding_optional_arg(mocker):
@ -673,17 +678,3 @@ async def test_databricks_embeddings(sync_mode):
 #     print(response)

 # local_proxy_embeddings()
-
-
-def test_embedding_azure_ad_token():
-    # this tests if we can pass api_key to completion, when it's not in the env.
-    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
-    # If you want to remove it, speak to Ishaan!
-    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
-
-    response = embedding(
-        model="azure/azure-embedding-model",
-        input=["good morning from litellm"],
-        azure_ad_token=os.getenv("AZURE_API_KEY"),
-    )
-    print(response)
--- a/litellm/tests/test_get_llm_provider.py
+++ b/litellm/tests/test_get_llm_provider.py
@ -1,14 +1,18 @@
-import sys, os
+import os
+import sys
 import traceback
+
 from dotenv import load_dotenv

 load_dotenv()
-import os, io
+import io
+import os

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
+
 import litellm


@ -21,6 +25,12 @@ def test_get_llm_provider():
 # test_get_llm_provider()


+def test_get_llm_provider_gpt_instruct():
+    _, response, _, _ = litellm.get_llm_provider(model="gpt-3.5-turbo-instruct-0914")
+
+    assert response == "text-completion-openai"
+
+
 def test_get_llm_provider_mistral_custom_api_base():
    model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
        model="mistral/mistral-large-fr",
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@ -3840,7 +3840,26 @@ def test_completion_chatgpt_prompt():
    try:
        print("\n gpt3.5 test\n")
        response = text_completion(
-            model="gpt-3.5-turbo", prompt="What's the weather in SF?"
+            model="openai/gpt-3.5-turbo", prompt="What's the weather in SF?"
+        )
+        print(response)
+        response_str = response["choices"][0]["text"]
+        print("\n", response.choices)
+        print("\n", response.choices[0])
+        # print(response.choices[0].text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+# test_completion_chatgpt_prompt()
+
+
+def test_completion_gpt_instruct():
+    try:
+        response = text_completion(
+            model="gpt-3.5-turbo-instruct-0914",
+            prompt="What's the weather in SF?",
+            custom_llm_provider="openai",
        )
        print(response)
        response_str = response["choices"][0]["text"]
--- a/litellm/types/llms/custom_llm.py
+++ b/litellm/types/llms/custom_llm.py
@ -0,0 +1,10 @@
+from typing import List
+
+from typing_extensions import Dict, Required, TypedDict, override
+
+from litellm.llms.custom_llm import CustomLLM
+
+
+class CustomLLMItem(TypedDict):
+    provider: str
+    custom_handler: CustomLLM
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -330,6 +330,18 @@ class Rules:

 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
+def custom_llm_setup():
+    """
+    Add custom_llm provider to provider list
+    """
+    for custom_llm in litellm.custom_provider_map:
+        if custom_llm["provider"] not in litellm.provider_list:
+            litellm.provider_list.append(custom_llm["provider"])
+
+        if custom_llm["provider"] not in litellm._custom_providers:
+            litellm._custom_providers.append(custom_llm["provider"])
+
+
 def function_setup(
    original_function: str, rules_obj, start_time, *args, **kwargs
 ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
@ -341,6 +353,10 @@ def function_setup(
    try:
        global callback_list, add_breadcrumb, user_logger_fn, Logging

+        ## CUSTOM LLM SETUP ##
+        custom_llm_setup()
+
+        ## LOGGING SETUP
        function_id = kwargs["id"] if "id" in kwargs else None

        if len(litellm.callbacks) > 0:
@ -2774,7 +2790,7 @@ def get_optional_params(
                tool_function["parameters"] = new_parameters

    def _check_valid_arg(supported_params):
-        verbose_logger.debug(
+        verbose_logger.info(
            f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
        )
        verbose_logger.debug(
@ -3121,7 +3137,19 @@ def get_optional_params(
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
-        if "ai21" in model:
+        if model in litellm.BEDROCK_CONVERSE_MODELS:
+            _check_valid_arg(supported_params=supported_params)
+            optional_params = litellm.AmazonConverseConfig().map_openai_params(
+                model=model,
+                non_default_params=non_default_params,
+                optional_params=optional_params,
+                drop_params=(
+                    drop_params
+                    if drop_params is not None and isinstance(drop_params, bool)
+                    else False
+                ),
+            )
+        elif "ai21" in model:
            _check_valid_arg(supported_params=supported_params)
            # params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
            # https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@ -3143,17 +3171,6 @@ def get_optional_params(
                            optional_params=optional_params,
                        )
                    )
-            elif model in litellm.BEDROCK_CONVERSE_MODELS:
-                optional_params = litellm.AmazonConverseConfig().map_openai_params(
-                    model=model,
-                    non_default_params=non_default_params,
-                    optional_params=optional_params,
-                    drop_params=(
-                        drop_params
-                        if drop_params is not None and isinstance(drop_params, bool)
-                        else False
-                    ),
-                )
            else:
                optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
                    non_default_params=non_default_params,
@ -4486,7 +4503,11 @@ def get_llm_provider(
                    or get_secret("TOGETHER_AI_TOKEN")
                )
            elif custom_llm_provider == "friendliai":
-                api_base = "https://inference.friendli.ai/v1"
+                api_base = (
+                    api_base
+                    or get_secret("FRIENDLI_API_BASE")
+                    or "https://inference.friendli.ai/v1"
+                )
                dynamic_api_key = (
                    api_key
                    or get_secret("FRIENDLIAI_API_KEY")
@ -9242,7 +9263,10 @@ class CustomStreamWrapper:
        try:
            # return this for all models
            completion_obj = {"content": ""}
-            if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
+            if self.custom_llm_provider and (
+                self.custom_llm_provider == "anthropic"
+                or self.custom_llm_provider in litellm._custom_providers
+            ):
                from litellm.types.utils import GenericStreamingChunk as GChunk

                if self.received_finish_reason is not None:
@ -10109,6 +10133,7 @@ class CustomStreamWrapper:
        try:
            if self.completion_stream is None:
                await self.fetch_stream()
+
            if (
                self.custom_llm_provider == "openai"
                or self.custom_llm_provider == "azure"
@ -10133,6 +10158,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "triton"
                or self.custom_llm_provider == "watsonx"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
+                or self.custom_llm_provider in litellm._custom_providers
            ):
                async for chunk in self.completion_stream:
                    print_verbose(f"value of async chunk: {chunk}")
@ -10961,3 +10987,8 @@ class ModelResponseIterator:
            raise StopAsyncIteration
        self.is_done = True
        return self.model_response
+
+
+class CustomModelResponseIterator(Iterable):
+    def __init__(self) -> None:
+        super().__init__()
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -893,11 +893,11 @@
        "mode": "chat"
    },
    "mistral/mistral-large-latest": {
-        "max_tokens": 8191,
-        "max_input_tokens": 32000,
-        "max_output_tokens": 8191,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000012,
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
        "litellm_provider": "mistral",
        "mode": "chat",
        "supports_function_calling": true
@ -912,6 +912,16 @@
        "mode": "chat",
        "supports_function_calling": true
    },
+    "mistral/mistral-large-2407": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "mistral",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
    "mistral/open-mistral-7b": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -1094,6 +1104,36 @@
        "mode": "chat",
        "supports_function_calling": true
    },
+    "groq/llama-3.1-8b-instant": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-70b-versatile": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "groq/llama-3.1-405b-reasoning": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "groq",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
    "groq/mixtral-8x7b-32768": {
        "max_tokens": 32768,
        "max_input_tokens": 32768,
@ -2956,6 +2996,15 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "mistral.mistral-large-2407-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000009,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -3691,6 +3740,15 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "meta.llama3-1-405b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000532,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.42.0"
+version = "1.42.1"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.42.0"
+version = "1.42.1"
 version_files = [
    "pyproject.toml:^version"
 ]