From cece76c4eefd4fc445ff80d214a805b6e7b7931a Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 18 Jul 2024 18:24:06 -0700
Subject: [PATCH 01/27] feat(bedrock_httpx.py): add ai21 jamba instruct as
 converse model

initial commit for adding ai21 jamba instruct support through bedrock converse
---
 litellm/llms/bedrock_httpx.py                       |  2 +-
 litellm/model_prices_and_context_window_backup.json | 10 ++++++++++
 model_prices_and_context_window.json                | 10 ++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py
index b41dd542b..1461cfd90 100644
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@@ -74,7 +74,7 @@ BEDROCK_CONVERSE_MODELS = [
     "anthropic.claude-v2",
     "anthropic.claude-v2:1",
     "anthropic.claude-v1",
-    "anthropic.claude-instant-v1",
+    "ai21.jamba-instruct-v1:0",
 ]
 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 5b11b8360..98bb161e1 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -2812,6 +2812,16 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "ai21.jamba-instruct-v1:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 70000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000005,
+        "output_cost_per_token": 0.0000007,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_system_messages": true
+    },
     "amazon.titan-text-lite-v1": {
         "max_tokens": 4000, 
         "max_input_tokens": 42000,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 5b11b8360..98bb161e1 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -2812,6 +2812,16 @@
         "litellm_provider": "bedrock",
         "mode": "chat"
     },
+    "ai21.jamba-instruct-v1:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 70000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000005,
+        "output_cost_per_token": 0.0000007,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_system_messages": true
+    },
     "amazon.titan-text-lite-v1": {
         "max_tokens": 4000, 
         "max_input_tokens": 42000,

From 96471c145e8596b3ef4af2b7bc0e3c20ab915c89 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 18 Jul 2024 19:36:50 -0700
Subject: [PATCH 02/27] fix(bedrock_httpx.py): support jamba streaming

---
 litellm/llms/bedrock_httpx.py   | 128 ++++++++++++++++++++++++++++----
 litellm/tests/test_streaming.py |  24 +++---
 2 files changed, 125 insertions(+), 27 deletions(-)

diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py
index 1461cfd90..c3a563ce4 100644
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@@ -74,6 +74,7 @@ BEDROCK_CONVERSE_MODELS = [
     "anthropic.claude-v2",
     "anthropic.claude-v2:1",
     "anthropic.claude-v1",
+    "anthropic.claude-instant-v1",
     "ai21.jamba-instruct-v1:0",
 ]
 
@@ -195,13 +196,39 @@ async def make_call(
         if client is None:
             client = _get_async_httpx_client()  # Create a new client if none provided
 
-        response = await client.post(api_base, headers=headers, data=data, stream=True)
+        response = await client.post(
+            api_base,
+            headers=headers,
+            data=data,
+            stream=True if "ai21" not in api_base else False,
+        )
 
         if response.status_code != 200:
             raise BedrockError(status_code=response.status_code, message=response.text)
 
-        decoder = AWSEventStreamDecoder(model=model)
-        completion_stream = decoder.aiter_bytes(response.aiter_bytes(chunk_size=1024))
+        if "ai21" in api_base:
+            aws_bedrock_process_response = BedrockConverseLLM()
+            model_response: (
+                ModelResponse
+            ) = aws_bedrock_process_response.process_response(
+                model=model,
+                response=response,
+                model_response=litellm.ModelResponse(),
+                stream=True,
+                logging_obj=logging_obj,
+                optional_params={},
+                api_key="",
+                data=data,
+                messages=messages,
+                print_verbose=litellm.print_verbose,
+                encoding=litellm.encoding,
+            )  # type: ignore
+            completion_stream: Any = MockResponseIterator(model_response=model_response)
+        else:
+            decoder = AWSEventStreamDecoder(model=model)
+            completion_stream = decoder.aiter_bytes(
+                response.aiter_bytes(chunk_size=1024)
+            )
 
         # LOGGING
         logging_obj.post_call(
@@ -233,13 +260,35 @@ def make_sync_call(
     if client is None:
         client = _get_httpx_client()  # Create a new client if none provided
 
-    response = client.post(api_base, headers=headers, data=data, stream=True)
+    response = client.post(
+        api_base,
+        headers=headers,
+        data=data,
+        stream=True if "ai21" not in api_base else False,
+    )
 
     if response.status_code != 200:
         raise BedrockError(status_code=response.status_code, message=response.read())
 
-    decoder = AWSEventStreamDecoder(model=model)
-    completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
+    if "ai21" in api_base:
+        aws_bedrock_process_response = BedrockConverseLLM()
+        model_response: ModelResponse = aws_bedrock_process_response.process_response(
+            model=model,
+            response=response,
+            model_response=litellm.ModelResponse(),
+            stream=True,
+            logging_obj=logging_obj,
+            optional_params={},
+            api_key="",
+            data=data,
+            messages=messages,
+            print_verbose=litellm.print_verbose,
+            encoding=litellm.encoding,
+        )  # type: ignore
+        completion_stream: Any = MockResponseIterator(model_response=model_response)
+    else:
+        decoder = AWSEventStreamDecoder(model=model)
+        completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
 
     # LOGGING
     logging_obj.post_call(
@@ -1348,7 +1397,7 @@ class BedrockConverseLLM(BaseLLM):
         response: Union[requests.Response, httpx.Response],
         model_response: ModelResponse,
         stream: bool,
-        logging_obj: Logging,
+        logging_obj: Optional[Logging],
         optional_params: dict,
         api_key: str,
         data: Union[dict, str],
@@ -1358,12 +1407,13 @@ class BedrockConverseLLM(BaseLLM):
     ) -> Union[ModelResponse, CustomStreamWrapper]:
 
         ## LOGGING
-        logging_obj.post_call(
-            input=messages,
-            api_key=api_key,
-            original_response=response.text,
-            additional_args={"complete_input_dict": data},
-        )
+        if logging_obj is not None:
+            logging_obj.post_call(
+                input=messages,
+                api_key=api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data},
+            )
         print_verbose(f"raw model_response: {response.text}")
 
         ## RESPONSE OBJECT
@@ -1900,7 +1950,7 @@ class BedrockConverseLLM(BaseLLM):
         if acompletion:
             if isinstance(client, HTTPHandler):
                 client = None
-            if stream is True and provider != "ai21":
+            if stream is True:
                 return self.async_streaming(
                     model=model,
                     messages=messages,
@@ -1937,7 +1987,7 @@ class BedrockConverseLLM(BaseLLM):
                 client=client,
             )  # type: ignore
 
-        if (stream is not None and stream is True) and provider != "ai21":
+        if stream is not None and stream is True:
 
             streaming_response = CustomStreamWrapper(
                 completion_stream=None,
@@ -1981,7 +2031,7 @@ class BedrockConverseLLM(BaseLLM):
             model=model,
             response=response,
             model_response=model_response,
-            stream=stream,
+            stream=stream if isinstance(stream, bool) else False,
             logging_obj=logging_obj,
             optional_params=optional_params,
             api_key="",
@@ -2168,3 +2218,49 @@ class AWSEventStreamDecoder:
                 return None
 
             return chunk.decode()  # type: ignore[no-any-return]
+
+
+class MockResponseIterator:  # for returning ai21 streaming responses
+    def __init__(self, model_response):
+        self.model_response = model_response
+        self.is_done = False
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def _chunk_parser(self, chunk_data: ModelResponse) -> GenericStreamingChunk:
+
+        try:
+            chunk_usage: litellm.Usage = getattr(chunk_data, "usage")
+            processed_chunk = GenericStreamingChunk(
+                text=chunk_data.choices[0].message.content or "",  # type: ignore
+                tool_use=None,
+                is_finished=True,
+                finish_reason=chunk_data.choices[0].finish_reason,  # type: ignore
+                usage=ConverseTokenUsageBlock(
+                    inputTokens=chunk_usage.prompt_tokens,
+                    outputTokens=chunk_usage.completion_tokens,
+                    totalTokens=chunk_usage.total_tokens,
+                ),
+                index=0,
+            )
+            return processed_chunk
+        except Exception:
+            raise ValueError(f"Failed to decode chunk: {chunk_data}")
+
+    def __next__(self):
+        if self.is_done:
+            raise StopIteration
+        self.is_done = True
+        return self._chunk_parser(self.model_response)
+
+    # Async iterator
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self.is_done:
+            raise StopAsyncIteration
+        self.is_done = True
+        return self._chunk_parser(self.model_response)
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 8c7943893..d07aa681d 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -1312,22 +1312,22 @@ async def test_completion_replicate_llama3_streaming(sync_mode):
 #         pytest.fail(f"Error occurred: {e}")
 
 
-@pytest.mark.parametrize("sync_mode", [True])  # False
+@pytest.mark.parametrize("sync_mode", [True, False])  #
 @pytest.mark.parametrize(
-    "model",
+    "model, region",
     [
-        "bedrock/cohere.command-r-plus-v1:0",
-        "anthropic.claude-3-sonnet-20240229-v1:0",
-        "anthropic.claude-instant-v1",
-        "bedrock/ai21.j2-mid",
-        "mistral.mistral-7b-instruct-v0:2",
-        "bedrock/amazon.titan-tg1-large",
-        "meta.llama3-8b-instruct-v1:0",
-        "cohere.command-text-v14",
+        ["bedrock/ai21.jamba-instruct-v1:0", "us-east-1"],
+        ["bedrock/cohere.command-r-plus-v1:0", None],
+        ["anthropic.claude-3-sonnet-20240229-v1:0", None],
+        ["anthropic.claude-instant-v1", None],
+        ["mistral.mistral-7b-instruct-v0:2", None],
+        ["bedrock/amazon.titan-tg1-large", None],
+        ["meta.llama3-8b-instruct-v1:0", None],
+        ["cohere.command-text-v14", None],
     ],
 )
 @pytest.mark.asyncio
-async def test_bedrock_httpx_streaming(sync_mode, model):
+async def test_bedrock_httpx_streaming(sync_mode, model, region):
     try:
         litellm.set_verbose = True
         if sync_mode:
@@ -1337,6 +1337,7 @@ async def test_bedrock_httpx_streaming(sync_mode, model):
                 messages=messages,
                 max_tokens=10,  # type: ignore
                 stream=True,
+                aws_region_name=region,
             )
             complete_response = ""
             # Add any assertions here to check the response
@@ -1358,6 +1359,7 @@ async def test_bedrock_httpx_streaming(sync_mode, model):
                 messages=messages,
                 max_tokens=100,  # type: ignore
                 stream=True,
+                aws_region_name=region,
             )
             complete_response = ""
             # Add any assertions here to check the response

From dfc674622bcc1bdd369d183cf4fad0c180b5a123 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 16:55:50 -0700
Subject: [PATCH 03/27] litellm router - use free / paid tier

---
 litellm/types/router.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/types/router.py b/litellm/types/router.py
index e7b8971bc..df9947c26 100644
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@@ -91,6 +91,7 @@ class ModelInfo(BaseModel):
     base_model: Optional[str] = (
         None  # specify if the base model is azure/gpt-3.5-turbo etc for accurate cost tracking
     )
+    tier: Optional[Literal["free", "paid"]] = None
 
     def __init__(self, id: Optional[Union[str, int]] = None, **params):
         if id is None:
@@ -328,6 +329,7 @@ class LiteLLMParamsTypedDict(TypedDict, total=False):
 class DeploymentTypedDict(TypedDict):
     model_name: str
     litellm_params: LiteLLMParamsTypedDict
+    model_info: ModelInfo
 
 
 SPECIAL_MODEL_INFO_PARAMS = [

From 229b7a649378715e7e6ed605a6738dfc92dd7764 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 17:06:06 -0700
Subject: [PATCH 04/27] helper to get_deployments_for_tier

---
 litellm/router_strategy/free_paid_tiers.py | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 litellm/router_strategy/free_paid_tiers.py

diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py
new file mode 100644
index 000000000..4328bd84c
--- /dev/null
+++ b/litellm/router_strategy/free_paid_tiers.py
@@ -0,0 +1,64 @@
+"""
+Use this to route requests between free and paid tiers
+"""
+
+from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast
+
+from litellm._logging import verbose_logger
+from litellm.types.router import DeploymentTypedDict
+
+
+class ModelInfo(TypedDict):
+    tier: Literal["free", "paid"]
+
+
+class Deployment(TypedDict):
+    model_info: ModelInfo
+
+
+async def get_deployments_for_tier(
+    request_kwargs: dict,
+    healthy_deployments: Optional[
+        Union[List[DeploymentTypedDict], List[Dict[str, Any]]]
+    ] = None,
+):
+    """
+    if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
+    """
+    verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata"))
+    if "metadata" in request_kwargs:
+        metadata = request_kwargs["metadata"]
+        if "tier" in metadata:
+            selected_tier: Literal["free", "paid"] = metadata["tier"]
+            if healthy_deployments is None:
+                return None
+
+            if selected_tier == "free":
+                # get all deployments where model_info has tier = free
+                free_deployments: List[Any] = []
+                verbose_logger.debug(
+                    "Getting deployments in free tier, all_deployments: %s",
+                    healthy_deployments,
+                )
+                for deployment in healthy_deployments:
+                    typed_deployment = cast(Deployment, deployment)
+                    if typed_deployment["model_info"]["tier"] == "free":
+                        free_deployments.append(deployment)
+                verbose_logger.debug("free_deployments: %s", free_deployments)
+                return free_deployments
+
+            elif selected_tier == "paid":
+                # get all deployments where model_info has tier = paid
+                paid_deployments: List[Any] = []
+                for deployment in healthy_deployments:
+                    typed_deployment = cast(Deployment, deployment)
+                    if typed_deployment["model_info"]["tier"] == "paid":
+                        paid_deployments.append(deployment)
+                verbose_logger.debug("paid_deployments: %s", paid_deployments)
+                return paid_deployments
+
+    verbose_logger.debug(
+        "no tier found in metadata, returning healthy_deployments: %s",
+        healthy_deployments,
+    )
+    return healthy_deployments

From 0e70b5df14c5dc25c29a2424923cb7d801a1500d Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 17:09:42 -0700
Subject: [PATCH 05/27] router - use free paid tier routing

---
 litellm/router.py                          |  7 ++
 litellm/router_strategy/free_paid_tiers.py | 13 +++-
 litellm/tests/test_router_tiers.py         | 90 ++++++++++++++++++++++
 3 files changed, 106 insertions(+), 4 deletions(-)
 create mode 100644 litellm/tests/test_router_tiers.py

diff --git a/litellm/router.py b/litellm/router.py
index 2f72b8142..487d5fd6a 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -47,6 +47,7 @@ from litellm.assistants.main import AssistantDeleted
 from litellm.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.azure import get_azure_ad_token_from_oidc
+from litellm.router_strategy.free_paid_tiers import get_deployments_for_tier
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
@@ -4481,6 +4482,12 @@ class Router:
                     request_kwargs=request_kwargs,
                 )
 
+            # check free / paid tier for each deployment
+            healthy_deployments = await get_deployments_for_tier(
+                request_kwargs=request_kwargs,
+                healthy_deployments=healthy_deployments,
+            )
+
             if len(healthy_deployments) == 0:
                 if _allowed_model_region is None:
                     _allowed_model_region = "n/a"
diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py
index 4328bd84c..82e38b4f5 100644
--- a/litellm/router_strategy/free_paid_tiers.py
+++ b/litellm/router_strategy/free_paid_tiers.py
@@ -17,14 +17,19 @@ class Deployment(TypedDict):
 
 
 async def get_deployments_for_tier(
-    request_kwargs: dict,
-    healthy_deployments: Optional[
-        Union[List[DeploymentTypedDict], List[Dict[str, Any]]]
-    ] = None,
+    request_kwargs: Optional[Dict[Any, Any]] = None,
+    healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None,
 ):
     """
     if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
     """
+    if request_kwargs is None:
+        verbose_logger.debug(
+            "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
+            healthy_deployments,
+        )
+        return healthy_deployments
+
     verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata"))
     if "metadata" in request_kwargs:
         metadata = request_kwargs["metadata"]
diff --git a/litellm/tests/test_router_tiers.py b/litellm/tests/test_router_tiers.py
new file mode 100644
index 000000000..54e67ded3
--- /dev/null
+++ b/litellm/tests/test_router_tiers.py
@@ -0,0 +1,90 @@
+#### What this tests ####
+# This tests litellm router
+
+import asyncio
+import os
+import sys
+import time
+import traceback
+
+import openai
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import logging
+import os
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+from dotenv import load_dotenv
+
+import litellm
+from litellm import Router
+from litellm._logging import verbose_logger
+
+verbose_logger.setLevel(logging.DEBUG)
+
+
+load_dotenv()
+
+
+@pytest.mark.asyncio()
+async def test_router_free_paid_tier():
+    """
+    Pass list of orgs in 1 model definition,
+    expect a unique deployment for each to be created
+    """
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4o",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                },
+                "model_info": {"tier": "paid", "id": "very-expensive-model"},
+            },
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4o-mini",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                },
+                "model_info": {"tier": "free", "id": "very-cheap-model"},
+            },
+        ]
+    )
+
+    for _ in range(5):
+        # this should pick model with id == very-cheap-model
+        response = await router.acompletion(
+            model="gpt-4",
+            messages=[{"role": "user", "content": "Tell me a joke."}],
+            metadata={"tier": "free"},
+        )
+
+        print("Response: ", response)
+
+        response_extra_info = response._hidden_params
+        print("response_extra_info: ", response_extra_info)
+
+        assert response_extra_info["model_id"] == "very-cheap-model"
+
+    for _ in range(5):
+        # this should pick model with id == very-cheap-model
+        response = await router.acompletion(
+            model="gpt-4",
+            messages=[{"role": "user", "content": "Tell me a joke."}],
+            metadata={"tier": "paid"},
+        )
+
+        print("Response: ", response)
+
+        response_extra_info = response._hidden_params
+        print("response_extra_info: ", response_extra_info)
+
+        assert response_extra_info["model_id"] == "very-expensive-model"

From de8c92b11db8877241edd2b77ab4af2f1515a420 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 17:15:47 -0700
Subject: [PATCH 06/27] feat - enterprise

---
 litellm/proxy/litellm_pre_call_utils.py | 29 ++++++++++++++++---------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
index eaa2303ba..283f31e3c 100644
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional
 from fastapi import Request
 
 from litellm._logging import verbose_logger, verbose_proxy_logger
-from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy._types import CommonProxyErrors, UserAPIKeyAuth
 from litellm.types.utils import SupportedCacheControls
 
 if TYPE_CHECKING:
@@ -95,15 +95,6 @@ async def add_litellm_data_to_request(
         cache_dict = parse_cache_control(cache_control_header)
         data["ttl"] = cache_dict.get("s-maxage")
 
-    ### KEY-LEVEL CACHNG
-    key_metadata = user_api_key_dict.metadata
-    if "cache" in key_metadata:
-        data["cache"] = {}
-        if isinstance(key_metadata["cache"], dict):
-            for k, v in key_metadata["cache"].items():
-                if k in SupportedCacheControls:
-                    data["cache"][k] = v
-
     verbose_proxy_logger.debug("receiving data: %s", data)
 
     _metadata_variable_name = _get_metadata_variable_name(request)
@@ -133,6 +124,24 @@ async def add_litellm_data_to_request(
         user_api_key_dict, "team_alias", None
     )
 
+    ### KEY-LEVEL Contorls
+    key_metadata = user_api_key_dict.metadata
+    if "cache" in key_metadata:
+        data["cache"] = {}
+        if isinstance(key_metadata["cache"], dict):
+            for k, v in key_metadata["cache"].items():
+                if k in SupportedCacheControls:
+                    data["cache"][k] = v
+    if "tier" in key_metadata:
+        if premium_user is not True:
+            verbose_logger.warning(
+                "Trying to use free/paid tier feature. This will not be applied %s",
+                CommonProxyErrors.not_premium_user.value,
+            )
+
+        # add request tier to metadata
+        data[_metadata_variable_name]["tier"] = key_metadata["tier"]
+
     # Team spend, budget - used by prometheus.py
     data[_metadata_variable_name][
         "user_api_key_team_max_budget"

From 59d599d5fdafe92ecef79c8311f8caf21ec01b6a Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 17:31:02 -0700
Subject: [PATCH 07/27] test adding free / paid tier to metadata

---
 litellm/tests/test_litellm_pre_call_utils.py | 60 ++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 litellm/tests/test_litellm_pre_call_utils.py

diff --git a/litellm/tests/test_litellm_pre_call_utils.py b/litellm/tests/test_litellm_pre_call_utils.py
new file mode 100644
index 000000000..7f56d693d
--- /dev/null
+++ b/litellm/tests/test_litellm_pre_call_utils.py
@@ -0,0 +1,60 @@
+"""
+Tests litellm pre_call_utils
+"""
+
+import os
+import sys
+import traceback
+import uuid
+from datetime import datetime
+
+from dotenv import load_dotenv
+from fastapi import Request
+from fastapi.routing import APIRoute
+
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
+from litellm.proxy.proxy_server import ProxyConfig, chat_completion
+
+load_dotenv()
+import io
+import os
+import time
+
+import pytest
+
+# this file is to test litellm/proxy
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+
+@pytest.mark.parametrize("tier", ["free", "paid"])
+@pytest.mark.asyncio()
+async def test_adding_key_tier_to_request_metadata(tier):
+    """
+    Tests if we can add tier: free/paid from key metadata to the request metadata
+    """
+    data = {}
+
+    api_route = APIRoute(path="/chat/completions", endpoint=chat_completion)
+    request = Request(
+        {
+            "type": "http",
+            "method": "POST",
+            "route": api_route,
+            "path": api_route.path,
+            "headers": [],
+        }
+    )
+    new_data = await add_litellm_data_to_request(
+        data=data,
+        request=request,
+        user_api_key_dict=UserAPIKeyAuth(metadata={"tier": tier}),
+        proxy_config=ProxyConfig(),
+    )
+
+    print("new_data", new_data)
+
+    assert new_data["metadata"]["tier"] == tier

From 9f02fb5a33b9a9ddbb4557f2695e7b8c702251d4 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 18:01:21 -0700
Subject: [PATCH 08/27] docs using free, paid tier

---
 docs/my-website/docs/proxy/free_paid_tier.md | 102 +++++++++++++++++++
 docs/my-website/sidebars.js                  |   3 +-
 2 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 docs/my-website/docs/proxy/free_paid_tier.md

diff --git a/docs/my-website/docs/proxy/free_paid_tier.md b/docs/my-website/docs/proxy/free_paid_tier.md
new file mode 100644
index 000000000..01230e1f0
--- /dev/null
+++ b/docs/my-website/docs/proxy/free_paid_tier.md
@@ -0,0 +1,102 @@
+# 💸 Free, Paid Tier Routing
+
+Route Virtual Keys on `free tier` to cheaper models
+
+### 1. Define free, paid tier models on config.yaml 
+
+:::info
+Requests with `model=gpt-4` will be routed to either `openai/fake` or `openai/gpt-4o` depending on which tier the virtual key is on
+:::
+
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+    model_info:
+      tier: free # 👈 Key Change - set `tier to paid or free`
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+    model_info:
+      tier: paid # 👈 Key Change - set `tier to paid or free`
+
+general_settings: 
+  master_key: sk-1234 
+```
+
+### 2. Create Virtual Keys with pricing `tier=free`
+
+```shell
+curl --location 'http://0.0.0.0:4000/key/generate' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+        "metadata": {"tier": "free"}
+}'
+```
+
+### 3. Make Request with Key on `Free Tier`
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-inxzoSurQsjog9gPrVOCcA" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+  }'
+```
+
+**Expected Response**
+
+If this worked as expected then `x-litellm-model-api-base` should be `https://exampleopenaiendpoint-production.up.railway.app/` in the response headers
+
+```shell
+x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/
+
+{"id":"chatcmpl-657b750f581240c1908679ed94b31bfe","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}}%
+```
+
+
+### 4. Create Virtual Keys with pricing `tier=paid`
+
+```shell
+curl --location 'http://0.0.0.0:4000/key/generate' \
+        --header 'Authorization: Bearer sk-1234' \
+        --header 'Content-Type: application/json' \
+        --data '{
+            "metadata": {"tier": "paid"}
+    }'
+```
+
+### 5. Make Request with Key on `Paid Tier`
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-mnJoeSc6jFjzZr256q-iqA" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+  }'
+```
+
+**Expected Response**
+
+If this worked as expected then `x-litellm-model-api-base` should be `https://api.openai.com` in the response headers
+
+```shell
+x-litellm-model-api-base: https://api.openai.com
+
+{"id":"chatcmpl-9mW75EbJCgwmLcO0M5DmwxpiBgWdc","choices":[{"finish_reason":"stop","index":0,"message":{"content":"Good morning! How can I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1721350215,"model":"gpt-4o-2024-05-13","object":"chat.completion","system_fingerprint":"fp_c4e5b6fa31","usage":{"completion_tokens":10,"prompt_tokens":12,"total_tokens":22}}
+```
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index d2179cafc..a74543c87 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -43,11 +43,12 @@ const sidebars = {
         "proxy/reliability",
         "proxy/cost_tracking",
         "proxy/self_serve",
+        "proxy/virtual_keys",
+        "proxy/free_paid_tier",
         "proxy/users",
         "proxy/team_budgets",
         "proxy/customers",
         "proxy/billing",
-        "proxy/virtual_keys",
         "proxy/guardrails",
         "proxy/token_auth",
         "proxy/alerting",

From 946db012d4990dec3ee38ad4556692d2401ae55d Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 18:01:30 -0700
Subject: [PATCH 09/27] docs free/paid tier

---
 litellm/proxy/proxy_config.yaml | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 3f3b0858e..7e78cf317 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -1,23 +1,19 @@
 model_list:
-  - model_name: fake-openai-endpoint
+  - model_name: gpt-4
     litellm_params:
       model: openai/fake
       api_key: fake-key
       api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: gemini-flash
-    litellm_params:
-      model: gemini/gemini-1.5-flash
-  - model_name: whisper
-    litellm_params:
-      model: whisper-1
-      api_key: sk-*******
-      max_file_size_mb: 1000
     model_info:
-      mode: audio_transcription
+      tier: free # 👈 Key Change - set `tier`
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+    model_info:
+      tier: paid # 👈 Key Change - set `tier`
 
 general_settings: 
   master_key: sk-1234 
 
-litellm_settings:
-  success_callback: ["langsmith"]
 

From f8bdfe7cc382787d75a907dbdd86e20a85476857 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 18:16:00 -0700
Subject: [PATCH 10/27] fix test amazing vertex medlm

---
 .../tests/test_amazing_vertex_completion.py   | 66 +++++++------------
 1 file changed, 22 insertions(+), 44 deletions(-)

diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py
index 6a381022e..b8ba54cb4 100644
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@@ -36,6 +36,20 @@ litellm.cache = None
 user_message = "Write a short poem about the sky"
 messages = [{"content": user_message, "role": "user"}]
 
+VERTEX_MODELS_TO_NOT_TEST = [
+    "medlm-medium",
+    "medlm-large",
+    "code-gecko",
+    "code-gecko@001",
+    "code-gecko@002",
+    "code-gecko@latest",
+    "codechat-bison@latest",
+    "code-bison@001",
+    "text-bison@001",
+    "gemini-1.5-pro",
+    "gemini-1.5-pro-preview-0215",
+]
+
 
 def get_vertex_ai_creds_json() -> dict:
     # Define the path to the vertex_key.json file
@@ -327,17 +341,7 @@ def test_vertex_ai():
     test_models += litellm.vertex_language_models  # always test gemini-pro
     for model in test_models:
         try:
-            if model in [
-                "code-gecko",
-                "code-gecko@001",
-                "code-gecko@002",
-                "code-gecko@latest",
-                "codechat-bison@latest",
-                "code-bison@001",
-                "text-bison@001",
-                "gemini-1.5-pro",
-                "gemini-1.5-pro-preview-0215",
-            ] or (
+            if model in VERTEX_MODELS_TO_NOT_TEST or (
                 "gecko" in model or "32k" in model or "ultra" in model or "002" in model
             ):
                 # our account does not have access to this model
@@ -382,17 +386,7 @@ def test_vertex_ai_stream():
     test_models += litellm.vertex_language_models  # always test gemini-pro
     for model in test_models:
         try:
-            if model in [
-                "code-gecko",
-                "code-gecko@001",
-                "code-gecko@002",
-                "code-gecko@latest",
-                "codechat-bison@latest",
-                "code-bison@001",
-                "text-bison@001",
-                "gemini-1.5-pro",
-                "gemini-1.5-pro-preview-0215",
-            ] or (
+            if model in VERTEX_MODELS_TO_NOT_TEST or (
                 "gecko" in model or "32k" in model or "ultra" in model or "002" in model
             ):
                 # our account does not have access to this model
@@ -437,17 +431,9 @@ async def test_async_vertexai_response():
     test_models += litellm.vertex_language_models  # always test gemini-pro
     for model in test_models:
         print(f"model being tested in async call: {model}")
-        if model in [
-            "code-gecko",
-            "code-gecko@001",
-            "code-gecko@002",
-            "code-gecko@latest",
-            "codechat-bison@latest",
-            "code-bison@001",
-            "text-bison@001",
-            "gemini-1.5-pro",
-            "gemini-1.5-pro-preview-0215",
-        ] or ("gecko" in model or "32k" in model or "ultra" in model or "002" in model):
+        if model in VERTEX_MODELS_TO_NOT_TEST or (
+            "gecko" in model or "32k" in model or "ultra" in model or "002" in model
+        ):
             # our account does not have access to this model
             continue
         try:
@@ -484,17 +470,9 @@ async def test_async_vertexai_streaming_response():
     test_models = random.sample(test_models, 1)
     test_models += litellm.vertex_language_models  # always test gemini-pro
     for model in test_models:
-        if model in [
-            "code-gecko",
-            "code-gecko@001",
-            "code-gecko@002",
-            "code-gecko@latest",
-            "codechat-bison@latest",
-            "code-bison@001",
-            "text-bison@001",
-            "gemini-1.5-pro",
-            "gemini-1.5-pro-preview-0215",
-        ] or ("gecko" in model or "32k" in model or "ultra" in model or "002" in model):
+        if model in VERTEX_MODELS_TO_NOT_TEST or (
+            "gecko" in model or "32k" in model or "ultra" in model or "002" in model
+        ):
             # our account does not have access to this model
             continue
         try:

From f2401d6d5eadef39036feacb034dba6f5e102a74 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 18 Jul 2024 16:57:38 -0700
Subject: [PATCH 11/27] feat(vertex_ai_anthropic.py): support response_schema
 for vertex ai anthropic calls

allows passing response_schema for anthropic calls. supports schema validation.
---
 .../json_validation_rule.py                   |   7 +-
 litellm/llms/anthropic.py                     |  71 ++++++++----
 litellm/llms/vertex_ai_anthropic.py           |  36 +++++-
 litellm/main.py                               |   5 +
 litellm/proxy/_new_secret_config.yaml         |  14 ++-
 .../tests/test_amazing_vertex_completion.py   | 104 ++++++++++++++----
 6 files changed, 189 insertions(+), 48 deletions(-)

diff --git a/litellm/litellm_core_utils/json_validation_rule.py b/litellm/litellm_core_utils/json_validation_rule.py
index f19144aaf..0f37e6737 100644
--- a/litellm/litellm_core_utils/json_validation_rule.py
+++ b/litellm/litellm_core_utils/json_validation_rule.py
@@ -13,7 +13,12 @@ def validate_schema(schema: dict, response: str):
 
     from litellm import JSONSchemaValidationError
 
-    response_dict = json.loads(response)
+    try:
+        response_dict = json.loads(response)
+    except json.JSONDecodeError:
+        raise JSONSchemaValidationError(
+            model="", llm_provider="", raw_response=response, schema=response
+        )
 
     try:
         validate(response_dict, schema=schema)
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py
index af5ccf828..b666d9494 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@@ -16,6 +16,7 @@ from litellm import verbose_logger
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import (
     AsyncHTTPHandler,
+    HTTPHandler,
     _get_async_httpx_client,
     _get_httpx_client,
 )
@@ -538,7 +539,7 @@ class AnthropicChatCompletion(BaseLLM):
     def __init__(self) -> None:
         super().__init__()
 
-    def process_response(
+    def _process_response(
         self,
         model: str,
         response: Union[requests.Response, httpx.Response],
@@ -551,6 +552,7 @@ class AnthropicChatCompletion(BaseLLM):
         messages: List,
         print_verbose,
         encoding,
+        json_mode: bool,
     ) -> ModelResponse:
         ## LOGGING
         logging_obj.post_call(
@@ -574,27 +576,40 @@ class AnthropicChatCompletion(BaseLLM):
             )
         else:
             text_content = ""
-            tool_calls = []
-            for content in completion_response["content"]:
+            tool_calls: List[ChatCompletionToolCallChunk] = []
+            for idx, content in enumerate(completion_response["content"]):
                 if content["type"] == "text":
                     text_content += content["text"]
                 ## TOOL CALLING
                 elif content["type"] == "tool_use":
                     tool_calls.append(
-                        {
-                            "id": content["id"],
-                            "type": "function",
-                            "function": {
-                                "name": content["name"],
-                                "arguments": json.dumps(content["input"]),
-                            },
-                        }
+                        ChatCompletionToolCallChunk(
+                            id=content["id"],
+                            type="function",
+                            function=ChatCompletionToolCallFunctionChunk(
+                                name=content["name"],
+                                arguments=json.dumps(content["input"]),
+                            ),
+                            index=idx,
+                        )
                     )
 
             _message = litellm.Message(
                 tool_calls=tool_calls,
                 content=text_content or None,
             )
+
+            ## HANDLE JSON MODE - anthropic returns single function call
+            if json_mode and len(tool_calls) == 1:
+                json_mode_content_str: Optional[str] = tool_calls[0]["function"].get(
+                    "arguments"
+                )
+                if json_mode_content_str is not None:
+                    args = json.loads(json_mode_content_str)
+                    values: Optional[dict] = args.get("values")
+                    if values is not None:
+                        _message = litellm.Message(content=json.dumps(values))
+                        completion_response["stop_reason"] = "stop"
             model_response.choices[0].message = _message  # type: ignore
             model_response._hidden_params["original_response"] = completion_response[
                 "content"
@@ -687,9 +702,11 @@ class AnthropicChatCompletion(BaseLLM):
         _is_function_call,
         data: dict,
         optional_params: dict,
+        json_mode: bool,
         litellm_params=None,
         logger_fn=None,
         headers={},
+        client=None,
     ) -> Union[ModelResponse, CustomStreamWrapper]:
         async_handler = _get_async_httpx_client()
 
@@ -705,7 +722,7 @@ class AnthropicChatCompletion(BaseLLM):
             )
             raise e
 
-        return self.process_response(
+        return self._process_response(
             model=model,
             response=response,
             model_response=model_response,
@@ -717,6 +734,7 @@ class AnthropicChatCompletion(BaseLLM):
             print_verbose=print_verbose,
             optional_params=optional_params,
             encoding=encoding,
+            json_mode=json_mode,
         )
 
     def completion(
@@ -731,10 +749,12 @@ class AnthropicChatCompletion(BaseLLM):
         api_key,
         logging_obj,
         optional_params: dict,
+        timeout: Union[float, httpx.Timeout],
         acompletion=None,
         litellm_params=None,
         logger_fn=None,
         headers={},
+        client=None,
     ):
         headers = validate_environment(api_key, headers, model)
         _is_function_call = False
@@ -787,14 +807,18 @@ class AnthropicChatCompletion(BaseLLM):
 
             anthropic_tools = []
             for tool in optional_params["tools"]:
-                new_tool = tool["function"]
-                new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
-                anthropic_tools.append(new_tool)
+                if "input_schema" in tool:  # assume in anthropic format
+                    anthropic_tools.append(tool)
+                else:  # assume openai tool call
+                    new_tool = tool["function"]
+                    new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
+                    anthropic_tools.append(new_tool)
 
             optional_params["tools"] = anthropic_tools
 
         stream = optional_params.pop("stream", None)
         is_vertex_request: bool = optional_params.pop("is_vertex_request", False)
+        json_mode: bool = optional_params.pop("json_mode", False)
 
         data = {
             "messages": messages,
@@ -815,7 +839,7 @@ class AnthropicChatCompletion(BaseLLM):
             },
         )
         print_verbose(f"_is_function_call: {_is_function_call}")
-        if acompletion == True:
+        if acompletion is True:
             if (
                 stream is True
             ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
@@ -857,15 +881,21 @@ class AnthropicChatCompletion(BaseLLM):
                     litellm_params=litellm_params,
                     logger_fn=logger_fn,
                     headers=headers,
+                    client=client,
+                    json_mode=json_mode,
                 )
         else:
             ## COMPLETION CALL
+            if client is None or isinstance(client, AsyncHTTPHandler):
+                client = HTTPHandler(timeout=timeout)  # type: ignore
+            else:
+                client = client
             if (
                 stream is True
             ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
                 print_verbose("makes anthropic streaming POST request")
                 data["stream"] = stream
-                response = requests.post(
+                response = client.post(
                     api_base,
                     headers=headers,
                     data=json.dumps(data),
@@ -889,15 +919,13 @@ class AnthropicChatCompletion(BaseLLM):
                 return streaming_response
 
             else:
-                response = requests.post(
-                    api_base, headers=headers, data=json.dumps(data)
-                )
+                response = client.post(api_base, headers=headers, data=json.dumps(data))
                 if response.status_code != 200:
                     raise AnthropicError(
                         status_code=response.status_code, message=response.text
                     )
 
-        return self.process_response(
+        return self._process_response(
             model=model,
             response=response,
             model_response=model_response,
@@ -909,6 +937,7 @@ class AnthropicChatCompletion(BaseLLM):
             print_verbose=print_verbose,
             optional_params=optional_params,
             encoding=encoding,
+            json_mode=json_mode,
         )
 
     def embedding(self):
diff --git a/litellm/llms/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_anthropic.py
index b8362d5a5..900e7795f 100644
--- a/litellm/llms/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_anthropic.py
@@ -7,7 +7,7 @@ import time
 import types
 import uuid
 from enum import Enum
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import httpx  # type: ignore
 import requests  # type: ignore
@@ -15,7 +15,14 @@ import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from litellm.types.llms.anthropic import AnthropicMessagesToolChoice
+from litellm.types.llms.anthropic import (
+    AnthropicMessagesTool,
+    AnthropicMessagesToolChoice,
+)
+from litellm.types.llms.openai import (
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
+)
 from litellm.types.utils import ResponseFormatChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 
@@ -142,7 +149,27 @@ class VertexAIAnthropicConfig:
             if param == "top_p":
                 optional_params["top_p"] = value
             if param == "response_format" and "response_schema" in value:
-                optional_params["response_format"] = ResponseFormatChunk(**value)  # type: ignore
+                """
+                When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode
+                - You usually want to provide a single tool
+                - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
+                - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
+                """
+                _tool_choice = None
+                _tool_choice = {"name": "json_tool_call", "type": "tool"}
+
+                _tool = AnthropicMessagesTool(
+                    name="json_tool_call",
+                    input_schema={
+                        "type": "object",
+                        "properties": {"values": value["response_schema"]},  # type: ignore
+                    },
+                )
+
+                optional_params["tools"] = [_tool]
+                optional_params["tool_choice"] = _tool_choice
+                optional_params["json_mode"] = True
+
         return optional_params
 
 
@@ -222,6 +249,7 @@ def completion(
     optional_params: dict,
     custom_prompt_dict: dict,
     headers: Optional[dict],
+    timeout: Union[float, httpx.Timeout],
     vertex_project=None,
     vertex_location=None,
     vertex_credentials=None,
@@ -301,6 +329,8 @@ def completion(
             litellm_params=litellm_params,
             logger_fn=logger_fn,
             headers=vertex_headers,
+            client=client,
+            timeout=timeout,
         )
 
     except Exception as e:
diff --git a/litellm/main.py b/litellm/main.py
index e01603b7e..69c845ad8 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -1528,6 +1528,8 @@ def completion(
                     api_key=api_key,
                     logging_obj=logging,
                     headers=headers,
+                    timeout=timeout,
+                    client=client,
                 )
             if optional_params.get("stream", False) or acompletion == True:
                 ## LOGGING
@@ -2046,7 +2048,10 @@ def completion(
                     acompletion=acompletion,
                     headers=headers,
                     custom_prompt_dict=custom_prompt_dict,
+                    timeout=timeout,
+                    client=client,
                 )
+
             else:
                 model_response = vertex_ai.completion(
                     model=model,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 641c70ebc..1bd421f8d 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,5 +1,13 @@
 model_list:
-  - model_name: llama-3
+  - model_name: bad-azure-model
     litellm_params:
-      model: gpt-4
-      request_timeout: 1
+      model: azure/chatgpt-v-2
+      azure_ad_token: ""
+      api_base: os.environ/AZURE_API_BASE
+
+  - model_name: good-openai-model
+    litellm_params:
+      model: gpt-3.5-turbo
+
+litellm_settings:
+  fallbacks: [{"bad-azure-model": ["good-openai-model"]}]
diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py
index b8ba54cb4..3def5a1ec 100644
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@@ -1128,6 +1128,39 @@ def vertex_httpx_mock_post_valid_response(*args, **kwargs):
     return mock_response
 
 
+def vertex_httpx_mock_post_valid_response_anthropic(*args, **kwargs):
+    mock_response = MagicMock()
+    mock_response.status_code = 200
+    mock_response.headers = {"Content-Type": "application/json"}
+    mock_response.json.return_value = {
+        "id": "msg_vrtx_013Wki5RFQXAspL7rmxRFjZg",
+        "type": "message",
+        "role": "assistant",
+        "model": "claude-3-5-sonnet-20240620",
+        "content": [
+            {
+                "type": "tool_use",
+                "id": "toolu_vrtx_01YMnYZrToPPfcmY2myP2gEB",
+                "name": "json_tool_call",
+                "input": {
+                    "values": [
+                        {"recipe_name": "Chocolate Chip Cookies"},
+                        {"recipe_name": "Oatmeal Raisin Cookies"},
+                        {"recipe_name": "Peanut Butter Cookies"},
+                        {"recipe_name": "Snickerdoodle Cookies"},
+                        {"recipe_name": "Sugar Cookies"},
+                    ]
+                },
+            }
+        ],
+        "stop_reason": "tool_use",
+        "stop_sequence": None,
+        "usage": {"input_tokens": 368, "output_tokens": 118},
+    }
+
+    return mock_response
+
+
 def vertex_httpx_mock_post_invalid_schema_response(*args, **kwargs):
     mock_response = MagicMock()
     mock_response.status_code = 200
@@ -1183,11 +1216,29 @@ def vertex_httpx_mock_post_invalid_schema_response(*args, **kwargs):
     return mock_response
 
 
+def vertex_httpx_mock_post_invalid_schema_response_anthropic(*args, **kwargs):
+    mock_response = MagicMock()
+    mock_response.status_code = 200
+    mock_response.headers = {"Content-Type": "application/json"}
+    mock_response.json.return_value = {
+        "id": "msg_vrtx_013Wki5RFQXAspL7rmxRFjZg",
+        "type": "message",
+        "role": "assistant",
+        "model": "claude-3-5-sonnet-20240620",
+        "content": [{"text": "Hi! My name is Claude.", "type": "text"}],
+        "stop_reason": "end_turn",
+        "stop_sequence": None,
+        "usage": {"input_tokens": 368, "output_tokens": 118},
+    }
+    return mock_response
+
+
 @pytest.mark.parametrize(
     "model, vertex_location, supports_response_schema",
     [
         ("vertex_ai_beta/gemini-1.5-pro-001", "us-central1", True),
         ("vertex_ai_beta/gemini-1.5-flash", "us-central1", False),
+        ("vertex_ai/claude-3-5-sonnet@20240620", "us-east5", False),
     ],
 )
 @pytest.mark.parametrize(
@@ -1231,12 +1282,21 @@ async def test_gemini_pro_json_schema_args_sent_httpx(
 
     httpx_response = MagicMock()
     if invalid_response is True:
-        httpx_response.side_effect = vertex_httpx_mock_post_invalid_schema_response
+        if "claude" in model:
+            httpx_response.side_effect = (
+                vertex_httpx_mock_post_invalid_schema_response_anthropic
+            )
+        else:
+            httpx_response.side_effect = vertex_httpx_mock_post_invalid_schema_response
     else:
-        httpx_response.side_effect = vertex_httpx_mock_post_valid_response
+        if "claude" in model:
+            httpx_response.side_effect = vertex_httpx_mock_post_valid_response_anthropic
+        else:
+            httpx_response.side_effect = vertex_httpx_mock_post_valid_response
     with patch.object(client, "post", new=httpx_response) as mock_call:
+        print("SENDING CLIENT POST={}".format(client.post))
         try:
-            _ = completion(
+            resp = completion(
                 model=model,
                 messages=messages,
                 response_format={
@@ -1247,30 +1307,34 @@ async def test_gemini_pro_json_schema_args_sent_httpx(
                 vertex_location=vertex_location,
                 client=client,
             )
+            print("Received={}".format(resp))
             if invalid_response is True and enforce_validation is True:
                 pytest.fail("Expected this to fail")
         except litellm.JSONSchemaValidationError as e:
-            if invalid_response is False and "claude-3" not in model:
+            if invalid_response is False:
                 pytest.fail("Expected this to pass. Got={}".format(e))
 
         mock_call.assert_called_once()
-        print(mock_call.call_args.kwargs)
-        print(mock_call.call_args.kwargs["json"]["generationConfig"])
+        if "claude" not in model:
+            print(mock_call.call_args.kwargs)
+            print(mock_call.call_args.kwargs["json"]["generationConfig"])
 
-        if supports_response_schema:
-            assert (
-                "response_schema"
-                in mock_call.call_args.kwargs["json"]["generationConfig"]
-            )
-        else:
-            assert (
-                "response_schema"
-                not in mock_call.call_args.kwargs["json"]["generationConfig"]
-            )
-            assert (
-                "Use this JSON schema:"
-                in mock_call.call_args.kwargs["json"]["contents"][0]["parts"][1]["text"]
-            )
+            if supports_response_schema:
+                assert (
+                    "response_schema"
+                    in mock_call.call_args.kwargs["json"]["generationConfig"]
+                )
+            else:
+                assert (
+                    "response_schema"
+                    not in mock_call.call_args.kwargs["json"]["generationConfig"]
+                )
+                assert (
+                    "Use this JSON schema:"
+                    in mock_call.call_args.kwargs["json"]["contents"][0]["parts"][1][
+                        "text"
+                    ]
+                )
 
 
 @pytest.mark.parametrize("provider", ["vertex_ai_beta"])  # "vertex_ai",

From af0d30e41eddd6b39369588d3c79b0cc07b7ec67 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 18 Jul 2024 17:20:19 -0700
Subject: [PATCH 12/27] docs(json_mode.md): add json mode to docs

---
 docs/my-website/docs/completion/json_mode.md | 137 +++++++++++++++++++
 docs/my-website/sidebars.js                  |   1 +
 2 files changed, 138 insertions(+)
 create mode 100644 docs/my-website/docs/completion/json_mode.md

diff --git a/docs/my-website/docs/completion/json_mode.md b/docs/my-website/docs/completion/json_mode.md
new file mode 100644
index 000000000..0e7e64a8e
--- /dev/null
+++ b/docs/my-website/docs/completion/json_mode.md
@@ -0,0 +1,137 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# JSON Mode
+
+## Quick Start 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os 
+
+os.environ["OPENAI_API_KEY"] = ""
+
+response = completion(
+  model="gpt-4o-mini",
+  response_format={ "type": "json_object" },
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ]
+)
+print(response.choices[0].message.content)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "response_format": { "type": "json_object" },
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant designed to output JSON."
+      },
+      {
+        "role": "user",
+        "content": "Who won the world series in 2020?"
+      }
+    ]
+  }'
+```
+</TabItem>
+</Tabs>
+
+## Check Model Support 
+
+Call `litellm.get_supported_openai_params` to check if a model/provider supports `response_format`. 
+
+```python
+from litellm import get_supported_openai_params
+
+params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
+
+assert "response_format" in params
+```
+
+## Validate JSON Schema 
+
+For VertexAI models, LiteLLM supports passing the `response_schema` and validating the JSON output.
+
+This works across Gemini (`vertex_ai_beta/`) + Anthropic (`vertex_ai/`) models. 
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+from litellm import completion 
+
+messages = [{"role": "user", "content": "List 5 cookie recipes"}]
+
+response_schema = {
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "recipe_name": {
+                "type": "string",
+            },
+        },
+        "required": ["recipe_name"],
+    },
+}
+
+resp = completion(
+    model="vertex_ai_beta/gemini-1.5-pro",
+    messages=messages,
+    response_format={
+        "type": "json_object",
+        "response_schema": response_schema,
+        "enforce_validation": True, # client-side json schema validation
+    },
+    vertex_location="us-east5",
+)
+
+print("Received={}".format(resp))
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_API_KEY" \
+  -d '{
+    "model": "vertex_ai_beta/gemini-1.5-pro",
+    "messages": [{"role": "user", "content": "List 5 cookie recipes"}]
+    "response_format": { 
+        "type": "json_object",
+        "enforce_validation: true, 
+        "response_schema": { 
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "recipe_name": {
+                        "type": "string",
+                    },
+                },
+                "required": ["recipe_name"],
+            },
+        }
+    },
+  }'
+```
+
+</TabItem>
+</Tabs>
\ No newline at end of file
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index a74543c87..eea863d8e 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -92,6 +92,7 @@ const sidebars = {
       items: [
         "completion/input",
         "completion/provider_specific_params",
+        "completion/json_mode",
         "completion/drop_params",
         "completion/prompt_formatting",
         "completion/output",

From b0f0898f2fea066f60ec324ecccddd1fd357cff8 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 17:06:06 -0700
Subject: [PATCH 13/27] helper to get_deployments_for_tier

---
 litellm/router_strategy/free_paid_tiers.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py
index 82e38b4f5..4328bd84c 100644
--- a/litellm/router_strategy/free_paid_tiers.py
+++ b/litellm/router_strategy/free_paid_tiers.py
@@ -17,19 +17,14 @@ class Deployment(TypedDict):
 
 
 async def get_deployments_for_tier(
-    request_kwargs: Optional[Dict[Any, Any]] = None,
-    healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None,
+    request_kwargs: dict,
+    healthy_deployments: Optional[
+        Union[List[DeploymentTypedDict], List[Dict[str, Any]]]
+    ] = None,
 ):
     """
     if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
     """
-    if request_kwargs is None:
-        verbose_logger.debug(
-            "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
-            healthy_deployments,
-        )
-        return healthy_deployments
-
     verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata"))
     if "metadata" in request_kwargs:
         metadata = request_kwargs["metadata"]

From d42963a0ae5d77729c05320e8ac49e2290d76e88 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 17:09:42 -0700
Subject: [PATCH 14/27] router - use free paid tier routing

---
 litellm/router_strategy/free_paid_tiers.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py
index 4328bd84c..82e38b4f5 100644
--- a/litellm/router_strategy/free_paid_tiers.py
+++ b/litellm/router_strategy/free_paid_tiers.py
@@ -17,14 +17,19 @@ class Deployment(TypedDict):
 
 
 async def get_deployments_for_tier(
-    request_kwargs: dict,
-    healthy_deployments: Optional[
-        Union[List[DeploymentTypedDict], List[Dict[str, Any]]]
-    ] = None,
+    request_kwargs: Optional[Dict[Any, Any]] = None,
+    healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None,
 ):
     """
     if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
     """
+    if request_kwargs is None:
+        verbose_logger.debug(
+            "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
+            healthy_deployments,
+        )
+        return healthy_deployments
+
     verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata"))
     if "metadata" in request_kwargs:
         metadata = request_kwargs["metadata"]

From db3063d3d3ce0330ebbe95a6d6ce366826e5e9f7 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 18 Jul 2024 22:05:10 -0700
Subject: [PATCH 15/27] test: fix test

---
 litellm/tests/test_proxy_exception_mapping.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/litellm/tests/test_proxy_exception_mapping.py b/litellm/tests/test_proxy_exception_mapping.py
index 4fb1e7134..07ae7f5a8 100644
--- a/litellm/tests/test_proxy_exception_mapping.py
+++ b/litellm/tests/test_proxy_exception_mapping.py
@@ -75,8 +75,8 @@ def test_chat_completion_exception(client):
         print("ERROR=", json_response["error"])
         assert isinstance(json_response["error"]["message"], str)
         assert (
-            json_response["error"]["message"]
-            == "litellm.AuthenticationError: AuthenticationError: OpenAIException - Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys."
+            "litellm.AuthenticationError: AuthenticationError: OpenAIException - Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys."
+            in json_response["error"]["message"]
         )
 
         # make an openai client to call _make_status_error_from_response

From 38c50e674eee6da0e18b951cc78f8ed7d7832433 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 18:22:40 -0700
Subject: [PATCH 16/27] fix ui make ui session last 24 hours

---
 litellm/proxy/proxy_server.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index d2337c37f..592d95777 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1734,9 +1734,7 @@ class ProxyConfig:
                 "background_health_checks", False
             )
             health_check_interval = general_settings.get("health_check_interval", 300)
-            health_check_details = general_settings.get(
-                "health_check_details", True
-            )
+            health_check_details = general_settings.get("health_check_details", True)
 
             ## check if user has set a premium feature in general_settings
             if (
@@ -7793,7 +7791,7 @@ async def login(request: Request):
                 request_type="key",
                 **{
                     "user_role": LitellmUserRoles.PROXY_ADMIN,
-                    "duration": "2hr",
+                    "duration": "12hr",
                     "key_max_budget": 5,
                     "models": [],
                     "aliases": {},
@@ -7857,7 +7855,7 @@ async def login(request: Request):
                     request_type="key",
                     **{  # type: ignore
                         "user_role": user_role,
-                        "duration": "2hr",
+                        "duration": "12hr",
                         "key_max_budget": 5,
                         "models": [],
                         "aliases": {},
@@ -7998,7 +7996,7 @@ async def onboarding(invite_link: str):
         request_type="key",
         **{
             "user_role": user_obj.user_role,
-            "duration": "2hr",
+            "duration": "12hr",
             "key_max_budget": 5,
             "models": [],
             "aliases": {},
@@ -8336,7 +8334,7 @@ async def auth_callback(request: Request):
     # User might not be already created on first generation of key
     # But if it is, we want their models preferences
     default_ui_key_values = {
-        "duration": "2hr",
+        "duration": "12hr",
         "key_max_budget": 0.01,
         "aliases": {},
         "config": {},

From ad46e6a61f20652efdbfe351af1d576a7910609b Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 19:22:09 -0700
Subject: [PATCH 17/27] router - refactor to tag based routing

---
 litellm/router.py                             |  6 +-
 litellm/router_strategy/free_paid_tiers.py    | 69 -------------------
 litellm/router_strategy/tag_based_routing.py  | 68 ++++++++++++++++++
 ...er_tiers.py => test_router_tag_routing.py} | 10 +--
 litellm/types/router.py                       |  4 ++
 5 files changed, 81 insertions(+), 76 deletions(-)
 delete mode 100644 litellm/router_strategy/free_paid_tiers.py
 create mode 100644 litellm/router_strategy/tag_based_routing.py
 rename litellm/tests/{test_router_tiers.py => test_router_tag_routing.py} (89%)

diff --git a/litellm/router.py b/litellm/router.py
index 487d5fd6a..44c02f126 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -47,12 +47,12 @@ from litellm.assistants.main import AssistantDeleted
 from litellm.caching import DualCache, InMemoryCache, RedisCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.azure import get_azure_ad_token_from_oidc
-from litellm.router_strategy.free_paid_tiers import get_deployments_for_tier
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
+from litellm.router_strategy.tag_based_routing import get_deployments_for_tag
 from litellm.router_utils.client_initalization_utils import (
     set_client,
     should_initialize_sync_client,
@@ -4482,8 +4482,8 @@ class Router:
                     request_kwargs=request_kwargs,
                 )
 
-            # check free / paid tier for each deployment
-            healthy_deployments = await get_deployments_for_tier(
+            # check if user wants to do tag based routing
+            healthy_deployments = await get_deployments_for_tag(
                 request_kwargs=request_kwargs,
                 healthy_deployments=healthy_deployments,
             )
diff --git a/litellm/router_strategy/free_paid_tiers.py b/litellm/router_strategy/free_paid_tiers.py
deleted file mode 100644
index 82e38b4f5..000000000
--- a/litellm/router_strategy/free_paid_tiers.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-Use this to route requests between free and paid tiers
-"""
-
-from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast
-
-from litellm._logging import verbose_logger
-from litellm.types.router import DeploymentTypedDict
-
-
-class ModelInfo(TypedDict):
-    tier: Literal["free", "paid"]
-
-
-class Deployment(TypedDict):
-    model_info: ModelInfo
-
-
-async def get_deployments_for_tier(
-    request_kwargs: Optional[Dict[Any, Any]] = None,
-    healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None,
-):
-    """
-    if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
-    """
-    if request_kwargs is None:
-        verbose_logger.debug(
-            "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
-            healthy_deployments,
-        )
-        return healthy_deployments
-
-    verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata"))
-    if "metadata" in request_kwargs:
-        metadata = request_kwargs["metadata"]
-        if "tier" in metadata:
-            selected_tier: Literal["free", "paid"] = metadata["tier"]
-            if healthy_deployments is None:
-                return None
-
-            if selected_tier == "free":
-                # get all deployments where model_info has tier = free
-                free_deployments: List[Any] = []
-                verbose_logger.debug(
-                    "Getting deployments in free tier, all_deployments: %s",
-                    healthy_deployments,
-                )
-                for deployment in healthy_deployments:
-                    typed_deployment = cast(Deployment, deployment)
-                    if typed_deployment["model_info"]["tier"] == "free":
-                        free_deployments.append(deployment)
-                verbose_logger.debug("free_deployments: %s", free_deployments)
-                return free_deployments
-
-            elif selected_tier == "paid":
-                # get all deployments where model_info has tier = paid
-                paid_deployments: List[Any] = []
-                for deployment in healthy_deployments:
-                    typed_deployment = cast(Deployment, deployment)
-                    if typed_deployment["model_info"]["tier"] == "paid":
-                        paid_deployments.append(deployment)
-                verbose_logger.debug("paid_deployments: %s", paid_deployments)
-                return paid_deployments
-
-    verbose_logger.debug(
-        "no tier found in metadata, returning healthy_deployments: %s",
-        healthy_deployments,
-    )
-    return healthy_deployments
diff --git a/litellm/router_strategy/tag_based_routing.py b/litellm/router_strategy/tag_based_routing.py
new file mode 100644
index 000000000..11bad19a3
--- /dev/null
+++ b/litellm/router_strategy/tag_based_routing.py
@@ -0,0 +1,68 @@
+"""
+Use this to route requests between free and paid tiers
+"""
+
+from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast
+
+from litellm._logging import verbose_logger
+from litellm.types.router import DeploymentTypedDict
+
+
+async def get_deployments_for_tag(
+    request_kwargs: Optional[Dict[Any, Any]] = None,
+    healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None,
+):
+    """
+    if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
+    """
+    if request_kwargs is None:
+        verbose_logger.debug(
+            "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
+            healthy_deployments,
+        )
+        return healthy_deployments
+
+    if healthy_deployments is None:
+        verbose_logger.debug(
+            "get_deployments_for_tier: healthy_deployments is None returning healthy_deployments"
+        )
+        return healthy_deployments
+
+    verbose_logger.debug("request metadata: %s", request_kwargs.get("metadata"))
+    if "metadata" in request_kwargs:
+        metadata = request_kwargs["metadata"]
+        request_tags = metadata.get("tags")
+
+        new_healthy_deployments = []
+        if request_tags:
+            verbose_logger.debug("parameter routing: router_keys: %s", request_tags)
+            # example this can be router_keys=["free", "custom"]
+            # get all deployments that have a superset of these router keys
+            for deployment in healthy_deployments:
+                deployment_litellm_params = deployment.get("litellm_params")
+                deployment_tags = deployment_litellm_params.get("tags")
+
+                verbose_logger.debug(
+                    "deployment: %s,  deployment_router_keys: %s",
+                    deployment,
+                    deployment_tags,
+                )
+
+                if deployment_tags is None:
+                    continue
+
+                if set(request_tags).issubset(set(deployment_tags)):
+                    verbose_logger.debug(
+                        "adding deployment with tags: %s, request tags: %s",
+                        deployment_tags,
+                        request_tags,
+                    )
+                    new_healthy_deployments.append(deployment)
+
+        return new_healthy_deployments
+
+    verbose_logger.debug(
+        "no tier found in metadata, returning healthy_deployments: %s",
+        healthy_deployments,
+    )
+    return healthy_deployments
diff --git a/litellm/tests/test_router_tiers.py b/litellm/tests/test_router_tag_routing.py
similarity index 89%
rename from litellm/tests/test_router_tiers.py
rename to litellm/tests/test_router_tag_routing.py
index 54e67ded3..feb67c0e9 100644
--- a/litellm/tests/test_router_tiers.py
+++ b/litellm/tests/test_router_tag_routing.py
@@ -45,16 +45,18 @@ async def test_router_free_paid_tier():
                 "litellm_params": {
                     "model": "gpt-4o",
                     "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "tags": ["free"],
                 },
-                "model_info": {"tier": "paid", "id": "very-expensive-model"},
+                "model_info": {"id": "very-cheap-model"},
             },
             {
                 "model_name": "gpt-4",
                 "litellm_params": {
                     "model": "gpt-4o-mini",
                     "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "tags": ["paid"],
                 },
-                "model_info": {"tier": "free", "id": "very-cheap-model"},
+                "model_info": {"id": "very-expensive-model"},
             },
         ]
     )
@@ -64,7 +66,7 @@ async def test_router_free_paid_tier():
         response = await router.acompletion(
             model="gpt-4",
             messages=[{"role": "user", "content": "Tell me a joke."}],
-            metadata={"tier": "free"},
+            metadata={"tags": ["free"]},
         )
 
         print("Response: ", response)
@@ -79,7 +81,7 @@ async def test_router_free_paid_tier():
         response = await router.acompletion(
             model="gpt-4",
             messages=[{"role": "user", "content": "Tell me a joke."}],
-            metadata={"tier": "paid"},
+            metadata={"tags": ["paid"]},
         )
 
         print("Response: ", response)
diff --git a/litellm/types/router.py b/litellm/types/router.py
index df9947c26..78dfbc4c1 100644
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@@ -325,6 +325,10 @@ class LiteLLMParamsTypedDict(TypedDict, total=False):
     ## MOCK RESPONSES ##
     mock_response: Optional[Union[str, ModelResponse, Exception]]
 
+    # routing params
+    # use this for tag-based routing
+    tags: Optional[List[str]]
+
 
 class DeploymentTypedDict(TypedDict):
     model_name: str

From 79c5788ad99ffee06b0850f90a8bbfc1421b5a68 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 19:24:13 -0700
Subject: [PATCH 18/27] fix remove previous code on free/paid tier

---
 litellm/proxy/litellm_pre_call_utils.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
index 283f31e3c..e0e875308 100644
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@@ -132,15 +132,6 @@ async def add_litellm_data_to_request(
             for k, v in key_metadata["cache"].items():
                 if k in SupportedCacheControls:
                     data["cache"][k] = v
-    if "tier" in key_metadata:
-        if premium_user is not True:
-            verbose_logger.warning(
-                "Trying to use free/paid tier feature. This will not be applied %s",
-                CommonProxyErrors.not_premium_user.value,
-            )
-
-        # add request tier to metadata
-        data[_metadata_variable_name]["tier"] = key_metadata["tier"]
 
     # Team spend, budget - used by prometheus.py
     data[_metadata_variable_name][

From e29851503473667416311d52efa5360b2ad89236 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 19:34:45 -0700
Subject: [PATCH 19/27] fix use tags as a litellm param

---
 litellm/main.py                 | 5 +++++
 litellm/proxy/proxy_config.yaml | 6 ++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 69c845ad8..3889d1bc8 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -735,6 +735,7 @@ def completion(
     ]
     litellm_params = [
         "metadata",
+        "tags",
         "acompletion",
         "atext_completion",
         "text_completion",
@@ -3155,6 +3156,7 @@ def embedding(
         "allowed_model_region",
         "model_config",
         "cooldown_time",
+        "tags",
     ]
     default_params = openai_params + litellm_params
     non_default_params = {
@@ -4384,6 +4386,8 @@ def transcription(
     proxy_server_request = kwargs.get("proxy_server_request", None)
     model_info = kwargs.get("model_info", None)
     metadata = kwargs.get("metadata", {})
+    tags = kwargs.pop("tags", [])
+
     drop_params = kwargs.get("drop_params", None)
     client: Optional[
         Union[
@@ -4556,6 +4560,7 @@ def speech(
 ) -> HttpxBinaryResponseContent:
 
     model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
+    tags = kwargs.pop("tags", [])
 
     optional_params = {}
     if response_format is not None:
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 7e78cf317..81ed12c07 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -4,14 +4,12 @@ model_list:
       model: openai/fake
       api_key: fake-key
       api_base: https://exampleopenaiendpoint-production.up.railway.app/
-    model_info:
-      tier: free # 👈 Key Change - set `tier`
+      tags: ["free"]
   - model_name: gpt-4
     litellm_params:
       model: openai/gpt-4o
       api_key: os.environ/OPENAI_API_KEY
-    model_info:
-      tier: paid # 👈 Key Change - set `tier`
+      tags: ["paid"]
 
 general_settings: 
   master_key: sk-1234 

From 52d0f6a808e53fdfdda6b26648c02586c6f49ff8 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 19:39:04 -0700
Subject: [PATCH 20/27] control using enable_tag_filtering

---
 litellm/router.py                            |  3 +
 litellm/router_strategy/tag_based_routing.py | 13 ++++-
 litellm/tests/test_litellm_pre_call_utils.py | 60 --------------------
 3 files changed, 15 insertions(+), 61 deletions(-)
 delete mode 100644 litellm/tests/test_litellm_pre_call_utils.py

diff --git a/litellm/router.py b/litellm/router.py
index 44c02f126..0e693e188 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -145,6 +145,7 @@ class Router:
         content_policy_fallbacks: List = [],
         model_group_alias: Optional[dict] = {},
         enable_pre_call_checks: bool = False,
+        enable_tag_filtering: bool = False,
         retry_after: int = 0,  # min time to wait before retrying a failed request
         retry_policy: Optional[
             RetryPolicy
@@ -246,6 +247,7 @@ class Router:
         self.set_verbose = set_verbose
         self.debug_level = debug_level
         self.enable_pre_call_checks = enable_pre_call_checks
+        self.enable_tag_filtering = enable_tag_filtering
         if self.set_verbose == True:
             if debug_level == "INFO":
                 verbose_router_logger.setLevel(logging.INFO)
@@ -4484,6 +4486,7 @@ class Router:
 
             # check if user wants to do tag based routing
             healthy_deployments = await get_deployments_for_tag(
+                llm_router_instance=self,
                 request_kwargs=request_kwargs,
                 healthy_deployments=healthy_deployments,
             )
diff --git a/litellm/router_strategy/tag_based_routing.py b/litellm/router_strategy/tag_based_routing.py
index 11bad19a3..2dbc5cb93 100644
--- a/litellm/router_strategy/tag_based_routing.py
+++ b/litellm/router_strategy/tag_based_routing.py
@@ -2,19 +2,30 @@
 Use this to route requests between free and paid tiers
 """
 
-from typing import Any, Dict, List, Literal, Optional, TypedDict, Union, cast
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union
 
 from litellm._logging import verbose_logger
 from litellm.types.router import DeploymentTypedDict
 
+if TYPE_CHECKING:
+    from litellm.router import Router as _Router
+
+    LitellmRouter = _Router
+else:
+    LitellmRouter = Any
+
 
 async def get_deployments_for_tag(
+    llm_router_instance: LitellmRouter,
     request_kwargs: Optional[Dict[Any, Any]] = None,
     healthy_deployments: Optional[Union[List[Any], Dict[Any, Any]]] = None,
 ):
     """
     if request_kwargs contains {"metadata": {"tier": "free"}} or {"metadata": {"tier": "paid"}}, then routes the request to free/paid tier models
     """
+    if llm_router_instance.enable_tag_filtering is not True:
+        return healthy_deployments
+
     if request_kwargs is None:
         verbose_logger.debug(
             "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
diff --git a/litellm/tests/test_litellm_pre_call_utils.py b/litellm/tests/test_litellm_pre_call_utils.py
deleted file mode 100644
index 7f56d693d..000000000
--- a/litellm/tests/test_litellm_pre_call_utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""
-Tests litellm pre_call_utils
-"""
-
-import os
-import sys
-import traceback
-import uuid
-from datetime import datetime
-
-from dotenv import load_dotenv
-from fastapi import Request
-from fastapi.routing import APIRoute
-
-from litellm.proxy._types import UserAPIKeyAuth
-from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
-from litellm.proxy.proxy_server import ProxyConfig, chat_completion
-
-load_dotenv()
-import io
-import os
-import time
-
-import pytest
-
-# this file is to test litellm/proxy
-
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-
-
-@pytest.mark.parametrize("tier", ["free", "paid"])
-@pytest.mark.asyncio()
-async def test_adding_key_tier_to_request_metadata(tier):
-    """
-    Tests if we can add tier: free/paid from key metadata to the request metadata
-    """
-    data = {}
-
-    api_route = APIRoute(path="/chat/completions", endpoint=chat_completion)
-    request = Request(
-        {
-            "type": "http",
-            "method": "POST",
-            "route": api_route,
-            "path": api_route.path,
-            "headers": [],
-        }
-    )
-    new_data = await add_litellm_data_to_request(
-        data=data,
-        request=request,
-        user_api_key_dict=UserAPIKeyAuth(metadata={"tier": tier}),
-        proxy_config=ProxyConfig(),
-    )
-
-    print("new_data", new_data)
-
-    assert new_data["metadata"]["tier"] == tier

From 1ab5c1a22700f845840a1ab888c1eda84178d12f Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 20:10:45 -0700
Subject: [PATCH 21/27] check if using tag based routing

---
 litellm/proxy/litellm_pre_call_utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
index e0e875308..e6bce5392 100644
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@@ -75,7 +75,7 @@ async def add_litellm_data_to_request(
         dict: The modified data dictionary.
 
     """
-    from litellm.proxy.proxy_server import premium_user
+    from litellm.proxy.proxy_server import llm_router, premium_user
 
     safe_add_api_version_from_query_params(data, request)
 
@@ -166,7 +166,8 @@ async def add_litellm_data_to_request(
     if user_api_key_dict.allowed_model_region is not None:
         data["allowed_model_region"] = user_api_key_dict.allowed_model_region
 
-    ## [Enterprise Only] Add User-IP Address
+    ## [Enterprise Only]
+    # Add User-IP Address
     requester_ip_address = ""
     if premium_user is True:
         # Only set the IP Address for Enterprise Users
@@ -179,6 +180,15 @@ async def add_litellm_data_to_request(
             requester_ip_address = request.client.host
     data[_metadata_variable_name]["requester_ip_address"] = requester_ip_address
 
+    # Enterprise Only - Check if using tag based routing
+    if llm_router and llm_router.enable_tag_filtering is True:
+        if premium_user is not True:
+            verbose_proxy_logger.warning(
+                "router.enable_tag_filtering is on %s \n switched off router.enable_tag_filtering",
+                CommonProxyErrors.not_premium_user.value,
+            )
+            llm_router.enable_tag_filtering = False
+
     ### TEAM-SPECIFIC PARAMS ###
     if user_api_key_dict.team_id is not None:
         team_config = await proxy_config.load_team_config(

From 56489ad9cc97dd9e3691118eded480dadf29cbdf Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 21:48:24 -0700
Subject: [PATCH 22/27] rename doc

---
 docs/my-website/docs/proxy/free_paid_tier.md | 102 -------------------
 docs/my-website/docs/proxy/tag_routing.md    |  38 +++++++
 docs/my-website/sidebars.js                  |   2 +-
 3 files changed, 39 insertions(+), 103 deletions(-)
 delete mode 100644 docs/my-website/docs/proxy/free_paid_tier.md
 create mode 100644 docs/my-website/docs/proxy/tag_routing.md

diff --git a/docs/my-website/docs/proxy/free_paid_tier.md b/docs/my-website/docs/proxy/free_paid_tier.md
deleted file mode 100644
index 01230e1f0..000000000
--- a/docs/my-website/docs/proxy/free_paid_tier.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# 💸 Free, Paid Tier Routing
-
-Route Virtual Keys on `free tier` to cheaper models
-
-### 1. Define free, paid tier models on config.yaml 
-
-:::info
-Requests with `model=gpt-4` will be routed to either `openai/fake` or `openai/gpt-4o` depending on which tier the virtual key is on
-:::
-
-```yaml
-model_list:
-  - model_name: gpt-4
-    litellm_params:
-      model: openai/fake
-      api_key: fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-    model_info:
-      tier: free # 👈 Key Change - set `tier to paid or free`
-  - model_name: gpt-4
-    litellm_params:
-      model: openai/gpt-4o
-      api_key: os.environ/OPENAI_API_KEY
-    model_info:
-      tier: paid # 👈 Key Change - set `tier to paid or free`
-
-general_settings: 
-  master_key: sk-1234 
-```
-
-### 2. Create Virtual Keys with pricing `tier=free`
-
-```shell
-curl --location 'http://0.0.0.0:4000/key/generate' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-        "metadata": {"tier": "free"}
-}'
-```
-
-### 3. Make Request with Key on `Free Tier`
-
-```shell
-curl -i http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-curl -i http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-inxzoSurQsjog9gPrVOCcA" \
-  -d '{
-    "model": "gpt-4",
-    "messages": [
-      {"role": "user", "content": "Hello, Claude gm!"}
-    ]
-  }'
-```
-
-**Expected Response**
-
-If this worked as expected then `x-litellm-model-api-base` should be `https://exampleopenaiendpoint-production.up.railway.app/` in the response headers
-
-```shell
-x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/
-
-{"id":"chatcmpl-657b750f581240c1908679ed94b31bfe","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}}%
-```
-
-
-### 4. Create Virtual Keys with pricing `tier=paid`
-
-```shell
-curl --location 'http://0.0.0.0:4000/key/generate' \
-        --header 'Authorization: Bearer sk-1234' \
-        --header 'Content-Type: application/json' \
-        --data '{
-            "metadata": {"tier": "paid"}
-    }'
-```
-
-### 5. Make Request with Key on `Paid Tier`
-
-```shell
-curl -i http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-mnJoeSc6jFjzZr256q-iqA" \
-  -d '{
-    "model": "gpt-4",
-    "messages": [
-      {"role": "user", "content": "Hello, Claude gm!"}
-    ]
-  }'
-```
-
-**Expected Response**
-
-If this worked as expected then `x-litellm-model-api-base` should be `https://api.openai.com` in the response headers
-
-```shell
-x-litellm-model-api-base: https://api.openai.com
-
-{"id":"chatcmpl-9mW75EbJCgwmLcO0M5DmwxpiBgWdc","choices":[{"finish_reason":"stop","index":0,"message":{"content":"Good morning! How can I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1721350215,"model":"gpt-4o-2024-05-13","object":"chat.completion","system_fingerprint":"fp_c4e5b6fa31","usage":{"completion_tokens":10,"prompt_tokens":12,"total_tokens":22}}
-```
diff --git a/docs/my-website/docs/proxy/tag_routing.md b/docs/my-website/docs/proxy/tag_routing.md
new file mode 100644
index 000000000..c33bce315
--- /dev/null
+++ b/docs/my-website/docs/proxy/tag_routing.md
@@ -0,0 +1,38 @@
+# 💸 Tag Based Routing
+
+Route requests based on tags
+
+### 1. Define free, paid tier models on config.yaml 
+
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+      tags: ["free"]
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+      tags: ["paid"]
+
+general_settings: 
+  master_key: sk-1234 
+```
+
+### Make Request with Key on `Free Tier`
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-4",
+    "metadata": {"tags": ["paid"]},
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+  }'
+```
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index eea863d8e..204c27394 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -44,7 +44,7 @@ const sidebars = {
         "proxy/cost_tracking",
         "proxy/self_serve",
         "proxy/virtual_keys",
-        "proxy/free_paid_tier",
+        "proxy/tag_routing",
         "proxy/users",
         "proxy/team_budgets",
         "proxy/customers",

From fa26d3f96f2b5ff09b53d146006771b0086b5b50 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 21:49:36 -0700
Subject: [PATCH 23/27] fix test

---
 litellm/tests/test_router_tag_routing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_router_tag_routing.py b/litellm/tests/test_router_tag_routing.py
index feb67c0e9..67f100d79 100644
--- a/litellm/tests/test_router_tag_routing.py
+++ b/litellm/tests/test_router_tag_routing.py
@@ -58,7 +58,8 @@ async def test_router_free_paid_tier():
                 },
                 "model_info": {"id": "very-expensive-model"},
             },
-        ]
+        ],
+        enable_tag_filtering=True,
     )
 
     for _ in range(5):

From d9c051adfffe1cbee1df767105b918397e4d3ed3 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 21:55:53 -0700
Subject: [PATCH 24/27] add tags to metadata

---
 litellm/proxy/litellm_pre_call_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
index e6bce5392..1014a325a 100644
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@@ -188,6 +188,9 @@ async def add_litellm_data_to_request(
                 CommonProxyErrors.not_premium_user.value,
             )
             llm_router.enable_tag_filtering = False
+        else:
+            if "tags" in data:
+                data[_metadata_variable_name]["tags"] = data["tags"]
 
     ### TEAM-SPECIFIC PARAMS ###
     if user_api_key_dict.team_id is not None:

From 6f393be66b860999874b5d60d6f020881596fd84 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 22:18:10 -0700
Subject: [PATCH 25/27] docs - tag based routing

---
 docs/my-website/docs/proxy/tag_routing.md | 109 ++++++++++++++++++++--
 litellm/proxy/proxy_config.yaml           |  10 +-
 2 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/docs/my-website/docs/proxy/tag_routing.md b/docs/my-website/docs/proxy/tag_routing.md
index c33bce315..763d50918 100644
--- a/docs/my-website/docs/proxy/tag_routing.md
+++ b/docs/my-website/docs/proxy/tag_routing.md
@@ -1,8 +1,12 @@
 # 💸 Tag Based Routing
 
-Route requests based on tags
+Route requests based on tags. 
+This is useful for implementing free / paid tiers for users
 
-### 1. Define free, paid tier models on config.yaml 
+### 1. Define tags on config.yaml 
+
+- A request with `tags=["free"]` will get routed to `openai/fake`
+- A request with `tags=["paid"]`  will get routed to `openai/gpt-4o`
 
 ```yaml
 model_list:
@@ -11,18 +15,22 @@ model_list:
       model: openai/fake
       api_key: fake-key
       api_base: https://exampleopenaiendpoint-production.up.railway.app/
-      tags: ["free"]
+      tags: ["free"] # 👈 Key Change
   - model_name: gpt-4
     litellm_params:
       model: openai/gpt-4o
       api_key: os.environ/OPENAI_API_KEY
-      tags: ["paid"]
+      tags: ["paid"] # 👈 Key Change
 
+router_settings:
+  enable_tag_filtering: True # 👈 Key Change
 general_settings: 
   master_key: sk-1234 
 ```
 
-### Make Request with Key on `Free Tier`
+### 2. Make Request with `tags=["free"]`
+
+This request includes "tags": ["free"], which routes it to `openai/fake`
 
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
@@ -30,9 +38,96 @@ curl -i http://localhost:4000/v1/chat/completions \
   -H "Authorization: Bearer sk-1234" \
   -d '{
     "model": "gpt-4",
-    "metadata": {"tags": ["paid"]},
     "messages": [
       {"role": "user", "content": "Hello, Claude gm!"}
-    ]
+    ],
+    "tags": ["free"]
   }'
 ```
+**Expected Response**
+
+Expect to see the following response header when this works
+```shell
+x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/
+```
+
+Response
+```shell
+{
+ "id": "chatcmpl-33c534e3d70148218e2d62496b81270b",
+ "choices": [
+   {
+     "finish_reason": "stop",
+     "index": 0,
+     "message": {
+       "content": "\n\nHello there, how may I assist you today?",
+       "role": "assistant",
+       "tool_calls": null,
+       "function_call": null
+     }
+   }
+ ],
+ "created": 1677652288,
+ "model": "gpt-3.5-turbo-0125",
+ "object": "chat.completion",
+ "system_fingerprint": "fp_44709d6fcb",
+ "usage": {
+   "completion_tokens": 12,
+   "prompt_tokens": 9,
+   "total_tokens": 21
+ }
+}
+```
+
+
+### 3. Make Request with `tags=["paid"]`
+
+This request includes "tags": ["paid"], which routes it to `openai/gpt-4`
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ],
+    "tags": ["paid"]
+  }'
+```
+
+**Expected Response**
+
+Expect to see the following response header when this works
+```shell
+x-litellm-model-api-base: https://api.openai.com
+```
+
+Response
+```shell
+{
+ "id": "chatcmpl-9maCcqQYTqdJrtvfakIawMOIUbEZx",
+ "choices": [
+   {
+     "finish_reason": "stop",
+     "index": 0,
+     "message": {
+       "content": "Good morning! How can I assist you today?",
+       "role": "assistant",
+       "tool_calls": null,
+       "function_call": null
+     }
+   }
+ ],
+ "created": 1721365934,
+ "model": "gpt-4o-2024-05-13",
+ "object": "chat.completion",
+ "system_fingerprint": "fp_c4e5b6fa31",
+ "usage": {
+   "completion_tokens": 10,
+   "prompt_tokens": 12,
+   "total_tokens": 22
+ }
+}
+```
\ No newline at end of file
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 81ed12c07..f20c780cc 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -4,14 +4,14 @@ model_list:
       model: openai/fake
       api_key: fake-key
       api_base: https://exampleopenaiendpoint-production.up.railway.app/
-      tags: ["free"]
+      tags: ["free"] # 👈 Key Change
   - model_name: gpt-4
     litellm_params:
       model: openai/gpt-4o
       api_key: os.environ/OPENAI_API_KEY
-      tags: ["paid"]
+      tags: ["paid"] # 👈 Key Change
 
+router_settings:
+  enable_tag_filtering: True # 👈 Key Change
 general_settings: 
-  master_key: sk-1234 
-
-
+  master_key: sk-1234
\ No newline at end of file

From 780a6293dca895e7a381ef2253b3622502dc20df Mon Sep 17 00:00:00 2001
From: Marc Abramowitz <abramowi@adobe.com>
Date: Thu, 18 Jul 2024 14:29:32 -0700
Subject: [PATCH 26/27] Alias  /health/liveliness as /health/liveness

The latter is the more common term in Kubernetes, so it's nice to support that.
---
 litellm/proxy/_types.py                             |  1 +
 litellm/proxy/health_endpoints/_health_endpoints.py | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py
index 7464714db..7acd38e8b 100644
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@@ -304,6 +304,7 @@ class LiteLLMRoutes(enum.Enum):
         "/routes",
         "/",
         "/health/liveliness",
+        "/health/liveness",
         "/health/readiness",
         "/test",
         "/config/yaml",
diff --git a/litellm/proxy/health_endpoints/_health_endpoints.py b/litellm/proxy/health_endpoints/_health_endpoints.py
index 494d9aa09..fa9edcdc8 100644
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@@ -483,7 +483,12 @@ async def health_readiness():
 
 
 @router.get(
-    "/health/liveliness",
+    "/health/liveliness", # Historical LiteLLM name; doesn't match k8s terminology but kept for backwards compatibility
+    tags=["health"],
+    dependencies=[Depends(user_api_key_auth)],
+)
+@router.get(
+    "/health/liveness",   # Kubernetes has "liveness" probes (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command)
     tags=["health"],
     dependencies=[Depends(user_api_key_auth)],
 )
@@ -516,6 +521,11 @@ async def health_readiness_options():
     tags=["health"],
     dependencies=[Depends(user_api_key_auth)],
 )
+@router.options(
+    "/health/liveness",   # Kubernetes has "liveness" probes (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command)
+    tags=["health"],
+    dependencies=[Depends(user_api_key_auth)],
+)
 async def health_liveliness_options():
     """
     Options endpoint for health/liveliness check.

From 086486c5c35ed1025d314cf1ef9db5431bcbaf2d Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 18 Jul 2024 22:20:34 -0700
Subject: [PATCH 27/27] =?UTF-8?q?bump:=20version=201.41.24=20=E2=86=92=201?=
 =?UTF-8?q?.41.25?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4e5d51a76..5a1d6066d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.41.24"
+version = "1.41.25"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.41.24"
+version = "1.41.25"
 version_files = [
     "pyproject.toml:^version"
 ]