From 16b5de07afb22faeaaa8ab7a382b63b3e1c3a70d Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 30 Jan 2025 22:56:41 -0800
Subject: [PATCH] Doc updates + management endpoint fixes  (#8138)

* Litellm dev 01 29 2025 p4 (#8107)

* fix(key_management_endpoints.py): always get db team

Fixes https://github.com/BerriAI/litellm/issues/7983

* test(test_key_management.py): add unit test enforcing check_db_only is always true on key generate checks

* test: fix test

* test: skip gemini thinking

* Litellm dev 01 29 2025 p3 (#8106)

* fix(__init__.py): reduces size of __init__.py and reduces scope for errors by using correct param

* refactor(__init__.py): refactor init by cleaning up redundant params

* refactor(__init__.py): move more constants into constants.py

cleanup root

* refactor(__init__.py): more cleanup

* feat(__init__.py): expose new 'disable_hf_tokenizer_download' param

enables hf model usage in offline env

* docs(config_settings.md): document new disable_hf_tokenizer_download param

* fix: fix linting error

* fix: fix unsafe comparison

* test: fix test

* docs(public_teams.md): add doc showing how to expose public teams for users to join

* docs: add beta disclaimer on public teams

* test: update tests
---
 docs/my-website/docs/proxy/config_settings.md |   1 +
 docs/my-website/docs/proxy/public_teams.md    |  40 ++
 docs/my-website/docs/proxy/ui.md              |   5 -
 docs/my-website/sidebars.js                   |   1 +
 litellm/__init__.py                           | 347 ++----------------
 litellm/constants.py                          | 223 +++++++++++
 .../litellm_core_utils/get_model_cost_map.py  |  45 +++
 litellm/proxy/_experimental/out/404.html      |   1 -
 .../proxy/_experimental/out/model_hub.html    |   1 -
 .../proxy/_experimental/out/onboarding.html   |   1 -
 .../key_management_endpoints.py               |   2 +
 litellm/types/utils.py                        |   5 +
 litellm/utils.py                              |  60 +--
 .../test_amazing_vertex_completion.py         |   8 +-
 tests/local_testing/test_token_counter.py     |  11 +
 .../test_key_management.py                    |  26 ++
 16 files changed, 428 insertions(+), 349 deletions(-)
 create mode 100644 docs/my-website/docs/proxy/public_teams.md
 create mode 100644 litellm/litellm_core_utils/get_model_cost_map.py
 delete mode 100644 litellm/proxy/_experimental/out/404.html
 delete mode 100644 litellm/proxy/_experimental/out/model_hub.html
 delete mode 100644 litellm/proxy/_experimental/out/onboarding.html
diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md
index 5ba26031b8..4a10cea7ab 100644
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@@ -139,6 +139,7 @@ general_settings:
 | disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. |
 | key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
 | disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. |
+| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). |
 
 ### general_settings - Reference
 
diff --git a/docs/my-website/docs/proxy/public_teams.md b/docs/my-website/docs/proxy/public_teams.md
new file mode 100644
index 0000000000..6ff2258308
--- /dev/null
+++ b/docs/my-website/docs/proxy/public_teams.md
@@ -0,0 +1,40 @@
+# [BETA] Public Teams
+
+Expose available teams to your users to join on signup.
+
+<iframe width="840" height="500" src="https://www.loom.com/embed/7871ea15035a48d2a118b7486c2f7598?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+
+
+## Quick Start
+
+1. Create a team on LiteLLM
+
+```bash
+curl -X POST '<PROXY_BASE_URL>/team/new' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer <MASTER_KEY>' \
+-d '{"name": "My Team", "team_id": "team_id_1"}'
+```
+
+2. Expose the team to your users
+
+```yaml
+litellm_settings:
+    default_internal_user_params:
+        available_teams: ["team_id_1"] # 👈 Make team available to new SSO users
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/team/member_add' \
+-H 'Authorization: Bearer sk-<USER_KEY>' \
+-H 'Content-Type: application/json' \
+--data-raw '{
+    "team_id": "team_id_1", 
+    "member": [{"role": "user", "user_id": "my-test-user"}]
+}'
+```
+
+
+
diff --git a/docs/my-website/docs/proxy/ui.md b/docs/my-website/docs/proxy/ui.md
index f32f8ffa2d..a093b226a2 100644
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@@ -6,11 +6,6 @@ import TabItem from '@theme/TabItem';
 
 Create keys, track spend, add models without worrying about the config / CRUD endpoints.
 
-:::info
-
-This is in beta, so things may change. If you have feedback, [let us know](https://discord.com/invite/wuPM9dRgDw)
-
-:::
 
 <Image img={require('../../img/litellm_ui_create_key.png')} />  
 
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index cda84067ba..41febfd564 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -98,6 +98,7 @@ const sidebars = {
             "proxy/ui",
             "proxy/admin_ui_sso",
             "proxy/self_serve",
+            "proxy/public_teams",
             "proxy/custom_sso"
           ],
         },
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 76ab021a0a..3032d1b8c6 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -9,7 +9,12 @@ from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
 from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
-from litellm.types.utils import ImageObject, BudgetConfig
+from litellm.types.utils import (
+    ImageObject,
+    BudgetConfig,
+    all_litellm_params,
+    all_litellm_params as _litellm_completion_params,
+)  # maintain backwards compatibility for root param
 from litellm._logging import (
     set_verbose,
     _turn_on_debug,
@@ -29,6 +34,24 @@ from litellm.constants import (
     LITELLM_CHAT_PROVIDERS,
     HUMANLOOP_PROMPT_CACHE_TTL_SECONDS,
     OPENAI_CHAT_COMPLETION_PARAMS,
+    OPENAI_CHAT_COMPLETION_PARAMS as _openai_completion_params,  # backwards compatibility
+    OPENAI_FINISH_REASONS,
+    OPENAI_FINISH_REASONS as _openai_finish_reasons,  # backwards compatibility
+    openai_compatible_endpoints,
+    openai_compatible_providers,
+    openai_text_completion_compatible_providers,
+    _openai_like_providers,
+    replicate_models,
+    clarifai_models,
+    huggingface_models,
+    empower_models,
+    together_ai_models,
+    baseten_models,
+    REPEATED_STREAMING_CHUNK_LIMIT,
+    request_timeout,
+    open_ai_embedding_models,
+    cohere_embedding_models,
+    bedrock_embedding_models,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@@ -217,75 +240,8 @@ default_soft_budget: float = (
     50.0  # by default all litellm proxy keys have a soft budget of 50.0
 )
 forward_traceparent_to_llm_provider: bool = False
-_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
-_openai_completion_params = [
-    "functions",
-    "function_call",
-    "temperature",
-    "temperature",
-    "top_p",
-    "n",
-    "stream",
-    "stop",
-    "max_tokens",
-    "presence_penalty",
-    "frequency_penalty",
-    "logit_bias",
-    "user",
-    "request_timeout",
-    "api_base",
-    "api_version",
-    "api_key",
-    "deployment_id",
-    "organization",
-    "base_url",
-    "default_headers",
-    "timeout",
-    "response_format",
-    "seed",
-    "tools",
-    "tool_choice",
-    "max_retries",
-]
-_litellm_completion_params = [
-    "metadata",
-    "acompletion",
-    "caching",
-    "mock_response",
-    "api_key",
-    "api_version",
-    "api_base",
-    "force_timeout",
-    "logger_fn",
-    "verbose",
-    "custom_llm_provider",
-    "litellm_logging_obj",
-    "litellm_call_id",
-    "use_client",
-    "id",
-    "fallbacks",
-    "azure",
-    "headers",
-    "model_list",
-    "num_retries",
-    "context_window_fallback_dict",
-    "roles",
-    "final_prompt_value",
-    "bos_token",
-    "eos_token",
-    "request_timeout",
-    "complete_response",
-    "self",
-    "client",
-    "rpm",
-    "tpm",
-    "input_cost_per_token",
-    "output_cost_per_token",
-    "hf_model_name",
-    "model_info",
-    "proxy_server_request",
-    "preset_cache_key",
-]
+
+
 _current_cost = 0.0  # private variable, used if max budget is set
 error_logs: Dict = {}
 add_function_to_prompt: bool = (
@@ -318,11 +274,8 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-#### RELIABILITY ####
-REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 
-#### Networking settings ####
-request_timeout: float = 6000  # time in seconds
+
 force_ipv4: bool = (
     False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
@@ -352,39 +305,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings()
 #### PII MASKING ####
 output_parse_pii: bool = False
 #############################################
-
-
-def get_model_cost_map(url: str):
-    if (
-        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
-        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
-    ):
-        import importlib.resources
-        import json
-
-        with importlib.resources.open_text(
-            "litellm", "model_prices_and_context_window_backup.json"
-        ) as f:
-            content = json.load(f)
-            return content
-
-    try:
-        response = httpx.get(
-            url, timeout=5
-        )  # set a 5 second timeout for the get request
-        response.raise_for_status()  # Raise an exception if the request is unsuccessful
-        content = response.json()
-        return content
-    except Exception:
-        import importlib.resources
-        import json
-
-        with importlib.resources.open_text(
-            "litellm", "model_prices_and_context_window_backup.json"
-        ) as f:
-            content = json.load(f)
-            return content
-
+from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
 
 model_cost = get_model_cost_map(url=model_cost_map_url)
 custom_prompt_dict: Dict[str, dict] = {}
@@ -446,7 +367,6 @@ cohere_chat_models: List = []
 mistral_chat_models: List = []
 text_completion_codestral_models: List = []
 anthropic_models: List = []
-empower_models: List = []
 openrouter_models: List = []
 vertex_language_models: List = []
 vertex_vision_models: List = []
@@ -641,202 +561,8 @@ def add_known_models():
 
 add_known_models()
 # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
-openai_compatible_endpoints: List = [
-    "api.perplexity.ai",
-    "api.endpoints.anyscale.com/v1",
-    "api.deepinfra.com/v1/openai",
-    "api.mistral.ai/v1",
-    "codestral.mistral.ai/v1/chat/completions",
-    "codestral.mistral.ai/v1/fim/completions",
-    "api.groq.com/openai/v1",
-    "https://integrate.api.nvidia.com/v1",
-    "api.deepseek.com/v1",
-    "api.together.xyz/v1",
-    "app.empower.dev/api/v1",
-    "https://api.friendli.ai/serverless/v1",
-    "api.sambanova.ai/v1",
-    "api.x.ai/v1",
-    "api.galadriel.ai/v1",
-]
 
 # this is maintained for Exception Mapping
-openai_compatible_providers: List = [
-    "anyscale",
-    "mistral",
-    "groq",
-    "nvidia_nim",
-    "cerebras",
-    "sambanova",
-    "ai21_chat",
-    "ai21",
-    "volcengine",
-    "codestral",
-    "deepseek",
-    "deepinfra",
-    "perplexity",
-    "xinference",
-    "xai",
-    "together_ai",
-    "fireworks_ai",
-    "empower",
-    "friendliai",
-    "azure_ai",
-    "github",
-    "litellm_proxy",
-    "hosted_vllm",
-    "lm_studio",
-    "galadriel",
-]
-openai_text_completion_compatible_providers: List = (
-    [  # providers that support `/v1/completions`
-        "together_ai",
-        "fireworks_ai",
-        "hosted_vllm",
-    ]
-)
-_openai_like_providers: List = [
-    "predibase",
-    "databricks",
-    "watsonx",
-]  # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
-# well supported replicate llms
-replicate_models: List = [
-    # llama replicate supported LLMs
-    "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
-    "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
-    "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
-    # Vicuna
-    "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
-    "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
-    # Flan T-5
-    "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
-    # Others
-    "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
-    "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
-]
-
-clarifai_models: List = [
-    "clarifai/meta.Llama-3.Llama-3-8B-Instruct",
-    "clarifai/gcp.generate.gemma-1_1-7b-it",
-    "clarifai/mistralai.completion.mixtral-8x22B",
-    "clarifai/cohere.generate.command-r-plus",
-    "clarifai/databricks.drbx.dbrx-instruct",
-    "clarifai/mistralai.completion.mistral-large",
-    "clarifai/mistralai.completion.mistral-medium",
-    "clarifai/mistralai.completion.mistral-small",
-    "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
-    "clarifai/gcp.generate.gemma-2b-it",
-    "clarifai/gcp.generate.gemma-7b-it",
-    "clarifai/deci.decilm.deciLM-7B-instruct",
-    "clarifai/mistralai.completion.mistral-7B-Instruct",
-    "clarifai/gcp.generate.gemini-pro",
-    "clarifai/anthropic.completion.claude-v1",
-    "clarifai/anthropic.completion.claude-instant-1_2",
-    "clarifai/anthropic.completion.claude-instant",
-    "clarifai/anthropic.completion.claude-v2",
-    "clarifai/anthropic.completion.claude-2_1",
-    "clarifai/meta.Llama-2.codeLlama-70b-Python",
-    "clarifai/meta.Llama-2.codeLlama-70b-Instruct",
-    "clarifai/openai.completion.gpt-3_5-turbo-instruct",
-    "clarifai/meta.Llama-2.llama2-7b-chat",
-    "clarifai/meta.Llama-2.llama2-13b-chat",
-    "clarifai/meta.Llama-2.llama2-70b-chat",
-    "clarifai/openai.chat-completion.gpt-4-turbo",
-    "clarifai/microsoft.text-generation.phi-2",
-    "clarifai/meta.Llama-2.llama2-7b-chat-vllm",
-    "clarifai/upstage.solar.solar-10_7b-instruct",
-    "clarifai/openchat.openchat.openchat-3_5-1210",
-    "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
-    "clarifai/gcp.generate.text-bison",
-    "clarifai/meta.Llama-2.llamaGuard-7b",
-    "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
-    "clarifai/openai.chat-completion.GPT-4",
-    "clarifai/openai.chat-completion.GPT-3_5-turbo",
-    "clarifai/ai21.complete.Jurassic2-Grande",
-    "clarifai/ai21.complete.Jurassic2-Grande-Instruct",
-    "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
-    "clarifai/ai21.complete.Jurassic2-Jumbo",
-    "clarifai/ai21.complete.Jurassic2-Large",
-    "clarifai/cohere.generate.cohere-generate-command",
-    "clarifai/wizardlm.generate.wizardCoder-Python-34B",
-    "clarifai/wizardlm.generate.wizardLM-70B",
-    "clarifai/tiiuae.falcon.falcon-40b-instruct",
-    "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
-    "clarifai/gcp.generate.code-gecko",
-    "clarifai/gcp.generate.code-bison",
-    "clarifai/mistralai.completion.mistral-7B-OpenOrca",
-    "clarifai/mistralai.completion.openHermes-2-mistral-7B",
-    "clarifai/wizardlm.generate.wizardLM-13B",
-    "clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
-    "clarifai/wizardlm.generate.wizardCoder-15B",
-    "clarifai/microsoft.text-generation.phi-1_5",
-    "clarifai/databricks.Dolly-v2.dolly-v2-12b",
-    "clarifai/bigcode.code.StarCoder",
-    "clarifai/salesforce.xgen.xgen-7b-8k-instruct",
-    "clarifai/mosaicml.mpt.mpt-7b-instruct",
-    "clarifai/anthropic.completion.claude-3-opus",
-    "clarifai/anthropic.completion.claude-3-sonnet",
-    "clarifai/gcp.generate.gemini-1_5-pro",
-    "clarifai/gcp.generate.imagen-2",
-    "clarifai/salesforce.blip.general-english-image-caption-blip-2",
-]
-
-
-huggingface_models: List = [
-    "meta-llama/Llama-2-7b-hf",
-    "meta-llama/Llama-2-7b-chat-hf",
-    "meta-llama/Llama-2-13b-hf",
-    "meta-llama/Llama-2-13b-chat-hf",
-    "meta-llama/Llama-2-70b-hf",
-    "meta-llama/Llama-2-70b-chat-hf",
-    "meta-llama/Llama-2-7b",
-    "meta-llama/Llama-2-7b-chat",
-    "meta-llama/Llama-2-13b",
-    "meta-llama/Llama-2-13b-chat",
-    "meta-llama/Llama-2-70b",
-    "meta-llama/Llama-2-70b-chat",
-]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
-empower_models = [
-    "empower/empower-functions",
-    "empower/empower-functions-small",
-]
-
-together_ai_models: List = [
-    # llama llms - chat
-    "togethercomputer/llama-2-70b-chat",
-    # llama llms - language / instruct
-    "togethercomputer/llama-2-70b",
-    "togethercomputer/LLaMA-2-7B-32K",
-    "togethercomputer/Llama-2-7B-32K-Instruct",
-    "togethercomputer/llama-2-7b",
-    # falcon llms
-    "togethercomputer/falcon-40b-instruct",
-    "togethercomputer/falcon-7b-instruct",
-    # alpaca
-    "togethercomputer/alpaca-7b",
-    # chat llms
-    "HuggingFaceH4/starchat-alpha",
-    # code llms
-    "togethercomputer/CodeLlama-34b",
-    "togethercomputer/CodeLlama-34b-Instruct",
-    "togethercomputer/CodeLlama-34b-Python",
-    "defog/sqlcoder",
-    "NumbersStation/nsql-llama-2-7B",
-    "WizardLM/WizardCoder-15B-V1.0",
-    "WizardLM/WizardCoder-Python-34B-V1.0",
-    # language llms
-    "NousResearch/Nous-Hermes-Llama2-13b",
-    "Austism/chronos-hermes-13b",
-    "upstage/SOLAR-0-70b-16bit",
-    "WizardLM/WizardLM-70B-V1.0",
-]  # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
-
-
-baseten_models: List = [
-    "qvv0xeq",
-    "q841o8w",
-    "31dxrj3",
-]  # FALCON 7B  # WizardLM  # Mosaic ML
 
 
 # used for Cost Tracking & Token counting
@@ -980,20 +706,6 @@ longer_context_model_fallback_dict: dict = {
 }
 
 ####### EMBEDDING MODELS ###################
-open_ai_embedding_models: List = ["text-embedding-ada-002"]
-cohere_embedding_models: List = [
-    "embed-english-v3.0",
-    "embed-english-light-v3.0",
-    "embed-multilingual-v3.0",
-    "embed-english-v2.0",
-    "embed-english-light-v2.0",
-    "embed-multilingual-v2.0",
-]
-bedrock_embedding_models: List = [
-    "amazon.titan-embed-text-v1",
-    "cohere.embed-english-v3",
-    "cohere.embed-multilingual-v3",
-]
 
 all_embedding_models = (
     open_ai_embedding_models
@@ -1277,4 +989,7 @@ custom_provider_map: List[CustomLLMItem] = []
 _custom_providers: List[str] = (
     []
 )  # internal helper util, used to track names of custom providers
+disable_hf_tokenizer_download: Optional[bool] = (
+    None  # disable huggingface tokenizer download. Defaults to openai clk100
+)
 global_disable_no_log_param: bool = False
diff --git a/litellm/constants.py b/litellm/constants.py
index 0a3b4ee4c7..36d45060e9 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -1,3 +1,5 @@
+from typing import List
+
 ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
 DEFAULT_FLUSH_INTERVAL_SECONDS = 5
@@ -12,6 +14,11 @@ DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
+#### RELIABILITY ####
+REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
+#### Networking settings ####
+request_timeout: float = 6000  # time in seconds
+
 LITELLM_CHAT_PROVIDERS = [
     "openai",
     "openai_like",
@@ -113,6 +120,222 @@ OPENAI_CHAT_COMPLETION_PARAMS = [
     "top_logprobs",
     "extra_headers",
 ]
+
+openai_compatible_endpoints: List = [
+    "api.perplexity.ai",
+    "api.endpoints.anyscale.com/v1",
+    "api.deepinfra.com/v1/openai",
+    "api.mistral.ai/v1",
+    "codestral.mistral.ai/v1/chat/completions",
+    "codestral.mistral.ai/v1/fim/completions",
+    "api.groq.com/openai/v1",
+    "https://integrate.api.nvidia.com/v1",
+    "api.deepseek.com/v1",
+    "api.together.xyz/v1",
+    "app.empower.dev/api/v1",
+    "https://api.friendli.ai/serverless/v1",
+    "api.sambanova.ai/v1",
+    "api.x.ai/v1",
+    "api.galadriel.ai/v1",
+]
+
+
+openai_compatible_providers: List = [
+    "anyscale",
+    "mistral",
+    "groq",
+    "nvidia_nim",
+    "cerebras",
+    "sambanova",
+    "ai21_chat",
+    "ai21",
+    "volcengine",
+    "codestral",
+    "deepseek",
+    "deepinfra",
+    "perplexity",
+    "xinference",
+    "xai",
+    "together_ai",
+    "fireworks_ai",
+    "empower",
+    "friendliai",
+    "azure_ai",
+    "github",
+    "litellm_proxy",
+    "hosted_vllm",
+    "lm_studio",
+    "galadriel",
+]
+openai_text_completion_compatible_providers: List = (
+    [  # providers that support `/v1/completions`
+        "together_ai",
+        "fireworks_ai",
+        "hosted_vllm",
+    ]
+)
+_openai_like_providers: List = [
+    "predibase",
+    "databricks",
+    "watsonx",
+]  # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
+# well supported replicate llms
+replicate_models: List = [
+    # llama replicate supported LLMs
+    "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
+    "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
+    "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
+    # Vicuna
+    "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
+    "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
+    # Flan T-5
+    "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
+    # Others
+    "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
+    "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
+]
+
+clarifai_models: List = [
+    "clarifai/meta.Llama-3.Llama-3-8B-Instruct",
+    "clarifai/gcp.generate.gemma-1_1-7b-it",
+    "clarifai/mistralai.completion.mixtral-8x22B",
+    "clarifai/cohere.generate.command-r-plus",
+    "clarifai/databricks.drbx.dbrx-instruct",
+    "clarifai/mistralai.completion.mistral-large",
+    "clarifai/mistralai.completion.mistral-medium",
+    "clarifai/mistralai.completion.mistral-small",
+    "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
+    "clarifai/gcp.generate.gemma-2b-it",
+    "clarifai/gcp.generate.gemma-7b-it",
+    "clarifai/deci.decilm.deciLM-7B-instruct",
+    "clarifai/mistralai.completion.mistral-7B-Instruct",
+    "clarifai/gcp.generate.gemini-pro",
+    "clarifai/anthropic.completion.claude-v1",
+    "clarifai/anthropic.completion.claude-instant-1_2",
+    "clarifai/anthropic.completion.claude-instant",
+    "clarifai/anthropic.completion.claude-v2",
+    "clarifai/anthropic.completion.claude-2_1",
+    "clarifai/meta.Llama-2.codeLlama-70b-Python",
+    "clarifai/meta.Llama-2.codeLlama-70b-Instruct",
+    "clarifai/openai.completion.gpt-3_5-turbo-instruct",
+    "clarifai/meta.Llama-2.llama2-7b-chat",
+    "clarifai/meta.Llama-2.llama2-13b-chat",
+    "clarifai/meta.Llama-2.llama2-70b-chat",
+    "clarifai/openai.chat-completion.gpt-4-turbo",
+    "clarifai/microsoft.text-generation.phi-2",
+    "clarifai/meta.Llama-2.llama2-7b-chat-vllm",
+    "clarifai/upstage.solar.solar-10_7b-instruct",
+    "clarifai/openchat.openchat.openchat-3_5-1210",
+    "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
+    "clarifai/gcp.generate.text-bison",
+    "clarifai/meta.Llama-2.llamaGuard-7b",
+    "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
+    "clarifai/openai.chat-completion.GPT-4",
+    "clarifai/openai.chat-completion.GPT-3_5-turbo",
+    "clarifai/ai21.complete.Jurassic2-Grande",
+    "clarifai/ai21.complete.Jurassic2-Grande-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo",
+    "clarifai/ai21.complete.Jurassic2-Large",
+    "clarifai/cohere.generate.cohere-generate-command",
+    "clarifai/wizardlm.generate.wizardCoder-Python-34B",
+    "clarifai/wizardlm.generate.wizardLM-70B",
+    "clarifai/tiiuae.falcon.falcon-40b-instruct",
+    "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
+    "clarifai/gcp.generate.code-gecko",
+    "clarifai/gcp.generate.code-bison",
+    "clarifai/mistralai.completion.mistral-7B-OpenOrca",
+    "clarifai/mistralai.completion.openHermes-2-mistral-7B",
+    "clarifai/wizardlm.generate.wizardLM-13B",
+    "clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
+    "clarifai/wizardlm.generate.wizardCoder-15B",
+    "clarifai/microsoft.text-generation.phi-1_5",
+    "clarifai/databricks.Dolly-v2.dolly-v2-12b",
+    "clarifai/bigcode.code.StarCoder",
+    "clarifai/salesforce.xgen.xgen-7b-8k-instruct",
+    "clarifai/mosaicml.mpt.mpt-7b-instruct",
+    "clarifai/anthropic.completion.claude-3-opus",
+    "clarifai/anthropic.completion.claude-3-sonnet",
+    "clarifai/gcp.generate.gemini-1_5-pro",
+    "clarifai/gcp.generate.imagen-2",
+    "clarifai/salesforce.blip.general-english-image-caption-blip-2",
+]
+
+
+huggingface_models: List = [
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Llama-2-70b-hf",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "meta-llama/Llama-2-7b",
+    "meta-llama/Llama-2-7b-chat",
+    "meta-llama/Llama-2-13b",
+    "meta-llama/Llama-2-13b-chat",
+    "meta-llama/Llama-2-70b",
+    "meta-llama/Llama-2-70b-chat",
+]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
+empower_models = [
+    "empower/empower-functions",
+    "empower/empower-functions-small",
+]
+
+together_ai_models: List = [
+    # llama llms - chat
+    "togethercomputer/llama-2-70b-chat",
+    # llama llms - language / instruct
+    "togethercomputer/llama-2-70b",
+    "togethercomputer/LLaMA-2-7B-32K",
+    "togethercomputer/Llama-2-7B-32K-Instruct",
+    "togethercomputer/llama-2-7b",
+    # falcon llms
+    "togethercomputer/falcon-40b-instruct",
+    "togethercomputer/falcon-7b-instruct",
+    # alpaca
+    "togethercomputer/alpaca-7b",
+    # chat llms
+    "HuggingFaceH4/starchat-alpha",
+    # code llms
+    "togethercomputer/CodeLlama-34b",
+    "togethercomputer/CodeLlama-34b-Instruct",
+    "togethercomputer/CodeLlama-34b-Python",
+    "defog/sqlcoder",
+    "NumbersStation/nsql-llama-2-7B",
+    "WizardLM/WizardCoder-15B-V1.0",
+    "WizardLM/WizardCoder-Python-34B-V1.0",
+    # language llms
+    "NousResearch/Nous-Hermes-Llama2-13b",
+    "Austism/chronos-hermes-13b",
+    "upstage/SOLAR-0-70b-16bit",
+    "WizardLM/WizardLM-70B-V1.0",
+]  # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
+
+
+baseten_models: List = [
+    "qvv0xeq",
+    "q841o8w",
+    "31dxrj3",
+]  # FALCON 7B  # WizardLM  # Mosaic ML
+
+
+open_ai_embedding_models: List = ["text-embedding-ada-002"]
+cohere_embedding_models: List = [
+    "embed-english-v3.0",
+    "embed-english-light-v3.0",
+    "embed-multilingual-v3.0",
+    "embed-english-v2.0",
+    "embed-english-light-v2.0",
+    "embed-multilingual-v2.0",
+]
+bedrock_embedding_models: List = [
+    "amazon.titan-embed-text-v1",
+    "cohere.embed-english-v3",
+    "cohere.embed-multilingual-v3",
+]
+
+
+OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
 HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60  # 1 minute
 RESPONSE_FORMAT_TOOL_NAME = "json_tool_call"  # default tool name used when converting response format to tool call
 
diff --git a/litellm/litellm_core_utils/get_model_cost_map.py b/litellm/litellm_core_utils/get_model_cost_map.py
new file mode 100644
index 0000000000..b8bdaee19c
--- /dev/null
+++ b/litellm/litellm_core_utils/get_model_cost_map.py
@@ -0,0 +1,45 @@
+"""
+Pulls the cost + context window + provider route for known models from https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+
+This can be disabled by setting the LITELLM_LOCAL_MODEL_COST_MAP environment variable to True.
+
+```
+export LITELLM_LOCAL_MODEL_COST_MAP=True
+```
+"""
+
+import os
+
+import httpx
+
+
+def get_model_cost_map(url: str):
+    if (
+        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
+        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
+    ):
+        import importlib.resources
+        import json
+
+        with importlib.resources.open_text(
+            "litellm", "model_prices_and_context_window_backup.json"
+        ) as f:
+            content = json.load(f)
+            return content
+
+    try:
+        response = httpx.get(
+            url, timeout=5
+        )  # set a 5 second timeout for the get request
+        response.raise_for_status()  # Raise an exception if the request is unsuccessful
+        content = response.json()
+        return content
+    except Exception:
+        import importlib.resources
+        import json
+
+        with importlib.resources.open_text(
+            "litellm", "model_prices_and_context_window_backup.json"
+        ) as f:
+            content = json.load(f)
+            return content
diff --git a/litellm/proxy/_experimental/out/404.html b/litellm/proxy/_experimental/out/404.html
deleted file mode 100644
index 7b62e04c0c..0000000000
--- a/litellm/proxy/_experimental/out/404.html
+++ /dev/null
@@ -1 +0,0 @@
-<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/86f6cc749f6b8493.css" data-precedence="next"/><link rel="stylesheet" href="/ui/_next/static/css/b5e4f566f30b214b.css" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-f8f374bd771def60.js" async=""></script><script src="/ui/_next/static/chunks/117-2d8e84979f319d39.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><meta name="robots" content="noindex"/><title>404: This page could not be found.</title><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body class="__className_cf7686"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/b5e4f566f30b214b.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[4707,[],\"\"]\n7:I[36423,[],\"\"]\nd:I[61060,[],\"\"]\n8:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n9:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\na:{\"display\":\"inline-block\"}\nb:{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0}\ne:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"cofVBB4zTcxnZDBTkxHSw\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"_not-found\"],\"initialTree\":[\"\",{\"children\":[\"/_not-found\",{\"children\":[\"__PAGE__\",{}]}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"/_not-found\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],null],null],null]},[null,[\"$\",\"$L6\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"/_not-found\",\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L7\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\"}]],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/b5e4f566f30b214b.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L6\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L7\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$9\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$a\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$b\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[[\"$\",\"meta\",null,{\"name\":\"robots\",\"content\":\"noindex\"}],\"$Lc\"],\"globalErrorComponent\":\"$d\",\"missingSlots\":\"$We\"}]\n"])</script><script>self.__next_f.push([1,"c:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"meta\",\"4\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/model_hub.html b/litellm/proxy/_experimental/out/model_hub.html
deleted file mode 100644
index 0117fa6155..0000000000
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ /dev/null
@@ -1 +0,0 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-f8f374bd771def60.js" async=""></script><script src="/ui/_next/static/chunks/117-2d8e84979f319d39.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/b5e4f566f30b214b.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[52829,[\"42\",\"static/chunks/42-85111ce0453ddfcd.js\",\"261\",\"static/chunks/261-45feb99696985c63.js\",\"250\",\"static/chunks/250-05031210bd351f4a.js\",\"699\",\"static/chunks/699-d78fadc5a0619859.js\",\"418\",\"static/chunks/app/model_hub/page-cca9fa9bc14379a7.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"cofVBB4zTcxnZDBTkxHSw\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"model_hub\"],\"initialTree\":[\"\",{\"children\":[\"model_hub\",{\"children\":[\"__PAGE__\",{}]}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"model_hub\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[null,[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"model_hub\",\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\"}]],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/b5e4f566f30b214b.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/onboarding.html b/litellm/proxy/_experimental/out/onboarding.html
deleted file mode 100644
index 90b5bf2129..0000000000
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ /dev/null
@@ -1 +0,0 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-f8f374bd771def60.js" async=""></script><script src="/ui/_next/static/chunks/117-2d8e84979f319d39.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/b5e4f566f30b214b.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[12011,[\"665\",\"static/chunks/3014691f-6bdd9c4659caabcd.js\",\"42\",\"static/chunks/42-85111ce0453ddfcd.js\",\"755\",\"static/chunks/755-288e79da1358e7df.js\",\"250\",\"static/chunks/250-05031210bd351f4a.js\",\"461\",\"static/chunks/app/onboarding/page-7791b469d82cc16a.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"cofVBB4zTcxnZDBTkxHSw\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"onboarding\"],\"initialTree\":[\"\",{\"children\":[\"onboarding\",{\"children\":[\"__PAGE__\",{}]}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"onboarding\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[null,[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"onboarding\",\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\"}]],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/b5e4f566f30b214b.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
\ No newline at end of file
diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py
index 8761e1ac9f..b9b462a4e8 100644
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@@ -61,6 +61,7 @@ def _get_user_in_team(
     for member in team_table.members_with_roles:
         if member.user_id is not None and member.user_id == user_id:
             return member
+
     return None
 
 
@@ -366,6 +367,7 @@ async def generate_key_fn(  # noqa: PLR0915
                     prisma_client=prisma_client,
                     user_api_key_cache=user_api_key_cache,
                     parent_otel_span=user_api_key_dict.parent_otel_span,
+                    check_db_only=True,
                 )
             except Exception as e:
                 verbose_proxy_logger.debug(
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index a1c19dab1b..559588b7c7 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -1889,3 +1889,8 @@ class HttpHandlerRequestFields(TypedDict, total=False):
 class ProviderSpecificHeader(TypedDict):
     custom_llm_provider: str
     extra_headers: dict
+
+
+class SelectTokenizerResponse(TypedDict):
+    type: Literal["openai_tokenizer", "huggingface_tokenizer"]
+    tokenizer: Any
diff --git a/litellm/utils.py b/litellm/utils.py
index b1e683113a..5396e008f0 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -150,6 +150,7 @@ from litellm.types.utils import (
     ModelResponseStream,
     ProviderField,
     ProviderSpecificModelInfo,
+    SelectTokenizerResponse,
     StreamingChoices,
     TextChoices,
     TextCompletionResponse,
@@ -1440,34 +1441,47 @@ def _select_tokenizer(
 
 
 @lru_cache(maxsize=128)
-def _select_tokenizer_helper(model: str):
+def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
+
+    if litellm.disable_hf_tokenizer_download is True:
+        return _return_openai_tokenizer(model)
+
     try:
-        if model in litellm.cohere_models and "command-r" in model:
-            # cohere
-            cohere_tokenizer = Tokenizer.from_pretrained(
-                "Xenova/c4ai-command-r-v01-tokenizer"
-            )
-            return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer}
-        # anthropic
-        elif model in litellm.anthropic_models and "claude-3" not in model:
-            claude_tokenizer = Tokenizer.from_str(claude_json_str)
-            return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer}
-        # llama2
-        elif "llama-2" in model.lower() or "replicate" in model.lower():
-            tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-            return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
-        # llama3
-        elif "llama-3" in model.lower():
-            tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
-            return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
+        result = _return_huggingface_tokenizer(model)
+        if result is not None:
+            return result
     except Exception as e:
         verbose_logger.debug(f"Error selecting tokenizer: {e}")
 
     # default - tiktoken
-    return {
-        "type": "openai_tokenizer",
-        "tokenizer": encoding,
-    }  # default to openai tokenizer
+    return _return_openai_tokenizer(model)
+
+
+def _return_openai_tokenizer(model: str) -> SelectTokenizerResponse:
+    return {"type": "openai_tokenizer", "tokenizer": encoding}
+
+
+def _return_huggingface_tokenizer(model: str) -> Optional[SelectTokenizerResponse]:
+    if model in litellm.cohere_models and "command-r" in model:
+        # cohere
+        cohere_tokenizer = Tokenizer.from_pretrained(
+            "Xenova/c4ai-command-r-v01-tokenizer"
+        )
+        return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer}
+    # anthropic
+    elif model in litellm.anthropic_models and "claude-3" not in model:
+        claude_tokenizer = Tokenizer.from_str(claude_json_str)
+        return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer}
+    # llama2
+    elif "llama-2" in model.lower() or "replicate" in model.lower():
+        tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
+    # llama3
+    elif "llama-3" in model.lower():
+        tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
+        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
+    else:
+        return None
 
 
 def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
index c8e66f767b..0fd82ad7bf 100644
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@@ -450,7 +450,7 @@ async def test_async_vertexai_response():
             or "32k" in model
             or "ultra" in model
             or "002" in model
-            or "gemini-2.0-flash-thinking-exp" == model
+            or "gemini-2.0-flash-thinking-exp" in model
         ):
             # our account does not have access to this model
             continue
@@ -492,7 +492,11 @@ async def test_async_vertexai_streaming_response():
     test_models += litellm.vertex_language_models  # always test gemini-pro
     for model in test_models:
         if model in VERTEX_MODELS_TO_NOT_TEST or (
-            "gecko" in model or "32k" in model or "ultra" in model or "002" in model
+            "gecko" in model
+            or "32k" in model
+            or "ultra" in model
+            or "002" in model
+            or "gemini-2.0-flash-thinking-exp" in model
         ):
             # our account does not have access to this model
             continue
diff --git a/tests/local_testing/test_token_counter.py b/tests/local_testing/test_token_counter.py
index e1e2c36e9f..d572fa8014 100644
--- a/tests/local_testing/test_token_counter.py
+++ b/tests/local_testing/test_token_counter.py
@@ -459,3 +459,14 @@ class TestTokenizerSelection(unittest.TestCase):
         # Verify fallback to OpenAI tokenizer
         self.assertEqual(result["type"], "openai_tokenizer")
         self.assertEqual(result["tokenizer"], encoding)
+
+    @patch("litellm.utils._return_huggingface_tokenizer")
+    def test_disable_hf_tokenizer_download(self, mock_return_huggingface_tokenizer):
+        # Use pytest.MonkeyPatch() directly instead of fixture
+        monkeypatch = pytest.MonkeyPatch()
+        monkeypatch.setattr(litellm, "disable_hf_tokenizer_download", True)
+
+        result = _select_tokenizer_helper("grok-32r22r")
+        mock_return_huggingface_tokenizer.assert_not_called()
+        assert result["type"] == "openai_tokenizer"
+        assert result["tokenizer"] == encoding
diff --git a/tests/proxy_admin_ui_tests/test_key_management.py b/tests/proxy_admin_ui_tests/test_key_management.py
index f443d29715..12c50ac0cc 100644
--- a/tests/proxy_admin_ui_tests/test_key_management.py
+++ b/tests/proxy_admin_ui_tests/test_key_management.py
@@ -8,6 +8,7 @@ from datetime import datetime
 from dotenv import load_dotenv
 from fastapi import Request
 from fastapi.routing import APIRoute
+from unittest.mock import MagicMock, patch
 
 load_dotenv()
 import io
@@ -988,3 +989,28 @@ async def test_list_key_helper(prisma_client):
                 user_id="admin",
             ),
         )
+
+
+@pytest.mark.asyncio
+@patch("litellm.proxy.management_endpoints.key_management_endpoints.get_team_object")
+async def test_key_generate_always_db_team(mock_get_team_object):
+    from litellm.proxy.management_endpoints.key_management_endpoints import (
+        generate_key_fn,
+    )
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", MagicMock())
+    mock_get_team_object.return_value = None
+    try:
+        await generate_key_fn(
+            data=GenerateKeyRequest(team_id="1234"),
+            user_api_key_dict=UserAPIKeyAuth(
+                user_role=LitellmUserRoles.PROXY_ADMIN,
+                api_key="sk-1234",
+                user_id="admin",
+            ),
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+
+    mock_get_team_object.assert_called_once()
+    assert mock_get_team_object.call_args.kwargs["check_db_only"] == True