Doc updates + management endpoint fixes (#8138)

* Litellm dev 01 29 2025 p4 (#8107) * fix(key_management_endpoints.py): always get db team Fixes https://github.com/BerriAI/litellm/issues/7983 * test(test_key_management.py): add unit test enforcing check_db_only is always true on key generate checks * test: fix test * test: skip gemini thinking * Litellm dev 01 29 2025 p3 (#8106) * fix(__init__.py): reduces size of __init__.py and reduces scope for errors by using correct param * refactor(__init__.py): refactor init by cleaning up redundant params * refactor(__init__.py): move more constants into constants.py cleanup root * refactor(__init__.py): more cleanup * feat(__init__.py): expose new 'disable_hf_tokenizer_download' param enables hf model usage in offline env * docs(config_settings.md): document new disable_hf_tokenizer_download param * fix: fix linting error * fix: fix unsafe comparison * test: fix test * docs(public_teams.md): add doc showing how to expose public teams for users to join * docs: add beta disclaimer on public teams * test: update tests
2025-04-26 03:04:13 +00:00 · 2025-01-30 22:56:41 -08:00 · 2025-01-30 22:56:41 -08:00 · 16b5de07af
commit 16b5de07af
parent 2eee7f978f
16 changed files with 428 additions and 349 deletions
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -139,6 +139,7 @@ general_settings:
 | disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. |
 | key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
 | disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. |
+| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). |

 ### general_settings - Reference

--- a/docs/my-website/docs/proxy/public_teams.md
+++ b/docs/my-website/docs/proxy/public_teams.md
@ -0,0 +1,40 @@
+# [BETA] Public Teams
+
+Expose available teams to your users to join on signup.
+
+<iframe width="840" height="500" src="https://www.loom.com/embed/7871ea15035a48d2a118b7486c2f7598?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+
+
+## Quick Start
+
+1. Create a team on LiteLLM
+
+```bash
+curl -X POST '<PROXY_BASE_URL>/team/new' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer <MASTER_KEY>' \
+-d '{"name": "My Team", "team_id": "team_id_1"}'
+```
+
+2. Expose the team to your users
+
+```yaml
+litellm_settings:
+    default_internal_user_params:
+        available_teams: ["team_id_1"] # 👈 Make team available to new SSO users
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/team/member_add' \
+-H 'Authorization: Bearer sk-<USER_KEY>' \
+-H 'Content-Type: application/json' \
+--data-raw '{
+    "team_id": "team_id_1", 
+    "member": [{"role": "user", "user_id": "my-test-user"}]
+}'
+```
+
+
+
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -6,11 +6,6 @@ import TabItem from '@theme/TabItem';

 Create keys, track spend, add models without worrying about the config / CRUD endpoints.

-:::info
-
-This is in beta, so things may change. If you have feedback, [let us know](https://discord.com/invite/wuPM9dRgDw)
-
-:::

 <Image img={require('../../img/litellm_ui_create_key.png')} />  

--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -98,6 +98,7 @@ const sidebars = {
            "proxy/ui",
            "proxy/admin_ui_sso",
            "proxy/self_serve",
+            "proxy/public_teams",
            "proxy/custom_sso"
          ],
        },
--- a/litellm/init.py
+++ b/litellm/init.py
@ -9,7 +9,12 @@ from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
 from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
-from litellm.types.utils import ImageObject, BudgetConfig
+from litellm.types.utils import (
+    ImageObject,
+    BudgetConfig,
+    all_litellm_params,
+    all_litellm_params as _litellm_completion_params,
+)  # maintain backwards compatibility for root param
 from litellm._logging import (
    set_verbose,
    _turn_on_debug,
@ -29,6 +34,24 @@ from litellm.constants import (
    LITELLM_CHAT_PROVIDERS,
    HUMANLOOP_PROMPT_CACHE_TTL_SECONDS,
    OPENAI_CHAT_COMPLETION_PARAMS,
+    OPENAI_CHAT_COMPLETION_PARAMS as _openai_completion_params,  # backwards compatibility
+    OPENAI_FINISH_REASONS,
+    OPENAI_FINISH_REASONS as _openai_finish_reasons,  # backwards compatibility
+    openai_compatible_endpoints,
+    openai_compatible_providers,
+    openai_text_completion_compatible_providers,
+    _openai_like_providers,
+    replicate_models,
+    clarifai_models,
+    huggingface_models,
+    empower_models,
+    together_ai_models,
+    baseten_models,
+    REPEATED_STREAMING_CHUNK_LIMIT,
+    request_timeout,
+    open_ai_embedding_models,
+    cohere_embedding_models,
+    bedrock_embedding_models,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@ -217,75 +240,8 @@ default_soft_budget: float = (
    50.0  # by default all litellm proxy keys have a soft budget of 50.0
 )
 forward_traceparent_to_llm_provider: bool = False
-_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
-_openai_completion_params = [
-    "functions",
-    "function_call",
-    "temperature",
-    "temperature",
-    "top_p",
-    "n",
-    "stream",
-    "stop",
-    "max_tokens",
-    "presence_penalty",
-    "frequency_penalty",
-    "logit_bias",
-    "user",
-    "request_timeout",
-    "api_base",
-    "api_version",
-    "api_key",
-    "deployment_id",
-    "organization",
-    "base_url",
-    "default_headers",
-    "timeout",
-    "response_format",
-    "seed",
-    "tools",
-    "tool_choice",
-    "max_retries",
-]
-_litellm_completion_params = [
-    "metadata",
-    "acompletion",
-    "caching",
-    "mock_response",
-    "api_key",
-    "api_version",
-    "api_base",
-    "force_timeout",
-    "logger_fn",
-    "verbose",
-    "custom_llm_provider",
-    "litellm_logging_obj",
-    "litellm_call_id",
-    "use_client",
-    "id",
-    "fallbacks",
-    "azure",
-    "headers",
-    "model_list",
-    "num_retries",
-    "context_window_fallback_dict",
-    "roles",
-    "final_prompt_value",
-    "bos_token",
-    "eos_token",
-    "request_timeout",
-    "complete_response",
-    "self",
-    "client",
-    "rpm",
-    "tpm",
-    "input_cost_per_token",
-    "output_cost_per_token",
-    "hf_model_name",
-    "model_info",
-    "proxy_server_request",
-    "preset_cache_key",
-]
+
+
 _current_cost = 0.0  # private variable, used if max budget is set
 error_logs: Dict = {}
 add_function_to_prompt: bool = (
@ -318,11 +274,8 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-#### RELIABILITY ####
-REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.

-#### Networking settings ####
-request_timeout: float = 6000  # time in seconds
+
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
@ -352,39 +305,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings()
 #### PII MASKING ####
 output_parse_pii: bool = False
 #############################################
-
-
-def get_model_cost_map(url: str):
-    if (
-        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
-        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
-    ):
-        import importlib.resources
-        import json
-
-        with importlib.resources.open_text(
-            "litellm", "model_prices_and_context_window_backup.json"
-        ) as f:
-            content = json.load(f)
-            return content
-
-    try:
-        response = httpx.get(
-            url, timeout=5
-        )  # set a 5 second timeout for the get request
-        response.raise_for_status()  # Raise an exception if the request is unsuccessful
-        content = response.json()
-        return content
-    except Exception:
-        import importlib.resources
-        import json
-
-        with importlib.resources.open_text(
-            "litellm", "model_prices_and_context_window_backup.json"
-        ) as f:
-            content = json.load(f)
-            return content
-
+from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map

 model_cost = get_model_cost_map(url=model_cost_map_url)
 custom_prompt_dict: Dict[str, dict] = {}
@ -446,7 +367,6 @@ cohere_chat_models: List = []
 mistral_chat_models: List = []
 text_completion_codestral_models: List = []
 anthropic_models: List = []
-empower_models: List = []
 openrouter_models: List = []
 vertex_language_models: List = []
 vertex_vision_models: List = []
@ -641,202 +561,8 @@ def add_known_models():

 add_known_models()
 # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
-openai_compatible_endpoints: List = [
-    "api.perplexity.ai",
-    "api.endpoints.anyscale.com/v1",
-    "api.deepinfra.com/v1/openai",
-    "api.mistral.ai/v1",
-    "codestral.mistral.ai/v1/chat/completions",
-    "codestral.mistral.ai/v1/fim/completions",
-    "api.groq.com/openai/v1",
-    "https://integrate.api.nvidia.com/v1",
-    "api.deepseek.com/v1",
-    "api.together.xyz/v1",
-    "app.empower.dev/api/v1",
-    "https://api.friendli.ai/serverless/v1",
-    "api.sambanova.ai/v1",
-    "api.x.ai/v1",
-    "api.galadriel.ai/v1",
-]

 # this is maintained for Exception Mapping
-openai_compatible_providers: List = [
-    "anyscale",
-    "mistral",
-    "groq",
-    "nvidia_nim",
-    "cerebras",
-    "sambanova",
-    "ai21_chat",
-    "ai21",
-    "volcengine",
-    "codestral",
-    "deepseek",
-    "deepinfra",
-    "perplexity",
-    "xinference",
-    "xai",
-    "together_ai",
-    "fireworks_ai",
-    "empower",
-    "friendliai",
-    "azure_ai",
-    "github",
-    "litellm_proxy",
-    "hosted_vllm",
-    "lm_studio",
-    "galadriel",
-]
-openai_text_completion_compatible_providers: List = (
-    [  # providers that support `/v1/completions`
-        "together_ai",
-        "fireworks_ai",
-        "hosted_vllm",
-    ]
-)
-_openai_like_providers: List = [
-    "predibase",
-    "databricks",
-    "watsonx",
-]  # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
-# well supported replicate llms
-replicate_models: List = [
-    # llama replicate supported LLMs
-    "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
-    "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
-    "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
-    # Vicuna
-    "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
-    "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
-    # Flan T-5
-    "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
-    # Others
-    "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
-    "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
-]
-
-clarifai_models: List = [
-    "clarifai/meta.Llama-3.Llama-3-8B-Instruct",
-    "clarifai/gcp.generate.gemma-1_1-7b-it",
-    "clarifai/mistralai.completion.mixtral-8x22B",
-    "clarifai/cohere.generate.command-r-plus",
-    "clarifai/databricks.drbx.dbrx-instruct",
-    "clarifai/mistralai.completion.mistral-large",
-    "clarifai/mistralai.completion.mistral-medium",
-    "clarifai/mistralai.completion.mistral-small",
-    "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
-    "clarifai/gcp.generate.gemma-2b-it",
-    "clarifai/gcp.generate.gemma-7b-it",
-    "clarifai/deci.decilm.deciLM-7B-instruct",
-    "clarifai/mistralai.completion.mistral-7B-Instruct",
-    "clarifai/gcp.generate.gemini-pro",
-    "clarifai/anthropic.completion.claude-v1",
-    "clarifai/anthropic.completion.claude-instant-1_2",
-    "clarifai/anthropic.completion.claude-instant",
-    "clarifai/anthropic.completion.claude-v2",
-    "clarifai/anthropic.completion.claude-2_1",
-    "clarifai/meta.Llama-2.codeLlama-70b-Python",
-    "clarifai/meta.Llama-2.codeLlama-70b-Instruct",
-    "clarifai/openai.completion.gpt-3_5-turbo-instruct",
-    "clarifai/meta.Llama-2.llama2-7b-chat",
-    "clarifai/meta.Llama-2.llama2-13b-chat",
-    "clarifai/meta.Llama-2.llama2-70b-chat",
-    "clarifai/openai.chat-completion.gpt-4-turbo",
-    "clarifai/microsoft.text-generation.phi-2",
-    "clarifai/meta.Llama-2.llama2-7b-chat-vllm",
-    "clarifai/upstage.solar.solar-10_7b-instruct",
-    "clarifai/openchat.openchat.openchat-3_5-1210",
-    "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
-    "clarifai/gcp.generate.text-bison",
-    "clarifai/meta.Llama-2.llamaGuard-7b",
-    "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
-    "clarifai/openai.chat-completion.GPT-4",
-    "clarifai/openai.chat-completion.GPT-3_5-turbo",
-    "clarifai/ai21.complete.Jurassic2-Grande",
-    "clarifai/ai21.complete.Jurassic2-Grande-Instruct",
-    "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
-    "clarifai/ai21.complete.Jurassic2-Jumbo",
-    "clarifai/ai21.complete.Jurassic2-Large",
-    "clarifai/cohere.generate.cohere-generate-command",
-    "clarifai/wizardlm.generate.wizardCoder-Python-34B",
-    "clarifai/wizardlm.generate.wizardLM-70B",
-    "clarifai/tiiuae.falcon.falcon-40b-instruct",
-    "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
-    "clarifai/gcp.generate.code-gecko",
-    "clarifai/gcp.generate.code-bison",
-    "clarifai/mistralai.completion.mistral-7B-OpenOrca",
-    "clarifai/mistralai.completion.openHermes-2-mistral-7B",
-    "clarifai/wizardlm.generate.wizardLM-13B",
-    "clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
-    "clarifai/wizardlm.generate.wizardCoder-15B",
-    "clarifai/microsoft.text-generation.phi-1_5",
-    "clarifai/databricks.Dolly-v2.dolly-v2-12b",
-    "clarifai/bigcode.code.StarCoder",
-    "clarifai/salesforce.xgen.xgen-7b-8k-instruct",
-    "clarifai/mosaicml.mpt.mpt-7b-instruct",
-    "clarifai/anthropic.completion.claude-3-opus",
-    "clarifai/anthropic.completion.claude-3-sonnet",
-    "clarifai/gcp.generate.gemini-1_5-pro",
-    "clarifai/gcp.generate.imagen-2",
-    "clarifai/salesforce.blip.general-english-image-caption-blip-2",
-]
-
-
-huggingface_models: List = [
-    "meta-llama/Llama-2-7b-hf",
-    "meta-llama/Llama-2-7b-chat-hf",
-    "meta-llama/Llama-2-13b-hf",
-    "meta-llama/Llama-2-13b-chat-hf",
-    "meta-llama/Llama-2-70b-hf",
-    "meta-llama/Llama-2-70b-chat-hf",
-    "meta-llama/Llama-2-7b",
-    "meta-llama/Llama-2-7b-chat",
-    "meta-llama/Llama-2-13b",
-    "meta-llama/Llama-2-13b-chat",
-    "meta-llama/Llama-2-70b",
-    "meta-llama/Llama-2-70b-chat",
-]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
-empower_models = [
-    "empower/empower-functions",
-    "empower/empower-functions-small",
-]
-
-together_ai_models: List = [
-    # llama llms - chat
-    "togethercomputer/llama-2-70b-chat",
-    # llama llms - language / instruct
-    "togethercomputer/llama-2-70b",
-    "togethercomputer/LLaMA-2-7B-32K",
-    "togethercomputer/Llama-2-7B-32K-Instruct",
-    "togethercomputer/llama-2-7b",
-    # falcon llms
-    "togethercomputer/falcon-40b-instruct",
-    "togethercomputer/falcon-7b-instruct",
-    # alpaca
-    "togethercomputer/alpaca-7b",
-    # chat llms
-    "HuggingFaceH4/starchat-alpha",
-    # code llms
-    "togethercomputer/CodeLlama-34b",
-    "togethercomputer/CodeLlama-34b-Instruct",
-    "togethercomputer/CodeLlama-34b-Python",
-    "defog/sqlcoder",
-    "NumbersStation/nsql-llama-2-7B",
-    "WizardLM/WizardCoder-15B-V1.0",
-    "WizardLM/WizardCoder-Python-34B-V1.0",
-    # language llms
-    "NousResearch/Nous-Hermes-Llama2-13b",
-    "Austism/chronos-hermes-13b",
-    "upstage/SOLAR-0-70b-16bit",
-    "WizardLM/WizardLM-70B-V1.0",
-]  # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
-
-
-baseten_models: List = [
-    "qvv0xeq",
-    "q841o8w",
-    "31dxrj3",
-]  # FALCON 7B  # WizardLM  # Mosaic ML


 # used for Cost Tracking & Token counting
@ -980,20 +706,6 @@ longer_context_model_fallback_dict: dict = {
 }

 ####### EMBEDDING MODELS ###################
-open_ai_embedding_models: List = ["text-embedding-ada-002"]
-cohere_embedding_models: List = [
-    "embed-english-v3.0",
-    "embed-english-light-v3.0",
-    "embed-multilingual-v3.0",
-    "embed-english-v2.0",
-    "embed-english-light-v2.0",
-    "embed-multilingual-v2.0",
-]
-bedrock_embedding_models: List = [
-    "amazon.titan-embed-text-v1",
-    "cohere.embed-english-v3",
-    "cohere.embed-multilingual-v3",
-]

 all_embedding_models = (
    open_ai_embedding_models
@ -1277,4 +989,7 @@ custom_provider_map: List[CustomLLMItem] = []
 _custom_providers: List[str] = (
    []
 )  # internal helper util, used to track names of custom providers
+disable_hf_tokenizer_download: Optional[bool] = (
+    None  # disable huggingface tokenizer download. Defaults to openai clk100
+)
 global_disable_no_log_param: bool = False
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -1,3 +1,5 @@
+from typing import List
+
 ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
 DEFAULT_FLUSH_INTERVAL_SECONDS = 5
@ -12,6 +14,11 @@ DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
+#### RELIABILITY ####
+REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
+#### Networking settings ####
+request_timeout: float = 6000  # time in seconds
+
 LITELLM_CHAT_PROVIDERS = [
    "openai",
    "openai_like",
@ -113,6 +120,222 @@ OPENAI_CHAT_COMPLETION_PARAMS = [
    "top_logprobs",
    "extra_headers",
 ]
+
+openai_compatible_endpoints: List = [
+    "api.perplexity.ai",
+    "api.endpoints.anyscale.com/v1",
+    "api.deepinfra.com/v1/openai",
+    "api.mistral.ai/v1",
+    "codestral.mistral.ai/v1/chat/completions",
+    "codestral.mistral.ai/v1/fim/completions",
+    "api.groq.com/openai/v1",
+    "https://integrate.api.nvidia.com/v1",
+    "api.deepseek.com/v1",
+    "api.together.xyz/v1",
+    "app.empower.dev/api/v1",
+    "https://api.friendli.ai/serverless/v1",
+    "api.sambanova.ai/v1",
+    "api.x.ai/v1",
+    "api.galadriel.ai/v1",
+]
+
+
+openai_compatible_providers: List = [
+    "anyscale",
+    "mistral",
+    "groq",
+    "nvidia_nim",
+    "cerebras",
+    "sambanova",
+    "ai21_chat",
+    "ai21",
+    "volcengine",
+    "codestral",
+    "deepseek",
+    "deepinfra",
+    "perplexity",
+    "xinference",
+    "xai",
+    "together_ai",
+    "fireworks_ai",
+    "empower",
+    "friendliai",
+    "azure_ai",
+    "github",
+    "litellm_proxy",
+    "hosted_vllm",
+    "lm_studio",
+    "galadriel",
+]
+openai_text_completion_compatible_providers: List = (
+    [  # providers that support `/v1/completions`
+        "together_ai",
+        "fireworks_ai",
+        "hosted_vllm",
+    ]
+)
+_openai_like_providers: List = [
+    "predibase",
+    "databricks",
+    "watsonx",
+]  # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
+# well supported replicate llms
+replicate_models: List = [
+    # llama replicate supported LLMs
+    "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
+    "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
+    "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
+    # Vicuna
+    "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
+    "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
+    # Flan T-5
+    "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
+    # Others
+    "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
+    "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
+]
+
+clarifai_models: List = [
+    "clarifai/meta.Llama-3.Llama-3-8B-Instruct",
+    "clarifai/gcp.generate.gemma-1_1-7b-it",
+    "clarifai/mistralai.completion.mixtral-8x22B",
+    "clarifai/cohere.generate.command-r-plus",
+    "clarifai/databricks.drbx.dbrx-instruct",
+    "clarifai/mistralai.completion.mistral-large",
+    "clarifai/mistralai.completion.mistral-medium",
+    "clarifai/mistralai.completion.mistral-small",
+    "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
+    "clarifai/gcp.generate.gemma-2b-it",
+    "clarifai/gcp.generate.gemma-7b-it",
+    "clarifai/deci.decilm.deciLM-7B-instruct",
+    "clarifai/mistralai.completion.mistral-7B-Instruct",
+    "clarifai/gcp.generate.gemini-pro",
+    "clarifai/anthropic.completion.claude-v1",
+    "clarifai/anthropic.completion.claude-instant-1_2",
+    "clarifai/anthropic.completion.claude-instant",
+    "clarifai/anthropic.completion.claude-v2",
+    "clarifai/anthropic.completion.claude-2_1",
+    "clarifai/meta.Llama-2.codeLlama-70b-Python",
+    "clarifai/meta.Llama-2.codeLlama-70b-Instruct",
+    "clarifai/openai.completion.gpt-3_5-turbo-instruct",
+    "clarifai/meta.Llama-2.llama2-7b-chat",
+    "clarifai/meta.Llama-2.llama2-13b-chat",
+    "clarifai/meta.Llama-2.llama2-70b-chat",
+    "clarifai/openai.chat-completion.gpt-4-turbo",
+    "clarifai/microsoft.text-generation.phi-2",
+    "clarifai/meta.Llama-2.llama2-7b-chat-vllm",
+    "clarifai/upstage.solar.solar-10_7b-instruct",
+    "clarifai/openchat.openchat.openchat-3_5-1210",
+    "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
+    "clarifai/gcp.generate.text-bison",
+    "clarifai/meta.Llama-2.llamaGuard-7b",
+    "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
+    "clarifai/openai.chat-completion.GPT-4",
+    "clarifai/openai.chat-completion.GPT-3_5-turbo",
+    "clarifai/ai21.complete.Jurassic2-Grande",
+    "clarifai/ai21.complete.Jurassic2-Grande-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo",
+    "clarifai/ai21.complete.Jurassic2-Large",
+    "clarifai/cohere.generate.cohere-generate-command",
+    "clarifai/wizardlm.generate.wizardCoder-Python-34B",
+    "clarifai/wizardlm.generate.wizardLM-70B",
+    "clarifai/tiiuae.falcon.falcon-40b-instruct",
+    "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
+    "clarifai/gcp.generate.code-gecko",
+    "clarifai/gcp.generate.code-bison",
+    "clarifai/mistralai.completion.mistral-7B-OpenOrca",
+    "clarifai/mistralai.completion.openHermes-2-mistral-7B",
+    "clarifai/wizardlm.generate.wizardLM-13B",
+    "clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
+    "clarifai/wizardlm.generate.wizardCoder-15B",
+    "clarifai/microsoft.text-generation.phi-1_5",
+    "clarifai/databricks.Dolly-v2.dolly-v2-12b",
+    "clarifai/bigcode.code.StarCoder",
+    "clarifai/salesforce.xgen.xgen-7b-8k-instruct",
+    "clarifai/mosaicml.mpt.mpt-7b-instruct",
+    "clarifai/anthropic.completion.claude-3-opus",
+    "clarifai/anthropic.completion.claude-3-sonnet",
+    "clarifai/gcp.generate.gemini-1_5-pro",
+    "clarifai/gcp.generate.imagen-2",
+    "clarifai/salesforce.blip.general-english-image-caption-blip-2",
+]
+
+
+huggingface_models: List = [
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Llama-2-70b-hf",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "meta-llama/Llama-2-7b",
+    "meta-llama/Llama-2-7b-chat",
+    "meta-llama/Llama-2-13b",
+    "meta-llama/Llama-2-13b-chat",
+    "meta-llama/Llama-2-70b",
+    "meta-llama/Llama-2-70b-chat",
+]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
+empower_models = [
+    "empower/empower-functions",
+    "empower/empower-functions-small",
+]
+
+together_ai_models: List = [
+    # llama llms - chat
+    "togethercomputer/llama-2-70b-chat",
+    # llama llms - language / instruct
+    "togethercomputer/llama-2-70b",
+    "togethercomputer/LLaMA-2-7B-32K",
+    "togethercomputer/Llama-2-7B-32K-Instruct",
+    "togethercomputer/llama-2-7b",
+    # falcon llms
+    "togethercomputer/falcon-40b-instruct",
+    "togethercomputer/falcon-7b-instruct",
+    # alpaca
+    "togethercomputer/alpaca-7b",
+    # chat llms
+    "HuggingFaceH4/starchat-alpha",
+    # code llms
+    "togethercomputer/CodeLlama-34b",
+    "togethercomputer/CodeLlama-34b-Instruct",
+    "togethercomputer/CodeLlama-34b-Python",
+    "defog/sqlcoder",
+    "NumbersStation/nsql-llama-2-7B",
+    "WizardLM/WizardCoder-15B-V1.0",
+    "WizardLM/WizardCoder-Python-34B-V1.0",
+    # language llms
+    "NousResearch/Nous-Hermes-Llama2-13b",
+    "Austism/chronos-hermes-13b",
+    "upstage/SOLAR-0-70b-16bit",
+    "WizardLM/WizardLM-70B-V1.0",
+]  # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
+
+
+baseten_models: List = [
+    "qvv0xeq",
+    "q841o8w",
+    "31dxrj3",
+]  # FALCON 7B  # WizardLM  # Mosaic ML
+
+
+open_ai_embedding_models: List = ["text-embedding-ada-002"]
+cohere_embedding_models: List = [
+    "embed-english-v3.0",
+    "embed-english-light-v3.0",
+    "embed-multilingual-v3.0",
+    "embed-english-v2.0",
+    "embed-english-light-v2.0",
+    "embed-multilingual-v2.0",
+]
+bedrock_embedding_models: List = [
+    "amazon.titan-embed-text-v1",
+    "cohere.embed-english-v3",
+    "cohere.embed-multilingual-v3",
+]
+
+
+OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
 HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60  # 1 minute
 RESPONSE_FORMAT_TOOL_NAME = "json_tool_call"  # default tool name used when converting response format to tool call

--- a/litellm/litellm_core_utils/get_model_cost_map.py
+++ b/litellm/litellm_core_utils/get_model_cost_map.py
@ -0,0 +1,45 @@
+"""
+Pulls the cost + context window + provider route for known models from https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+
+This can be disabled by setting the LITELLM_LOCAL_MODEL_COST_MAP environment variable to True.
+
+```
+export LITELLM_LOCAL_MODEL_COST_MAP=True
+```
+"""
+
+import os
+
+import httpx
+
+
+def get_model_cost_map(url: str):
+    if (
+        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
+        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
+    ):
+        import importlib.resources
+        import json
+
+        with importlib.resources.open_text(
+            "litellm", "model_prices_and_context_window_backup.json"
+        ) as f:
+            content = json.load(f)
+            return content
+
+    try:
+        response = httpx.get(
+            url, timeout=5
+        )  # set a 5 second timeout for the get request
+        response.raise_for_status()  # Raise an exception if the request is unsuccessful
+        content = response.json()
+        return content
+    except Exception:
+        import importlib.resources
+        import json
+
+        with importlib.resources.open_text(
+            "litellm", "model_prices_and_context_window_backup.json"
+        ) as f:
+            content = json.load(f)
+            return content
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@ -61,6 +61,7 @@ def _get_user_in_team(
    for member in team_table.members_with_roles:
        if member.user_id is not None and member.user_id == user_id:
            return member
+
    return None


@ -366,6 +367,7 @@ async def generate_key_fn(  # noqa: PLR0915
                    prisma_client=prisma_client,
                    user_api_key_cache=user_api_key_cache,
                    parent_otel_span=user_api_key_dict.parent_otel_span,
+                    check_db_only=True,
                )
            except Exception as e:
                verbose_proxy_logger.debug(
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -1889,3 +1889,8 @@ class HttpHandlerRequestFields(TypedDict, total=False):
 class ProviderSpecificHeader(TypedDict):
    custom_llm_provider: str
    extra_headers: dict
+
+
+class SelectTokenizerResponse(TypedDict):
+    type: Literal["openai_tokenizer", "huggingface_tokenizer"]
+    tokenizer: Any
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -150,6 +150,7 @@ from litellm.types.utils import (
    ModelResponseStream,
    ProviderField,
    ProviderSpecificModelInfo,
+    SelectTokenizerResponse,
    StreamingChoices,
    TextChoices,
    TextCompletionResponse,
@ -1440,8 +1441,27 @@ def _select_tokenizer(


@lru_cache(maxsize=128)
-def _select_tokenizer_helper(model: str):
+def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
+
+    if litellm.disable_hf_tokenizer_download is True:
+        return _return_openai_tokenizer(model)
+
    try:
+        result = _return_huggingface_tokenizer(model)
+        if result is not None:
+            return result
+    except Exception as e:
+        verbose_logger.debug(f"Error selecting tokenizer: {e}")
+
+    # default - tiktoken
+    return _return_openai_tokenizer(model)
+
+
+def _return_openai_tokenizer(model: str) -> SelectTokenizerResponse:
+    return {"type": "openai_tokenizer", "tokenizer": encoding}
+
+
+def _return_huggingface_tokenizer(model: str) -> Optional[SelectTokenizerResponse]:
    if model in litellm.cohere_models and "command-r" in model:
        # cohere
        cohere_tokenizer = Tokenizer.from_pretrained(
@ -1460,14 +1480,8 @@ def _select_tokenizer_helper(model: str):
    elif "llama-3" in model.lower():
        tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
-    except Exception as e:
-        verbose_logger.debug(f"Error selecting tokenizer: {e}")
-
-    # default - tiktoken
-    return {
-        "type": "openai_tokenizer",
-        "tokenizer": encoding,
-    }  # default to openai tokenizer
+    else:
+        return None


 def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -450,7 +450,7 @@ async def test_async_vertexai_response():
            or "32k" in model
            or "ultra" in model
            or "002" in model
-            or "gemini-2.0-flash-thinking-exp" == model
+            or "gemini-2.0-flash-thinking-exp" in model
        ):
            # our account does not have access to this model
            continue
@ -492,7 +492,11 @@ async def test_async_vertexai_streaming_response():
    test_models += litellm.vertex_language_models  # always test gemini-pro
    for model in test_models:
        if model in VERTEX_MODELS_TO_NOT_TEST or (
-            "gecko" in model or "32k" in model or "ultra" in model or "002" in model
+            "gecko" in model
+            or "32k" in model
+            or "ultra" in model
+            or "002" in model
+            or "gemini-2.0-flash-thinking-exp" in model
        ):
            # our account does not have access to this model
            continue
--- a/tests/local_testing/test_token_counter.py
+++ b/tests/local_testing/test_token_counter.py
@ -459,3 +459,14 @@ class TestTokenizerSelection(unittest.TestCase):
        # Verify fallback to OpenAI tokenizer
        self.assertEqual(result["type"], "openai_tokenizer")
        self.assertEqual(result["tokenizer"], encoding)
+
+    @patch("litellm.utils._return_huggingface_tokenizer")
+    def test_disable_hf_tokenizer_download(self, mock_return_huggingface_tokenizer):
+        # Use pytest.MonkeyPatch() directly instead of fixture
+        monkeypatch = pytest.MonkeyPatch()
+        monkeypatch.setattr(litellm, "disable_hf_tokenizer_download", True)
+
+        result = _select_tokenizer_helper("grok-32r22r")
+        mock_return_huggingface_tokenizer.assert_not_called()
+        assert result["type"] == "openai_tokenizer"
+        assert result["tokenizer"] == encoding
--- a/tests/proxy_admin_ui_tests/test_key_management.py
+++ b/tests/proxy_admin_ui_tests/test_key_management.py
@ -8,6 +8,7 @@ from datetime import datetime
 from dotenv import load_dotenv
 from fastapi import Request
 from fastapi.routing import APIRoute
+from unittest.mock import MagicMock, patch

 load_dotenv()
 import io
@ -988,3 +989,28 @@ async def test_list_key_helper(prisma_client):
                user_id="admin",
            ),
        )
+
+
+@pytest.mark.asyncio
+@patch("litellm.proxy.management_endpoints.key_management_endpoints.get_team_object")
+async def test_key_generate_always_db_team(mock_get_team_object):
+    from litellm.proxy.management_endpoints.key_management_endpoints import (
+        generate_key_fn,
+    )
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", MagicMock())
+    mock_get_team_object.return_value = None
+    try:
+        await generate_key_fn(
+            data=GenerateKeyRequest(team_id="1234"),
+            user_api_key_dict=UserAPIKeyAuth(
+                user_role=LitellmUserRoles.PROXY_ADMIN,
+                api_key="sk-1234",
+                user_id="admin",
+            ),
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+
+    mock_get_team_object.assert_called_once()
+    assert mock_get_team_object.call_args.kwargs["check_db_only"] == True