Doc updates + management endpoint fixes (#8138)

* Litellm dev 01 29 2025 p4 (#8107)

* fix(key_management_endpoints.py): always get db team

Fixes https://github.com/BerriAI/litellm/issues/7983

* test(test_key_management.py): add unit test enforcing check_db_only is always true on key generate checks

* test: fix test

* test: skip gemini thinking

* Litellm dev 01 29 2025 p3 (#8106)

* fix(__init__.py): reduces size of __init__.py and reduces scope for errors by using correct param

* refactor(__init__.py): refactor init by cleaning up redundant params

* refactor(__init__.py): move more constants into constants.py

cleanup root

* refactor(__init__.py): more cleanup

* feat(__init__.py): expose new 'disable_hf_tokenizer_download' param

enables hf model usage in offline env

* docs(config_settings.md): document new disable_hf_tokenizer_download param

* fix: fix linting error

* fix: fix unsafe comparison

* test: fix test

* docs(public_teams.md): add doc showing how to expose public teams for users to join

* docs: add beta disclaimer on public teams

* test: update tests
This commit is contained in:
Krish Dholakia 2025-01-30 22:56:41 -08:00 committed by GitHub
parent 2eee7f978f
commit 16b5de07af
16 changed files with 428 additions and 349 deletions

View file

@ -139,6 +139,7 @@ general_settings:
| disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. | | disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. |
| key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) | | key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
| disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. | | disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. |
| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). |
### general_settings - Reference ### general_settings - Reference

View file

@ -0,0 +1,40 @@
# [BETA] Public Teams
Expose available teams to your users to join on signup.
<iframe width="840" height="500" src="https://www.loom.com/embed/7871ea15035a48d2a118b7486c2f7598?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
## Quick Start
1. Create a team on LiteLLM
```bash
curl -X POST '<PROXY_BASE_URL>/team/new' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer <MASTER_KEY>' \
-d '{"name": "My Team", "team_id": "team_id_1"}'
```
2. Expose the team to your users
```yaml
litellm_settings:
default_internal_user_params:
available_teams: ["team_id_1"] # 👈 Make team available to new SSO users
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/team/member_add' \
-H 'Authorization: Bearer sk-<USER_KEY>' \
-H 'Content-Type: application/json' \
--data-raw '{
"team_id": "team_id_1",
"member": [{"role": "user", "user_id": "my-test-user"}]
}'
```

View file

@ -6,11 +6,6 @@ import TabItem from '@theme/TabItem';
Create keys, track spend, add models without worrying about the config / CRUD endpoints. Create keys, track spend, add models without worrying about the config / CRUD endpoints.
:::info
This is in beta, so things may change. If you have feedback, [let us know](https://discord.com/invite/wuPM9dRgDw)
:::
<Image img={require('../../img/litellm_ui_create_key.png')} /> <Image img={require('../../img/litellm_ui_create_key.png')} />

View file

@ -98,6 +98,7 @@ const sidebars = {
"proxy/ui", "proxy/ui",
"proxy/admin_ui_sso", "proxy/admin_ui_sso",
"proxy/self_serve", "proxy/self_serve",
"proxy/public_teams",
"proxy/custom_sso" "proxy/custom_sso"
], ],
}, },

View file

@ -9,7 +9,12 @@ from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
from litellm.types.utils import ImageObject, BudgetConfig from litellm.types.utils import (
ImageObject,
BudgetConfig,
all_litellm_params,
all_litellm_params as _litellm_completion_params,
) # maintain backwards compatibility for root param
from litellm._logging import ( from litellm._logging import (
set_verbose, set_verbose,
_turn_on_debug, _turn_on_debug,
@ -29,6 +34,24 @@ from litellm.constants import (
LITELLM_CHAT_PROVIDERS, LITELLM_CHAT_PROVIDERS,
HUMANLOOP_PROMPT_CACHE_TTL_SECONDS, HUMANLOOP_PROMPT_CACHE_TTL_SECONDS,
OPENAI_CHAT_COMPLETION_PARAMS, OPENAI_CHAT_COMPLETION_PARAMS,
OPENAI_CHAT_COMPLETION_PARAMS as _openai_completion_params, # backwards compatibility
OPENAI_FINISH_REASONS,
OPENAI_FINISH_REASONS as _openai_finish_reasons, # backwards compatibility
openai_compatible_endpoints,
openai_compatible_providers,
openai_text_completion_compatible_providers,
_openai_like_providers,
replicate_models,
clarifai_models,
huggingface_models,
empower_models,
together_ai_models,
baseten_models,
REPEATED_STREAMING_CHUNK_LIMIT,
request_timeout,
open_ai_embedding_models,
cohere_embedding_models,
bedrock_embedding_models,
) )
from litellm.types.guardrails import GuardrailItem from litellm.types.guardrails import GuardrailItem
from litellm.proxy._types import ( from litellm.proxy._types import (
@ -217,75 +240,8 @@ default_soft_budget: float = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0 50.0 # by default all litellm proxy keys have a soft budget of 50.0
) )
forward_traceparent_to_llm_provider: bool = False forward_traceparent_to_llm_provider: bool = False
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
_openai_completion_params = [
"functions",
"function_call",
"temperature",
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"request_timeout",
"api_base",
"api_version",
"api_key",
"deployment_id",
"organization",
"base_url",
"default_headers",
"timeout",
"response_format",
"seed",
"tools",
"tool_choice",
"max_retries",
]
_litellm_completion_params = [
"metadata",
"acompletion",
"caching",
"mock_response",
"api_key",
"api_version",
"api_base",
"force_timeout",
"logger_fn",
"verbose",
"custom_llm_provider",
"litellm_logging_obj",
"litellm_call_id",
"use_client",
"id",
"fallbacks",
"azure",
"headers",
"model_list",
"num_retries",
"context_window_fallback_dict",
"roles",
"final_prompt_value",
"bos_token",
"eos_token",
"request_timeout",
"complete_response",
"self",
"client",
"rpm",
"tpm",
"input_cost_per_token",
"output_cost_per_token",
"hf_model_name",
"model_info",
"proxy_server_request",
"preset_cache_key",
]
_current_cost = 0.0 # private variable, used if max budget is set _current_cost = 0.0 # private variable, used if max budget is set
error_logs: Dict = {} error_logs: Dict = {}
add_function_to_prompt: bool = ( add_function_to_prompt: bool = (
@ -318,11 +274,8 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = [] custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION #### #### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None priority_reservation: Optional[Dict[str, float]] = None
#### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
#### Networking settings ####
request_timeout: float = 6000 # time in seconds
force_ipv4: bool = ( force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6. False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
) )
@ -352,39 +305,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings()
#### PII MASKING #### #### PII MASKING ####
output_parse_pii: bool = False output_parse_pii: bool = False
############################################# #############################################
from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
def get_model_cost_map(url: str):
if (
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
):
import importlib.resources
import json
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
content = json.load(f)
return content
try:
response = httpx.get(
url, timeout=5
) # set a 5 second timeout for the get request
response.raise_for_status() # Raise an exception if the request is unsuccessful
content = response.json()
return content
except Exception:
import importlib.resources
import json
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
content = json.load(f)
return content
model_cost = get_model_cost_map(url=model_cost_map_url) model_cost = get_model_cost_map(url=model_cost_map_url)
custom_prompt_dict: Dict[str, dict] = {} custom_prompt_dict: Dict[str, dict] = {}
@ -446,7 +367,6 @@ cohere_chat_models: List = []
mistral_chat_models: List = [] mistral_chat_models: List = []
text_completion_codestral_models: List = [] text_completion_codestral_models: List = []
anthropic_models: List = [] anthropic_models: List = []
empower_models: List = []
openrouter_models: List = [] openrouter_models: List = []
vertex_language_models: List = [] vertex_language_models: List = []
vertex_vision_models: List = [] vertex_vision_models: List = []
@ -641,202 +561,8 @@ def add_known_models():
add_known_models() add_known_models()
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
openai_compatible_endpoints: List = [
"api.perplexity.ai",
"api.endpoints.anyscale.com/v1",
"api.deepinfra.com/v1/openai",
"api.mistral.ai/v1",
"codestral.mistral.ai/v1/chat/completions",
"codestral.mistral.ai/v1/fim/completions",
"api.groq.com/openai/v1",
"https://integrate.api.nvidia.com/v1",
"api.deepseek.com/v1",
"api.together.xyz/v1",
"app.empower.dev/api/v1",
"https://api.friendli.ai/serverless/v1",
"api.sambanova.ai/v1",
"api.x.ai/v1",
"api.galadriel.ai/v1",
]
# this is maintained for Exception Mapping # this is maintained for Exception Mapping
openai_compatible_providers: List = [
"anyscale",
"mistral",
"groq",
"nvidia_nim",
"cerebras",
"sambanova",
"ai21_chat",
"ai21",
"volcengine",
"codestral",
"deepseek",
"deepinfra",
"perplexity",
"xinference",
"xai",
"together_ai",
"fireworks_ai",
"empower",
"friendliai",
"azure_ai",
"github",
"litellm_proxy",
"hosted_vllm",
"lm_studio",
"galadriel",
]
openai_text_completion_compatible_providers: List = (
[ # providers that support `/v1/completions`
"together_ai",
"fireworks_ai",
"hosted_vllm",
]
)
_openai_like_providers: List = [
"predibase",
"databricks",
"watsonx",
] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
# well supported replicate llms
replicate_models: List = [
# llama replicate supported LLMs
"replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
"a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
"meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
# Vicuna
"replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
"joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
# Flan T-5
"daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
# Others
"replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
"replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
]
clarifai_models: List = [
"clarifai/meta.Llama-3.Llama-3-8B-Instruct",
"clarifai/gcp.generate.gemma-1_1-7b-it",
"clarifai/mistralai.completion.mixtral-8x22B",
"clarifai/cohere.generate.command-r-plus",
"clarifai/databricks.drbx.dbrx-instruct",
"clarifai/mistralai.completion.mistral-large",
"clarifai/mistralai.completion.mistral-medium",
"clarifai/mistralai.completion.mistral-small",
"clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
"clarifai/gcp.generate.gemma-2b-it",
"clarifai/gcp.generate.gemma-7b-it",
"clarifai/deci.decilm.deciLM-7B-instruct",
"clarifai/mistralai.completion.mistral-7B-Instruct",
"clarifai/gcp.generate.gemini-pro",
"clarifai/anthropic.completion.claude-v1",
"clarifai/anthropic.completion.claude-instant-1_2",
"clarifai/anthropic.completion.claude-instant",
"clarifai/anthropic.completion.claude-v2",
"clarifai/anthropic.completion.claude-2_1",
"clarifai/meta.Llama-2.codeLlama-70b-Python",
"clarifai/meta.Llama-2.codeLlama-70b-Instruct",
"clarifai/openai.completion.gpt-3_5-turbo-instruct",
"clarifai/meta.Llama-2.llama2-7b-chat",
"clarifai/meta.Llama-2.llama2-13b-chat",
"clarifai/meta.Llama-2.llama2-70b-chat",
"clarifai/openai.chat-completion.gpt-4-turbo",
"clarifai/microsoft.text-generation.phi-2",
"clarifai/meta.Llama-2.llama2-7b-chat-vllm",
"clarifai/upstage.solar.solar-10_7b-instruct",
"clarifai/openchat.openchat.openchat-3_5-1210",
"clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
"clarifai/gcp.generate.text-bison",
"clarifai/meta.Llama-2.llamaGuard-7b",
"clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
"clarifai/openai.chat-completion.GPT-4",
"clarifai/openai.chat-completion.GPT-3_5-turbo",
"clarifai/ai21.complete.Jurassic2-Grande",
"clarifai/ai21.complete.Jurassic2-Grande-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo",
"clarifai/ai21.complete.Jurassic2-Large",
"clarifai/cohere.generate.cohere-generate-command",
"clarifai/wizardlm.generate.wizardCoder-Python-34B",
"clarifai/wizardlm.generate.wizardLM-70B",
"clarifai/tiiuae.falcon.falcon-40b-instruct",
"clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
"clarifai/gcp.generate.code-gecko",
"clarifai/gcp.generate.code-bison",
"clarifai/mistralai.completion.mistral-7B-OpenOrca",
"clarifai/mistralai.completion.openHermes-2-mistral-7B",
"clarifai/wizardlm.generate.wizardLM-13B",
"clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
"clarifai/wizardlm.generate.wizardCoder-15B",
"clarifai/microsoft.text-generation.phi-1_5",
"clarifai/databricks.Dolly-v2.dolly-v2-12b",
"clarifai/bigcode.code.StarCoder",
"clarifai/salesforce.xgen.xgen-7b-8k-instruct",
"clarifai/mosaicml.mpt.mpt-7b-instruct",
"clarifai/anthropic.completion.claude-3-opus",
"clarifai/anthropic.completion.claude-3-sonnet",
"clarifai/gcp.generate.gemini-1_5-pro",
"clarifai/gcp.generate.imagen-2",
"clarifai/salesforce.blip.general-english-image-caption-blip-2",
]
huggingface_models: List = [
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-13b-hf",
"meta-llama/Llama-2-13b-chat-hf",
"meta-llama/Llama-2-70b-hf",
"meta-llama/Llama-2-70b-chat-hf",
"meta-llama/Llama-2-7b",
"meta-llama/Llama-2-7b-chat",
"meta-llama/Llama-2-13b",
"meta-llama/Llama-2-13b-chat",
"meta-llama/Llama-2-70b",
"meta-llama/Llama-2-70b-chat",
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
empower_models = [
"empower/empower-functions",
"empower/empower-functions-small",
]
together_ai_models: List = [
# llama llms - chat
"togethercomputer/llama-2-70b-chat",
# llama llms - language / instruct
"togethercomputer/llama-2-70b",
"togethercomputer/LLaMA-2-7B-32K",
"togethercomputer/Llama-2-7B-32K-Instruct",
"togethercomputer/llama-2-7b",
# falcon llms
"togethercomputer/falcon-40b-instruct",
"togethercomputer/falcon-7b-instruct",
# alpaca
"togethercomputer/alpaca-7b",
# chat llms
"HuggingFaceH4/starchat-alpha",
# code llms
"togethercomputer/CodeLlama-34b",
"togethercomputer/CodeLlama-34b-Instruct",
"togethercomputer/CodeLlama-34b-Python",
"defog/sqlcoder",
"NumbersStation/nsql-llama-2-7B",
"WizardLM/WizardCoder-15B-V1.0",
"WizardLM/WizardCoder-Python-34B-V1.0",
# language llms
"NousResearch/Nous-Hermes-Llama2-13b",
"Austism/chronos-hermes-13b",
"upstage/SOLAR-0-70b-16bit",
"WizardLM/WizardLM-70B-V1.0",
] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
baseten_models: List = [
"qvv0xeq",
"q841o8w",
"31dxrj3",
] # FALCON 7B # WizardLM # Mosaic ML
# used for Cost Tracking & Token counting # used for Cost Tracking & Token counting
@ -980,20 +706,6 @@ longer_context_model_fallback_dict: dict = {
} }
####### EMBEDDING MODELS ################### ####### EMBEDDING MODELS ###################
open_ai_embedding_models: List = ["text-embedding-ada-002"]
cohere_embedding_models: List = [
"embed-english-v3.0",
"embed-english-light-v3.0",
"embed-multilingual-v3.0",
"embed-english-v2.0",
"embed-english-light-v2.0",
"embed-multilingual-v2.0",
]
bedrock_embedding_models: List = [
"amazon.titan-embed-text-v1",
"cohere.embed-english-v3",
"cohere.embed-multilingual-v3",
]
all_embedding_models = ( all_embedding_models = (
open_ai_embedding_models open_ai_embedding_models
@ -1277,4 +989,7 @@ custom_provider_map: List[CustomLLMItem] = []
_custom_providers: List[str] = ( _custom_providers: List[str] = (
[] []
) # internal helper util, used to track names of custom providers ) # internal helper util, used to track names of custom providers
disable_hf_tokenizer_download: Optional[bool] = (
None # disable huggingface tokenizer download. Defaults to openai clk100
)
global_disable_no_log_param: bool = False global_disable_no_log_param: bool = False

View file

@ -1,3 +1,5 @@
from typing import List
ROUTER_MAX_FALLBACKS = 5 ROUTER_MAX_FALLBACKS = 5
DEFAULT_BATCH_SIZE = 512 DEFAULT_BATCH_SIZE = 512
DEFAULT_FLUSH_INTERVAL_SECONDS = 5 DEFAULT_FLUSH_INTERVAL_SECONDS = 5
@ -12,6 +14,11 @@ DEFAULT_IMAGE_TOKEN_COUNT = 250
DEFAULT_IMAGE_WIDTH = 300 DEFAULT_IMAGE_WIDTH = 300
DEFAULT_IMAGE_HEIGHT = 300 DEFAULT_IMAGE_HEIGHT = 300
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
#### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
#### Networking settings ####
request_timeout: float = 6000 # time in seconds
LITELLM_CHAT_PROVIDERS = [ LITELLM_CHAT_PROVIDERS = [
"openai", "openai",
"openai_like", "openai_like",
@ -113,6 +120,222 @@ OPENAI_CHAT_COMPLETION_PARAMS = [
"top_logprobs", "top_logprobs",
"extra_headers", "extra_headers",
] ]
openai_compatible_endpoints: List = [
"api.perplexity.ai",
"api.endpoints.anyscale.com/v1",
"api.deepinfra.com/v1/openai",
"api.mistral.ai/v1",
"codestral.mistral.ai/v1/chat/completions",
"codestral.mistral.ai/v1/fim/completions",
"api.groq.com/openai/v1",
"https://integrate.api.nvidia.com/v1",
"api.deepseek.com/v1",
"api.together.xyz/v1",
"app.empower.dev/api/v1",
"https://api.friendli.ai/serverless/v1",
"api.sambanova.ai/v1",
"api.x.ai/v1",
"api.galadriel.ai/v1",
]
openai_compatible_providers: List = [
"anyscale",
"mistral",
"groq",
"nvidia_nim",
"cerebras",
"sambanova",
"ai21_chat",
"ai21",
"volcengine",
"codestral",
"deepseek",
"deepinfra",
"perplexity",
"xinference",
"xai",
"together_ai",
"fireworks_ai",
"empower",
"friendliai",
"azure_ai",
"github",
"litellm_proxy",
"hosted_vllm",
"lm_studio",
"galadriel",
]
openai_text_completion_compatible_providers: List = (
[ # providers that support `/v1/completions`
"together_ai",
"fireworks_ai",
"hosted_vllm",
]
)
_openai_like_providers: List = [
"predibase",
"databricks",
"watsonx",
] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
# well supported replicate llms
replicate_models: List = [
# llama replicate supported LLMs
"replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
"a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
"meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
# Vicuna
"replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
"joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
# Flan T-5
"daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
# Others
"replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
"replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
]
clarifai_models: List = [
"clarifai/meta.Llama-3.Llama-3-8B-Instruct",
"clarifai/gcp.generate.gemma-1_1-7b-it",
"clarifai/mistralai.completion.mixtral-8x22B",
"clarifai/cohere.generate.command-r-plus",
"clarifai/databricks.drbx.dbrx-instruct",
"clarifai/mistralai.completion.mistral-large",
"clarifai/mistralai.completion.mistral-medium",
"clarifai/mistralai.completion.mistral-small",
"clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
"clarifai/gcp.generate.gemma-2b-it",
"clarifai/gcp.generate.gemma-7b-it",
"clarifai/deci.decilm.deciLM-7B-instruct",
"clarifai/mistralai.completion.mistral-7B-Instruct",
"clarifai/gcp.generate.gemini-pro",
"clarifai/anthropic.completion.claude-v1",
"clarifai/anthropic.completion.claude-instant-1_2",
"clarifai/anthropic.completion.claude-instant",
"clarifai/anthropic.completion.claude-v2",
"clarifai/anthropic.completion.claude-2_1",
"clarifai/meta.Llama-2.codeLlama-70b-Python",
"clarifai/meta.Llama-2.codeLlama-70b-Instruct",
"clarifai/openai.completion.gpt-3_5-turbo-instruct",
"clarifai/meta.Llama-2.llama2-7b-chat",
"clarifai/meta.Llama-2.llama2-13b-chat",
"clarifai/meta.Llama-2.llama2-70b-chat",
"clarifai/openai.chat-completion.gpt-4-turbo",
"clarifai/microsoft.text-generation.phi-2",
"clarifai/meta.Llama-2.llama2-7b-chat-vllm",
"clarifai/upstage.solar.solar-10_7b-instruct",
"clarifai/openchat.openchat.openchat-3_5-1210",
"clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
"clarifai/gcp.generate.text-bison",
"clarifai/meta.Llama-2.llamaGuard-7b",
"clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
"clarifai/openai.chat-completion.GPT-4",
"clarifai/openai.chat-completion.GPT-3_5-turbo",
"clarifai/ai21.complete.Jurassic2-Grande",
"clarifai/ai21.complete.Jurassic2-Grande-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo",
"clarifai/ai21.complete.Jurassic2-Large",
"clarifai/cohere.generate.cohere-generate-command",
"clarifai/wizardlm.generate.wizardCoder-Python-34B",
"clarifai/wizardlm.generate.wizardLM-70B",
"clarifai/tiiuae.falcon.falcon-40b-instruct",
"clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
"clarifai/gcp.generate.code-gecko",
"clarifai/gcp.generate.code-bison",
"clarifai/mistralai.completion.mistral-7B-OpenOrca",
"clarifai/mistralai.completion.openHermes-2-mistral-7B",
"clarifai/wizardlm.generate.wizardLM-13B",
"clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
"clarifai/wizardlm.generate.wizardCoder-15B",
"clarifai/microsoft.text-generation.phi-1_5",
"clarifai/databricks.Dolly-v2.dolly-v2-12b",
"clarifai/bigcode.code.StarCoder",
"clarifai/salesforce.xgen.xgen-7b-8k-instruct",
"clarifai/mosaicml.mpt.mpt-7b-instruct",
"clarifai/anthropic.completion.claude-3-opus",
"clarifai/anthropic.completion.claude-3-sonnet",
"clarifai/gcp.generate.gemini-1_5-pro",
"clarifai/gcp.generate.imagen-2",
"clarifai/salesforce.blip.general-english-image-caption-blip-2",
]
huggingface_models: List = [
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-13b-hf",
"meta-llama/Llama-2-13b-chat-hf",
"meta-llama/Llama-2-70b-hf",
"meta-llama/Llama-2-70b-chat-hf",
"meta-llama/Llama-2-7b",
"meta-llama/Llama-2-7b-chat",
"meta-llama/Llama-2-13b",
"meta-llama/Llama-2-13b-chat",
"meta-llama/Llama-2-70b",
"meta-llama/Llama-2-70b-chat",
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
empower_models = [
"empower/empower-functions",
"empower/empower-functions-small",
]
together_ai_models: List = [
# llama llms - chat
"togethercomputer/llama-2-70b-chat",
# llama llms - language / instruct
"togethercomputer/llama-2-70b",
"togethercomputer/LLaMA-2-7B-32K",
"togethercomputer/Llama-2-7B-32K-Instruct",
"togethercomputer/llama-2-7b",
# falcon llms
"togethercomputer/falcon-40b-instruct",
"togethercomputer/falcon-7b-instruct",
# alpaca
"togethercomputer/alpaca-7b",
# chat llms
"HuggingFaceH4/starchat-alpha",
# code llms
"togethercomputer/CodeLlama-34b",
"togethercomputer/CodeLlama-34b-Instruct",
"togethercomputer/CodeLlama-34b-Python",
"defog/sqlcoder",
"NumbersStation/nsql-llama-2-7B",
"WizardLM/WizardCoder-15B-V1.0",
"WizardLM/WizardCoder-Python-34B-V1.0",
# language llms
"NousResearch/Nous-Hermes-Llama2-13b",
"Austism/chronos-hermes-13b",
"upstage/SOLAR-0-70b-16bit",
"WizardLM/WizardLM-70B-V1.0",
] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
baseten_models: List = [
"qvv0xeq",
"q841o8w",
"31dxrj3",
] # FALCON 7B # WizardLM # Mosaic ML
open_ai_embedding_models: List = ["text-embedding-ada-002"]
cohere_embedding_models: List = [
"embed-english-v3.0",
"embed-english-light-v3.0",
"embed-multilingual-v3.0",
"embed-english-v2.0",
"embed-english-light-v2.0",
"embed-multilingual-v2.0",
]
bedrock_embedding_models: List = [
"amazon.titan-embed-text-v1",
"cohere.embed-english-v3",
"cohere.embed-multilingual-v3",
]
OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60 # 1 minute HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60 # 1 minute
RESPONSE_FORMAT_TOOL_NAME = "json_tool_call" # default tool name used when converting response format to tool call RESPONSE_FORMAT_TOOL_NAME = "json_tool_call" # default tool name used when converting response format to tool call

View file

@ -0,0 +1,45 @@
"""
Pulls the cost + context window + provider route for known models from https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
This can be disabled by setting the LITELLM_LOCAL_MODEL_COST_MAP environment variable to True.
```
export LITELLM_LOCAL_MODEL_COST_MAP=True
```
"""
import os
import httpx
def get_model_cost_map(url: str):
if (
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
):
import importlib.resources
import json
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
content = json.load(f)
return content
try:
response = httpx.get(
url, timeout=5
) # set a 5 second timeout for the get request
response.raise_for_status() # Raise an exception if the request is unsuccessful
content = response.json()
return content
except Exception:
import importlib.resources
import json
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
content = json.load(f)
return content

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -61,6 +61,7 @@ def _get_user_in_team(
for member in team_table.members_with_roles: for member in team_table.members_with_roles:
if member.user_id is not None and member.user_id == user_id: if member.user_id is not None and member.user_id == user_id:
return member return member
return None return None
@ -366,6 +367,7 @@ async def generate_key_fn( # noqa: PLR0915
prisma_client=prisma_client, prisma_client=prisma_client,
user_api_key_cache=user_api_key_cache, user_api_key_cache=user_api_key_cache,
parent_otel_span=user_api_key_dict.parent_otel_span, parent_otel_span=user_api_key_dict.parent_otel_span,
check_db_only=True,
) )
except Exception as e: except Exception as e:
verbose_proxy_logger.debug( verbose_proxy_logger.debug(

View file

@ -1889,3 +1889,8 @@ class HttpHandlerRequestFields(TypedDict, total=False):
class ProviderSpecificHeader(TypedDict): class ProviderSpecificHeader(TypedDict):
custom_llm_provider: str custom_llm_provider: str
extra_headers: dict extra_headers: dict
class SelectTokenizerResponse(TypedDict):
type: Literal["openai_tokenizer", "huggingface_tokenizer"]
tokenizer: Any

View file

@ -150,6 +150,7 @@ from litellm.types.utils import (
ModelResponseStream, ModelResponseStream,
ProviderField, ProviderField,
ProviderSpecificModelInfo, ProviderSpecificModelInfo,
SelectTokenizerResponse,
StreamingChoices, StreamingChoices,
TextChoices, TextChoices,
TextCompletionResponse, TextCompletionResponse,
@ -1440,34 +1441,47 @@ def _select_tokenizer(
@lru_cache(maxsize=128) @lru_cache(maxsize=128)
def _select_tokenizer_helper(model: str): def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
if litellm.disable_hf_tokenizer_download is True:
return _return_openai_tokenizer(model)
try: try:
if model in litellm.cohere_models and "command-r" in model: result = _return_huggingface_tokenizer(model)
# cohere if result is not None:
cohere_tokenizer = Tokenizer.from_pretrained( return result
"Xenova/c4ai-command-r-v01-tokenizer"
)
return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer}
# anthropic
elif model in litellm.anthropic_models and "claude-3" not in model:
claude_tokenizer = Tokenizer.from_str(claude_json_str)
return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer}
# llama2
elif "llama-2" in model.lower() or "replicate" in model.lower():
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# llama3
elif "llama-3" in model.lower():
tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
except Exception as e: except Exception as e:
verbose_logger.debug(f"Error selecting tokenizer: {e}") verbose_logger.debug(f"Error selecting tokenizer: {e}")
# default - tiktoken # default - tiktoken
return { return _return_openai_tokenizer(model)
"type": "openai_tokenizer",
"tokenizer": encoding,
} # default to openai tokenizer def _return_openai_tokenizer(model: str) -> SelectTokenizerResponse:
return {"type": "openai_tokenizer", "tokenizer": encoding}
def _return_huggingface_tokenizer(model: str) -> Optional[SelectTokenizerResponse]:
if model in litellm.cohere_models and "command-r" in model:
# cohere
cohere_tokenizer = Tokenizer.from_pretrained(
"Xenova/c4ai-command-r-v01-tokenizer"
)
return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer}
# anthropic
elif model in litellm.anthropic_models and "claude-3" not in model:
claude_tokenizer = Tokenizer.from_str(claude_json_str)
return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer}
# llama2
elif "llama-2" in model.lower() or "replicate" in model.lower():
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# llama3
elif "llama-3" in model.lower():
tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
else:
return None
def encode(model="", text="", custom_tokenizer: Optional[dict] = None): def encode(model="", text="", custom_tokenizer: Optional[dict] = None):

View file

@ -450,7 +450,7 @@ async def test_async_vertexai_response():
or "32k" in model or "32k" in model
or "ultra" in model or "ultra" in model
or "002" in model or "002" in model
or "gemini-2.0-flash-thinking-exp" == model or "gemini-2.0-flash-thinking-exp" in model
): ):
# our account does not have access to this model # our account does not have access to this model
continue continue
@ -492,7 +492,11 @@ async def test_async_vertexai_streaming_response():
test_models += litellm.vertex_language_models # always test gemini-pro test_models += litellm.vertex_language_models # always test gemini-pro
for model in test_models: for model in test_models:
if model in VERTEX_MODELS_TO_NOT_TEST or ( if model in VERTEX_MODELS_TO_NOT_TEST or (
"gecko" in model or "32k" in model or "ultra" in model or "002" in model "gecko" in model
or "32k" in model
or "ultra" in model
or "002" in model
or "gemini-2.0-flash-thinking-exp" in model
): ):
# our account does not have access to this model # our account does not have access to this model
continue continue

View file

@ -459,3 +459,14 @@ class TestTokenizerSelection(unittest.TestCase):
# Verify fallback to OpenAI tokenizer # Verify fallback to OpenAI tokenizer
self.assertEqual(result["type"], "openai_tokenizer") self.assertEqual(result["type"], "openai_tokenizer")
self.assertEqual(result["tokenizer"], encoding) self.assertEqual(result["tokenizer"], encoding)
@patch("litellm.utils._return_huggingface_tokenizer")
def test_disable_hf_tokenizer_download(self, mock_return_huggingface_tokenizer):
# Use pytest.MonkeyPatch() directly instead of fixture
monkeypatch = pytest.MonkeyPatch()
monkeypatch.setattr(litellm, "disable_hf_tokenizer_download", True)
result = _select_tokenizer_helper("grok-32r22r")
mock_return_huggingface_tokenizer.assert_not_called()
assert result["type"] == "openai_tokenizer"
assert result["tokenizer"] == encoding

View file

@ -8,6 +8,7 @@ from datetime import datetime
from dotenv import load_dotenv from dotenv import load_dotenv
from fastapi import Request from fastapi import Request
from fastapi.routing import APIRoute from fastapi.routing import APIRoute
from unittest.mock import MagicMock, patch
load_dotenv() load_dotenv()
import io import io
@ -988,3 +989,28 @@ async def test_list_key_helper(prisma_client):
user_id="admin", user_id="admin",
), ),
) )
@pytest.mark.asyncio
@patch("litellm.proxy.management_endpoints.key_management_endpoints.get_team_object")
async def test_key_generate_always_db_team(mock_get_team_object):
from litellm.proxy.management_endpoints.key_management_endpoints import (
generate_key_fn,
)
setattr(litellm.proxy.proxy_server, "prisma_client", MagicMock())
mock_get_team_object.return_value = None
try:
await generate_key_fn(
data=GenerateKeyRequest(team_id="1234"),
user_api_key_dict=UserAPIKeyAuth(
user_role=LitellmUserRoles.PROXY_ADMIN,
api_key="sk-1234",
user_id="admin",
),
)
except Exception as e:
print(f"Error: {e}")
mock_get_team_object.assert_called_once()
assert mock_get_team_object.call_args.kwargs["check_db_only"] == True