mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
Doc updates + management endpoint fixes (#8138)
* Litellm dev 01 29 2025 p4 (#8107) * fix(key_management_endpoints.py): always get db team Fixes https://github.com/BerriAI/litellm/issues/7983 * test(test_key_management.py): add unit test enforcing check_db_only is always true on key generate checks * test: fix test * test: skip gemini thinking * Litellm dev 01 29 2025 p3 (#8106) * fix(__init__.py): reduces size of __init__.py and reduces scope for errors by using correct param * refactor(__init__.py): refactor init by cleaning up redundant params * refactor(__init__.py): move more constants into constants.py cleanup root * refactor(__init__.py): more cleanup * feat(__init__.py): expose new 'disable_hf_tokenizer_download' param enables hf model usage in offline env * docs(config_settings.md): document new disable_hf_tokenizer_download param * fix: fix linting error * fix: fix unsafe comparison * test: fix test * docs(public_teams.md): add doc showing how to expose public teams for users to join * docs: add beta disclaimer on public teams * test: update tests
This commit is contained in:
parent
2eee7f978f
commit
16b5de07af
16 changed files with 428 additions and 349 deletions
|
@ -139,6 +139,7 @@ general_settings:
|
|||
| disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. |
|
||||
| key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
|
||||
| disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. |
|
||||
| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). |
|
||||
|
||||
### general_settings - Reference
|
||||
|
||||
|
|
40
docs/my-website/docs/proxy/public_teams.md
Normal file
40
docs/my-website/docs/proxy/public_teams.md
Normal file
|
@ -0,0 +1,40 @@
|
|||
# [BETA] Public Teams
|
||||
|
||||
Expose available teams to your users to join on signup.
|
||||
|
||||
<iframe width="840" height="500" src="https://www.loom.com/embed/7871ea15035a48d2a118b7486c2f7598?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. Create a team on LiteLLM
|
||||
|
||||
```bash
|
||||
curl -X POST '<PROXY_BASE_URL>/team/new' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer <MASTER_KEY>' \
|
||||
-d '{"name": "My Team", "team_id": "team_id_1"}'
|
||||
```
|
||||
|
||||
2. Expose the team to your users
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_internal_user_params:
|
||||
available_teams: ["team_id_1"] # 👈 Make team available to new SSO users
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```bash
|
||||
curl -L -X POST 'http://0.0.0.0:4000/team/member_add' \
|
||||
-H 'Authorization: Bearer sk-<USER_KEY>' \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"team_id": "team_id_1",
|
||||
"member": [{"role": "user", "user_id": "my-test-user"}]
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
|
|
@ -6,11 +6,6 @@ import TabItem from '@theme/TabItem';
|
|||
|
||||
Create keys, track spend, add models without worrying about the config / CRUD endpoints.
|
||||
|
||||
:::info
|
||||
|
||||
This is in beta, so things may change. If you have feedback, [let us know](https://discord.com/invite/wuPM9dRgDw)
|
||||
|
||||
:::
|
||||
|
||||
<Image img={require('../../img/litellm_ui_create_key.png')} />
|
||||
|
||||
|
|
|
@ -98,6 +98,7 @@ const sidebars = {
|
|||
"proxy/ui",
|
||||
"proxy/admin_ui_sso",
|
||||
"proxy/self_serve",
|
||||
"proxy/public_teams",
|
||||
"proxy/custom_sso"
|
||||
],
|
||||
},
|
||||
|
|
|
@ -9,7 +9,12 @@ from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
|
|||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
|
||||
from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
|
||||
from litellm.types.utils import ImageObject, BudgetConfig
|
||||
from litellm.types.utils import (
|
||||
ImageObject,
|
||||
BudgetConfig,
|
||||
all_litellm_params,
|
||||
all_litellm_params as _litellm_completion_params,
|
||||
) # maintain backwards compatibility for root param
|
||||
from litellm._logging import (
|
||||
set_verbose,
|
||||
_turn_on_debug,
|
||||
|
@ -29,6 +34,24 @@ from litellm.constants import (
|
|||
LITELLM_CHAT_PROVIDERS,
|
||||
HUMANLOOP_PROMPT_CACHE_TTL_SECONDS,
|
||||
OPENAI_CHAT_COMPLETION_PARAMS,
|
||||
OPENAI_CHAT_COMPLETION_PARAMS as _openai_completion_params, # backwards compatibility
|
||||
OPENAI_FINISH_REASONS,
|
||||
OPENAI_FINISH_REASONS as _openai_finish_reasons, # backwards compatibility
|
||||
openai_compatible_endpoints,
|
||||
openai_compatible_providers,
|
||||
openai_text_completion_compatible_providers,
|
||||
_openai_like_providers,
|
||||
replicate_models,
|
||||
clarifai_models,
|
||||
huggingface_models,
|
||||
empower_models,
|
||||
together_ai_models,
|
||||
baseten_models,
|
||||
REPEATED_STREAMING_CHUNK_LIMIT,
|
||||
request_timeout,
|
||||
open_ai_embedding_models,
|
||||
cohere_embedding_models,
|
||||
bedrock_embedding_models,
|
||||
)
|
||||
from litellm.types.guardrails import GuardrailItem
|
||||
from litellm.proxy._types import (
|
||||
|
@ -217,75 +240,8 @@ default_soft_budget: float = (
|
|||
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
||||
)
|
||||
forward_traceparent_to_llm_provider: bool = False
|
||||
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
|
||||
_openai_completion_params = [
|
||||
"functions",
|
||||
"function_call",
|
||||
"temperature",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
"request_timeout",
|
||||
"api_base",
|
||||
"api_version",
|
||||
"api_key",
|
||||
"deployment_id",
|
||||
"organization",
|
||||
"base_url",
|
||||
"default_headers",
|
||||
"timeout",
|
||||
"response_format",
|
||||
"seed",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"max_retries",
|
||||
]
|
||||
_litellm_completion_params = [
|
||||
"metadata",
|
||||
"acompletion",
|
||||
"caching",
|
||||
"mock_response",
|
||||
"api_key",
|
||||
"api_version",
|
||||
"api_base",
|
||||
"force_timeout",
|
||||
"logger_fn",
|
||||
"verbose",
|
||||
"custom_llm_provider",
|
||||
"litellm_logging_obj",
|
||||
"litellm_call_id",
|
||||
"use_client",
|
||||
"id",
|
||||
"fallbacks",
|
||||
"azure",
|
||||
"headers",
|
||||
"model_list",
|
||||
"num_retries",
|
||||
"context_window_fallback_dict",
|
||||
"roles",
|
||||
"final_prompt_value",
|
||||
"bos_token",
|
||||
"eos_token",
|
||||
"request_timeout",
|
||||
"complete_response",
|
||||
"self",
|
||||
"client",
|
||||
"rpm",
|
||||
"tpm",
|
||||
"input_cost_per_token",
|
||||
"output_cost_per_token",
|
||||
"hf_model_name",
|
||||
"model_info",
|
||||
"proxy_server_request",
|
||||
"preset_cache_key",
|
||||
]
|
||||
|
||||
|
||||
_current_cost = 0.0 # private variable, used if max budget is set
|
||||
error_logs: Dict = {}
|
||||
add_function_to_prompt: bool = (
|
||||
|
@ -318,11 +274,8 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
|
|||
custom_prometheus_metadata_labels: List[str] = []
|
||||
#### REQUEST PRIORITIZATION ####
|
||||
priority_reservation: Optional[Dict[str, float]] = None
|
||||
#### RELIABILITY ####
|
||||
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
||||
|
||||
#### Networking settings ####
|
||||
request_timeout: float = 6000 # time in seconds
|
||||
|
||||
force_ipv4: bool = (
|
||||
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
|
||||
)
|
||||
|
@ -352,39 +305,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings()
|
|||
#### PII MASKING ####
|
||||
output_parse_pii: bool = False
|
||||
#############################################
|
||||
|
||||
|
||||
def get_model_cost_map(url: str):
|
||||
if (
|
||||
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
|
||||
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
|
||||
):
|
||||
import importlib.resources
|
||||
import json
|
||||
|
||||
with importlib.resources.open_text(
|
||||
"litellm", "model_prices_and_context_window_backup.json"
|
||||
) as f:
|
||||
content = json.load(f)
|
||||
return content
|
||||
|
||||
try:
|
||||
response = httpx.get(
|
||||
url, timeout=5
|
||||
) # set a 5 second timeout for the get request
|
||||
response.raise_for_status() # Raise an exception if the request is unsuccessful
|
||||
content = response.json()
|
||||
return content
|
||||
except Exception:
|
||||
import importlib.resources
|
||||
import json
|
||||
|
||||
with importlib.resources.open_text(
|
||||
"litellm", "model_prices_and_context_window_backup.json"
|
||||
) as f:
|
||||
content = json.load(f)
|
||||
return content
|
||||
|
||||
from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
|
||||
|
||||
model_cost = get_model_cost_map(url=model_cost_map_url)
|
||||
custom_prompt_dict: Dict[str, dict] = {}
|
||||
|
@ -446,7 +367,6 @@ cohere_chat_models: List = []
|
|||
mistral_chat_models: List = []
|
||||
text_completion_codestral_models: List = []
|
||||
anthropic_models: List = []
|
||||
empower_models: List = []
|
||||
openrouter_models: List = []
|
||||
vertex_language_models: List = []
|
||||
vertex_vision_models: List = []
|
||||
|
@ -641,202 +561,8 @@ def add_known_models():
|
|||
|
||||
add_known_models()
|
||||
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
|
||||
openai_compatible_endpoints: List = [
|
||||
"api.perplexity.ai",
|
||||
"api.endpoints.anyscale.com/v1",
|
||||
"api.deepinfra.com/v1/openai",
|
||||
"api.mistral.ai/v1",
|
||||
"codestral.mistral.ai/v1/chat/completions",
|
||||
"codestral.mistral.ai/v1/fim/completions",
|
||||
"api.groq.com/openai/v1",
|
||||
"https://integrate.api.nvidia.com/v1",
|
||||
"api.deepseek.com/v1",
|
||||
"api.together.xyz/v1",
|
||||
"app.empower.dev/api/v1",
|
||||
"https://api.friendli.ai/serverless/v1",
|
||||
"api.sambanova.ai/v1",
|
||||
"api.x.ai/v1",
|
||||
"api.galadriel.ai/v1",
|
||||
]
|
||||
|
||||
# this is maintained for Exception Mapping
|
||||
openai_compatible_providers: List = [
|
||||
"anyscale",
|
||||
"mistral",
|
||||
"groq",
|
||||
"nvidia_nim",
|
||||
"cerebras",
|
||||
"sambanova",
|
||||
"ai21_chat",
|
||||
"ai21",
|
||||
"volcengine",
|
||||
"codestral",
|
||||
"deepseek",
|
||||
"deepinfra",
|
||||
"perplexity",
|
||||
"xinference",
|
||||
"xai",
|
||||
"together_ai",
|
||||
"fireworks_ai",
|
||||
"empower",
|
||||
"friendliai",
|
||||
"azure_ai",
|
||||
"github",
|
||||
"litellm_proxy",
|
||||
"hosted_vllm",
|
||||
"lm_studio",
|
||||
"galadriel",
|
||||
]
|
||||
openai_text_completion_compatible_providers: List = (
|
||||
[ # providers that support `/v1/completions`
|
||||
"together_ai",
|
||||
"fireworks_ai",
|
||||
"hosted_vllm",
|
||||
]
|
||||
)
|
||||
_openai_like_providers: List = [
|
||||
"predibase",
|
||||
"databricks",
|
||||
"watsonx",
|
||||
] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
|
||||
# well supported replicate llms
|
||||
replicate_models: List = [
|
||||
# llama replicate supported LLMs
|
||||
"replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
|
||||
"a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
|
||||
"meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
|
||||
# Vicuna
|
||||
"replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
|
||||
"joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
|
||||
# Flan T-5
|
||||
"daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
|
||||
# Others
|
||||
"replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
|
||||
"replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
|
||||
]
|
||||
|
||||
clarifai_models: List = [
|
||||
"clarifai/meta.Llama-3.Llama-3-8B-Instruct",
|
||||
"clarifai/gcp.generate.gemma-1_1-7b-it",
|
||||
"clarifai/mistralai.completion.mixtral-8x22B",
|
||||
"clarifai/cohere.generate.command-r-plus",
|
||||
"clarifai/databricks.drbx.dbrx-instruct",
|
||||
"clarifai/mistralai.completion.mistral-large",
|
||||
"clarifai/mistralai.completion.mistral-medium",
|
||||
"clarifai/mistralai.completion.mistral-small",
|
||||
"clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
|
||||
"clarifai/gcp.generate.gemma-2b-it",
|
||||
"clarifai/gcp.generate.gemma-7b-it",
|
||||
"clarifai/deci.decilm.deciLM-7B-instruct",
|
||||
"clarifai/mistralai.completion.mistral-7B-Instruct",
|
||||
"clarifai/gcp.generate.gemini-pro",
|
||||
"clarifai/anthropic.completion.claude-v1",
|
||||
"clarifai/anthropic.completion.claude-instant-1_2",
|
||||
"clarifai/anthropic.completion.claude-instant",
|
||||
"clarifai/anthropic.completion.claude-v2",
|
||||
"clarifai/anthropic.completion.claude-2_1",
|
||||
"clarifai/meta.Llama-2.codeLlama-70b-Python",
|
||||
"clarifai/meta.Llama-2.codeLlama-70b-Instruct",
|
||||
"clarifai/openai.completion.gpt-3_5-turbo-instruct",
|
||||
"clarifai/meta.Llama-2.llama2-7b-chat",
|
||||
"clarifai/meta.Llama-2.llama2-13b-chat",
|
||||
"clarifai/meta.Llama-2.llama2-70b-chat",
|
||||
"clarifai/openai.chat-completion.gpt-4-turbo",
|
||||
"clarifai/microsoft.text-generation.phi-2",
|
||||
"clarifai/meta.Llama-2.llama2-7b-chat-vllm",
|
||||
"clarifai/upstage.solar.solar-10_7b-instruct",
|
||||
"clarifai/openchat.openchat.openchat-3_5-1210",
|
||||
"clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
|
||||
"clarifai/gcp.generate.text-bison",
|
||||
"clarifai/meta.Llama-2.llamaGuard-7b",
|
||||
"clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
|
||||
"clarifai/openai.chat-completion.GPT-4",
|
||||
"clarifai/openai.chat-completion.GPT-3_5-turbo",
|
||||
"clarifai/ai21.complete.Jurassic2-Grande",
|
||||
"clarifai/ai21.complete.Jurassic2-Grande-Instruct",
|
||||
"clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
|
||||
"clarifai/ai21.complete.Jurassic2-Jumbo",
|
||||
"clarifai/ai21.complete.Jurassic2-Large",
|
||||
"clarifai/cohere.generate.cohere-generate-command",
|
||||
"clarifai/wizardlm.generate.wizardCoder-Python-34B",
|
||||
"clarifai/wizardlm.generate.wizardLM-70B",
|
||||
"clarifai/tiiuae.falcon.falcon-40b-instruct",
|
||||
"clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
|
||||
"clarifai/gcp.generate.code-gecko",
|
||||
"clarifai/gcp.generate.code-bison",
|
||||
"clarifai/mistralai.completion.mistral-7B-OpenOrca",
|
||||
"clarifai/mistralai.completion.openHermes-2-mistral-7B",
|
||||
"clarifai/wizardlm.generate.wizardLM-13B",
|
||||
"clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
|
||||
"clarifai/wizardlm.generate.wizardCoder-15B",
|
||||
"clarifai/microsoft.text-generation.phi-1_5",
|
||||
"clarifai/databricks.Dolly-v2.dolly-v2-12b",
|
||||
"clarifai/bigcode.code.StarCoder",
|
||||
"clarifai/salesforce.xgen.xgen-7b-8k-instruct",
|
||||
"clarifai/mosaicml.mpt.mpt-7b-instruct",
|
||||
"clarifai/anthropic.completion.claude-3-opus",
|
||||
"clarifai/anthropic.completion.claude-3-sonnet",
|
||||
"clarifai/gcp.generate.gemini-1_5-pro",
|
||||
"clarifai/gcp.generate.imagen-2",
|
||||
"clarifai/salesforce.blip.general-english-image-caption-blip-2",
|
||||
]
|
||||
|
||||
|
||||
huggingface_models: List = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Llama-2-7b-chat-hf",
|
||||
"meta-llama/Llama-2-13b-hf",
|
||||
"meta-llama/Llama-2-13b-chat-hf",
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
"meta-llama/Llama-2-70b-chat-hf",
|
||||
"meta-llama/Llama-2-7b",
|
||||
"meta-llama/Llama-2-7b-chat",
|
||||
"meta-llama/Llama-2-13b",
|
||||
"meta-llama/Llama-2-13b-chat",
|
||||
"meta-llama/Llama-2-70b",
|
||||
"meta-llama/Llama-2-70b-chat",
|
||||
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
|
||||
empower_models = [
|
||||
"empower/empower-functions",
|
||||
"empower/empower-functions-small",
|
||||
]
|
||||
|
||||
together_ai_models: List = [
|
||||
# llama llms - chat
|
||||
"togethercomputer/llama-2-70b-chat",
|
||||
# llama llms - language / instruct
|
||||
"togethercomputer/llama-2-70b",
|
||||
"togethercomputer/LLaMA-2-7B-32K",
|
||||
"togethercomputer/Llama-2-7B-32K-Instruct",
|
||||
"togethercomputer/llama-2-7b",
|
||||
# falcon llms
|
||||
"togethercomputer/falcon-40b-instruct",
|
||||
"togethercomputer/falcon-7b-instruct",
|
||||
# alpaca
|
||||
"togethercomputer/alpaca-7b",
|
||||
# chat llms
|
||||
"HuggingFaceH4/starchat-alpha",
|
||||
# code llms
|
||||
"togethercomputer/CodeLlama-34b",
|
||||
"togethercomputer/CodeLlama-34b-Instruct",
|
||||
"togethercomputer/CodeLlama-34b-Python",
|
||||
"defog/sqlcoder",
|
||||
"NumbersStation/nsql-llama-2-7B",
|
||||
"WizardLM/WizardCoder-15B-V1.0",
|
||||
"WizardLM/WizardCoder-Python-34B-V1.0",
|
||||
# language llms
|
||||
"NousResearch/Nous-Hermes-Llama2-13b",
|
||||
"Austism/chronos-hermes-13b",
|
||||
"upstage/SOLAR-0-70b-16bit",
|
||||
"WizardLM/WizardLM-70B-V1.0",
|
||||
] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
|
||||
|
||||
|
||||
baseten_models: List = [
|
||||
"qvv0xeq",
|
||||
"q841o8w",
|
||||
"31dxrj3",
|
||||
] # FALCON 7B # WizardLM # Mosaic ML
|
||||
|
||||
|
||||
# used for Cost Tracking & Token counting
|
||||
|
@ -980,20 +706,6 @@ longer_context_model_fallback_dict: dict = {
|
|||
}
|
||||
|
||||
####### EMBEDDING MODELS ###################
|
||||
open_ai_embedding_models: List = ["text-embedding-ada-002"]
|
||||
cohere_embedding_models: List = [
|
||||
"embed-english-v3.0",
|
||||
"embed-english-light-v3.0",
|
||||
"embed-multilingual-v3.0",
|
||||
"embed-english-v2.0",
|
||||
"embed-english-light-v2.0",
|
||||
"embed-multilingual-v2.0",
|
||||
]
|
||||
bedrock_embedding_models: List = [
|
||||
"amazon.titan-embed-text-v1",
|
||||
"cohere.embed-english-v3",
|
||||
"cohere.embed-multilingual-v3",
|
||||
]
|
||||
|
||||
all_embedding_models = (
|
||||
open_ai_embedding_models
|
||||
|
@ -1277,4 +989,7 @@ custom_provider_map: List[CustomLLMItem] = []
|
|||
_custom_providers: List[str] = (
|
||||
[]
|
||||
) # internal helper util, used to track names of custom providers
|
||||
disable_hf_tokenizer_download: Optional[bool] = (
|
||||
None # disable huggingface tokenizer download. Defaults to openai clk100
|
||||
)
|
||||
global_disable_no_log_param: bool = False
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
ROUTER_MAX_FALLBACKS = 5
|
||||
DEFAULT_BATCH_SIZE = 512
|
||||
DEFAULT_FLUSH_INTERVAL_SECONDS = 5
|
||||
|
@ -12,6 +14,11 @@ DEFAULT_IMAGE_TOKEN_COUNT = 250
|
|||
DEFAULT_IMAGE_WIDTH = 300
|
||||
DEFAULT_IMAGE_HEIGHT = 300
|
||||
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
||||
#### RELIABILITY ####
|
||||
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
||||
#### Networking settings ####
|
||||
request_timeout: float = 6000 # time in seconds
|
||||
|
||||
LITELLM_CHAT_PROVIDERS = [
|
||||
"openai",
|
||||
"openai_like",
|
||||
|
@ -113,6 +120,222 @@ OPENAI_CHAT_COMPLETION_PARAMS = [
|
|||
"top_logprobs",
|
||||
"extra_headers",
|
||||
]
|
||||
|
||||
openai_compatible_endpoints: List = [
|
||||
"api.perplexity.ai",
|
||||
"api.endpoints.anyscale.com/v1",
|
||||
"api.deepinfra.com/v1/openai",
|
||||
"api.mistral.ai/v1",
|
||||
"codestral.mistral.ai/v1/chat/completions",
|
||||
"codestral.mistral.ai/v1/fim/completions",
|
||||
"api.groq.com/openai/v1",
|
||||
"https://integrate.api.nvidia.com/v1",
|
||||
"api.deepseek.com/v1",
|
||||
"api.together.xyz/v1",
|
||||
"app.empower.dev/api/v1",
|
||||
"https://api.friendli.ai/serverless/v1",
|
||||
"api.sambanova.ai/v1",
|
||||
"api.x.ai/v1",
|
||||
"api.galadriel.ai/v1",
|
||||
]
|
||||
|
||||
|
||||
openai_compatible_providers: List = [
|
||||
"anyscale",
|
||||
"mistral",
|
||||
"groq",
|
||||
"nvidia_nim",
|
||||
"cerebras",
|
||||
"sambanova",
|
||||
"ai21_chat",
|
||||
"ai21",
|
||||
"volcengine",
|
||||
"codestral",
|
||||
"deepseek",
|
||||
"deepinfra",
|
||||
"perplexity",
|
||||
"xinference",
|
||||
"xai",
|
||||
"together_ai",
|
||||
"fireworks_ai",
|
||||
"empower",
|
||||
"friendliai",
|
||||
"azure_ai",
|
||||
"github",
|
||||
"litellm_proxy",
|
||||
"hosted_vllm",
|
||||
"lm_studio",
|
||||
"galadriel",
|
||||
]
|
||||
openai_text_completion_compatible_providers: List = (
|
||||
[ # providers that support `/v1/completions`
|
||||
"together_ai",
|
||||
"fireworks_ai",
|
||||
"hosted_vllm",
|
||||
]
|
||||
)
|
||||
_openai_like_providers: List = [
|
||||
"predibase",
|
||||
"databricks",
|
||||
"watsonx",
|
||||
] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
|
||||
# well supported replicate llms
|
||||
replicate_models: List = [
|
||||
# llama replicate supported LLMs
|
||||
"replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
|
||||
"a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
|
||||
"meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
|
||||
# Vicuna
|
||||
"replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
|
||||
"joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
|
||||
# Flan T-5
|
||||
"daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
|
||||
# Others
|
||||
"replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
|
||||
"replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
|
||||
]
|
||||
|
||||
clarifai_models: List = [
|
||||
"clarifai/meta.Llama-3.Llama-3-8B-Instruct",
|
||||
"clarifai/gcp.generate.gemma-1_1-7b-it",
|
||||
"clarifai/mistralai.completion.mixtral-8x22B",
|
||||
"clarifai/cohere.generate.command-r-plus",
|
||||
"clarifai/databricks.drbx.dbrx-instruct",
|
||||
"clarifai/mistralai.completion.mistral-large",
|
||||
"clarifai/mistralai.completion.mistral-medium",
|
||||
"clarifai/mistralai.completion.mistral-small",
|
||||
"clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
|
||||
"clarifai/gcp.generate.gemma-2b-it",
|
||||
"clarifai/gcp.generate.gemma-7b-it",
|
||||
"clarifai/deci.decilm.deciLM-7B-instruct",
|
||||
"clarifai/mistralai.completion.mistral-7B-Instruct",
|
||||
"clarifai/gcp.generate.gemini-pro",
|
||||
"clarifai/anthropic.completion.claude-v1",
|
||||
"clarifai/anthropic.completion.claude-instant-1_2",
|
||||
"clarifai/anthropic.completion.claude-instant",
|
||||
"clarifai/anthropic.completion.claude-v2",
|
||||
"clarifai/anthropic.completion.claude-2_1",
|
||||
"clarifai/meta.Llama-2.codeLlama-70b-Python",
|
||||
"clarifai/meta.Llama-2.codeLlama-70b-Instruct",
|
||||
"clarifai/openai.completion.gpt-3_5-turbo-instruct",
|
||||
"clarifai/meta.Llama-2.llama2-7b-chat",
|
||||
"clarifai/meta.Llama-2.llama2-13b-chat",
|
||||
"clarifai/meta.Llama-2.llama2-70b-chat",
|
||||
"clarifai/openai.chat-completion.gpt-4-turbo",
|
||||
"clarifai/microsoft.text-generation.phi-2",
|
||||
"clarifai/meta.Llama-2.llama2-7b-chat-vllm",
|
||||
"clarifai/upstage.solar.solar-10_7b-instruct",
|
||||
"clarifai/openchat.openchat.openchat-3_5-1210",
|
||||
"clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
|
||||
"clarifai/gcp.generate.text-bison",
|
||||
"clarifai/meta.Llama-2.llamaGuard-7b",
|
||||
"clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
|
||||
"clarifai/openai.chat-completion.GPT-4",
|
||||
"clarifai/openai.chat-completion.GPT-3_5-turbo",
|
||||
"clarifai/ai21.complete.Jurassic2-Grande",
|
||||
"clarifai/ai21.complete.Jurassic2-Grande-Instruct",
|
||||
"clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
|
||||
"clarifai/ai21.complete.Jurassic2-Jumbo",
|
||||
"clarifai/ai21.complete.Jurassic2-Large",
|
||||
"clarifai/cohere.generate.cohere-generate-command",
|
||||
"clarifai/wizardlm.generate.wizardCoder-Python-34B",
|
||||
"clarifai/wizardlm.generate.wizardLM-70B",
|
||||
"clarifai/tiiuae.falcon.falcon-40b-instruct",
|
||||
"clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
|
||||
"clarifai/gcp.generate.code-gecko",
|
||||
"clarifai/gcp.generate.code-bison",
|
||||
"clarifai/mistralai.completion.mistral-7B-OpenOrca",
|
||||
"clarifai/mistralai.completion.openHermes-2-mistral-7B",
|
||||
"clarifai/wizardlm.generate.wizardLM-13B",
|
||||
"clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
|
||||
"clarifai/wizardlm.generate.wizardCoder-15B",
|
||||
"clarifai/microsoft.text-generation.phi-1_5",
|
||||
"clarifai/databricks.Dolly-v2.dolly-v2-12b",
|
||||
"clarifai/bigcode.code.StarCoder",
|
||||
"clarifai/salesforce.xgen.xgen-7b-8k-instruct",
|
||||
"clarifai/mosaicml.mpt.mpt-7b-instruct",
|
||||
"clarifai/anthropic.completion.claude-3-opus",
|
||||
"clarifai/anthropic.completion.claude-3-sonnet",
|
||||
"clarifai/gcp.generate.gemini-1_5-pro",
|
||||
"clarifai/gcp.generate.imagen-2",
|
||||
"clarifai/salesforce.blip.general-english-image-caption-blip-2",
|
||||
]
|
||||
|
||||
|
||||
huggingface_models: List = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Llama-2-7b-chat-hf",
|
||||
"meta-llama/Llama-2-13b-hf",
|
||||
"meta-llama/Llama-2-13b-chat-hf",
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
"meta-llama/Llama-2-70b-chat-hf",
|
||||
"meta-llama/Llama-2-7b",
|
||||
"meta-llama/Llama-2-7b-chat",
|
||||
"meta-llama/Llama-2-13b",
|
||||
"meta-llama/Llama-2-13b-chat",
|
||||
"meta-llama/Llama-2-70b",
|
||||
"meta-llama/Llama-2-70b-chat",
|
||||
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
|
||||
empower_models = [
|
||||
"empower/empower-functions",
|
||||
"empower/empower-functions-small",
|
||||
]
|
||||
|
||||
together_ai_models: List = [
|
||||
# llama llms - chat
|
||||
"togethercomputer/llama-2-70b-chat",
|
||||
# llama llms - language / instruct
|
||||
"togethercomputer/llama-2-70b",
|
||||
"togethercomputer/LLaMA-2-7B-32K",
|
||||
"togethercomputer/Llama-2-7B-32K-Instruct",
|
||||
"togethercomputer/llama-2-7b",
|
||||
# falcon llms
|
||||
"togethercomputer/falcon-40b-instruct",
|
||||
"togethercomputer/falcon-7b-instruct",
|
||||
# alpaca
|
||||
"togethercomputer/alpaca-7b",
|
||||
# chat llms
|
||||
"HuggingFaceH4/starchat-alpha",
|
||||
# code llms
|
||||
"togethercomputer/CodeLlama-34b",
|
||||
"togethercomputer/CodeLlama-34b-Instruct",
|
||||
"togethercomputer/CodeLlama-34b-Python",
|
||||
"defog/sqlcoder",
|
||||
"NumbersStation/nsql-llama-2-7B",
|
||||
"WizardLM/WizardCoder-15B-V1.0",
|
||||
"WizardLM/WizardCoder-Python-34B-V1.0",
|
||||
# language llms
|
||||
"NousResearch/Nous-Hermes-Llama2-13b",
|
||||
"Austism/chronos-hermes-13b",
|
||||
"upstage/SOLAR-0-70b-16bit",
|
||||
"WizardLM/WizardLM-70B-V1.0",
|
||||
] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
|
||||
|
||||
|
||||
baseten_models: List = [
|
||||
"qvv0xeq",
|
||||
"q841o8w",
|
||||
"31dxrj3",
|
||||
] # FALCON 7B # WizardLM # Mosaic ML
|
||||
|
||||
|
||||
open_ai_embedding_models: List = ["text-embedding-ada-002"]
|
||||
cohere_embedding_models: List = [
|
||||
"embed-english-v3.0",
|
||||
"embed-english-light-v3.0",
|
||||
"embed-multilingual-v3.0",
|
||||
"embed-english-v2.0",
|
||||
"embed-english-light-v2.0",
|
||||
"embed-multilingual-v2.0",
|
||||
]
|
||||
bedrock_embedding_models: List = [
|
||||
"amazon.titan-embed-text-v1",
|
||||
"cohere.embed-english-v3",
|
||||
"cohere.embed-multilingual-v3",
|
||||
]
|
||||
|
||||
|
||||
OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
|
||||
HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60 # 1 minute
|
||||
RESPONSE_FORMAT_TOOL_NAME = "json_tool_call" # default tool name used when converting response format to tool call
|
||||
|
||||
|
|
45
litellm/litellm_core_utils/get_model_cost_map.py
Normal file
45
litellm/litellm_core_utils/get_model_cost_map.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
"""
|
||||
Pulls the cost + context window + provider route for known models from https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
|
||||
|
||||
This can be disabled by setting the LITELLM_LOCAL_MODEL_COST_MAP environment variable to True.
|
||||
|
||||
```
|
||||
export LITELLM_LOCAL_MODEL_COST_MAP=True
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
def get_model_cost_map(url: str):
|
||||
if (
|
||||
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
|
||||
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
|
||||
):
|
||||
import importlib.resources
|
||||
import json
|
||||
|
||||
with importlib.resources.open_text(
|
||||
"litellm", "model_prices_and_context_window_backup.json"
|
||||
) as f:
|
||||
content = json.load(f)
|
||||
return content
|
||||
|
||||
try:
|
||||
response = httpx.get(
|
||||
url, timeout=5
|
||||
) # set a 5 second timeout for the get request
|
||||
response.raise_for_status() # Raise an exception if the request is unsuccessful
|
||||
content = response.json()
|
||||
return content
|
||||
except Exception:
|
||||
import importlib.resources
|
||||
import json
|
||||
|
||||
with importlib.resources.open_text(
|
||||
"litellm", "model_prices_and_context_window_backup.json"
|
||||
) as f:
|
||||
content = json.load(f)
|
||||
return content
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -61,6 +61,7 @@ def _get_user_in_team(
|
|||
for member in team_table.members_with_roles:
|
||||
if member.user_id is not None and member.user_id == user_id:
|
||||
return member
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
@ -366,6 +367,7 @@ async def generate_key_fn( # noqa: PLR0915
|
|||
prisma_client=prisma_client,
|
||||
user_api_key_cache=user_api_key_cache,
|
||||
parent_otel_span=user_api_key_dict.parent_otel_span,
|
||||
check_db_only=True,
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.debug(
|
||||
|
|
|
@ -1889,3 +1889,8 @@ class HttpHandlerRequestFields(TypedDict, total=False):
|
|||
class ProviderSpecificHeader(TypedDict):
|
||||
custom_llm_provider: str
|
||||
extra_headers: dict
|
||||
|
||||
|
||||
class SelectTokenizerResponse(TypedDict):
|
||||
type: Literal["openai_tokenizer", "huggingface_tokenizer"]
|
||||
tokenizer: Any
|
||||
|
|
|
@ -150,6 +150,7 @@ from litellm.types.utils import (
|
|||
ModelResponseStream,
|
||||
ProviderField,
|
||||
ProviderSpecificModelInfo,
|
||||
SelectTokenizerResponse,
|
||||
StreamingChoices,
|
||||
TextChoices,
|
||||
TextCompletionResponse,
|
||||
|
@ -1440,8 +1441,27 @@ def _select_tokenizer(
|
|||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def _select_tokenizer_helper(model: str):
|
||||
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
|
||||
|
||||
if litellm.disable_hf_tokenizer_download is True:
|
||||
return _return_openai_tokenizer(model)
|
||||
|
||||
try:
|
||||
result = _return_huggingface_tokenizer(model)
|
||||
if result is not None:
|
||||
return result
|
||||
except Exception as e:
|
||||
verbose_logger.debug(f"Error selecting tokenizer: {e}")
|
||||
|
||||
# default - tiktoken
|
||||
return _return_openai_tokenizer(model)
|
||||
|
||||
|
||||
def _return_openai_tokenizer(model: str) -> SelectTokenizerResponse:
|
||||
return {"type": "openai_tokenizer", "tokenizer": encoding}
|
||||
|
||||
|
||||
def _return_huggingface_tokenizer(model: str) -> Optional[SelectTokenizerResponse]:
|
||||
if model in litellm.cohere_models and "command-r" in model:
|
||||
# cohere
|
||||
cohere_tokenizer = Tokenizer.from_pretrained(
|
||||
|
@ -1460,14 +1480,8 @@ def _select_tokenizer_helper(model: str):
|
|||
elif "llama-3" in model.lower():
|
||||
tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
|
||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||
except Exception as e:
|
||||
verbose_logger.debug(f"Error selecting tokenizer: {e}")
|
||||
|
||||
# default - tiktoken
|
||||
return {
|
||||
"type": "openai_tokenizer",
|
||||
"tokenizer": encoding,
|
||||
} # default to openai tokenizer
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
|
||||
|
|
|
@ -450,7 +450,7 @@ async def test_async_vertexai_response():
|
|||
or "32k" in model
|
||||
or "ultra" in model
|
||||
or "002" in model
|
||||
or "gemini-2.0-flash-thinking-exp" == model
|
||||
or "gemini-2.0-flash-thinking-exp" in model
|
||||
):
|
||||
# our account does not have access to this model
|
||||
continue
|
||||
|
@ -492,7 +492,11 @@ async def test_async_vertexai_streaming_response():
|
|||
test_models += litellm.vertex_language_models # always test gemini-pro
|
||||
for model in test_models:
|
||||
if model in VERTEX_MODELS_TO_NOT_TEST or (
|
||||
"gecko" in model or "32k" in model or "ultra" in model or "002" in model
|
||||
"gecko" in model
|
||||
or "32k" in model
|
||||
or "ultra" in model
|
||||
or "002" in model
|
||||
or "gemini-2.0-flash-thinking-exp" in model
|
||||
):
|
||||
# our account does not have access to this model
|
||||
continue
|
||||
|
|
|
@ -459,3 +459,14 @@ class TestTokenizerSelection(unittest.TestCase):
|
|||
# Verify fallback to OpenAI tokenizer
|
||||
self.assertEqual(result["type"], "openai_tokenizer")
|
||||
self.assertEqual(result["tokenizer"], encoding)
|
||||
|
||||
@patch("litellm.utils._return_huggingface_tokenizer")
|
||||
def test_disable_hf_tokenizer_download(self, mock_return_huggingface_tokenizer):
|
||||
# Use pytest.MonkeyPatch() directly instead of fixture
|
||||
monkeypatch = pytest.MonkeyPatch()
|
||||
monkeypatch.setattr(litellm, "disable_hf_tokenizer_download", True)
|
||||
|
||||
result = _select_tokenizer_helper("grok-32r22r")
|
||||
mock_return_huggingface_tokenizer.assert_not_called()
|
||||
assert result["type"] == "openai_tokenizer"
|
||||
assert result["tokenizer"] == encoding
|
||||
|
|
|
@ -8,6 +8,7 @@ from datetime import datetime
|
|||
from dotenv import load_dotenv
|
||||
from fastapi import Request
|
||||
from fastapi.routing import APIRoute
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
load_dotenv()
|
||||
import io
|
||||
|
@ -988,3 +989,28 @@ async def test_list_key_helper(prisma_client):
|
|||
user_id="admin",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("litellm.proxy.management_endpoints.key_management_endpoints.get_team_object")
|
||||
async def test_key_generate_always_db_team(mock_get_team_object):
|
||||
from litellm.proxy.management_endpoints.key_management_endpoints import (
|
||||
generate_key_fn,
|
||||
)
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "prisma_client", MagicMock())
|
||||
mock_get_team_object.return_value = None
|
||||
try:
|
||||
await generate_key_fn(
|
||||
data=GenerateKeyRequest(team_id="1234"),
|
||||
user_api_key_dict=UserAPIKeyAuth(
|
||||
user_role=LitellmUserRoles.PROXY_ADMIN,
|
||||
api_key="sk-1234",
|
||||
user_id="admin",
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
mock_get_team_object.assert_called_once()
|
||||
assert mock_get_team_object.call_args.kwargs["check_db_only"] == True
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue