From 16b5de07afb22faeaaa8ab7a382b63b3e1c3a70d Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Thu, 30 Jan 2025 22:56:41 -0800 Subject: [PATCH] Doc updates + management endpoint fixes (#8138) * Litellm dev 01 29 2025 p4 (#8107) * fix(key_management_endpoints.py): always get db team Fixes https://github.com/BerriAI/litellm/issues/7983 * test(test_key_management.py): add unit test enforcing check_db_only is always true on key generate checks * test: fix test * test: skip gemini thinking * Litellm dev 01 29 2025 p3 (#8106) * fix(__init__.py): reduces size of __init__.py and reduces scope for errors by using correct param * refactor(__init__.py): refactor init by cleaning up redundant params * refactor(__init__.py): move more constants into constants.py cleanup root * refactor(__init__.py): more cleanup * feat(__init__.py): expose new 'disable_hf_tokenizer_download' param enables hf model usage in offline env * docs(config_settings.md): document new disable_hf_tokenizer_download param * fix: fix linting error * fix: fix unsafe comparison * test: fix test * docs(public_teams.md): add doc showing how to expose public teams for users to join * docs: add beta disclaimer on public teams * test: update tests --- docs/my-website/docs/proxy/config_settings.md | 1 + docs/my-website/docs/proxy/public_teams.md | 40 ++ docs/my-website/docs/proxy/ui.md | 5 - docs/my-website/sidebars.js | 1 + litellm/__init__.py | 347 ++---------------- litellm/constants.py | 223 +++++++++++ .../litellm_core_utils/get_model_cost_map.py | 45 +++ litellm/proxy/_experimental/out/404.html | 1 - .../proxy/_experimental/out/model_hub.html | 1 - .../proxy/_experimental/out/onboarding.html | 1 - .../key_management_endpoints.py | 2 + litellm/types/utils.py | 5 + litellm/utils.py | 60 +-- .../test_amazing_vertex_completion.py | 8 +- tests/local_testing/test_token_counter.py | 11 + .../test_key_management.py | 26 ++ 16 files changed, 428 insertions(+), 349 deletions(-) create mode 100644 docs/my-website/docs/proxy/public_teams.md create mode 100644 litellm/litellm_core_utils/get_model_cost_map.py delete mode 100644 litellm/proxy/_experimental/out/404.html delete mode 100644 litellm/proxy/_experimental/out/model_hub.html delete mode 100644 litellm/proxy/_experimental/out/onboarding.html diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md index 5ba26031b8..4a10cea7ab 100644 --- a/docs/my-website/docs/proxy/config_settings.md +++ b/docs/my-website/docs/proxy/config_settings.md @@ -139,6 +139,7 @@ general_settings: | disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. | | key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) | | disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. | +| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). | ### general_settings - Reference diff --git a/docs/my-website/docs/proxy/public_teams.md b/docs/my-website/docs/proxy/public_teams.md new file mode 100644 index 0000000000..6ff2258308 --- /dev/null +++ b/docs/my-website/docs/proxy/public_teams.md @@ -0,0 +1,40 @@ +# [BETA] Public Teams + +Expose available teams to your users to join on signup. + + + + +## Quick Start + +1. Create a team on LiteLLM + +```bash +curl -X POST '/team/new' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer ' \ +-d '{"name": "My Team", "team_id": "team_id_1"}' +``` + +2. Expose the team to your users + +```yaml +litellm_settings: + default_internal_user_params: + available_teams: ["team_id_1"] # 👈 Make team available to new SSO users +``` + +3. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/team/member_add' \ +-H 'Authorization: Bearer sk-' \ +-H 'Content-Type: application/json' \ +--data-raw '{ + "team_id": "team_id_1", + "member": [{"role": "user", "user_id": "my-test-user"}] +}' +``` + + + diff --git a/docs/my-website/docs/proxy/ui.md b/docs/my-website/docs/proxy/ui.md index f32f8ffa2d..a093b226a2 100644 --- a/docs/my-website/docs/proxy/ui.md +++ b/docs/my-website/docs/proxy/ui.md @@ -6,11 +6,6 @@ import TabItem from '@theme/TabItem'; Create keys, track spend, add models without worrying about the config / CRUD endpoints. -:::info - -This is in beta, so things may change. If you have feedback, [let us know](https://discord.com/invite/wuPM9dRgDw) - -::: diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index cda84067ba..41febfd564 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -98,6 +98,7 @@ const sidebars = { "proxy/ui", "proxy/admin_ui_sso", "proxy/self_serve", + "proxy/public_teams", "proxy/custom_sso" ], }, diff --git a/litellm/__init__.py b/litellm/__init__.py index 76ab021a0a..3032d1b8c6 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -9,7 +9,12 @@ from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES -from litellm.types.utils import ImageObject, BudgetConfig +from litellm.types.utils import ( + ImageObject, + BudgetConfig, + all_litellm_params, + all_litellm_params as _litellm_completion_params, +) # maintain backwards compatibility for root param from litellm._logging import ( set_verbose, _turn_on_debug, @@ -29,6 +34,24 @@ from litellm.constants import ( LITELLM_CHAT_PROVIDERS, HUMANLOOP_PROMPT_CACHE_TTL_SECONDS, OPENAI_CHAT_COMPLETION_PARAMS, + OPENAI_CHAT_COMPLETION_PARAMS as _openai_completion_params, # backwards compatibility + OPENAI_FINISH_REASONS, + OPENAI_FINISH_REASONS as _openai_finish_reasons, # backwards compatibility + openai_compatible_endpoints, + openai_compatible_providers, + openai_text_completion_compatible_providers, + _openai_like_providers, + replicate_models, + clarifai_models, + huggingface_models, + empower_models, + together_ai_models, + baseten_models, + REPEATED_STREAMING_CHUNK_LIMIT, + request_timeout, + open_ai_embedding_models, + cohere_embedding_models, + bedrock_embedding_models, ) from litellm.types.guardrails import GuardrailItem from litellm.proxy._types import ( @@ -217,75 +240,8 @@ default_soft_budget: float = ( 50.0 # by default all litellm proxy keys have a soft budget of 50.0 ) forward_traceparent_to_llm_provider: bool = False -_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"] -_openai_completion_params = [ - "functions", - "function_call", - "temperature", - "temperature", - "top_p", - "n", - "stream", - "stop", - "max_tokens", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", - "request_timeout", - "api_base", - "api_version", - "api_key", - "deployment_id", - "organization", - "base_url", - "default_headers", - "timeout", - "response_format", - "seed", - "tools", - "tool_choice", - "max_retries", -] -_litellm_completion_params = [ - "metadata", - "acompletion", - "caching", - "mock_response", - "api_key", - "api_version", - "api_base", - "force_timeout", - "logger_fn", - "verbose", - "custom_llm_provider", - "litellm_logging_obj", - "litellm_call_id", - "use_client", - "id", - "fallbacks", - "azure", - "headers", - "model_list", - "num_retries", - "context_window_fallback_dict", - "roles", - "final_prompt_value", - "bos_token", - "eos_token", - "request_timeout", - "complete_response", - "self", - "client", - "rpm", - "tpm", - "input_cost_per_token", - "output_cost_per_token", - "hf_model_name", - "model_info", - "proxy_server_request", - "preset_cache_key", -] + + _current_cost = 0.0 # private variable, used if max budget is set error_logs: Dict = {} add_function_to_prompt: bool = ( @@ -318,11 +274,8 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None custom_prometheus_metadata_labels: List[str] = [] #### REQUEST PRIORITIZATION #### priority_reservation: Optional[Dict[str, float]] = None -#### RELIABILITY #### -REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. -#### Networking settings #### -request_timeout: float = 6000 # time in seconds + force_ipv4: bool = ( False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6. ) @@ -352,39 +305,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings() #### PII MASKING #### output_parse_pii: bool = False ############################################# - - -def get_model_cost_map(url: str): - if ( - os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True - or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True" - ): - import importlib.resources - import json - - with importlib.resources.open_text( - "litellm", "model_prices_and_context_window_backup.json" - ) as f: - content = json.load(f) - return content - - try: - response = httpx.get( - url, timeout=5 - ) # set a 5 second timeout for the get request - response.raise_for_status() # Raise an exception if the request is unsuccessful - content = response.json() - return content - except Exception: - import importlib.resources - import json - - with importlib.resources.open_text( - "litellm", "model_prices_and_context_window_backup.json" - ) as f: - content = json.load(f) - return content - +from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map model_cost = get_model_cost_map(url=model_cost_map_url) custom_prompt_dict: Dict[str, dict] = {} @@ -446,7 +367,6 @@ cohere_chat_models: List = [] mistral_chat_models: List = [] text_completion_codestral_models: List = [] anthropic_models: List = [] -empower_models: List = [] openrouter_models: List = [] vertex_language_models: List = [] vertex_vision_models: List = [] @@ -641,202 +561,8 @@ def add_known_models(): add_known_models() # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary -openai_compatible_endpoints: List = [ - "api.perplexity.ai", - "api.endpoints.anyscale.com/v1", - "api.deepinfra.com/v1/openai", - "api.mistral.ai/v1", - "codestral.mistral.ai/v1/chat/completions", - "codestral.mistral.ai/v1/fim/completions", - "api.groq.com/openai/v1", - "https://integrate.api.nvidia.com/v1", - "api.deepseek.com/v1", - "api.together.xyz/v1", - "app.empower.dev/api/v1", - "https://api.friendli.ai/serverless/v1", - "api.sambanova.ai/v1", - "api.x.ai/v1", - "api.galadriel.ai/v1", -] # this is maintained for Exception Mapping -openai_compatible_providers: List = [ - "anyscale", - "mistral", - "groq", - "nvidia_nim", - "cerebras", - "sambanova", - "ai21_chat", - "ai21", - "volcengine", - "codestral", - "deepseek", - "deepinfra", - "perplexity", - "xinference", - "xai", - "together_ai", - "fireworks_ai", - "empower", - "friendliai", - "azure_ai", - "github", - "litellm_proxy", - "hosted_vllm", - "lm_studio", - "galadriel", -] -openai_text_completion_compatible_providers: List = ( - [ # providers that support `/v1/completions` - "together_ai", - "fireworks_ai", - "hosted_vllm", - ] -) -_openai_like_providers: List = [ - "predibase", - "databricks", - "watsonx", -] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk -# well supported replicate llms -replicate_models: List = [ - # llama replicate supported LLMs - "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf", - "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52", - "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db", - # Vicuna - "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b", - "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe", - # Flan T-5 - "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f", - # Others - "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5", - "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad", -] - -clarifai_models: List = [ - "clarifai/meta.Llama-3.Llama-3-8B-Instruct", - "clarifai/gcp.generate.gemma-1_1-7b-it", - "clarifai/mistralai.completion.mixtral-8x22B", - "clarifai/cohere.generate.command-r-plus", - "clarifai/databricks.drbx.dbrx-instruct", - "clarifai/mistralai.completion.mistral-large", - "clarifai/mistralai.completion.mistral-medium", - "clarifai/mistralai.completion.mistral-small", - "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1", - "clarifai/gcp.generate.gemma-2b-it", - "clarifai/gcp.generate.gemma-7b-it", - "clarifai/deci.decilm.deciLM-7B-instruct", - "clarifai/mistralai.completion.mistral-7B-Instruct", - "clarifai/gcp.generate.gemini-pro", - "clarifai/anthropic.completion.claude-v1", - "clarifai/anthropic.completion.claude-instant-1_2", - "clarifai/anthropic.completion.claude-instant", - "clarifai/anthropic.completion.claude-v2", - "clarifai/anthropic.completion.claude-2_1", - "clarifai/meta.Llama-2.codeLlama-70b-Python", - "clarifai/meta.Llama-2.codeLlama-70b-Instruct", - "clarifai/openai.completion.gpt-3_5-turbo-instruct", - "clarifai/meta.Llama-2.llama2-7b-chat", - "clarifai/meta.Llama-2.llama2-13b-chat", - "clarifai/meta.Llama-2.llama2-70b-chat", - "clarifai/openai.chat-completion.gpt-4-turbo", - "clarifai/microsoft.text-generation.phi-2", - "clarifai/meta.Llama-2.llama2-7b-chat-vllm", - "clarifai/upstage.solar.solar-10_7b-instruct", - "clarifai/openchat.openchat.openchat-3_5-1210", - "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B", - "clarifai/gcp.generate.text-bison", - "clarifai/meta.Llama-2.llamaGuard-7b", - "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2", - "clarifai/openai.chat-completion.GPT-4", - "clarifai/openai.chat-completion.GPT-3_5-turbo", - "clarifai/ai21.complete.Jurassic2-Grande", - "clarifai/ai21.complete.Jurassic2-Grande-Instruct", - "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct", - "clarifai/ai21.complete.Jurassic2-Jumbo", - "clarifai/ai21.complete.Jurassic2-Large", - "clarifai/cohere.generate.cohere-generate-command", - "clarifai/wizardlm.generate.wizardCoder-Python-34B", - "clarifai/wizardlm.generate.wizardLM-70B", - "clarifai/tiiuae.falcon.falcon-40b-instruct", - "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat", - "clarifai/gcp.generate.code-gecko", - "clarifai/gcp.generate.code-bison", - "clarifai/mistralai.completion.mistral-7B-OpenOrca", - "clarifai/mistralai.completion.openHermes-2-mistral-7B", - "clarifai/wizardlm.generate.wizardLM-13B", - "clarifai/huggingface-research.zephyr.zephyr-7B-alpha", - "clarifai/wizardlm.generate.wizardCoder-15B", - "clarifai/microsoft.text-generation.phi-1_5", - "clarifai/databricks.Dolly-v2.dolly-v2-12b", - "clarifai/bigcode.code.StarCoder", - "clarifai/salesforce.xgen.xgen-7b-8k-instruct", - "clarifai/mosaicml.mpt.mpt-7b-instruct", - "clarifai/anthropic.completion.claude-3-opus", - "clarifai/anthropic.completion.claude-3-sonnet", - "clarifai/gcp.generate.gemini-1_5-pro", - "clarifai/gcp.generate.imagen-2", - "clarifai/salesforce.blip.general-english-image-caption-blip-2", -] - - -huggingface_models: List = [ - "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-2-7b-chat-hf", - "meta-llama/Llama-2-13b-hf", - "meta-llama/Llama-2-13b-chat-hf", - "meta-llama/Llama-2-70b-hf", - "meta-llama/Llama-2-70b-chat-hf", - "meta-llama/Llama-2-7b", - "meta-llama/Llama-2-7b-chat", - "meta-llama/Llama-2-13b", - "meta-llama/Llama-2-13b-chat", - "meta-llama/Llama-2-70b", - "meta-llama/Llama-2-70b-chat", -] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers -empower_models = [ - "empower/empower-functions", - "empower/empower-functions-small", -] - -together_ai_models: List = [ - # llama llms - chat - "togethercomputer/llama-2-70b-chat", - # llama llms - language / instruct - "togethercomputer/llama-2-70b", - "togethercomputer/LLaMA-2-7B-32K", - "togethercomputer/Llama-2-7B-32K-Instruct", - "togethercomputer/llama-2-7b", - # falcon llms - "togethercomputer/falcon-40b-instruct", - "togethercomputer/falcon-7b-instruct", - # alpaca - "togethercomputer/alpaca-7b", - # chat llms - "HuggingFaceH4/starchat-alpha", - # code llms - "togethercomputer/CodeLlama-34b", - "togethercomputer/CodeLlama-34b-Instruct", - "togethercomputer/CodeLlama-34b-Python", - "defog/sqlcoder", - "NumbersStation/nsql-llama-2-7B", - "WizardLM/WizardCoder-15B-V1.0", - "WizardLM/WizardCoder-Python-34B-V1.0", - # language llms - "NousResearch/Nous-Hermes-Llama2-13b", - "Austism/chronos-hermes-13b", - "upstage/SOLAR-0-70b-16bit", - "WizardLM/WizardLM-70B-V1.0", -] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...) - - -baseten_models: List = [ - "qvv0xeq", - "q841o8w", - "31dxrj3", -] # FALCON 7B # WizardLM # Mosaic ML # used for Cost Tracking & Token counting @@ -980,20 +706,6 @@ longer_context_model_fallback_dict: dict = { } ####### EMBEDDING MODELS ################### -open_ai_embedding_models: List = ["text-embedding-ada-002"] -cohere_embedding_models: List = [ - "embed-english-v3.0", - "embed-english-light-v3.0", - "embed-multilingual-v3.0", - "embed-english-v2.0", - "embed-english-light-v2.0", - "embed-multilingual-v2.0", -] -bedrock_embedding_models: List = [ - "amazon.titan-embed-text-v1", - "cohere.embed-english-v3", - "cohere.embed-multilingual-v3", -] all_embedding_models = ( open_ai_embedding_models @@ -1277,4 +989,7 @@ custom_provider_map: List[CustomLLMItem] = [] _custom_providers: List[str] = ( [] ) # internal helper util, used to track names of custom providers +disable_hf_tokenizer_download: Optional[bool] = ( + None # disable huggingface tokenizer download. Defaults to openai clk100 +) global_disable_no_log_param: bool = False diff --git a/litellm/constants.py b/litellm/constants.py index 0a3b4ee4c7..36d45060e9 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -1,3 +1,5 @@ +from typing import List + ROUTER_MAX_FALLBACKS = 5 DEFAULT_BATCH_SIZE = 512 DEFAULT_FLUSH_INTERVAL_SECONDS = 5 @@ -12,6 +14,11 @@ DEFAULT_IMAGE_TOKEN_COUNT = 250 DEFAULT_IMAGE_WIDTH = 300 DEFAULT_IMAGE_HEIGHT = 300 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. +#### RELIABILITY #### +REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. +#### Networking settings #### +request_timeout: float = 6000 # time in seconds + LITELLM_CHAT_PROVIDERS = [ "openai", "openai_like", @@ -113,6 +120,222 @@ OPENAI_CHAT_COMPLETION_PARAMS = [ "top_logprobs", "extra_headers", ] + +openai_compatible_endpoints: List = [ + "api.perplexity.ai", + "api.endpoints.anyscale.com/v1", + "api.deepinfra.com/v1/openai", + "api.mistral.ai/v1", + "codestral.mistral.ai/v1/chat/completions", + "codestral.mistral.ai/v1/fim/completions", + "api.groq.com/openai/v1", + "https://integrate.api.nvidia.com/v1", + "api.deepseek.com/v1", + "api.together.xyz/v1", + "app.empower.dev/api/v1", + "https://api.friendli.ai/serverless/v1", + "api.sambanova.ai/v1", + "api.x.ai/v1", + "api.galadriel.ai/v1", +] + + +openai_compatible_providers: List = [ + "anyscale", + "mistral", + "groq", + "nvidia_nim", + "cerebras", + "sambanova", + "ai21_chat", + "ai21", + "volcengine", + "codestral", + "deepseek", + "deepinfra", + "perplexity", + "xinference", + "xai", + "together_ai", + "fireworks_ai", + "empower", + "friendliai", + "azure_ai", + "github", + "litellm_proxy", + "hosted_vllm", + "lm_studio", + "galadriel", +] +openai_text_completion_compatible_providers: List = ( + [ # providers that support `/v1/completions` + "together_ai", + "fireworks_ai", + "hosted_vllm", + ] +) +_openai_like_providers: List = [ + "predibase", + "databricks", + "watsonx", +] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk +# well supported replicate llms +replicate_models: List = [ + # llama replicate supported LLMs + "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf", + "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52", + "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db", + # Vicuna + "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b", + "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe", + # Flan T-5 + "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f", + # Others + "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5", + "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad", +] + +clarifai_models: List = [ + "clarifai/meta.Llama-3.Llama-3-8B-Instruct", + "clarifai/gcp.generate.gemma-1_1-7b-it", + "clarifai/mistralai.completion.mixtral-8x22B", + "clarifai/cohere.generate.command-r-plus", + "clarifai/databricks.drbx.dbrx-instruct", + "clarifai/mistralai.completion.mistral-large", + "clarifai/mistralai.completion.mistral-medium", + "clarifai/mistralai.completion.mistral-small", + "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1", + "clarifai/gcp.generate.gemma-2b-it", + "clarifai/gcp.generate.gemma-7b-it", + "clarifai/deci.decilm.deciLM-7B-instruct", + "clarifai/mistralai.completion.mistral-7B-Instruct", + "clarifai/gcp.generate.gemini-pro", + "clarifai/anthropic.completion.claude-v1", + "clarifai/anthropic.completion.claude-instant-1_2", + "clarifai/anthropic.completion.claude-instant", + "clarifai/anthropic.completion.claude-v2", + "clarifai/anthropic.completion.claude-2_1", + "clarifai/meta.Llama-2.codeLlama-70b-Python", + "clarifai/meta.Llama-2.codeLlama-70b-Instruct", + "clarifai/openai.completion.gpt-3_5-turbo-instruct", + "clarifai/meta.Llama-2.llama2-7b-chat", + "clarifai/meta.Llama-2.llama2-13b-chat", + "clarifai/meta.Llama-2.llama2-70b-chat", + "clarifai/openai.chat-completion.gpt-4-turbo", + "clarifai/microsoft.text-generation.phi-2", + "clarifai/meta.Llama-2.llama2-7b-chat-vllm", + "clarifai/upstage.solar.solar-10_7b-instruct", + "clarifai/openchat.openchat.openchat-3_5-1210", + "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B", + "clarifai/gcp.generate.text-bison", + "clarifai/meta.Llama-2.llamaGuard-7b", + "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2", + "clarifai/openai.chat-completion.GPT-4", + "clarifai/openai.chat-completion.GPT-3_5-turbo", + "clarifai/ai21.complete.Jurassic2-Grande", + "clarifai/ai21.complete.Jurassic2-Grande-Instruct", + "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct", + "clarifai/ai21.complete.Jurassic2-Jumbo", + "clarifai/ai21.complete.Jurassic2-Large", + "clarifai/cohere.generate.cohere-generate-command", + "clarifai/wizardlm.generate.wizardCoder-Python-34B", + "clarifai/wizardlm.generate.wizardLM-70B", + "clarifai/tiiuae.falcon.falcon-40b-instruct", + "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat", + "clarifai/gcp.generate.code-gecko", + "clarifai/gcp.generate.code-bison", + "clarifai/mistralai.completion.mistral-7B-OpenOrca", + "clarifai/mistralai.completion.openHermes-2-mistral-7B", + "clarifai/wizardlm.generate.wizardLM-13B", + "clarifai/huggingface-research.zephyr.zephyr-7B-alpha", + "clarifai/wizardlm.generate.wizardCoder-15B", + "clarifai/microsoft.text-generation.phi-1_5", + "clarifai/databricks.Dolly-v2.dolly-v2-12b", + "clarifai/bigcode.code.StarCoder", + "clarifai/salesforce.xgen.xgen-7b-8k-instruct", + "clarifai/mosaicml.mpt.mpt-7b-instruct", + "clarifai/anthropic.completion.claude-3-opus", + "clarifai/anthropic.completion.claude-3-sonnet", + "clarifai/gcp.generate.gemini-1_5-pro", + "clarifai/gcp.generate.imagen-2", + "clarifai/salesforce.blip.general-english-image-caption-blip-2", +] + + +huggingface_models: List = [ + "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-13b-hf", + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Llama-2-70b-hf", + "meta-llama/Llama-2-70b-chat-hf", + "meta-llama/Llama-2-7b", + "meta-llama/Llama-2-7b-chat", + "meta-llama/Llama-2-13b", + "meta-llama/Llama-2-13b-chat", + "meta-llama/Llama-2-70b", + "meta-llama/Llama-2-70b-chat", +] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers +empower_models = [ + "empower/empower-functions", + "empower/empower-functions-small", +] + +together_ai_models: List = [ + # llama llms - chat + "togethercomputer/llama-2-70b-chat", + # llama llms - language / instruct + "togethercomputer/llama-2-70b", + "togethercomputer/LLaMA-2-7B-32K", + "togethercomputer/Llama-2-7B-32K-Instruct", + "togethercomputer/llama-2-7b", + # falcon llms + "togethercomputer/falcon-40b-instruct", + "togethercomputer/falcon-7b-instruct", + # alpaca + "togethercomputer/alpaca-7b", + # chat llms + "HuggingFaceH4/starchat-alpha", + # code llms + "togethercomputer/CodeLlama-34b", + "togethercomputer/CodeLlama-34b-Instruct", + "togethercomputer/CodeLlama-34b-Python", + "defog/sqlcoder", + "NumbersStation/nsql-llama-2-7B", + "WizardLM/WizardCoder-15B-V1.0", + "WizardLM/WizardCoder-Python-34B-V1.0", + # language llms + "NousResearch/Nous-Hermes-Llama2-13b", + "Austism/chronos-hermes-13b", + "upstage/SOLAR-0-70b-16bit", + "WizardLM/WizardLM-70B-V1.0", +] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...) + + +baseten_models: List = [ + "qvv0xeq", + "q841o8w", + "31dxrj3", +] # FALCON 7B # WizardLM # Mosaic ML + + +open_ai_embedding_models: List = ["text-embedding-ada-002"] +cohere_embedding_models: List = [ + "embed-english-v3.0", + "embed-english-light-v3.0", + "embed-multilingual-v3.0", + "embed-english-v2.0", + "embed-english-light-v2.0", + "embed-multilingual-v2.0", +] +bedrock_embedding_models: List = [ + "amazon.titan-embed-text-v1", + "cohere.embed-english-v3", + "cohere.embed-multilingual-v3", +] + + +OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"] HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60 # 1 minute RESPONSE_FORMAT_TOOL_NAME = "json_tool_call" # default tool name used when converting response format to tool call diff --git a/litellm/litellm_core_utils/get_model_cost_map.py b/litellm/litellm_core_utils/get_model_cost_map.py new file mode 100644 index 0000000000..b8bdaee19c --- /dev/null +++ b/litellm/litellm_core_utils/get_model_cost_map.py @@ -0,0 +1,45 @@ +""" +Pulls the cost + context window + provider route for known models from https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json + +This can be disabled by setting the LITELLM_LOCAL_MODEL_COST_MAP environment variable to True. + +``` +export LITELLM_LOCAL_MODEL_COST_MAP=True +``` +""" + +import os + +import httpx + + +def get_model_cost_map(url: str): + if ( + os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) + or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True" + ): + import importlib.resources + import json + + with importlib.resources.open_text( + "litellm", "model_prices_and_context_window_backup.json" + ) as f: + content = json.load(f) + return content + + try: + response = httpx.get( + url, timeout=5 + ) # set a 5 second timeout for the get request + response.raise_for_status() # Raise an exception if the request is unsuccessful + content = response.json() + return content + except Exception: + import importlib.resources + import json + + with importlib.resources.open_text( + "litellm", "model_prices_and_context_window_backup.json" + ) as f: + content = json.load(f) + return content diff --git a/litellm/proxy/_experimental/out/404.html b/litellm/proxy/_experimental/out/404.html deleted file mode 100644 index 7b62e04c0c..0000000000 --- a/litellm/proxy/_experimental/out/404.html +++ /dev/null @@ -1 +0,0 @@ -404: This page could not be found.LiteLLM Dashboard

404

This page could not be found.

\ No newline at end of file diff --git a/litellm/proxy/_experimental/out/model_hub.html b/litellm/proxy/_experimental/out/model_hub.html deleted file mode 100644 index 0117fa6155..0000000000 --- a/litellm/proxy/_experimental/out/model_hub.html +++ /dev/null @@ -1 +0,0 @@ -LiteLLM Dashboard \ No newline at end of file diff --git a/litellm/proxy/_experimental/out/onboarding.html b/litellm/proxy/_experimental/out/onboarding.html deleted file mode 100644 index 90b5bf2129..0000000000 --- a/litellm/proxy/_experimental/out/onboarding.html +++ /dev/null @@ -1 +0,0 @@ -LiteLLM Dashboard \ No newline at end of file diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py index 8761e1ac9f..b9b462a4e8 100644 --- a/litellm/proxy/management_endpoints/key_management_endpoints.py +++ b/litellm/proxy/management_endpoints/key_management_endpoints.py @@ -61,6 +61,7 @@ def _get_user_in_team( for member in team_table.members_with_roles: if member.user_id is not None and member.user_id == user_id: return member + return None @@ -366,6 +367,7 @@ async def generate_key_fn( # noqa: PLR0915 prisma_client=prisma_client, user_api_key_cache=user_api_key_cache, parent_otel_span=user_api_key_dict.parent_otel_span, + check_db_only=True, ) except Exception as e: verbose_proxy_logger.debug( diff --git a/litellm/types/utils.py b/litellm/types/utils.py index a1c19dab1b..559588b7c7 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -1889,3 +1889,8 @@ class HttpHandlerRequestFields(TypedDict, total=False): class ProviderSpecificHeader(TypedDict): custom_llm_provider: str extra_headers: dict + + +class SelectTokenizerResponse(TypedDict): + type: Literal["openai_tokenizer", "huggingface_tokenizer"] + tokenizer: Any diff --git a/litellm/utils.py b/litellm/utils.py index b1e683113a..5396e008f0 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -150,6 +150,7 @@ from litellm.types.utils import ( ModelResponseStream, ProviderField, ProviderSpecificModelInfo, + SelectTokenizerResponse, StreamingChoices, TextChoices, TextCompletionResponse, @@ -1440,34 +1441,47 @@ def _select_tokenizer( @lru_cache(maxsize=128) -def _select_tokenizer_helper(model: str): +def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse: + + if litellm.disable_hf_tokenizer_download is True: + return _return_openai_tokenizer(model) + try: - if model in litellm.cohere_models and "command-r" in model: - # cohere - cohere_tokenizer = Tokenizer.from_pretrained( - "Xenova/c4ai-command-r-v01-tokenizer" - ) - return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer} - # anthropic - elif model in litellm.anthropic_models and "claude-3" not in model: - claude_tokenizer = Tokenizer.from_str(claude_json_str) - return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer} - # llama2 - elif "llama-2" in model.lower() or "replicate" in model.lower(): - tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") - return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} - # llama3 - elif "llama-3" in model.lower(): - tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer") - return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + result = _return_huggingface_tokenizer(model) + if result is not None: + return result except Exception as e: verbose_logger.debug(f"Error selecting tokenizer: {e}") # default - tiktoken - return { - "type": "openai_tokenizer", - "tokenizer": encoding, - } # default to openai tokenizer + return _return_openai_tokenizer(model) + + +def _return_openai_tokenizer(model: str) -> SelectTokenizerResponse: + return {"type": "openai_tokenizer", "tokenizer": encoding} + + +def _return_huggingface_tokenizer(model: str) -> Optional[SelectTokenizerResponse]: + if model in litellm.cohere_models and "command-r" in model: + # cohere + cohere_tokenizer = Tokenizer.from_pretrained( + "Xenova/c4ai-command-r-v01-tokenizer" + ) + return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer} + # anthropic + elif model in litellm.anthropic_models and "claude-3" not in model: + claude_tokenizer = Tokenizer.from_str(claude_json_str) + return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer} + # llama2 + elif "llama-2" in model.lower() or "replicate" in model.lower(): + tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") + return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + # llama3 + elif "llama-3" in model.lower(): + tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer") + return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + else: + return None def encode(model="", text="", custom_tokenizer: Optional[dict] = None): diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py index c8e66f767b..0fd82ad7bf 100644 --- a/tests/local_testing/test_amazing_vertex_completion.py +++ b/tests/local_testing/test_amazing_vertex_completion.py @@ -450,7 +450,7 @@ async def test_async_vertexai_response(): or "32k" in model or "ultra" in model or "002" in model - or "gemini-2.0-flash-thinking-exp" == model + or "gemini-2.0-flash-thinking-exp" in model ): # our account does not have access to this model continue @@ -492,7 +492,11 @@ async def test_async_vertexai_streaming_response(): test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: if model in VERTEX_MODELS_TO_NOT_TEST or ( - "gecko" in model or "32k" in model or "ultra" in model or "002" in model + "gecko" in model + or "32k" in model + or "ultra" in model + or "002" in model + or "gemini-2.0-flash-thinking-exp" in model ): # our account does not have access to this model continue diff --git a/tests/local_testing/test_token_counter.py b/tests/local_testing/test_token_counter.py index e1e2c36e9f..d572fa8014 100644 --- a/tests/local_testing/test_token_counter.py +++ b/tests/local_testing/test_token_counter.py @@ -459,3 +459,14 @@ class TestTokenizerSelection(unittest.TestCase): # Verify fallback to OpenAI tokenizer self.assertEqual(result["type"], "openai_tokenizer") self.assertEqual(result["tokenizer"], encoding) + + @patch("litellm.utils._return_huggingface_tokenizer") + def test_disable_hf_tokenizer_download(self, mock_return_huggingface_tokenizer): + # Use pytest.MonkeyPatch() directly instead of fixture + monkeypatch = pytest.MonkeyPatch() + monkeypatch.setattr(litellm, "disable_hf_tokenizer_download", True) + + result = _select_tokenizer_helper("grok-32r22r") + mock_return_huggingface_tokenizer.assert_not_called() + assert result["type"] == "openai_tokenizer" + assert result["tokenizer"] == encoding diff --git a/tests/proxy_admin_ui_tests/test_key_management.py b/tests/proxy_admin_ui_tests/test_key_management.py index f443d29715..12c50ac0cc 100644 --- a/tests/proxy_admin_ui_tests/test_key_management.py +++ b/tests/proxy_admin_ui_tests/test_key_management.py @@ -8,6 +8,7 @@ from datetime import datetime from dotenv import load_dotenv from fastapi import Request from fastapi.routing import APIRoute +from unittest.mock import MagicMock, patch load_dotenv() import io @@ -988,3 +989,28 @@ async def test_list_key_helper(prisma_client): user_id="admin", ), ) + + +@pytest.mark.asyncio +@patch("litellm.proxy.management_endpoints.key_management_endpoints.get_team_object") +async def test_key_generate_always_db_team(mock_get_team_object): + from litellm.proxy.management_endpoints.key_management_endpoints import ( + generate_key_fn, + ) + + setattr(litellm.proxy.proxy_server, "prisma_client", MagicMock()) + mock_get_team_object.return_value = None + try: + await generate_key_fn( + data=GenerateKeyRequest(team_id="1234"), + user_api_key_dict=UserAPIKeyAuth( + user_role=LitellmUserRoles.PROXY_ADMIN, + api_key="sk-1234", + user_id="admin", + ), + ) + except Exception as e: + print(f"Error: {e}") + + mock_get_team_object.assert_called_once() + assert mock_get_team_object.call_args.kwargs["check_db_only"] == True