diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..51c578971 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,47 @@ + + + +## Title + + + +## Relevant issues + + + +## Type + + + + +๐Ÿ†• New Feature +๐Ÿ› Bug Fix +๐Ÿงน Refactoring +๐Ÿ“– Documentation +๐Ÿ’ป Development Environment +๐Ÿš„ Infrastructure +โœ… Test + +## Changes + + + +## Testing + + + +## Notes + + + + + +## Pre-Submission Checklist (optional but appreciated): + +- [ ] I have included relevant documentation updates (stored in /docs/my-website) + +## OS Tests (optional but appreciated): + +- [ ] Tested on Windows +- [ ] Tested on MacOS +- [ ] Tested on Linux diff --git a/README.md b/README.md index 38a166935..9344c0f22 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,7 @@ Step 2: Navigate into the project, and install dependencies: ``` cd litellm -poetry install +poetry install -E extra_proxy -E proxy ``` Step 3: Test your change: diff --git a/docs/my-website/docs/completion/token_usage.md b/docs/my-website/docs/completion/token_usage.md index 626973c57..807ccfd91 100644 --- a/docs/my-website/docs/completion/token_usage.md +++ b/docs/my-website/docs/completion/token_usage.md @@ -1,7 +1,7 @@ # Completion Token Usage & Cost By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/)) -However, we also expose 5 helper functions + **[NEW]** an API to calculate token usage across providers: +However, we also expose some helper functions + **[NEW]** an API to calculate token usage across providers: - `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode) @@ -9,17 +9,19 @@ However, we also expose 5 helper functions + **[NEW]** an API to calculate token - `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. [**Jump to code**](#3-token_counter) -- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#4-cost_per_token) +- `create_pretrained_tokenizer` and `create_tokenizer`: LiteLLM provides default tokenizer support for OpenAI, Cohere, Anthropic, Llama2, and Llama3 models. If you are using a different model, you can create a custom tokenizer and pass it as `custom_tokenizer` to the `encode`, `decode`, and `token_counter` methods. [**Jump to code**](#4-create_pretrained_tokenizer-and-create_tokenizer) -- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#5-completion_cost) +- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#5-cost_per_token) -- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#6-get_max_tokens) +- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#6-completion_cost) -- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#7-model_cost) +- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#7-get_max_tokens) -- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#8-register_model) +- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#8-model_cost) -- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#9-apilitellmai) +- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#9-register_model) + +- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#10-apilitellmai) ๐Ÿ“ฃ This is a community maintained list. Contributions are welcome! โค๏ธ @@ -60,7 +62,24 @@ messages = [{"user": "role", "content": "Hey, how's it going"}] print(token_counter(model="gpt-3.5-turbo", messages=messages)) ``` -### 4. `cost_per_token` +### 4. `create_pretrained_tokenizer` and `create_tokenizer` + +```python +from litellm import create_pretrained_tokenizer, create_tokenizer + +# get tokenizer from huggingface repo +custom_tokenizer_1 = create_pretrained_tokenizer("Xenova/llama-3-tokenizer") + +# use tokenizer from json file +with open("tokenizer.json") as f: + json_data = json.load(f) + +json_str = json.dumps(json_data) + +custom_tokenizer_2 = create_tokenizer(json_str) +``` + +### 5. `cost_per_token` ```python from litellm import cost_per_token @@ -72,7 +91,7 @@ prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_toke print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar) ``` -### 5. `completion_cost` +### 6. `completion_cost` * Input: Accepts a `litellm.completion()` response **OR** prompt + completion strings * Output: Returns a `float` of cost for the `completion` call @@ -99,7 +118,7 @@ cost = completion_cost(model="bedrock/anthropic.claude-v2", prompt="Hey!", compl formatted_string = f"${float(cost):.10f}" print(formatted_string) ``` -### 6. `get_max_tokens` +### 7. `get_max_tokens` Input: Accepts a model name - e.g., gpt-3.5-turbo (to get a complete list, call litellm.model_list). Output: Returns the maximum number of tokens allowed for the given model @@ -112,7 +131,7 @@ model = "gpt-3.5-turbo" print(get_max_tokens(model)) # Output: 4097 ``` -### 7. `model_cost` +### 8. `model_cost` * Output: Returns a dict object containing the max_tokens, input_cost_per_token, output_cost_per_token for all models on [community-maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) @@ -122,7 +141,7 @@ from litellm import model_cost print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token': 1.5e-06, 'output_cost_per_token': 2e-06}, ...} ``` -### 8. `register_model` +### 9. `register_model` * Input: Provide EITHER a model cost dictionary or a url to a hosted json blob * Output: Returns updated model_cost dictionary + updates litellm.model_cost with model details. @@ -157,5 +176,3 @@ export LITELLM_LOCAL_MODEL_COST_MAP="True" ``` Note: this means you will need to upgrade to get updated pricing, and newer models. - - diff --git a/docs/my-website/docs/observability/greenscale_integration.md b/docs/my-website/docs/observability/greenscale_integration.md index 8fc2b7ea3..0dd673226 100644 --- a/docs/my-website/docs/observability/greenscale_integration.md +++ b/docs/my-website/docs/observability/greenscale_integration.md @@ -1,4 +1,4 @@ -# Greenscale Tutorial +# Greenscale - Track LLM Spend and Responsible Usage [Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII). diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index ef2ddb57e..f5777d6e7 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -178,6 +178,7 @@ const sidebars = { "observability/traceloop_integration", "observability/athina_integration", "observability/lunary_integration", + "observability/greenscale_integration", "observability/helicone_integration", "observability/supabase_integration", `observability/telemetry`, diff --git a/litellm-js/spend-logs/package-lock.json b/litellm-js/spend-logs/package-lock.json index ef8cb1da0..cb4b599d3 100644 --- a/litellm-js/spend-logs/package-lock.json +++ b/litellm-js/spend-logs/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "@hono/node-server": "^1.9.0", + "@hono/node-server": "^1.10.1", "hono": "^4.2.7" }, "devDependencies": { @@ -382,9 +382,9 @@ } }, "node_modules/@hono/node-server": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz", - "integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==", + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.10.1.tgz", + "integrity": "sha512-5BKW25JH5PQKPDkTcIgv3yNUPtOAbnnjFFgWvIxxAY/B/ZNeYjjWoAeDmqhIiCgOAJ3Tauuw+0G+VainhuZRYQ==", "engines": { "node": ">=18.14.1" } diff --git a/litellm-js/spend-logs/package.json b/litellm-js/spend-logs/package.json index 92839a01b..d9543220b 100644 --- a/litellm-js/spend-logs/package.json +++ b/litellm-js/spend-logs/package.json @@ -3,7 +3,7 @@ "dev": "tsx watch src/index.ts" }, "dependencies": { - "@hono/node-server": "^1.9.0", + "@hono/node-server": "^1.10.1", "hono": "^4.2.7" }, "devDependencies": { diff --git a/litellm/__init__.py b/litellm/__init__.py index 5cc4d2316..dc640f0e9 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -542,7 +542,11 @@ models_by_provider: dict = { "together_ai": together_ai_models, "baseten": baseten_models, "openrouter": openrouter_models, - "vertex_ai": vertex_chat_models + vertex_text_models, + "vertex_ai": vertex_chat_models + + vertex_text_models + + vertex_anthropic_models + + vertex_vision_models + + vertex_language_models, "ai21": ai21_models, "bedrock": bedrock_models, "petals": petals_models, @@ -609,6 +613,8 @@ from .utils import ( get_optional_params, modify_integration, token_counter, + create_pretrained_tokenizer, + create_tokenizer, cost_per_token, completion_cost, supports_function_calling, diff --git a/litellm/integrations/openmeter.py b/litellm/integrations/openmeter.py index 2ed551c8d..248b83f4d 100644 --- a/litellm/integrations/openmeter.py +++ b/litellm/integrations/openmeter.py @@ -38,7 +38,7 @@ class OpenMeterLogger(CustomLogger): in the environment """ missing_keys = [] - if litellm.get_secret("OPENMETER_API_KEY", None) is None: + if os.getenv("OPENMETER_API_KEY", None) is None: missing_keys.append("OPENMETER_API_KEY") if len(missing_keys) > 0: @@ -71,15 +71,13 @@ class OpenMeterLogger(CustomLogger): } def log_success_event(self, kwargs, response_obj, start_time, end_time): - _url = litellm.get_secret( - "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud" - ) + _url = os.getenv("OPENMETER_API_ENDPOINT", "https://openmeter.cloud") if _url.endswith("/"): _url += "api/v1/events" else: _url += "/api/v1/events" - api_key = litellm.get_secret("OPENMETER_API_KEY") + api_key = os.getenv("OPENMETER_API_KEY") _data = self._common_logic(kwargs=kwargs, response_obj=response_obj) self.sync_http_handler.post( @@ -92,15 +90,13 @@ class OpenMeterLogger(CustomLogger): ) async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): - _url = litellm.get_secret( - "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud" - ) + _url = os.getenv("OPENMETER_API_ENDPOINT", "https://openmeter.cloud") if _url.endswith("/"): _url += "api/v1/events" else: _url += "/api/v1/events" - api_key = litellm.get_secret("OPENMETER_API_KEY") + api_key = os.getenv("OPENMETER_API_KEY") _data = self._common_logic(kwargs=kwargs, response_obj=response_obj) _headers = { @@ -117,7 +113,6 @@ class OpenMeterLogger(CustomLogger): response.raise_for_status() except Exception as e: - print(f"\nAn Exception Occurred - {str(e)}") if hasattr(response, "text"): - print(f"\nError Message: {response.text}") + litellm.print_verbose(f"\nError Message: {response.text}") raise e diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 8f8ce712e..a9aba2f1c 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -48,19 +48,6 @@ class SlackAlerting: self.internal_usage_cache = DualCache() self.async_http_handler = AsyncHTTPHandler() self.alert_to_webhook_url = alert_to_webhook_url - self.langfuse_logger = None - - try: - from litellm.integrations.langfuse import LangFuseLogger - - self.langfuse_logger = LangFuseLogger( - os.getenv("LANGFUSE_PUBLIC_KEY"), - os.getenv("LANGFUSE_SECRET_KEY"), - flush_interval=1, - ) - except: - pass - pass def update_values( @@ -110,62 +97,8 @@ class SlackAlerting: start_time: Optional[datetime.datetime] = None, end_time: Optional[datetime.datetime] = None, ): - import uuid - - # For now: do nothing as we're debugging why this is not working as expected - if request_data is not None: - trace_id = request_data.get("metadata", {}).get( - "trace_id", None - ) # get langfuse trace id - if trace_id is None: - trace_id = "litellm-alert-trace-" + str(uuid.uuid4()) - request_data["metadata"]["trace_id"] = trace_id - elif kwargs is not None: - _litellm_params = kwargs.get("litellm_params", {}) - trace_id = _litellm_params.get("metadata", {}).get( - "trace_id", None - ) # get langfuse trace id - if trace_id is None: - trace_id = "litellm-alert-trace-" + str(uuid.uuid4()) - _litellm_params["metadata"]["trace_id"] = trace_id - - # Log hanging request as an error on langfuse - if type == "hanging_request": - if self.langfuse_logger is not None: - _logging_kwargs = copy.deepcopy(request_data) - if _logging_kwargs is None: - _logging_kwargs = {} - _logging_kwargs["litellm_params"] = {} - request_data = request_data or {} - _logging_kwargs["litellm_params"]["metadata"] = request_data.get( - "metadata", {} - ) - # log to langfuse in a separate thread - import threading - - threading.Thread( - target=self.langfuse_logger.log_event, - args=( - _logging_kwargs, - None, - start_time, - end_time, - None, - print, - "ERROR", - "Requests is hanging", - ), - ).start() - - _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com") - _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID") - - # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************ - - _langfuse_url = ( - f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}" - ) - request_info += f"\n๐Ÿชข Langfuse Trace: {_langfuse_url}" + # do nothing for now + pass return request_info def _response_taking_too_long_callback( @@ -242,10 +175,6 @@ class SlackAlerting: request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`" slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`" if time_difference_float > self.alerting_threshold: - if "langfuse" in litellm.success_callback: - request_info = self._add_langfuse_trace_id_to_alert( - request_info=request_info, kwargs=kwargs, type="slow_response" - ) # add deployment latencies to alert if ( kwargs is not None diff --git a/litellm/main.py b/litellm/main.py index 8fc07b9bf..98295de72 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -34,6 +34,8 @@ from litellm.utils import ( async_mock_completion_streaming_obj, convert_to_model_response_object, token_counter, + create_pretrained_tokenizer, + create_tokenizer, Usage, get_optional_params_embeddings, get_optional_params_image_gen, diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index ce6f9b800..7fcd425bb 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -338,6 +338,18 @@ "output_cost_per_second": 0.0001, "litellm_provider": "azure" }, + "azure/gpt-4-turbo-2024-04-09": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00001, + "output_cost_per_token": 0.00003, + "litellm_provider": "azure", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_vision": true + }, "azure/gpt-4-0125-preview": { "max_tokens": 4096, "max_input_tokens": 128000, @@ -813,6 +825,7 @@ "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 264 }, "claude-3-opus-20240229": { @@ -824,6 +837,7 @@ "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 395 }, "claude-3-sonnet-20240229": { @@ -835,6 +849,7 @@ "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 159 }, "text-bison": { @@ -1142,7 +1157,8 @@ "output_cost_per_token": 0.000015, "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "vertex_ai/claude-3-haiku@20240307": { "max_tokens": 4096, @@ -1152,7 +1168,8 @@ "output_cost_per_token": 0.00000125, "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "vertex_ai/claude-3-opus@20240229": { "max_tokens": 4096, @@ -1162,7 +1179,8 @@ "output_cost_per_token": 0.0000075, "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "textembedding-gecko": { "max_tokens": 3072, @@ -1581,6 +1599,7 @@ "litellm_provider": "openrouter", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 395 }, "openrouter/google/palm-2-chat-bison": { @@ -1929,7 +1948,8 @@ "output_cost_per_token": 0.000015, "litellm_provider": "bedrock", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "anthropic.claude-3-haiku-20240307-v1:0": { "max_tokens": 4096, @@ -1939,7 +1959,8 @@ "output_cost_per_token": 0.00000125, "litellm_provider": "bedrock", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "anthropic.claude-3-opus-20240229-v1:0": { "max_tokens": 4096, @@ -1949,7 +1970,8 @@ "output_cost_per_token": 0.000075, "litellm_provider": "bedrock", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "anthropic.claude-v1": { "max_tokens": 8191, diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 9f2f6ec17..d90fb13fd 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -11,5 +11,12 @@ router_settings: redis_password: os.environ/REDIS_PASSWORD redis_port: os.environ/REDIS_PORT +router_settings: + routing_strategy: "latency-based-routing" + litellm_settings: - success_callback: ["openmeter"] \ No newline at end of file + success_callback: ["openmeter"] + +general_settings: + alerting: ["slack"] + alert_types: ["llm_exceptions"] \ No newline at end of file diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 9cc871966..26987f478 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3446,172 +3446,6 @@ def model_list( ) -@router.post( - "/v1/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"] -) -@router.post( - "/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"] -) -@router.post( - "/engines/{model:path}/completions", - dependencies=[Depends(user_api_key_auth)], - tags=["completions"], -) -@router.post( - "/openai/deployments/{model:path}/completions", - dependencies=[Depends(user_api_key_auth)], - tags=["completions"], -) -async def completion( - request: Request, - fastapi_response: Response, - model: Optional[str] = None, - user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), -): - global user_temperature, user_request_timeout, user_max_tokens, user_api_base - try: - body = await request.body() - body_str = body.decode() - try: - data = ast.literal_eval(body_str) - except: - data = json.loads(body_str) - - data["user"] = data.get("user", user_api_key_dict.user_id) - data["model"] = ( - general_settings.get("completion_model", None) # server default - or user_model # model name passed via cli args - or model # for azure deployments - or data["model"] # default passed in http request - ) - if user_model: - data["model"] = user_model - if "metadata" not in data: - data["metadata"] = {} - data["metadata"]["user_api_key"] = user_api_key_dict.api_key - data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata - data["metadata"]["user_api_key_alias"] = getattr( - user_api_key_dict, "key_alias", None - ) - data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id - data["metadata"]["user_api_key_team_id"] = getattr( - user_api_key_dict, "team_id", None - ) - data["metadata"]["user_api_key_team_alias"] = getattr( - user_api_key_dict, "team_alias", None - ) - _headers = dict(request.headers) - _headers.pop( - "authorization", None - ) # do not store the original `sk-..` api key in the db - data["metadata"]["headers"] = _headers - data["metadata"]["endpoint"] = str(request.url) - - # override with user settings, these are params passed via cli - if user_temperature: - data["temperature"] = user_temperature - if user_request_timeout: - data["request_timeout"] = user_request_timeout - if user_max_tokens: - data["max_tokens"] = user_max_tokens - if user_api_base: - data["api_base"] = user_api_base - - ### MODEL ALIAS MAPPING ### - # check if model name in model alias map - # get the actual model name - if data["model"] in litellm.model_alias_map: - data["model"] = litellm.model_alias_map[data["model"]] - - ### CALL HOOKS ### - modify incoming data before calling the model - data = await proxy_logging_obj.pre_call_hook( - user_api_key_dict=user_api_key_dict, data=data, call_type="completion" - ) - - ### ROUTE THE REQUESTs ### - router_model_names = llm_router.model_names if llm_router is not None else [] - # skip router if user passed their key - if "api_key" in data: - response = await litellm.atext_completion(**data) - elif ( - llm_router is not None and data["model"] in router_model_names - ): # model in router model list - response = await llm_router.atext_completion(**data) - elif ( - llm_router is not None - and llm_router.model_group_alias is not None - and data["model"] in llm_router.model_group_alias - ): # model set in model_group_alias - response = await llm_router.atext_completion(**data) - elif ( - llm_router is not None and data["model"] in llm_router.deployment_names - ): # model in router deployments, calling a specific deployment on the router - response = await llm_router.atext_completion( - **data, specific_deployment=True - ) - elif ( - llm_router is not None - and data["model"] not in router_model_names - and llm_router.default_deployment is not None - ): # model in router deployments, calling a specific deployment on the router - response = await llm_router.atext_completion(**data) - elif user_model is not None: # `litellm --model ` - response = await litellm.atext_completion(**data) - else: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail={ - "error": "Invalid model name passed in model=" - + data.get("model", "") - }, - ) - - if hasattr(response, "_hidden_params"): - model_id = response._hidden_params.get("model_id", None) or "" - original_response = ( - response._hidden_params.get("original_response", None) or "" - ) - else: - model_id = "" - original_response = "" - - verbose_proxy_logger.debug("final response: %s", response) - if ( - "stream" in data and data["stream"] == True - ): # use generate_responses to stream responses - custom_headers = { - "x-litellm-model-id": model_id, - } - selected_data_generator = select_data_generator( - response=response, user_api_key_dict=user_api_key_dict - ) - - return StreamingResponse( - selected_data_generator, - media_type="text/event-stream", - headers=custom_headers, - ) - - fastapi_response.headers["x-litellm-model-id"] = model_id - return response - except Exception as e: - data["litellm_status"] = "fail" # used for alerting - verbose_proxy_logger.debug("EXCEPTION RAISED IN PROXY MAIN.PY") - verbose_proxy_logger.debug( - "\033[1;31mAn error occurred: %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`", - e, - ) - traceback.print_exc() - error_traceback = traceback.format_exc() - error_msg = f"{str(e)}" - raise ProxyException( - message=getattr(e, "message", error_msg), - type=getattr(e, "type", "None"), - param=getattr(e, "param", "None"), - code=getattr(e, "status_code", 500), - ) - - @router.post( "/v1/chat/completions", dependencies=[Depends(user_api_key_auth)], @@ -3810,7 +3644,7 @@ async def chat_completion( raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail={ - "error": "Invalid model name passed in model=" + "error": "chat_completion: Invalid model name passed in model=" + data.get("model", "") }, ) @@ -3884,6 +3718,172 @@ async def chat_completion( ) +@router.post( + "/v1/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"] +) +@router.post( + "/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"] +) +@router.post( + "/engines/{model:path}/completions", + dependencies=[Depends(user_api_key_auth)], + tags=["completions"], +) +@router.post( + "/openai/deployments/{model:path}/completions", + dependencies=[Depends(user_api_key_auth)], + tags=["completions"], +) +async def completion( + request: Request, + fastapi_response: Response, + model: Optional[str] = None, + user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), +): + global user_temperature, user_request_timeout, user_max_tokens, user_api_base + try: + body = await request.body() + body_str = body.decode() + try: + data = ast.literal_eval(body_str) + except: + data = json.loads(body_str) + + data["user"] = data.get("user", user_api_key_dict.user_id) + data["model"] = ( + general_settings.get("completion_model", None) # server default + or user_model # model name passed via cli args + or model # for azure deployments + or data["model"] # default passed in http request + ) + if user_model: + data["model"] = user_model + if "metadata" not in data: + data["metadata"] = {} + data["metadata"]["user_api_key"] = user_api_key_dict.api_key + data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata + data["metadata"]["user_api_key_alias"] = getattr( + user_api_key_dict, "key_alias", None + ) + data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id + data["metadata"]["user_api_key_team_id"] = getattr( + user_api_key_dict, "team_id", None + ) + data["metadata"]["user_api_key_team_alias"] = getattr( + user_api_key_dict, "team_alias", None + ) + _headers = dict(request.headers) + _headers.pop( + "authorization", None + ) # do not store the original `sk-..` api key in the db + data["metadata"]["headers"] = _headers + data["metadata"]["endpoint"] = str(request.url) + + # override with user settings, these are params passed via cli + if user_temperature: + data["temperature"] = user_temperature + if user_request_timeout: + data["request_timeout"] = user_request_timeout + if user_max_tokens: + data["max_tokens"] = user_max_tokens + if user_api_base: + data["api_base"] = user_api_base + + ### MODEL ALIAS MAPPING ### + # check if model name in model alias map + # get the actual model name + if data["model"] in litellm.model_alias_map: + data["model"] = litellm.model_alias_map[data["model"]] + + ### CALL HOOKS ### - modify incoming data before calling the model + data = await proxy_logging_obj.pre_call_hook( + user_api_key_dict=user_api_key_dict, data=data, call_type="completion" + ) + + ### ROUTE THE REQUESTs ### + router_model_names = llm_router.model_names if llm_router is not None else [] + # skip router if user passed their key + if "api_key" in data: + response = await litellm.atext_completion(**data) + elif ( + llm_router is not None and data["model"] in router_model_names + ): # model in router model list + response = await llm_router.atext_completion(**data) + elif ( + llm_router is not None + and llm_router.model_group_alias is not None + and data["model"] in llm_router.model_group_alias + ): # model set in model_group_alias + response = await llm_router.atext_completion(**data) + elif ( + llm_router is not None and data["model"] in llm_router.deployment_names + ): # model in router deployments, calling a specific deployment on the router + response = await llm_router.atext_completion( + **data, specific_deployment=True + ) + elif ( + llm_router is not None + and data["model"] not in router_model_names + and llm_router.default_deployment is not None + ): # model in router deployments, calling a specific deployment on the router + response = await llm_router.atext_completion(**data) + elif user_model is not None: # `litellm --model ` + response = await litellm.atext_completion(**data) + else: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail={ + "error": "completion: Invalid model name passed in model=" + + data.get("model", "") + }, + ) + + if hasattr(response, "_hidden_params"): + model_id = response._hidden_params.get("model_id", None) or "" + original_response = ( + response._hidden_params.get("original_response", None) or "" + ) + else: + model_id = "" + original_response = "" + + verbose_proxy_logger.debug("final response: %s", response) + if ( + "stream" in data and data["stream"] == True + ): # use generate_responses to stream responses + custom_headers = { + "x-litellm-model-id": model_id, + } + selected_data_generator = select_data_generator( + response=response, user_api_key_dict=user_api_key_dict + ) + + return StreamingResponse( + selected_data_generator, + media_type="text/event-stream", + headers=custom_headers, + ) + + fastapi_response.headers["x-litellm-model-id"] = model_id + return response + except Exception as e: + data["litellm_status"] = "fail" # used for alerting + verbose_proxy_logger.debug("EXCEPTION RAISED IN PROXY MAIN.PY") + verbose_proxy_logger.debug( + "\033[1;31mAn error occurred: %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`", + e, + ) + traceback.print_exc() + error_traceback = traceback.format_exc() + error_msg = f"{str(e)}" + raise ProxyException( + message=getattr(e, "message", error_msg), + type=getattr(e, "type", "None"), + param=getattr(e, "param", "None"), + code=getattr(e, "status_code", 500), + ) + + @router.post( "/v1/embeddings", dependencies=[Depends(user_api_key_auth)], @@ -4041,7 +4041,7 @@ async def embeddings( raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail={ - "error": "Invalid model name passed in model=" + "error": "embeddings: Invalid model name passed in model=" + data.get("model", "") }, ) @@ -4197,7 +4197,7 @@ async def image_generation( raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail={ - "error": "Invalid model name passed in model=" + "error": "image_generation: Invalid model name passed in model=" + data.get("model", "") }, ) @@ -4372,7 +4372,7 @@ async def audio_transcriptions( raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail={ - "error": "Invalid model name passed in model=" + "error": "audio_transcriptions: Invalid model name passed in model=" + data.get("model", "") }, ) @@ -4538,7 +4538,7 @@ async def moderations( raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail={ - "error": "Invalid model name passed in model=" + "error": "moderations: Invalid model name passed in model=" + data.get("model", "") }, ) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index b5db81b31..1048c6727 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -387,15 +387,21 @@ class ProxyLogging: """ ### ALERTING ### - if "llm_exceptions" not in self.alert_types: - return - asyncio.create_task( - self.alerting_handler( - message=f"LLM API call failed: {str(original_exception)}", - level="High", - alert_type="llm_exceptions", + if "llm_exceptions" in self.alert_types and not isinstance( + original_exception, HTTPException + ): + """ + Just alert on LLM API exceptions. Do not alert on user errors + + Related issue - https://github.com/BerriAI/litellm/issues/3395 + """ + asyncio.create_task( + self.alerting_handler( + message=f"LLM API call failed: {str(original_exception)}", + level="High", + alert_type="llm_exceptions", + ) ) - ) for callback in litellm.callbacks: try: @@ -679,8 +685,8 @@ class PrismaClient: @backoff.on_exception( backoff.expo, Exception, # base exception to catch for the backoff - max_tries=3, # maximum number of retries - max_time=10, # maximum total time to retry for + max_tries=1, # maximum number of retries + max_time=2, # maximum total time to retry for on_backoff=on_backoff, # specifying the function to call on backoff ) async def get_generic_data( @@ -718,7 +724,8 @@ class PrismaClient: import traceback error_msg = f"LiteLLM Prisma Client Exception get_generic_data: {str(e)}" - print_verbose(error_msg) + verbose_proxy_logger.error(error_msg) + error_msg = error_msg + "\nException Type: {}".format(type(e)) error_traceback = error_msg + "\n" + traceback.format_exc() end_time = time.time() _duration = end_time - start_time diff --git a/litellm/router.py b/litellm/router.py index 15fdbd4b8..7acf75e8e 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -2590,6 +2590,16 @@ class Router: return model return None + def get_model_info(self, id: str) -> Optional[dict]: + """ + For a given model id, return the model info + """ + for model in self.model_list: + if "model_info" in model and "id" in model["model_info"]: + if id == model["model_info"]["id"]: + return model + return None + def get_model_ids(self): ids = [] for model in self.model_list: @@ -2904,15 +2914,10 @@ class Router: m for m in self.model_list if m["litellm_params"]["model"] == model ] - verbose_router_logger.debug( - f"initial list of deployments: {healthy_deployments}" - ) + litellm.print_verbose(f"initial list of deployments: {healthy_deployments}") - verbose_router_logger.debug( - f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}" - ) if len(healthy_deployments) == 0: - raise ValueError(f"No healthy deployment available, passed model={model}") + raise ValueError(f"No healthy deployment available, passed model={model}. ") if litellm.model_alias_map and model in litellm.model_alias_map: model = litellm.model_alias_map[ model diff --git a/litellm/router_strategy/lowest_tpm_rpm_v2.py b/litellm/router_strategy/lowest_tpm_rpm_v2.py index 4bcf1eec1..f7a55d970 100644 --- a/litellm/router_strategy/lowest_tpm_rpm_v2.py +++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py @@ -79,10 +79,12 @@ class LowestTPMLoggingHandler_v2(CustomLogger): model=deployment.get("litellm_params", {}).get("model"), response=httpx.Response( status_code=429, - content="{} rpm limit={}. current usage={}".format( + content="{} rpm limit={}. current usage={}. id={}, model_group={}. Get the model info by calling 'router.get_model_info(id)".format( RouterErrors.user_defined_ratelimit_error.value, deployment_rpm, local_result, + model_id, + deployment.get("model_name", ""), ), request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"), # type: ignore ), diff --git a/litellm/tests/langfuse.log b/litellm/tests/langfuse.log index e69de29bb..f47590a29 100644 --- a/litellm/tests/langfuse.log +++ b/litellm/tests/langfuse.log @@ -0,0 +1,88 @@ +int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +Traceback (most recent call last): + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation + "usage": _convert_usage_input(usage) if usage is not None else None, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input + "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority + return int(usage[key]) + ^^^^^^^^^^^^^^^ +TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +Traceback (most recent call last): + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation + "usage": _convert_usage_input(usage) if usage is not None else None, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input + "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority + return int(usage[key]) + ^^^^^^^^^^^^^^^ +TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +Traceback (most recent call last): + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation + "usage": _convert_usage_input(usage) if usage is not None else None, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input + "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority + return int(usage[key]) + ^^^^^^^^^^^^^^^ +TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +Traceback (most recent call last): + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation + "usage": _convert_usage_input(usage) if usage is not None else None, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input + "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority + return int(usage[key]) + ^^^^^^^^^^^^^^^ +TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +Traceback (most recent call last): + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation + "usage": _convert_usage_input(usage) if usage is not None else None, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input + "totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority + return int(usage[key]) + ^^^^^^^^^^^^^^^ +TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' +consumer is running... +Getting observations... None, None, None, None, litellm-test-98e1cc75-bef8-4280-a2b9-e08633b81acd, None, GENERATION +consumer is running... +Getting observations... None, None, None, None, litellm-test-532d2bc8-f8d6-42fd-8f78-416bae79925d, None, GENERATION +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined +joining 1 consumer threads +consumer thread 0 joined diff --git a/litellm/tests/test_alangfuse.py b/litellm/tests/test_alangfuse.py index 6c3830935..29718d474 100644 --- a/litellm/tests/test_alangfuse.py +++ b/litellm/tests/test_alangfuse.py @@ -205,8 +205,6 @@ async def test_langfuse_logging_without_request_response(stream): assert _trace_data[0].output == { "role": "assistant", "content": "redacted-by-litellm", - "function_call": None, - "tool_calls": None, } except Exception as e: diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py index ff3e8f8c7..a74e25910 100644 --- a/litellm/tests/test_alerting.py +++ b/litellm/tests/test_alerting.py @@ -3,7 +3,7 @@ import sys import os -import io, asyncio +import io, asyncio, httpx from datetime import datetime, timedelta # import logging @@ -17,6 +17,61 @@ import asyncio from unittest.mock import patch, MagicMock from litellm.caching import DualCache from litellm.integrations.slack_alerting import SlackAlerting +from litellm.proxy._types import UserAPIKeyAuth +from litellm.proxy.proxy_server import HTTPException + + +@pytest.mark.parametrize("exception_type", ["llm-exception", "non-llm-exception"]) +@pytest.mark.asyncio +async def test_slack_alerting_llm_exceptions(exception_type, monkeypatch): + """ + Test if non-llm exception -> No request + Test if llm exception -> Request triggered + """ + _pl = ProxyLogging(user_api_key_cache=DualCache()) + _pl.update_values( + alerting=["slack"], + alerting_threshold=100, + redis_cache=None, + alert_types=["llm_exceptions"], + ) + + async def mock_alerting_handler(message, level, alert_type): + global exception_type + + if exception_type == "llm-exception": + pass + elif exception_type == "non-llm-exception": + pytest.fail("Function should not have been called") + + monkeypatch.setattr(_pl, "alerting_handler", mock_alerting_handler) + + if exception_type == "llm-exception": + await _pl.post_call_failure_hook( + original_exception=litellm.APIError( + status_code=500, + message="This is a test exception", + llm_provider="openai", + model="gpt-3.5-turbo", + request=httpx.Request( + method="completion", url="https://github.com/BerriAI/litellm" + ), + ), + user_api_key_dict=UserAPIKeyAuth(), + ) + + await asyncio.sleep(2) + + elif exception_type == "non-llm-exception": + await _pl.post_call_failure_hook( + original_exception=HTTPException( + status_code=400, + detail={"error": "this is a test exception"}, + ), + user_api_key_dict=UserAPIKeyAuth(), + ) + + await asyncio.sleep(2) @pytest.mark.asyncio diff --git a/litellm/tests/test_proxy_exception_mapping.py b/litellm/tests/test_proxy_exception_mapping.py index 82957b658..0cc7b0d30 100644 --- a/litellm/tests/test_proxy_exception_mapping.py +++ b/litellm/tests/test_proxy_exception_mapping.py @@ -169,7 +169,7 @@ def test_chat_completion_exception_any_model(client): ) assert isinstance(openai_exception, openai.BadRequestError) _error_message = openai_exception.message - assert "Invalid model name passed in model=Lite-GPT-12" in str(_error_message) + assert "chat_completion: Invalid model name passed in model=Lite-GPT-12" in str(_error_message) except Exception as e: pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}") @@ -197,7 +197,7 @@ def test_embedding_exception_any_model(client): print("Exception raised=", openai_exception) assert isinstance(openai_exception, openai.BadRequestError) _error_message = openai_exception.message - assert "Invalid model name passed in model=Lite-GPT-12" in str(_error_message) + assert "embeddings: Invalid model name passed in model=Lite-GPT-12" in str(_error_message) except Exception as e: pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}") diff --git a/litellm/tests/test_proxy_server.py b/litellm/tests/test_proxy_server.py index 052646db8..43a070556 100644 --- a/litellm/tests/test_proxy_server.py +++ b/litellm/tests/test_proxy_server.py @@ -1,5 +1,6 @@ import sys, os import traceback +from unittest import mock from dotenv import load_dotenv load_dotenv() @@ -35,6 +36,77 @@ token = "sk-1234" headers = {"Authorization": f"Bearer {token}"} +example_completion_result = { + "choices": [ + { + "message": { + "content": "Whispers of the wind carry dreams to me.", + "role": "assistant" + } + } + ], +} +example_embedding_result = { + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [ + -0.006929283495992422, + -0.005336422007530928, + -4.547132266452536e-05, + -0.024047505110502243, + -0.006929283495992422, + -0.005336422007530928, + -4.547132266452536e-05, + -0.024047505110502243, + -0.006929283495992422, + -0.005336422007530928, + -4.547132266452536e-05, + -0.024047505110502243, + ], + } + ], + "model": "text-embedding-3-small", + "usage": { + "prompt_tokens": 5, + "total_tokens": 5 + } +} +example_image_generation_result = { + "created": 1589478378, + "data": [ + { + "url": "https://..." + }, + { + "url": "https://..." + } + ] +} + + +def mock_patch_acompletion(): + return mock.patch( + "litellm.proxy.proxy_server.llm_router.acompletion", + return_value=example_completion_result, + ) + + +def mock_patch_aembedding(): + return mock.patch( + "litellm.proxy.proxy_server.llm_router.aembedding", + return_value=example_embedding_result, + ) + + +def mock_patch_aimage_generation(): + return mock.patch( + "litellm.proxy.proxy_server.llm_router.aimage_generation", + return_value=example_image_generation_result, + ) + @pytest.fixture(scope="function") def client_no_auth(): @@ -52,7 +124,8 @@ def client_no_auth(): return TestClient(app) -def test_chat_completion(client_no_auth): +@mock_patch_acompletion() +def test_chat_completion(mock_acompletion, client_no_auth): global headers try: # Your test data @@ -66,6 +139,19 @@ def test_chat_completion(client_no_auth): print("testing proxy server with chat completions") response = client_no_auth.post("/v1/chat/completions", json=test_data) + mock_acompletion.assert_called_once_with( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": "hi"}, + ], + max_tokens=10, + litellm_call_id=mock.ANY, + litellm_logging_obj=mock.ANY, + request_timeout=mock.ANY, + specific_deployment=True, + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) print(f"response - {response.text}") assert response.status_code == 200 result = response.json() @@ -77,7 +163,8 @@ def test_chat_completion(client_no_auth): # Run the test -def test_chat_completion_azure(client_no_auth): +@mock_patch_acompletion() +def test_chat_completion_azure(mock_acompletion, client_no_auth): global headers try: # Your test data @@ -92,6 +179,19 @@ def test_chat_completion_azure(client_no_auth): print("testing proxy server with Azure Request /chat/completions") response = client_no_auth.post("/v1/chat/completions", json=test_data) + mock_acompletion.assert_called_once_with( + model="azure/chatgpt-v-2", + messages=[ + {"role": "user", "content": "write 1 sentence poem"}, + ], + max_tokens=10, + litellm_call_id=mock.ANY, + litellm_logging_obj=mock.ANY, + request_timeout=mock.ANY, + specific_deployment=True, + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) assert response.status_code == 200 result = response.json() print(f"Received response: {result}") @@ -104,8 +204,51 @@ def test_chat_completion_azure(client_no_auth): # test_chat_completion_azure() +@mock_patch_acompletion() +def test_openai_deployments_model_chat_completions_azure(mock_acompletion, client_no_auth): + global headers + try: + # Your test data + test_data = { + "model": "azure/chatgpt-v-2", + "messages": [ + {"role": "user", "content": "write 1 sentence poem"}, + ], + "max_tokens": 10, + } + + url = "/openai/deployments/azure/chatgpt-v-2/chat/completions" + print(f"testing proxy server with Azure Request {url}") + response = client_no_auth.post(url, json=test_data) + + mock_acompletion.assert_called_once_with( + model="azure/chatgpt-v-2", + messages=[ + {"role": "user", "content": "write 1 sentence poem"}, + ], + max_tokens=10, + litellm_call_id=mock.ANY, + litellm_logging_obj=mock.ANY, + request_timeout=mock.ANY, + specific_deployment=True, + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) + assert response.status_code == 200 + result = response.json() + print(f"Received response: {result}") + assert len(result["choices"][0]["message"]["content"]) > 0 + except Exception as e: + pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}") + + +# Run the test +# test_openai_deployments_model_chat_completions_azure() + + ### EMBEDDING -def test_embedding(client_no_auth): +@mock_patch_aembedding() +def test_embedding(mock_aembedding, client_no_auth): global headers from litellm.proxy.proxy_server import user_custom_auth @@ -117,6 +260,13 @@ def test_embedding(client_no_auth): response = client_no_auth.post("/v1/embeddings", json=test_data) + mock_aembedding.assert_called_once_with( + model="azure/azure-embedding-model", + input=["good morning from litellm"], + specific_deployment=True, + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) assert response.status_code == 200 result = response.json() print(len(result["data"][0]["embedding"])) @@ -125,7 +275,8 @@ def test_embedding(client_no_auth): pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}") -def test_bedrock_embedding(client_no_auth): +@mock_patch_aembedding() +def test_bedrock_embedding(mock_aembedding, client_no_auth): global headers from litellm.proxy.proxy_server import user_custom_auth @@ -137,6 +288,12 @@ def test_bedrock_embedding(client_no_auth): response = client_no_auth.post("/v1/embeddings", json=test_data) + mock_aembedding.assert_called_once_with( + model="amazon-embeddings", + input=["good morning from litellm"], + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) assert response.status_code == 200 result = response.json() print(len(result["data"][0]["embedding"])) @@ -171,7 +328,8 @@ def test_sagemaker_embedding(client_no_auth): #### IMAGE GENERATION -def test_img_gen(client_no_auth): +@mock_patch_aimage_generation() +def test_img_gen(mock_aimage_generation, client_no_auth): global headers from litellm.proxy.proxy_server import user_custom_auth @@ -185,6 +343,14 @@ def test_img_gen(client_no_auth): response = client_no_auth.post("/v1/images/generations", json=test_data) + mock_aimage_generation.assert_called_once_with( + model='dall-e-3', + prompt='A cute baby sea otter', + n=1, + size='1024x1024', + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) assert response.status_code == 200 result = response.json() print(len(result["data"][0]["url"])) @@ -249,7 +415,8 @@ class MyCustomHandler(CustomLogger): customHandler = MyCustomHandler() -def test_chat_completion_optional_params(client_no_auth): +@mock_patch_acompletion() +def test_chat_completion_optional_params(mock_acompletion, client_no_auth): # [PROXY: PROD TEST] - DO NOT DELETE # This tests if all the /chat/completion params are passed to litellm try: @@ -267,6 +434,20 @@ def test_chat_completion_optional_params(client_no_auth): litellm.callbacks = [customHandler] print("testing proxy server: optional params") response = client_no_auth.post("/v1/chat/completions", json=test_data) + mock_acompletion.assert_called_once_with( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": "hi"}, + ], + max_tokens=10, + user="proxy-user", + litellm_call_id=mock.ANY, + litellm_logging_obj=mock.ANY, + request_timeout=mock.ANY, + specific_deployment=True, + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) assert response.status_code == 200 result = response.json() print(f"Received response: {result}") diff --git a/litellm/tests/test_token_counter.py b/litellm/tests/test_token_counter.py index af0db487e..4d759d4cf 100644 --- a/litellm/tests/test_token_counter.py +++ b/litellm/tests/test_token_counter.py @@ -9,7 +9,7 @@ sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import time -from litellm import token_counter, encode, decode +from litellm import token_counter, create_pretrained_tokenizer, encode, decode def test_token_counter_normal_plus_function_calling(): @@ -69,15 +69,23 @@ def test_tokenizers(): model="meta-llama/Llama-2-7b-chat", text=sample_text ) + # llama3 tokenizer (also testing custom tokenizer) + llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text) + + llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer") + llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text) + print( - f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}" + f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}" ) # assert that all token values are different assert ( - openai_tokens != cohere_tokens != llama2_tokens + openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1 ), "Token values are not different." + assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same." + print("test tokenizer: It worked!") except Exception as e: pytest.fail(f"An exception occured: {e}") diff --git a/litellm/tests/test_utils.py b/litellm/tests/test_utils.py index 44fb1607c..57b93df9c 100644 --- a/litellm/tests/test_utils.py +++ b/litellm/tests/test_utils.py @@ -20,6 +20,8 @@ from litellm.utils import ( validate_environment, function_to_dict, token_counter, + create_pretrained_tokenizer, + create_tokenizer, ) # Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils' diff --git a/litellm/utils.py b/litellm/utils.py index c4117bdb3..ec296e9dc 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -378,16 +378,13 @@ class Message(OpenAIObject): super(Message, self).__init__(**params) self.content = content self.role = role - self.tool_calls = None - self.function_call = None - if function_call is not None: self.function_call = FunctionCall(**function_call) if tool_calls is not None: - self.tool_calls = [ - ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls - ] + self.tool_calls = [] + for tool_call in tool_calls: + self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call)) if logprobs is not None: self._logprobs = ChoiceLogprobs(**logprobs) @@ -413,8 +410,6 @@ class Message(OpenAIObject): class Delta(OpenAIObject): - tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None - def __init__( self, content=None, @@ -1700,10 +1695,17 @@ class Logging: print_verbose("reaches langfuse for streaming logging!") result = kwargs["complete_streaming_response"] if langFuseLogger is None or ( - self.langfuse_public_key != langFuseLogger.public_key - and self.langfuse_secret != langFuseLogger.secret_key + ( + self.langfuse_public_key is not None + and self.langfuse_public_key + != langFuseLogger.public_key + ) + and ( + self.langfuse_public_key is not None + and self.langfuse_public_key + != langFuseLogger.public_key + ) ): - print_verbose("Instantiates langfuse client") langFuseLogger = LangFuseLogger( langfuse_public_key=self.langfuse_public_key, langfuse_secret=self.langfuse_secret, @@ -3773,29 +3775,34 @@ def _select_tokenizer(model: str): elif "llama-2" in model.lower() or "replicate" in model.lower(): tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + # llama3 + elif "llama-3" in model.lower(): + tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer") + return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} # default - tiktoken else: return {"type": "openai_tokenizer", "tokenizer": encoding} -def encode(model: str, text: str): +def encode(model="", text="", custom_tokenizer: Optional[dict] = None): """ Encodes the given text using the specified model. Args: model (str): The name of the model to use for tokenization. + custom_tokenizer (Optional[dict]): A custom tokenizer created with the `create_pretrained_tokenizer` or `create_tokenizer` method. Must be a dictionary with a string value for `type` and Tokenizer for `tokenizer`. Default is None. text (str): The text to be encoded. Returns: enc: The encoded text. """ - tokenizer_json = _select_tokenizer(model=model) + tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) enc = tokenizer_json["tokenizer"].encode(text) return enc -def decode(model: str, tokens: List[int]): - tokenizer_json = _select_tokenizer(model=model) +def decode(model="", tokens: List[int] = [], custom_tokenizer: Optional[dict] = None): + tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) dec = tokenizer_json["tokenizer"].decode(tokens) return dec @@ -3967,10 +3974,47 @@ def calculage_img_tokens( tile_tokens = (base_tokens * 2) * tiles_needed_high_res total_tokens = base_tokens + tile_tokens return total_tokens + + +def create_pretrained_tokenizer( + identifier: str, + revision="main", + auth_token: Optional[str] = None +): + """ + Creates a tokenizer from an existing file on a HuggingFace repository to be used with `token_counter`. + + Args: + identifier (str): The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file + revision (str, defaults to main): A branch or commit id + auth_token (str, optional, defaults to None): An optional auth token used to access private repositories on the Hugging Face Hub + + Returns: + dict: A dictionary with the tokenizer and its type. + """ + + tokenizer = Tokenizer.from_pretrained(identifier, revision=revision, auth_token=auth_token) + return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + + +def create_tokenizer(json: str): + """ + Creates a tokenizer from a valid JSON string for use with `token_counter`. + + Args: + json (str): A valid JSON string representing a previously serialized tokenizer + + Returns: + dict: A dictionary with the tokenizer and its type. + """ + + tokenizer = Tokenizer.from_str(json) + return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} def token_counter( model="", + custom_tokenizer: Optional[dict] = None, text: Optional[Union[str, List[str]]] = None, messages: Optional[List] = None, count_response_tokens: Optional[bool] = False, @@ -3980,13 +4024,14 @@ def token_counter( Args: model (str): The name of the model to use for tokenization. Default is an empty string. + custom_tokenizer (Optional[dict]): A custom tokenizer created with the `create_pretrained_tokenizer` or `create_tokenizer` method. Must be a dictionary with a string value for `type` and Tokenizer for `tokenizer`. Default is None. text (str): The raw text string to be passed to the model. Default is None. messages (Optional[List[Dict[str, str]]]): Alternative to passing in text. A list of dictionaries representing messages with "role" and "content" keys. Default is None. Returns: int: The number of tokens in the text. """ - # use tiktoken, anthropic, cohere or llama2's tokenizer depending on the model + # use tiktoken, anthropic, cohere, llama2, or llama3's tokenizer depending on the model is_tool_call = False num_tokens = 0 if text == None: @@ -4028,8 +4073,8 @@ def token_counter( elif isinstance(text, str): count_response_tokens = True # user just trying to count tokens for a text. don't add the chat_ml +3 tokens to this - if model is not None: - tokenizer_json = _select_tokenizer(model=model) + if model is not None or custom_tokenizer is not None: + tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) if tokenizer_json["type"] == "huggingface_tokenizer": print_verbose( f"Token Counter - using hugging face token counter, for model={model}" @@ -6768,7 +6813,7 @@ def validate_environment(model: Optional[str] = None) -> dict: keys_in_environment = True else: missing_keys.append("NLP_CLOUD_API_KEY") - elif custom_llm_provider == "bedrock": + elif custom_llm_provider == "bedrock" or custom_llm_provider == "sagemaker": if ( "AWS_ACCESS_KEY_ID" in os.environ and "AWS_SECRET_ACCESS_KEY" in os.environ @@ -6782,11 +6827,72 @@ def validate_environment(model: Optional[str] = None) -> dict: keys_in_environment = True else: missing_keys.append("OLLAMA_API_BASE") + elif custom_llm_provider == "anyscale": + if "ANYSCALE_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("ANYSCALE_API_KEY") + elif custom_llm_provider == "deepinfra": + if "DEEPINFRA_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("DEEPINFRA_API_KEY") + elif custom_llm_provider == "gemini": + if "GEMINI_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("GEMINI_API_KEY") + elif custom_llm_provider == "groq": + if "GROQ_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("GROQ_API_KEY") + elif custom_llm_provider == "mistral": + if "MISTRAL_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("MISTRAL_API_KEY") + elif custom_llm_provider == "palm": + if "PALM_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("PALM_API_KEY") + elif custom_llm_provider == "perplexity": + if "PERPLEXITYAI_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("PERPLEXITYAI_API_KEY") + elif custom_llm_provider == "voyage": + if "VOYAGE_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("VOYAGE_API_KEY") + elif custom_llm_provider == "fireworks_ai": + if ( + "FIREWORKS_AI_API_KEY" in os.environ + or "FIREWORKS_API_KEY" in os.environ + or "FIREWORKSAI_API_KEY" in os.environ + or "FIREWORKS_AI_TOKEN" in os.environ + ): + keys_in_environment = True + else: + missing_keys.append("FIREWORKS_AI_API_KEY") + elif custom_llm_provider == "cloudflare": + if "CLOUDFLARE_API_KEY" in os.environ and ( + "CLOUDFLARE_ACCOUNT_ID" in os.environ + or "CLOUDFLARE_API_BASE" in os.environ + ): + keys_in_environment = True + else: + missing_keys.append("CLOUDFLARE_API_KEY") + missing_keys.append("CLOUDFLARE_API_BASE") else: ## openai - chatcompletion + text completion if ( model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_text_completion_models + or model in litellm.open_ai_embedding_models + or model in litellm.openai_image_generation_models ): if "OPENAI_API_KEY" in os.environ: keys_in_environment = True @@ -6817,7 +6923,11 @@ def validate_environment(model: Optional[str] = None) -> dict: else: missing_keys.append("OPENROUTER_API_KEY") ## vertex - text + chat models - elif model in litellm.vertex_chat_models or model in litellm.vertex_text_models: + elif ( + model in litellm.vertex_chat_models + or model in litellm.vertex_text_models + or model in litellm.models_by_provider["vertex_ai"] + ): if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ: keys_in_environment = True else: diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index ce6f9b800..7fcd425bb 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -338,6 +338,18 @@ "output_cost_per_second": 0.0001, "litellm_provider": "azure" }, + "azure/gpt-4-turbo-2024-04-09": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00001, + "output_cost_per_token": 0.00003, + "litellm_provider": "azure", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_vision": true + }, "azure/gpt-4-0125-preview": { "max_tokens": 4096, "max_input_tokens": 128000, @@ -813,6 +825,7 @@ "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 264 }, "claude-3-opus-20240229": { @@ -824,6 +837,7 @@ "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 395 }, "claude-3-sonnet-20240229": { @@ -835,6 +849,7 @@ "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 159 }, "text-bison": { @@ -1142,7 +1157,8 @@ "output_cost_per_token": 0.000015, "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "vertex_ai/claude-3-haiku@20240307": { "max_tokens": 4096, @@ -1152,7 +1168,8 @@ "output_cost_per_token": 0.00000125, "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "vertex_ai/claude-3-opus@20240229": { "max_tokens": 4096, @@ -1162,7 +1179,8 @@ "output_cost_per_token": 0.0000075, "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "textembedding-gecko": { "max_tokens": 3072, @@ -1581,6 +1599,7 @@ "litellm_provider": "openrouter", "mode": "chat", "supports_function_calling": true, + "supports_vision": true, "tool_use_system_prompt_tokens": 395 }, "openrouter/google/palm-2-chat-bison": { @@ -1929,7 +1948,8 @@ "output_cost_per_token": 0.000015, "litellm_provider": "bedrock", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "anthropic.claude-3-haiku-20240307-v1:0": { "max_tokens": 4096, @@ -1939,7 +1959,8 @@ "output_cost_per_token": 0.00000125, "litellm_provider": "bedrock", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "anthropic.claude-3-opus-20240229-v1:0": { "max_tokens": 4096, @@ -1949,7 +1970,8 @@ "output_cost_per_token": 0.000075, "litellm_provider": "bedrock", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_vision": true }, "anthropic.claude-v1": { "max_tokens": 8191, diff --git a/poetry.lock b/poetry.lock index 817a7e968..d699425e6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1153,13 +1153,13 @@ typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "t [[package]] name = "idna" -version = "3.6" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, - {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]]