diff --git a/docs/my-website/docs/assistants.md b/docs/my-website/docs/assistants.md index 2380fe5c6..1af780500 100644 --- a/docs/my-website/docs/assistants.md +++ b/docs/my-website/docs/assistants.md @@ -150,7 +150,7 @@ $ litellm --config /path/to/config.yaml ```bash curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ + -H "Authorization: Bearer sk-1234" ``` **Create a Thread** @@ -162,6 +162,14 @@ curl http://0.0.0.0:4000/v1/threads \ -d '' ``` +**Get a Thread** + +```bash +curl http://0.0.0.0:4000/v1/threads/{thread_id} \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" +``` + **Add Messages to the Thread** ```bash diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md index eb309f9b8..1b8bbd8e0 100644 --- a/docs/my-website/docs/caching/all_caches.md +++ b/docs/my-website/docs/caching/all_caches.md @@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t + + +## Switch Cache On / Off Per LiteLLM Call + +LiteLLM supports 4 cache-controls: + +- `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint. +- `no-store`: *Optional(bool)* When `True`, Will not cache the response. +- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds). +- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds). + +[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) + + + +Example usage `no-cache` - When `True`, Will not return a cached response + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"no-cache": True}, + ) +``` + + + + + +Example usage `no-store` - When `True`, Will not cache the response. + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"no-store": True}, + ) +``` + + + + +Example usage `ttl` - cache the response for 10 seconds + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"ttl": 10}, + ) +``` + + + + +Example usage `s-maxage` - Will only accept cached responses for 60 seconds + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"s-maxage": 60}, + ) +``` + + + + ## Cache Context Manager - Enable, Disable, Update Cache diff --git a/docs/my-website/docs/projects/llmcord.py (Discord LLM Chatbot).md b/docs/my-website/docs/projects/llm_cord.md similarity index 93% rename from docs/my-website/docs/projects/llmcord.py (Discord LLM Chatbot).md rename to docs/my-website/docs/projects/llm_cord.md index f8acb9383..6a28d5c88 100644 --- a/docs/my-website/docs/projects/llmcord.py (Discord LLM Chatbot).md +++ b/docs/my-website/docs/projects/llm_cord.md @@ -1,3 +1,5 @@ +# llmcord.py + llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted. Github: https://github.com/jakobdylanc/discord-llm-chatbot diff --git a/docs/my-website/docs/proxy/cost_tracking.md b/docs/my-website/docs/proxy/cost_tracking.md index de1a63a4c..b63fab106 100644 --- a/docs/my-website/docs/proxy/cost_tracking.md +++ b/docs/my-website/docs/proxy/cost_tracking.md @@ -138,14 +138,22 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin ## API Endpoints to get Spend -#### Getting Spend Reports - To Charge Other Teams, API Keys +#### Getting Spend Reports - To Charge Other Teams, Customers -Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model +Use the `/global/spend/report` endpoint to get daily spend report per +- team +- customer [this is `user` passed to `/chat/completions` request](#how-to-track-spend-with-litellm) + + + + ##### Example Request +👉 Key Change: Specify `group_by=team` + ```shell -curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \ +curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=team' \ -H 'Authorization: Bearer sk-1234' ``` @@ -254,6 +262,69 @@ Output from script ``` + + + + + + + + + +##### Example Request + +👉 Key Change: Specify `group_by=customer` + + +```shell +curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=customer' \ + -H 'Authorization: Bearer sk-1234' +``` + +##### Example Response + + +```shell +[ + { + "group_by_day": "2024-04-30T00:00:00+00:00", + "customers": [ + { + "customer": "palantir", + "total_spend": 0.0015265, + "metadata": [ # see the spend by unique(key + model) + { + "model": "gpt-4", + "spend": 0.00123, + "total_tokens": 28, + "api_key": "88dc28.." # the hashed api key + }, + { + "model": "gpt-4", + "spend": 0.00123, + "total_tokens": 28, + "api_key": "a73dc2.." # the hashed api key + }, + { + "model": "chatgpt-v-2", + "spend": 0.000214, + "total_tokens": 122, + "api_key": "898c28.." # the hashed api key + }, + { + "model": "gpt-3.5-turbo", + "spend": 0.0000825, + "total_tokens": 85, + "api_key": "84dc28.." # the hashed api key + } + ] + } + ] + } +] +``` + + diff --git a/docs/my-website/docs/proxy/debugging.md b/docs/my-website/docs/proxy/debugging.md index b9f2ba8da..571a97c0e 100644 --- a/docs/my-website/docs/proxy/debugging.md +++ b/docs/my-website/docs/proxy/debugging.md @@ -42,6 +42,14 @@ Set `JSON_LOGS="True"` in your env: ```bash export JSON_LOGS="True" ``` +**OR** + +Set `json_logs: true` in your yaml: + +```yaml +litellm_settings: + json_logs: true +``` Start proxy @@ -49,4 +57,35 @@ Start proxy $ litellm ``` -The proxy will now all logs in json format. \ No newline at end of file +The proxy will now all logs in json format. + +## Control Log Output + +Turn off fastapi's default 'INFO' logs + +1. Turn on 'json logs' +```yaml +litellm_settings: + json_logs: true +``` + +2. Set `LITELLM_LOG` to 'ERROR' + +Only get logs if an error occurs. + +```bash +LITELLM_LOG="ERROR" +``` + +3. Start proxy + + +```bash +$ litellm +``` + +Expected Output: + +```bash +# no info statements +``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/multiple_admins.md b/docs/my-website/docs/proxy/multiple_admins.md index 388df0d60..376ff0174 100644 --- a/docs/my-website/docs/proxy/multiple_admins.md +++ b/docs/my-website/docs/proxy/multiple_admins.md @@ -2,11 +2,21 @@ Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform). -:::info -Requires Enterprise License for usage. -::: -## Set `LiteLLM-Changed-By` in request headers +:::tip + +Requires Enterprise License, Get in touch with us [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) + +::: + +## 1. Switch on audit Logs +Add `store_audit_logs` to your litellm config.yaml and then start the proxy +```shell +litellm_settings: + store_audit_logs: true +``` + +## 2. Set `LiteLLM-Changed-By` in request headers Set the 'user_id' in request headers, when calling a management endpoint. [View Full List](https://litellm-api.up.railway.app/#/team%20management). @@ -26,7 +36,7 @@ curl -X POST 'http://0.0.0.0:4000/team/update' \ }' ``` -## Emitted Audit Log +## 3. Emitted Audit Log ```bash { diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md index 35c8c575b..587164fe6 100644 --- a/docs/my-website/docs/proxy/prod.md +++ b/docs/my-website/docs/proxy/prod.md @@ -21,6 +21,7 @@ general_settings: litellm_settings: set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on + json_logs: true # Get debug logs in json format ``` Set slack webhook url in your env @@ -28,6 +29,11 @@ Set slack webhook url in your env export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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" ``` +Turn off FASTAPI's default info logs +```bash +export LITELLM_LOG="ERROR" +``` + :::info Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index e39a6765f..ace94251d 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -2,18 +2,13 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# 🔥 Fallbacks, Retries, Timeouts, Load Balancing +# 🔥 Load Balancing, Fallbacks, Retries, Timeouts -Retry call with multiple instances of the same model. - -If a call fails after num_retries, fall back to another model group. - -If the error is a context window exceeded error, fall back to a larger model group (if given). - -[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) +- Quick Start [load balancing](#test---load-balancing) +- Quick Start [client side fallbacks](#test---client-side-fallbacks) ## Quick Start - Load Balancing -### Step 1 - Set deployments on config +#### Step 1 - Set deployments on config **Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo` ```yaml @@ -38,50 +33,220 @@ model_list: rpm: 1440 ``` -### Step 2: Start Proxy with config +#### Step 2: Start Proxy with config ```shell $ litellm --config /path/to/config.yaml ``` -### Step 3: Use proxy - Call a model group [Load Balancing] -Curl Command +### Test - Load Balancing + +Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo + +👉 Key Change: `model="gpt-3.5-turbo"` + +**Check the `model_id` in Response Headers to make sure the requests are being load balanced** + + + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ] +) + +print(response) +``` + + + + +Pass `metadata` as part of the request body + ```shell curl --location 'http://0.0.0.0:4000/chat/completions' \ ---header 'Content-Type: application/json' \ ---data ' { - "model": "gpt-3.5-turbo", - "messages": [ + --header 'Content-Type: application/json' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ { - "role": "user", - "content": "what llm are you" + "role": "user", + "content": "what llm are you" } - ], - } -' + ] +}' +``` + + + +```python +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.schema import HumanMessage, SystemMessage +import os + +os.environ["OPENAI_API_KEY"] = "anything" + +chat = ChatOpenAI( + openai_api_base="http://0.0.0.0:4000", + model="gpt-3.5-turbo", +) + +messages = [ + SystemMessage( + content="You are a helpful assistant that im using to make a test request to." + ), + HumanMessage( + content="test from litellm. tell me why it's amazing in 1 sentence" + ), +] +response = chat(messages) + +print(response) ``` -### Usage - Call a specific model deployment -If you want to call a specific model defined in the `config.yaml`, you can call the `litellm_params: model` + + + + + +### Test - Client Side Fallbacks +In this request the following will occur: +1. The request to `model="zephyr-beta"` will fail +2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]` +3. The request to `model="gpt-3.5-turbo"` will succeed and the client making the request will get a response from gpt-3.5-turbo + +👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]` + + + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="zephyr-beta", + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ], + extra_body={ + "metadata": { + "fallbacks": ["gpt-3.5-turbo"] + } + } +) + +print(response) +``` + + + + +Pass `metadata` as part of the request body + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "zephyr-beta"", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + "metadata": { + "fallbacks": ["gpt-3.5-turbo"] + } +}' +``` + + + +```python +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.schema import HumanMessage, SystemMessage +import os + +os.environ["OPENAI_API_KEY"] = "anything" + +chat = ChatOpenAI( + openai_api_base="http://0.0.0.0:4000", + model="zephyr-beta", + extra_body={ + "metadata": { + "fallbacks": ["gpt-3.5-turbo"] + } + } +) + +messages = [ + SystemMessage( + content="You are a helpful assistant that im using to make a test request to." + ), + HumanMessage( + content="test from litellm. tell me why it's amazing in 1 sentence" + ), +] +response = chat(messages) + +print(response) +``` + + + + + + + + -## Fallbacks + Retries + Timeouts + Cooldowns +## Advanced +### Fallbacks + Retries + Timeouts + Cooldowns **Set via config** ```yaml @@ -114,44 +279,7 @@ litellm_settings: context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. ``` - -**Set dynamically** - -```bash -curl --location 'http://0.0.0.0:4000/chat/completions' \ ---header 'Content-Type: application/json' \ ---data ' { - "model": "zephyr-beta", - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ], - "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}], - "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}], - "num_retries": 2, - "timeout": 10 - } -' -``` - -### Test it! - - -```bash -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "model": "zephyr-beta", # 👈 MODEL NAME to fallback from - "messages": [ - {"role": "user", "content": "what color is red"} - ], - "mock_testing_fallbacks": true - }' -``` - -## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks) +### Context Window Fallbacks (Pre-Call Checks + Fallbacks) **Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**. @@ -287,7 +415,7 @@ print(response) -## Advanced - EU-Region Filtering (Pre-Call Checks) +### EU-Region Filtering (Pre-Call Checks) **Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**. @@ -350,7 +478,7 @@ print(response) print(f"response.headers.get('x-litellm-model-api-base')") ``` -## Advanced - Custom Timeouts, Stream Timeouts - Per Model +### Custom Timeouts, Stream Timeouts - Per Model For each model you can set `timeout` & `stream_timeout` under `litellm_params` ```yaml model_list: @@ -379,7 +507,7 @@ $ litellm --config /path/to/config.yaml ``` -## Advanced - Setting Dynamic Timeouts - Per Request +### Setting Dynamic Timeouts - Per Request LiteLLM Proxy supports setting a `timeout` per request diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index b6b597d30..ff110bb62 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -255,6 +255,7 @@ const sidebars = { "projects/GPT Migrate", "projects/YiVal", "projects/LiteLLM Proxy", + "projects/llm_cord", ], }, ], diff --git a/litellm/__init__.py b/litellm/__init__.py index b6e6d97dc..e92ae355e 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -709,6 +709,7 @@ all_embedding_models = ( openai_image_generation_models = ["dall-e-2", "dall-e-3"] from .timeout import timeout +from .cost_calculator import completion_cost from .utils import ( client, exception_type, @@ -718,7 +719,6 @@ from .utils import ( create_pretrained_tokenizer, create_tokenizer, cost_per_token, - completion_cost, supports_function_calling, supports_parallel_function_calling, supports_vision, diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 75717378b..9a763d63e 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -1,6 +1,7 @@ # What is this? ## File for 'response_cost' calculation in Logging -from typing import Optional, Union, Literal +from typing import Optional, Union, Literal, List +import litellm._logging from litellm.utils import ( ModelResponse, EmbeddingResponse, @@ -8,10 +9,281 @@ from litellm.utils import ( TranscriptionResponse, TextCompletionResponse, CallTypes, - completion_cost, + cost_per_token, print_verbose, + CostPerToken, + token_counter, ) import litellm +from litellm import verbose_logger + + +# Extract the number of billion parameters from the model name +# only used for together_computer LLMs +def get_model_params_and_category(model_name) -> str: + """ + Helper function for calculating together ai pricing. + + Returns + - str - model pricing category if mapped else received model name + """ + import re + + model_name = model_name.lower() + re_params_match = re.search( + r"(\d+b)", model_name + ) # catch all decimals like 3b, 70b, etc + category = None + if re_params_match is not None: + params_match = str(re_params_match.group(1)) + params_match = params_match.replace("b", "") + if params_match is not None: + params_billion = float(params_match) + else: + return model_name + # Determine the category based on the number of parameters + if params_billion <= 4.0: + category = "together-ai-up-to-4b" + elif params_billion <= 8.0: + category = "together-ai-4.1b-8b" + elif params_billion <= 21.0: + category = "together-ai-8.1b-21b" + elif params_billion <= 41.0: + category = "together-ai-21.1b-41b" + elif params_billion <= 80.0: + category = "together-ai-41.1b-80b" + elif params_billion <= 110.0: + category = "together-ai-81.1b-110b" + if category is not None: + return category + + return model_name + + +def get_replicate_completion_pricing(completion_response=None, total_time=0.0): + # see https://replicate.com/pricing + # for all litellm currently supported LLMs, almost all requests go to a100_80gb + a100_80gb_price_per_second_public = ( + 0.001400 # assume all calls sent to A100 80GB for now + ) + if total_time == 0.0: # total time is in ms + start_time = completion_response["created"] + end_time = getattr(completion_response, "ended", time.time()) + total_time = end_time - start_time + + return a100_80gb_price_per_second_public * total_time / 1000 + + +def completion_cost( + completion_response=None, + model: Optional[str] = None, + prompt="", + messages: List = [], + completion="", + total_time=0.0, # used for replicate, sagemaker + call_type: Literal[ + "embedding", + "aembedding", + "completion", + "acompletion", + "atext_completion", + "text_completion", + "image_generation", + "aimage_generation", + "moderation", + "amoderation", + "atranscription", + "transcription", + "aspeech", + "speech", + ] = "completion", + ### REGION ### + custom_llm_provider=None, + region_name=None, # used for bedrock pricing + ### IMAGE GEN ### + size=None, + quality=None, + n=None, # number of images + ### CUSTOM PRICING ### + custom_cost_per_token: Optional[CostPerToken] = None, + custom_cost_per_second: Optional[float] = None, +) -> float: + """ + Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm. + + Parameters: + completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request. + + [OPTIONAL PARAMS] + model (str): Optional. The name of the language model used in the completion calls + prompt (str): Optional. The input prompt passed to the llm + completion (str): Optional. The output completion text from the llm + total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds + custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call. + custom_cost_per_second: Optional[float]: the cost per second for the llm api call. + + Returns: + float: The cost in USD dollars for the completion based on the provided parameters. + + Exceptions: + Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json + + + Note: + - If completion_response is provided, the function extracts token information and the model name from it. + - If completion_response is not provided, the function calculates token counts based on the model and input text. + - The cost is calculated based on the model, prompt tokens, and completion tokens. + - For certain models containing "togethercomputer" in the name, prices are based on the model size. + - For un-mapped Replicate models, the cost is calculated based on the total time used for the request. + """ + try: + if ( + (call_type == "aimage_generation" or call_type == "image_generation") + and model is not None + and isinstance(model, str) + and len(model) == 0 + and custom_llm_provider == "azure" + ): + model = "dall-e-2" # for dall-e-2, azure expects an empty model name + # Handle Inputs to completion_cost + prompt_tokens = 0 + completion_tokens = 0 + custom_llm_provider = None + if completion_response is not None: + # get input/output tokens from completion_response + prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0) + completion_tokens = completion_response.get("usage", {}).get( + "completion_tokens", 0 + ) + total_time = completion_response.get("_response_ms", 0) + verbose_logger.debug( + f"completion_response response ms: {completion_response.get('_response_ms')} " + ) + model = model or completion_response.get( + "model", None + ) # check if user passed an override for model, if it's none check completion_response['model'] + if hasattr(completion_response, "_hidden_params"): + if ( + completion_response._hidden_params.get("model", None) is not None + and len(completion_response._hidden_params["model"]) > 0 + ): + model = completion_response._hidden_params.get("model", model) + custom_llm_provider = completion_response._hidden_params.get( + "custom_llm_provider", "" + ) + region_name = completion_response._hidden_params.get( + "region_name", region_name + ) + size = completion_response._hidden_params.get( + "optional_params", {} + ).get( + "size", "1024-x-1024" + ) # openai default + quality = completion_response._hidden_params.get( + "optional_params", {} + ).get( + "quality", "standard" + ) # openai default + n = completion_response._hidden_params.get("optional_params", {}).get( + "n", 1 + ) # openai default + else: + if len(messages) > 0: + prompt_tokens = token_counter(model=model, messages=messages) + elif len(prompt) > 0: + prompt_tokens = token_counter(model=model, text=prompt) + completion_tokens = token_counter(model=model, text=completion) + if model is None: + raise ValueError( + f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}" + ) + + if ( + call_type == CallTypes.image_generation.value + or call_type == CallTypes.aimage_generation.value + ): + ### IMAGE GENERATION COST CALCULATION ### + if custom_llm_provider == "vertex_ai": + # https://cloud.google.com/vertex-ai/generative-ai/pricing + # Vertex Charges Flat $0.20 per image + return 0.020 + + # fix size to match naming convention + if "x" in size and "-x-" not in size: + size = size.replace("x", "-x-") + image_gen_model_name = f"{size}/{model}" + image_gen_model_name_with_quality = image_gen_model_name + if quality is not None: + image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}" + size = size.split("-x-") + height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024 + width = int(size[1]) + verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}") + verbose_logger.debug( + f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}" + ) + if image_gen_model_name in litellm.model_cost: + return ( + litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"] + * height + * width + * n + ) + elif image_gen_model_name_with_quality in litellm.model_cost: + return ( + litellm.model_cost[image_gen_model_name_with_quality][ + "input_cost_per_pixel" + ] + * height + * width + * n + ) + else: + raise Exception( + f"Model={image_gen_model_name} not found in completion cost model map" + ) + # Calculate cost based on prompt_tokens, completion_tokens + if ( + "togethercomputer" in model + or "together_ai" in model + or custom_llm_provider == "together_ai" + ): + # together ai prices based on size of llm + # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json + model = get_model_params_and_category(model) + # replicate llms are calculate based on time for request running + # see https://replicate.com/pricing + elif ( + model in litellm.replicate_models or "replicate" in model + ) and model not in litellm.model_cost: + # for unmapped replicate model, default to replicate's time tracking logic + return get_replicate_completion_pricing(completion_response, total_time) + + if model is None: + raise ValueError( + f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}" + ) + + ( + prompt_tokens_cost_usd_dollar, + completion_tokens_cost_usd_dollar, + ) = cost_per_token( + model=model, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + custom_llm_provider=custom_llm_provider, + response_time_ms=total_time, + region_name=region_name, + custom_cost_per_second=custom_cost_per_second, + custom_cost_per_token=custom_cost_per_token, + ) + _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + print_verbose( + f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" + ) + return _final_cost + except Exception as e: + raise e def response_cost_calculator( @@ -47,7 +319,7 @@ def response_cost_calculator( ) -> Optional[float]: try: response_cost: float = 0.0 - if cache_hit is not None and cache_hit == True: + if cache_hit is not None and cache_hit is True: response_cost = 0.0 else: response_object._hidden_params["optional_params"] = optional_params @@ -62,9 +334,11 @@ def response_cost_calculator( if ( model in litellm.model_cost and custom_pricing is not None - and custom_llm_provider == True + and custom_llm_provider is True ): # override defaults if custom pricing is set base_model = model + elif base_model is None: + base_model = model # base_model defaults to None if not set on model_info response_cost = completion_cost( completion_response=response_object, diff --git a/litellm/exceptions.py b/litellm/exceptions.py index 484e843b6..886b5889d 100644 --- a/litellm/exceptions.py +++ b/litellm/exceptions.py @@ -20,7 +20,7 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore message, llm_provider, model, - response: httpx.Response, + response: Optional[httpx.Response] = None, litellm_debug_info: Optional[str] = None, max_retries: Optional[int] = None, num_retries: Optional[int] = None, @@ -32,8 +32,14 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore self.litellm_debug_info = litellm_debug_info self.max_retries = max_retries self.num_retries = num_retries + self.response = response or httpx.Response( + status_code=self.status_code, + request=httpx.Request( + method="GET", url="https://litellm.ai" + ), # mock request object + ) super().__init__( - self.message, response=response, body=None + self.message, response=self.response, body=None ) # Call the base class constructor with the parameters it needs def __str__(self): @@ -60,7 +66,7 @@ class NotFoundError(openai.NotFoundError): # type: ignore message, model, llm_provider, - response: httpx.Response, + response: Optional[httpx.Response] = None, litellm_debug_info: Optional[str] = None, max_retries: Optional[int] = None, num_retries: Optional[int] = None, @@ -72,8 +78,14 @@ class NotFoundError(openai.NotFoundError): # type: ignore self.litellm_debug_info = litellm_debug_info self.max_retries = max_retries self.num_retries = num_retries + self.response = response or httpx.Response( + status_code=self.status_code, + request=httpx.Request( + method="GET", url="https://litellm.ai" + ), # mock request object + ) super().__init__( - self.message, response=response, body=None + self.message, response=self.response, body=None ) # Call the base class constructor with the parameters it needs def __str__(self): @@ -262,7 +274,7 @@ class RateLimitError(openai.RateLimitError): # type: ignore message, llm_provider, model, - response: httpx.Response, + response: Optional[httpx.Response] = None, litellm_debug_info: Optional[str] = None, max_retries: Optional[int] = None, num_retries: Optional[int] = None, @@ -274,8 +286,18 @@ class RateLimitError(openai.RateLimitError): # type: ignore self.litellm_debug_info = litellm_debug_info self.max_retries = max_retries self.num_retries = num_retries + if response is None: + self.response = httpx.Response( + status_code=429, + request=httpx.Request( + method="POST", + url=" https://cloud.google.com/vertex-ai/", + ), + ) + else: + self.response = response super().__init__( - self.message, response=response, body=None + self.message, response=self.response, body=None ) # Call the base class constructor with the parameters it needs def __str__(self): @@ -421,7 +443,7 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore message, llm_provider, model, - response: httpx.Response, + response: Optional[httpx.Response] = None, litellm_debug_info: Optional[str] = None, max_retries: Optional[int] = None, num_retries: Optional[int] = None, @@ -433,8 +455,18 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore self.litellm_debug_info = litellm_debug_info self.max_retries = max_retries self.num_retries = num_retries + if response is None: + self.response = httpx.Response( + status_code=self.status_code, + request=httpx.Request( + method="POST", + url=" https://cloud.google.com/vertex-ai/", + ), + ) + else: + self.response = response super().__init__( - self.message, response=response, body=None + self.message, response=self.response, body=None ) # Call the base class constructor with the parameters it needs def __str__(self): @@ -460,7 +492,7 @@ class InternalServerError(openai.InternalServerError): # type: ignore message, llm_provider, model, - response: httpx.Response, + response: Optional[httpx.Response] = None, litellm_debug_info: Optional[str] = None, max_retries: Optional[int] = None, num_retries: Optional[int] = None, @@ -472,8 +504,18 @@ class InternalServerError(openai.InternalServerError): # type: ignore self.litellm_debug_info = litellm_debug_info self.max_retries = max_retries self.num_retries = num_retries + if response is None: + self.response = httpx.Response( + status_code=self.status_code, + request=httpx.Request( + method="POST", + url=" https://cloud.google.com/vertex-ai/", + ), + ) + else: + self.response = response super().__init__( - self.message, response=response, body=None + self.message, response=self.response, body=None ) # Call the base class constructor with the parameters it needs def __str__(self): diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py index b5fbacdf3..bb9e34b1a 100644 --- a/litellm/integrations/opentelemetry.py +++ b/litellm/integrations/opentelemetry.py @@ -366,8 +366,6 @@ class OpenTelemetry(CustomLogger): ) message = choice.get("message") - if not isinstance(message, dict): - message = message.dict() tool_calls = message.get("tool_calls") if tool_calls: span.set_attribute( diff --git a/litellm/integrations/test_httpx.py b/litellm/integrations/test_httpx.py new file mode 100644 index 000000000..e69de29bb diff --git a/litellm/llms/predibase.py b/litellm/llms/predibase.py index a3245cdac..66c28acee 100644 --- a/litellm/llms/predibase.py +++ b/litellm/llms/predibase.py @@ -3,6 +3,7 @@ from functools import partial import os, types +import traceback import json from enum import Enum import requests, copy # type: ignore @@ -242,12 +243,12 @@ class PredibaseChatCompletion(BaseLLM): "details" in completion_response and "tokens" in completion_response["details"] ): - model_response.choices[0].finish_reason = completion_response[ - "details" - ]["finish_reason"] + model_response.choices[0].finish_reason = map_finish_reason( + completion_response["details"]["finish_reason"] + ) sum_logprob = 0 for token in completion_response["details"]["tokens"]: - if token["logprob"] != None: + if token["logprob"] is not None: sum_logprob += token["logprob"] model_response["choices"][0][ "message" @@ -265,7 +266,7 @@ class PredibaseChatCompletion(BaseLLM): ): sum_logprob = 0 for token in item["tokens"]: - if token["logprob"] != None: + if token["logprob"] is not None: sum_logprob += token["logprob"] if len(item["generated_text"]) > 0: message_obj = Message( @@ -275,7 +276,7 @@ class PredibaseChatCompletion(BaseLLM): else: message_obj = Message(content=None) choice_obj = Choices( - finish_reason=item["finish_reason"], + finish_reason=map_finish_reason(item["finish_reason"]), index=idx + 1, message=message_obj, ) @@ -285,10 +286,8 @@ class PredibaseChatCompletion(BaseLLM): ## CALCULATING USAGE prompt_tokens = 0 try: - prompt_tokens = len( - encoding.encode(model_response["choices"][0]["message"]["content"]) - ) ##[TODO] use a model-specific tokenizer here - except: + prompt_tokens = litellm.token_counter(messages=messages) + except Exception: # this should remain non blocking we should not block a response returning if calculating usage fails pass output_text = model_response["choices"][0]["message"].get("content", "") @@ -331,6 +330,7 @@ class PredibaseChatCompletion(BaseLLM): logging_obj, optional_params: dict, tenant_id: str, + timeout: Union[float, httpx.Timeout], acompletion=None, litellm_params=None, logger_fn=None, @@ -340,6 +340,7 @@ class PredibaseChatCompletion(BaseLLM): completion_url = "" input_text = "" base_url = "https://serving.app.predibase.com" + if "https" in model: completion_url = model elif api_base: @@ -349,7 +350,7 @@ class PredibaseChatCompletion(BaseLLM): completion_url = f"{base_url}/{tenant_id}/deployments/v2/llms/{model}" - if optional_params.get("stream", False) == True: + if optional_params.get("stream", False) is True: completion_url += "/generate_stream" else: completion_url += "/generate" @@ -393,9 +394,9 @@ class PredibaseChatCompletion(BaseLLM): }, ) ## COMPLETION CALL - if acompletion == True: + if acompletion is True: ### ASYNC STREAMING - if stream == True: + if stream is True: return self.async_streaming( model=model, messages=messages, @@ -410,6 +411,7 @@ class PredibaseChatCompletion(BaseLLM): litellm_params=litellm_params, logger_fn=logger_fn, headers=headers, + timeout=timeout, ) # type: ignore else: ### ASYNC COMPLETION @@ -428,10 +430,11 @@ class PredibaseChatCompletion(BaseLLM): litellm_params=litellm_params, logger_fn=logger_fn, headers=headers, + timeout=timeout, ) # type: ignore ### SYNC STREAMING - if stream == True: + if stream is True: response = requests.post( completion_url, headers=headers, @@ -452,7 +455,6 @@ class PredibaseChatCompletion(BaseLLM): headers=headers, data=json.dumps(data), ) - return self.process_response( model=model, response=response, @@ -480,23 +482,26 @@ class PredibaseChatCompletion(BaseLLM): stream, data: dict, optional_params: dict, + timeout: Union[float, httpx.Timeout], litellm_params=None, logger_fn=None, headers={}, ) -> ModelResponse: - self.async_handler = AsyncHTTPHandler( - timeout=httpx.Timeout(timeout=600.0, connect=5.0) - ) + + async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout)) try: - response = await self.async_handler.post( + response = await async_handler.post( api_base, headers=headers, data=json.dumps(data) ) except httpx.HTTPStatusError as e: raise PredibaseError( - status_code=e.response.status_code, message=e.response.text + status_code=e.response.status_code, + message="HTTPStatusError - {}".format(e.response.text), ) except Exception as e: - raise PredibaseError(status_code=500, message=str(e)) + raise PredibaseError( + status_code=500, message="{}\n{}".format(str(e), traceback.format_exc()) + ) return self.process_response( model=model, response=response, @@ -522,6 +527,7 @@ class PredibaseChatCompletion(BaseLLM): api_key, logging_obj, data: dict, + timeout: Union[float, httpx.Timeout], optional_params=None, litellm_params=None, logger_fn=None, diff --git a/litellm/main.py b/litellm/main.py index dd1fdb9f9..2c906e990 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -432,9 +432,9 @@ def mock_completion( if isinstance(mock_response, openai.APIError): raise mock_response raise litellm.APIError( - status_code=500, # type: ignore - message=str(mock_response), - llm_provider="openai", # type: ignore + status_code=getattr(mock_response, "status_code", 500), # type: ignore + message=getattr(mock_response, "text", str(mock_response)), + llm_provider=getattr(mock_response, "llm_provider", "openai"), # type: ignore model=model, # type: ignore request=httpx.Request(method="POST", url="https://api.openai.com/v1/"), ) @@ -1949,7 +1949,8 @@ def completion( ) api_base = ( - optional_params.pop("api_base", None) + api_base + or optional_params.pop("api_base", None) or optional_params.pop("base_url", None) or litellm.api_base or get_secret("PREDIBASE_API_BASE") @@ -1977,12 +1978,13 @@ def completion( custom_prompt_dict=custom_prompt_dict, api_key=api_key, tenant_id=tenant_id, + timeout=timeout, ) if ( "stream" in optional_params - and optional_params["stream"] == True - and acompletion == False + and optional_params["stream"] is True + and acompletion is False ): return _model_response response = _model_response diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 3fe089a6b..f2b292c92 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -3009,32 +3009,37 @@ "litellm_provider": "sagemaker", "mode": "chat" }, - "together-ai-up-to-3b": { + "together-ai-up-to-4b": { "input_cost_per_token": 0.0000001, "output_cost_per_token": 0.0000001, "litellm_provider": "together_ai" }, - "together-ai-3.1b-7b": { + "together-ai-4.1b-8b": { "input_cost_per_token": 0.0000002, "output_cost_per_token": 0.0000002, "litellm_provider": "together_ai" }, - "together-ai-7.1b-20b": { + "together-ai-8.1b-21b": { "max_tokens": 1000, - "input_cost_per_token": 0.0000004, - "output_cost_per_token": 0.0000004, + "input_cost_per_token": 0.0000003, + "output_cost_per_token": 0.0000003, "litellm_provider": "together_ai" }, - "together-ai-20.1b-40b": { + "together-ai-21.1b-41b": { "input_cost_per_token": 0.0000008, "output_cost_per_token": 0.0000008, "litellm_provider": "together_ai" }, - "together-ai-40.1b-70b": { + "together-ai-41.1b-80b": { "input_cost_per_token": 0.0000009, "output_cost_per_token": 0.0000009, "litellm_provider": "together_ai" }, + "together-ai-81.1b-110b": { + "input_cost_per_token": 0.0000018, + "output_cost_per_token": 0.0000018, + "litellm_provider": "together_ai" + }, "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": { "input_cost_per_token": 0.0000006, "output_cost_per_token": 0.0000006, diff --git a/litellm/proxy/_logging.py b/litellm/proxy/_logging.py index f453cef39..655da7b29 100644 --- a/litellm/proxy/_logging.py +++ b/litellm/proxy/_logging.py @@ -1,7 +1,12 @@ import json import logging from logging import Formatter -import sys +import os +from litellm import json_logs + +# Set default log level to INFO +log_level = os.getenv("LITELLM_LOG", "INFO") +numeric_level: str = getattr(logging, log_level.upper()) class JsonFormatter(Formatter): @@ -16,6 +21,14 @@ class JsonFormatter(Formatter): logger = logging.root handler = logging.StreamHandler() -handler.setFormatter(JsonFormatter()) +if json_logs: + handler.setFormatter(JsonFormatter()) +else: + formatter = logging.Formatter( + "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s", + datefmt="%H:%M:%S", + ) + + handler.setFormatter(formatter) logger.handlers = [handler] -logger.setLevel(logging.INFO) +logger.setLevel(numeric_level) diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 450d77b0a..5674abfe2 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -8,6 +8,17 @@ model_list: - model_name: llama3-70b-8192 litellm_params: model: groq/llama3-70b-8192 +- model_name: fake-openai-endpoint + litellm_params: + model: predibase/llama-3-8b-instruct + api_base: "http://0.0.0.0:8081" + api_key: os.environ/PREDIBASE_API_KEY + tenant_id: os.environ/PREDIBASE_TENANT_ID + max_retries: 0 + temperature: 0.1 + max_new_tokens: 256 + return_full_text: false + # - litellm_params: # api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ # api_key: os.environ/AZURE_EUROPE_API_KEY @@ -57,6 +68,8 @@ router_settings: litellm_settings: success_callback: ["langfuse"] cache: True + failure_callback: ["langfuse"] + general_settings: alerting: ["email"] diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 140948e51..924125b47 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -160,6 +160,7 @@ from litellm.proxy.auth.auth_checks import ( get_user_object, allowed_routes_check, get_actual_routes, + log_to_opentelemetry, ) from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.exceptions import RejectedRequestError @@ -368,6 +369,11 @@ from typing import Dict api_key_header = APIKeyHeader( name="Authorization", auto_error=False, description="Bearer token" ) +azure_api_key_header = APIKeyHeader( + name="API-Key", + auto_error=False, + description="Some older versions of the openai Python package will send an API-Key header with just the API key ", +) user_api_base = None user_model = None user_debug = False @@ -508,13 +514,19 @@ async def check_request_disconnection(request: Request, llm_api_call_task): async def user_api_key_auth( - request: Request, api_key: str = fastapi.Security(api_key_header) + request: Request, + api_key: str = fastapi.Security(api_key_header), + azure_api_key_header: str = fastapi.Security(azure_api_key_header), ) -> UserAPIKeyAuth: global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj try: if isinstance(api_key, str): passed_in_key = api_key api_key = _get_bearer_token(api_key=api_key) + + elif isinstance(azure_api_key_header, str): + api_key = azure_api_key_header + parent_otel_span: Optional[Span] = None if open_telemetry_logger is not None: parent_otel_span = open_telemetry_logger.tracer.start_span( @@ -1495,7 +1507,7 @@ async def user_api_key_auth( ) if valid_token is None: # No token was found when looking up in the DB - raise Exception("Invalid token passed") + raise Exception("Invalid proxy server token passed") if valid_token_dict is not None: if user_id_information is not None and _is_user_proxy_admin( user_id_information @@ -1528,6 +1540,14 @@ async def user_api_key_auth( str(e) ) ) + + # Log this exception to OTEL + if open_telemetry_logger is not None: + await open_telemetry_logger.async_post_call_failure_hook( + original_exception=e, + user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span), + ) + verbose_proxy_logger.debug(traceback.format_exc()) if isinstance(e, litellm.BudgetExceededError): raise ProxyException( @@ -7803,6 +7823,10 @@ async def get_global_spend_report( default=None, description="Time till which to view spend", ), + group_by: Optional[Literal["team", "customer"]] = fastapi.Query( + default="team", + description="Group spend by internal team or customer", + ), ): """ Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model @@ -7849,69 +7873,130 @@ async def get_global_spend_report( f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys" ) - # first get data from spend logs -> SpendByModelApiKey - # then read data from "SpendByModelApiKey" to format the response obj - sql_query = """ + if group_by == "team": + # first get data from spend logs -> SpendByModelApiKey + # then read data from "SpendByModelApiKey" to format the response obj + sql_query = """ - WITH SpendByModelApiKey AS ( - SELECT - date_trunc('day', sl."startTime") AS group_by_day, - COALESCE(tt.team_alias, 'Unassigned Team') AS team_name, - sl.model, - sl.api_key, - SUM(sl.spend) AS model_api_spend, - SUM(sl.total_tokens) AS model_api_tokens - FROM - "LiteLLM_SpendLogs" sl - LEFT JOIN - "LiteLLM_TeamTable" tt - ON - sl.team_id = tt.team_id - WHERE - sl."startTime" BETWEEN $1::date AND $2::date - GROUP BY - date_trunc('day', sl."startTime"), - tt.team_alias, - sl.model, - sl.api_key - ) + WITH SpendByModelApiKey AS ( + SELECT + date_trunc('day', sl."startTime") AS group_by_day, + COALESCE(tt.team_alias, 'Unassigned Team') AS team_name, + sl.model, + sl.api_key, + SUM(sl.spend) AS model_api_spend, + SUM(sl.total_tokens) AS model_api_tokens + FROM + "LiteLLM_SpendLogs" sl + LEFT JOIN + "LiteLLM_TeamTable" tt + ON + sl.team_id = tt.team_id + WHERE + sl."startTime" BETWEEN $1::date AND $2::date + GROUP BY + date_trunc('day', sl."startTime"), + tt.team_alias, + sl.model, + sl.api_key + ) + SELECT + group_by_day, + jsonb_agg(jsonb_build_object( + 'team_name', team_name, + 'total_spend', total_spend, + 'metadata', metadata + )) AS teams + FROM ( + SELECT + group_by_day, + team_name, + SUM(model_api_spend) AS total_spend, + jsonb_agg(jsonb_build_object( + 'model', model, + 'api_key', api_key, + 'spend', model_api_spend, + 'total_tokens', model_api_tokens + )) AS metadata + FROM + SpendByModelApiKey + GROUP BY + group_by_day, + team_name + ) AS aggregated + GROUP BY + group_by_day + ORDER BY + group_by_day; + """ + + db_response = await prisma_client.db.query_raw( + sql_query, start_date_obj, end_date_obj + ) + if db_response is None: + return [] + + return db_response + + elif group_by == "customer": + sql_query = """ + + WITH SpendByModelApiKey AS ( + SELECT + date_trunc('day', sl."startTime") AS group_by_day, + sl.end_user AS customer, + sl.model, + sl.api_key, + SUM(sl.spend) AS model_api_spend, + SUM(sl.total_tokens) AS model_api_tokens + FROM + "LiteLLM_SpendLogs" sl + WHERE + sl."startTime" BETWEEN $1::date AND $2::date + GROUP BY + date_trunc('day', sl."startTime"), + customer, + sl.model, + sl.api_key + ) SELECT group_by_day, jsonb_agg(jsonb_build_object( - 'team_name', team_name, + 'customer', customer, 'total_spend', total_spend, 'metadata', metadata - )) AS teams - FROM ( - SELECT - group_by_day, - team_name, - SUM(model_api_spend) AS total_spend, - jsonb_agg(jsonb_build_object( - 'model', model, - 'api_key', api_key, - 'spend', model_api_spend, - 'total_tokens', model_api_tokens - )) AS metadata - FROM - SpendByModelApiKey - GROUP BY - group_by_day, - team_name - ) AS aggregated + )) AS customers + FROM + ( + SELECT + group_by_day, + customer, + SUM(model_api_spend) AS total_spend, + jsonb_agg(jsonb_build_object( + 'model', model, + 'api_key', api_key, + 'spend', model_api_spend, + 'total_tokens', model_api_tokens + )) AS metadata + FROM + SpendByModelApiKey + GROUP BY + group_by_day, + customer + ) AS aggregated GROUP BY group_by_day ORDER BY group_by_day; - """ + """ - db_response = await prisma_client.db.query_raw( - sql_query, start_date_obj, end_date_obj - ) - if db_response is None: - return [] + db_response = await prisma_client.db.query_raw( + sql_query, start_date_obj, end_date_obj + ) + if db_response is None: + return [] - return db_response + return db_response except Exception as e: raise HTTPException( diff --git a/litellm/router.py b/litellm/router.py index bfd1dafe9..adf8f4897 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -2056,12 +2056,15 @@ class Router: verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}") generic_fallback_idx: Optional[int] = None ## check for specific model group-specific fallbacks - for idx, item in enumerate(fallbacks): - if list(item.keys())[0] == model_group: - fallback_model_group = item[model_group] - break - elif list(item.keys())[0] == "*": - generic_fallback_idx = idx + if isinstance(fallbacks, list): + fallback_model_group = fallbacks + elif isinstance(fallbacks, dict): + for idx, item in enumerate(fallbacks): + if list(item.keys())[0] == model_group: + fallback_model_group = item[model_group] + break + elif list(item.keys())[0] == "*": + generic_fallback_idx = idx ## if none, check for generic fallback if ( fallback_model_group is None @@ -2310,13 +2313,16 @@ class Router: verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}") fallback_model_group = None generic_fallback_idx: Optional[int] = None - ## check for specific model group-specific fallbacks - for idx, item in enumerate(fallbacks): - if list(item.keys())[0] == model_group: - fallback_model_group = item[model_group] - break - elif list(item.keys())[0] == "*": - generic_fallback_idx = idx + if isinstance(fallbacks, list): + fallback_model_group = fallbacks + elif isinstance(fallbacks, dict): + ## check for specific model group-specific fallbacks + for idx, item in enumerate(fallbacks): + if list(item.keys())[0] == model_group: + fallback_model_group = item[model_group] + break + elif list(item.keys())[0] == "*": + generic_fallback_idx = idx ## if none, check for generic fallback if ( fallback_model_group is None diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 4ac727cd2..2428cbf48 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -345,7 +345,7 @@ def test_completion_claude_3_function_call(model): drop_params=True, ) - # Add any assertions, here to check response args + # Add any assertions here to check response args print(response) assert isinstance(response.choices[0].message.tool_calls[0].function.name, str) assert isinstance( @@ -530,6 +530,7 @@ def test_completion_cohere_command_r_plus_function_call(): messages=messages, tools=tools, tool_choice="auto", + force_single_step=True, ) print(second_response) except Exception as e: diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index 7820e2af3..c0be350f9 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -517,3 +517,51 @@ def test_groq_response_cost_tracking(is_streaming): assert response_cost > 0.0 print(f"response_cost: {response_cost}") + + +def test_together_ai_qwen_completion_cost(): + input_kwargs = { + "completion_response": litellm.ModelResponse( + **{ + "id": "890db0c33c4ef94b-SJC", + "choices": [ + { + "finish_reason": "eos", + "index": 0, + "message": { + "content": "I am Qwen, a large language model created by Alibaba Cloud.", + "role": "assistant", + }, + } + ], + "created": 1717900130, + "model": "together_ai/qwen/Qwen2-72B-Instruct", + "object": "chat.completion", + "system_fingerprint": None, + "usage": { + "completion_tokens": 15, + "prompt_tokens": 23, + "total_tokens": 38, + }, + } + ), + "model": "qwen/Qwen2-72B-Instruct", + "prompt": "", + "messages": [], + "completion": "", + "total_time": 0.0, + "call_type": "completion", + "custom_llm_provider": "together_ai", + "region_name": None, + "size": None, + "quality": None, + "n": None, + "custom_cost_per_token": None, + "custom_cost_per_second": None, + } + + response = litellm.cost_calculator.get_model_params_and_category( + model_name="qwen/Qwen2-72B-Instruct" + ) + + assert response == "together-ai-41.1b-80b" diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index ee695dcd7..1082dd2f8 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -3,6 +3,7 @@ import os import sys import traceback import subprocess, asyncio +from typing import Any sys.path.insert( 0, os.path.abspath("../..") @@ -19,6 +20,7 @@ from litellm import ( ) from concurrent.futures import ThreadPoolExecutor import pytest +from unittest.mock import patch, MagicMock litellm.vertex_project = "pathrise-convert-1606954137718" litellm.vertex_location = "us-central1" @@ -655,3 +657,47 @@ def test_litellm_predibase_exception(): # accuracy_score = counts[True]/(counts[True] + counts[False]) # print(f"accuracy_score: {accuracy_score}") + + +@pytest.mark.parametrize("provider", ["predibase"]) +def test_exception_mapping(provider): + """ + For predibase, run through a set of mock exceptions + + assert that they are being mapped correctly + """ + litellm.set_verbose = True + error_map = { + 400: litellm.BadRequestError, + 401: litellm.AuthenticationError, + 404: litellm.NotFoundError, + 408: litellm.Timeout, + 429: litellm.RateLimitError, + 500: litellm.InternalServerError, + 503: litellm.ServiceUnavailableError, + } + + for code, expected_exception in error_map.items(): + mock_response = Exception() + setattr(mock_response, "text", "This is an error message") + setattr(mock_response, "llm_provider", provider) + setattr(mock_response, "status_code", code) + + response: Any = None + try: + response = completion( + model="{}/test-model".format(provider), + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_response=mock_response, + ) + except expected_exception: + continue + except Exception as e: + response = "{}\n{}".format(str(e), traceback.format_exc()) + pytest.fail( + "Did not raise expected exception. Expected={}, Return={},".format( + expected_exception, response + ) + ) + + pass diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index 083d84c2b..2f439862e 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -272,7 +272,7 @@ def test_call_with_invalid_key(prisma_client): except Exception as e: print("Got Exception", e) print(e.message) - assert "Authentication Error, Invalid token passed" in e.message + assert "Authentication Error, Invalid proxy server token passed" in e.message pass diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py index 6e483b9fe..c6e0e5411 100644 --- a/litellm/tests/test_router_fallbacks.py +++ b/litellm/tests/test_router_fallbacks.py @@ -1059,3 +1059,53 @@ async def test_default_model_fallbacks(sync_mode, litellm_module_fallbacks): assert isinstance(response, litellm.ModelResponse) assert response.model is not None and response.model == "gpt-4o" + + +@pytest.mark.parametrize("sync_mode", [True, False]) +@pytest.mark.asyncio +async def test_client_side_fallbacks_list(sync_mode): + """ + + Tests Client Side Fallbacks + + User can pass "fallbacks": ["gpt-3.5-turbo"] and this should work + + """ + router = Router( + model_list=[ + { + "model_name": "bad-model", + "litellm_params": { + "model": "openai/my-bad-model", + "api_key": "my-bad-api-key", + }, + }, + { + "model_name": "my-good-model", + "litellm_params": { + "model": "gpt-4o", + "api_key": os.getenv("OPENAI_API_KEY"), + }, + }, + ], + ) + + if sync_mode: + response = router.completion( + model="bad-model", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + fallbacks=["my-good-model"], + mock_testing_fallbacks=True, + mock_response="Hey! nice day", + ) + else: + response = await router.acompletion( + model="bad-model", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + fallbacks=["my-good-model"], + mock_testing_fallbacks=True, + mock_response="Hey! nice day", + ) + + assert isinstance(response, litellm.ModelResponse) + assert response.model is not None and response.model == "gpt-4o" diff --git a/litellm/utils.py b/litellm/utils.py index 52e94f28f..4602d8651 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -326,6 +326,22 @@ class Function(OpenAIObject): super(Function, self).__init__(**data) + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) + class ChatCompletionDeltaToolCall(OpenAIObject): id: Optional[str] = None @@ -385,6 +401,22 @@ class ChatCompletionMessageToolCall(OpenAIObject): else: self.type = "function" + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) + class Message(OpenAIObject): def __init__( @@ -3929,54 +3961,6 @@ def client(original_function): return wrapper -####### USAGE CALCULATOR ################ - - -# Extract the number of billion parameters from the model name -# only used for together_computer LLMs -def get_model_params_and_category(model_name): - import re - - model_name = model_name.lower() - params_match = re.search( - r"(\d+b)", model_name - ) # catch all decimals like 3b, 70b, etc - category = None - if params_match != None: - params_match = params_match.group(1) - params_match = params_match.replace("b", "") - params_billion = float(params_match) - # Determine the category based on the number of parameters - if params_billion <= 3.0: - category = "together-ai-up-to-3b" - elif params_billion <= 7.0: - category = "together-ai-3.1b-7b" - elif params_billion <= 20.0: - category = "together-ai-7.1b-20b" - elif params_billion <= 40.0: - category = "together-ai-20.1b-40b" - elif params_billion <= 70.0: - category = "together-ai-40.1b-70b" - return category - - return None - - -def get_replicate_completion_pricing(completion_response=None, total_time=0.0): - # see https://replicate.com/pricing - a100_40gb_price_per_second_public = 0.001150 - # for all litellm currently supported LLMs, almost all requests go to a100_80gb - a100_80gb_price_per_second_public = ( - 0.001400 # assume all calls sent to A100 80GB for now - ) - if total_time == 0.0: # total time is in ms - start_time = completion_response["created"] - end_time = getattr(completion_response, "ended", time.time()) - total_time = end_time - start_time - - return a100_80gb_price_per_second_public * total_time / 1000 - - @lru_cache(maxsize=128) def _select_tokenizer(model: str): if model in litellm.cohere_models and "command-r" in model: @@ -4363,7 +4347,7 @@ def _cost_per_token_custom_pricing_helper( def cost_per_token( - model="", + model: str = "", prompt_tokens=0, completion_tokens=0, response_time_ms=None, @@ -4388,6 +4372,8 @@ def cost_per_token( Returns: tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively. """ + if model is None: + raise Exception("Invalid arg. Model cannot be none.") ## CUSTOM PRICING ## response_cost = _cost_per_token_custom_pricing_helper( prompt_tokens=prompt_tokens, @@ -4560,213 +4546,6 @@ def cost_per_token( ) -def completion_cost( - completion_response=None, - model=None, - prompt="", - messages: List = [], - completion="", - total_time=0.0, # used for replicate, sagemaker - call_type: Literal[ - "embedding", - "aembedding", - "completion", - "acompletion", - "atext_completion", - "text_completion", - "image_generation", - "aimage_generation", - "moderation", - "amoderation", - "atranscription", - "transcription", - "aspeech", - "speech", - ] = "completion", - ### REGION ### - custom_llm_provider=None, - region_name=None, # used for bedrock pricing - ### IMAGE GEN ### - size=None, - quality=None, - n=None, # number of images - ### CUSTOM PRICING ### - custom_cost_per_token: Optional[CostPerToken] = None, - custom_cost_per_second: Optional[float] = None, -) -> float: - """ - Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm. - - Parameters: - completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request. - - [OPTIONAL PARAMS] - model (str): Optional. The name of the language model used in the completion calls - prompt (str): Optional. The input prompt passed to the llm - completion (str): Optional. The output completion text from the llm - total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds - custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call. - custom_cost_per_second: Optional[float]: the cost per second for the llm api call. - - Returns: - float: The cost in USD dollars for the completion based on the provided parameters. - - Exceptions: - Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json - - - Note: - - If completion_response is provided, the function extracts token information and the model name from it. - - If completion_response is not provided, the function calculates token counts based on the model and input text. - - The cost is calculated based on the model, prompt tokens, and completion tokens. - - For certain models containing "togethercomputer" in the name, prices are based on the model size. - - For un-mapped Replicate models, the cost is calculated based on the total time used for the request. - """ - try: - if ( - (call_type == "aimage_generation" or call_type == "image_generation") - and model is not None - and isinstance(model, str) - and len(model) == 0 - and custom_llm_provider == "azure" - ): - model = "dall-e-2" # for dall-e-2, azure expects an empty model name - # Handle Inputs to completion_cost - prompt_tokens = 0 - completion_tokens = 0 - custom_llm_provider = None - if completion_response is not None: - # get input/output tokens from completion_response - prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0) - completion_tokens = completion_response.get("usage", {}).get( - "completion_tokens", 0 - ) - total_time = completion_response.get("_response_ms", 0) - verbose_logger.debug( - f"completion_response response ms: {completion_response.get('_response_ms')} " - ) - model = model or completion_response.get( - "model", None - ) # check if user passed an override for model, if it's none check completion_response['model'] - if hasattr(completion_response, "_hidden_params"): - if ( - completion_response._hidden_params.get("model", None) is not None - and len(completion_response._hidden_params["model"]) > 0 - ): - model = completion_response._hidden_params.get("model", model) - custom_llm_provider = completion_response._hidden_params.get( - "custom_llm_provider", "" - ) - region_name = completion_response._hidden_params.get( - "region_name", region_name - ) - size = completion_response._hidden_params.get( - "optional_params", {} - ).get( - "size", "1024-x-1024" - ) # openai default - quality = completion_response._hidden_params.get( - "optional_params", {} - ).get( - "quality", "standard" - ) # openai default - n = completion_response._hidden_params.get("optional_params", {}).get( - "n", 1 - ) # openai default - else: - if len(messages) > 0: - prompt_tokens = token_counter(model=model, messages=messages) - elif len(prompt) > 0: - prompt_tokens = token_counter(model=model, text=prompt) - completion_tokens = token_counter(model=model, text=completion) - if model == None: - raise ValueError( - f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}" - ) - - if ( - call_type == CallTypes.image_generation.value - or call_type == CallTypes.aimage_generation.value - ): - ### IMAGE GENERATION COST CALCULATION ### - if custom_llm_provider == "vertex_ai": - # https://cloud.google.com/vertex-ai/generative-ai/pricing - # Vertex Charges Flat $0.20 per image - return 0.020 - - # fix size to match naming convention - if "x" in size and "-x-" not in size: - size = size.replace("x", "-x-") - image_gen_model_name = f"{size}/{model}" - image_gen_model_name_with_quality = image_gen_model_name - if quality is not None: - image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}" - size = size.split("-x-") - height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024 - width = int(size[1]) - verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}") - verbose_logger.debug( - f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}" - ) - if image_gen_model_name in litellm.model_cost: - return ( - litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"] - * height - * width - * n - ) - elif image_gen_model_name_with_quality in litellm.model_cost: - return ( - litellm.model_cost[image_gen_model_name_with_quality][ - "input_cost_per_pixel" - ] - * height - * width - * n - ) - else: - raise Exception( - f"Model={image_gen_model_name} not found in completion cost model map" - ) - # Calculate cost based on prompt_tokens, completion_tokens - if ( - "togethercomputer" in model - or "together_ai" in model - or custom_llm_provider == "together_ai" - ): - # together ai prices based on size of llm - # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json - model = get_model_params_and_category(model) - # replicate llms are calculate based on time for request running - # see https://replicate.com/pricing - elif ( - model in litellm.replicate_models or "replicate" in model - ) and model not in litellm.model_cost: - # for unmapped replicate model, default to replicate's time tracking logic - return get_replicate_completion_pricing(completion_response, total_time) - - ( - prompt_tokens_cost_usd_dollar, - completion_tokens_cost_usd_dollar, - ) = cost_per_token( - model=model, - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - custom_llm_provider=custom_llm_provider, - response_time_ms=total_time, - region_name=region_name, - custom_cost_per_second=custom_cost_per_second, - custom_cost_per_token=custom_cost_per_token, - ) - _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar - print_verbose( - f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" - ) - return _final_cost - except Exception as e: - raise e - - def supports_httpx_timeout(custom_llm_provider: str) -> bool: """ Helper function to know if a provider implementation supports httpx timeout @@ -8986,6 +8765,75 @@ def exception_type( response=original_exception.response, litellm_debug_info=extra_information, ) + elif hasattr(original_exception, "status_code"): + if original_exception.status_code == 500: + exception_mapping_worked = True + raise litellm.InternalServerError( + message=f"PredibaseException - {original_exception.message}", + llm_provider="predibase", + model=model, + ) + elif original_exception.status_code == 401: + exception_mapping_worked = True + raise AuthenticationError( + message=f"PredibaseException - {original_exception.message}", + llm_provider="predibase", + model=model, + ) + elif original_exception.status_code == 400: + exception_mapping_worked = True + raise BadRequestError( + message=f"PredibaseException - {original_exception.message}", + llm_provider="predibase", + model=model, + ) + elif original_exception.status_code == 404: + exception_mapping_worked = True + raise NotFoundError( + message=f"PredibaseException - {original_exception.message}", + llm_provider="predibase", + model=model, + ) + elif original_exception.status_code == 408: + exception_mapping_worked = True + raise Timeout( + message=f"PredibaseException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 422: + exception_mapping_worked = True + raise BadRequestError( + message=f"PredibaseException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 429: + exception_mapping_worked = True + raise RateLimitError( + message=f"PredibaseException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 503: + exception_mapping_worked = True + raise ServiceUnavailableError( + message=f"PredibaseException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 504: # gateway timeout error + exception_mapping_worked = True + raise Timeout( + message=f"PredibaseException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) elif custom_llm_provider == "bedrock": if ( "too many tokens" in error_str diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 3fe089a6b..f2b292c92 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -3009,32 +3009,37 @@ "litellm_provider": "sagemaker", "mode": "chat" }, - "together-ai-up-to-3b": { + "together-ai-up-to-4b": { "input_cost_per_token": 0.0000001, "output_cost_per_token": 0.0000001, "litellm_provider": "together_ai" }, - "together-ai-3.1b-7b": { + "together-ai-4.1b-8b": { "input_cost_per_token": 0.0000002, "output_cost_per_token": 0.0000002, "litellm_provider": "together_ai" }, - "together-ai-7.1b-20b": { + "together-ai-8.1b-21b": { "max_tokens": 1000, - "input_cost_per_token": 0.0000004, - "output_cost_per_token": 0.0000004, + "input_cost_per_token": 0.0000003, + "output_cost_per_token": 0.0000003, "litellm_provider": "together_ai" }, - "together-ai-20.1b-40b": { + "together-ai-21.1b-41b": { "input_cost_per_token": 0.0000008, "output_cost_per_token": 0.0000008, "litellm_provider": "together_ai" }, - "together-ai-40.1b-70b": { + "together-ai-41.1b-80b": { "input_cost_per_token": 0.0000009, "output_cost_per_token": 0.0000009, "litellm_provider": "together_ai" }, + "together-ai-81.1b-110b": { + "input_cost_per_token": 0.0000018, + "output_cost_per_token": 0.0000018, + "litellm_provider": "together_ai" + }, "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": { "input_cost_per_token": 0.0000006, "output_cost_per_token": 0.0000006, diff --git a/pyproject.toml b/pyproject.toml index b6deb0dac..8255ddf79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.40.7" +version = "1.40.8" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -84,7 +84,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.40.7" +version = "1.40.8" version_files = [ "pyproject.toml:^version" ] diff --git a/ruff.toml b/ruff.toml index dfb323c1b..4894ab3fc 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,3 +1,3 @@ -ignore = ["F405"] +ignore = ["F405", "E402"] extend-select = ["E501"] line-length = 120 diff --git a/ui/litellm-dashboard/src/components/chat_ui.tsx b/ui/litellm-dashboard/src/components/chat_ui.tsx index 407e33dca..d96db60c4 100644 --- a/ui/litellm-dashboard/src/components/chat_ui.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui.tsx @@ -119,9 +119,24 @@ const ChatUI: React.FC = ({ // Now, 'options' contains the list you wanted console.log(options); // You can log it to verify the list - - // setModelInfo(options) should be inside the if block to avoid setting it when no data is available - setModelInfo(options); + + // if options.length > 0, only store unique values + if (options.length > 0) { + const uniqueModels = Array.from(new Set(options)); + + console.log("Unique models:", uniqueModels); + + // sort uniqueModels alphabetically + uniqueModels.sort((a: any, b: any) => a.label.localeCompare(b.label)); + + + console.log("Model info:", modelInfo); + + // setModelInfo(options) should be inside the if block to avoid setting it when no data is available + setModelInfo(uniqueModels); + } + + setSelectedModel(fetchedAvailableModels.data[0].id); } } catch (error) { diff --git a/ui/litellm-dashboard/src/components/model_dashboard.tsx b/ui/litellm-dashboard/src/components/model_dashboard.tsx index 73e5a7a8f..d16d8db13 100644 --- a/ui/litellm-dashboard/src/components/model_dashboard.tsx +++ b/ui/litellm-dashboard/src/components/model_dashboard.tsx @@ -1130,7 +1130,7 @@ const ModelDashboard: React.FC = ({ setSelectedAPIKey(key); }} > - ✨ {key["key_alias"]} (Enterpise only Feature) + ✨ {key["key_alias"]} (Enterprise only Feature) ); } @@ -1165,7 +1165,7 @@ const ModelDashboard: React.FC = ({ setSelectedCustomer(user); }} > - ✨ {user} (Enterpise only Feature) + ✨ {user} (Enterprise only Feature) ); }) diff --git a/ui/litellm-dashboard/src/components/navbar.tsx b/ui/litellm-dashboard/src/components/navbar.tsx index 4f587afe9..6f33d1691 100644 --- a/ui/litellm-dashboard/src/components/navbar.tsx +++ b/ui/litellm-dashboard/src/components/navbar.tsx @@ -114,7 +114,7 @@ const Navbar: React.FC = ({ textDecoration: "underline", }} > - Get enterpise license + Get enterprise license ) : null} diff --git a/ui/litellm-dashboard/src/components/usage.tsx b/ui/litellm-dashboard/src/components/usage.tsx index 732df4524..ad1aa0e57 100644 --- a/ui/litellm-dashboard/src/components/usage.tsx +++ b/ui/litellm-dashboard/src/components/usage.tsx @@ -832,7 +832,7 @@ const UsagePage: React.FC = ({ // @ts-ignore disabled={true} > - ✨ {tag} (Enterpise only Feature) + ✨ {tag} (Enterprise only Feature) ); })}