diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 0a155828b..2b3edfadb 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -202,7 +202,7 @@ print(response) -## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.) +## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.) You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1) @@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml ``` +## Load Balancing + +Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). + +```yaml +router_settings: + routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group + +model_list: + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8001 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8002 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8003 + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + api_key: + - model_name: gpt-3.5-turbo-16k + litellm_params: + model: gpt-3.5-turbo-16k + api_key: + +litellm_settings: + num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta) + request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout + fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries + context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error + allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. +``` + + ## Set Azure `base_model` for cost tracking **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking @@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ ``` -## Router Settings - -Use this to configure things like routing strategy. - -```yaml -router_settings: - routing_strategy: "least-busy" - -model_list: # will route requests to the least busy ollama model - - model_name: ollama-models - litellm_params: - model: "ollama/mistral" - api_base: "http://127.0.0.1:8001" - - model_name: ollama-models - litellm_params: - model: "ollama/codellama" - api_base: "http://127.0.0.1:8002" - - model_name: ollama-models - litellm_params: - model: "ollama/llama2" - api_base: "http://127.0.0.1:8003" -``` - - ## Configure DB Pool Limits + Connection Timeouts ```yaml diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py index 40242f5c0..0556ceebb 100644 --- a/litellm/integrations/custom_logger.py +++ b/litellm/integrations/custom_logger.py @@ -124,7 +124,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac start_time, end_time, ) - print_verbose(f"Custom Logger - final response object: {response_obj}") except: # traceback.print_exc() print_verbose(f"Custom Logger Error - {traceback.format_exc()}") @@ -142,7 +141,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac start_time, end_time, ) - print_verbose(f"Custom Logger - final response object: {response_obj}") except: # traceback.print_exc() print_verbose(f"Custom Logger Error - {traceback.format_exc()}") diff --git a/litellm/llms/aleph_alpha.py b/litellm/llms/aleph_alpha.py index 7168e7369..3c1bd5dde 100644 --- a/litellm/llms/aleph_alpha.py +++ b/litellm/llms/aleph_alpha.py @@ -77,9 +77,9 @@ class AlephAlphaConfig: - `control_log_additive` (boolean; default value: true): Method of applying control to attention scores. """ - maximum_tokens: Optional[ - int - ] = litellm.max_tokens # aleph alpha requires max tokens + maximum_tokens: Optional[int] = ( + litellm.max_tokens + ) # aleph alpha requires max tokens minimum_tokens: Optional[int] = None echo: Optional[bool] = None temperature: Optional[int] = None @@ -285,7 +285,10 @@ def completion( ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. prompt_tokens = len(encoding.encode(prompt)) completion_tokens = len( - encoding.encode(model_response["choices"][0]["message"]["content"]) + encoding.encode( + model_response["choices"][0]["message"]["content"], + disallowed_special=(), + ) ) model_response["created"] = int(time.time()) diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index dec87a61c..c78a71dba 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list): if messages[i]["role"] == "assistant": last_assistant_message_idx = i + new_messages.append(messages[-1]) if last_assistant_message_idx is not None: new_messages[last_assistant_message_idx]["content"] = new_messages[ last_assistant_message_idx diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 47b8c1535..628f55852 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1067,20 +1067,22 @@ async def update_database( ) data_list.append(existing_spend_obj) - # Update the cost column for the given user id - if prisma_client is not None: - await prisma_client.update_data( - data_list=data_list, - query_type="update_many", - table_name="user", - ) - elif custom_db_client is not None and user_id is not None: + if custom_db_client is not None and user_id is not None: new_spend = data_list[0].spend await custom_db_client.update_data( key=user_id, value={"spend": new_spend}, table_name="user" ) + # Update the cost column for the given user id + if prisma_client is not None: + await prisma_client.update_data( + data_list=data_list, + query_type="update_many", + table_name="user", + ) except Exception as e: - verbose_proxy_logger.info(f"Update User DB call failed to execute") + verbose_proxy_logger.info( + f"Update User DB call failed to execute {str(e)}" + ) ### UPDATE KEY SPEND ### async def _update_key_db(): @@ -1215,7 +1217,9 @@ async def update_database( await custom_db_client.insert_data(payload, table_name="spend") except Exception as e: - verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute") + verbose_proxy_logger.info( + f"Update Spend Logs DB failed to execute - {str(e)}" + ) ### UPDATE KEY SPEND ### async def _update_team_db(): @@ -1286,7 +1290,9 @@ async def update_database( valid_token.spend = new_spend user_api_key_cache.set_cache(key=token, value=valid_token) except Exception as e: - verbose_proxy_logger.info(f"Update Team DB failed to execute") + verbose_proxy_logger.info( + f"Update Team DB failed to execute - {str(e)}" + ) asyncio.create_task(_update_user_db()) asyncio.create_task(_update_key_db()) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 959f2b714..9e3a374ec 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -64,7 +64,7 @@ class ProxyLogging: litellm.callbacks.append(self.max_parallel_request_limiter) litellm.callbacks.append(self.max_budget_limiter) litellm.callbacks.append(self.cache_control_check) - # litellm.callbacks.append(self.response_taking_too_long_callback) + litellm.success_callback.append(self.response_taking_too_long_callback) for callback in litellm.callbacks: if callback not in litellm.input_callback: litellm.input_callback.append(callback) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 57b0e436f..f5e145769 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -82,6 +82,23 @@ def test_completion_claude(): # test_completion_claude() +def test_completion_claude_3_empty_response(): + messages = [ + { + "role": "system", + "content": "You are 2twNLGfqk4GMOn3ffp4p.", + }, + {"role": "user", "content": "Hi gm!"}, + {"role": "assistant", "content": "Good morning! How are you doing today?"}, + { + "role": "user", + "content": "I was hoping we could chat a bit", + }, + ] + response = litellm.completion(model="claude-3-opus-20240229", messages=messages) + print(response) + + def test_completion_claude_3(): litellm.set_verbose = True messages = [{"role": "user", "content": "Hello, world"}] diff --git a/litellm/utils.py b/litellm/utils.py index d4d85cad1..68dc137af 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -225,9 +225,28 @@ class ChatCompletionDeltaToolCall(OpenAIObject): class ChatCompletionMessageToolCall(OpenAIObject): - id: str - function: Function - type: str + def __init__( + self, + function: Union[Dict, Function], + id: Optional[str] = None, + type: Optional[str] = None, + **params, + ): + super(ChatCompletionMessageToolCall, self).__init__(**params) + if isinstance(function, Dict): + self.function = Function(**function) + else: + self.function = function + + if id is not None: + self.id = id + else: + self.id = f"{uuid.uuid4()}" + + if type is not None: + self.type = type + else: + self.type = "function" class Message(OpenAIObject): @@ -772,10 +791,10 @@ class ImageResponse(OpenAIObject): ############################################################ -def print_verbose(print_statement): +def print_verbose(print_statement, logger_only: bool = False): try: verbose_logger.debug(print_statement) - if litellm.set_verbose: + if litellm.set_verbose == True and logger_only == False: print(print_statement) # noqa except: pass @@ -1738,9 +1757,10 @@ class Logging: end_time=end_time, ) if callable(callback): # custom logger functions - print_verbose( - f"Making async function logging call for {callback}, result={result} - {self.model_call_details}" - ) + # print_verbose( + # f"Making async function logging call for {callback}, result={result} - {self.model_call_details}", + # logger_only=True, + # ) if self.stream: if ( "async_complete_streaming_response" @@ -6231,7 +6251,7 @@ def convert_to_model_response_object( return model_response_object except Exception as e: - raise Exception(f"Invalid response object {e}") + raise Exception(f"Invalid response object {traceback.format_exc()}") def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index b65cefe79..64183f216 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -40,8 +40,8 @@ litellm_settings: budget_duration: 30d general_settings: master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) - proxy_budget_rescheduler_min_time: 3 - proxy_budget_rescheduler_max_time: 6 + proxy_budget_rescheduler_min_time: 10 + proxy_budget_rescheduler_max_time: 12 # database_url: "postgresql://:@:/" # [OPTIONAL] use for token-based auth to proxy environment_variables: diff --git a/pyproject.toml b/pyproject.toml index 8de9a9f10..6d42edd23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.29.2" +version = "1.29.4" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.29.2" +version = "1.29.4" version_files = [ "pyproject.toml:^version" ] diff --git a/tests/test_keys.py b/tests/test_keys.py index db21a2176..5a7b79e1c 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -469,7 +469,7 @@ async def test_key_with_budgets(): break except: i + 1 - await asyncio.sleep(5) + await asyncio.sleep(10) assert reset_at_init_value != reset_at_new_value