Merge branch 'main' into litellm_fixes_proxy_db

This commit is contained in:
Ishaan Jaff 2024-03-05 18:51:55 -08:00 committed by GitHub
commit 03fa322b38
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 117 additions and 57 deletions

View file

@ -202,7 +202,7 @@ print(response)
</Tabs> </Tabs>
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.) ## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.)
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1) [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml
``` ```
## Load Balancing
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
```yaml
router_settings:
routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
model_list:
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8001
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8002
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8003
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: <my-openai-key>
- model_name: gpt-3.5-turbo-16k
litellm_params:
model: gpt-3.5-turbo-16k
api_key: <my-openai-key>
litellm_settings:
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
```
## Set Azure `base_model` for cost tracking ## Set Azure `base_model` for cost tracking
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
``` ```
## Router Settings
Use this to configure things like routing strategy.
```yaml
router_settings:
routing_strategy: "least-busy"
model_list: # will route requests to the least busy ollama model
- model_name: ollama-models
litellm_params:
model: "ollama/mistral"
api_base: "http://127.0.0.1:8001"
- model_name: ollama-models
litellm_params:
model: "ollama/codellama"
api_base: "http://127.0.0.1:8002"
- model_name: ollama-models
litellm_params:
model: "ollama/llama2"
api_base: "http://127.0.0.1:8003"
```
## Configure DB Pool Limits + Connection Timeouts ## Configure DB Pool Limits + Connection Timeouts
```yaml ```yaml

View file

@ -124,7 +124,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
start_time, start_time,
end_time, end_time,
) )
print_verbose(f"Custom Logger - final response object: {response_obj}")
except: except:
# traceback.print_exc() # traceback.print_exc()
print_verbose(f"Custom Logger Error - {traceback.format_exc()}") print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
@ -142,7 +141,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
start_time, start_time,
end_time, end_time,
) )
print_verbose(f"Custom Logger - final response object: {response_obj}")
except: except:
# traceback.print_exc() # traceback.print_exc()
print_verbose(f"Custom Logger Error - {traceback.format_exc()}") print_verbose(f"Custom Logger Error - {traceback.format_exc()}")

View file

@ -77,9 +77,9 @@ class AlephAlphaConfig:
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores. - `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
""" """
maximum_tokens: Optional[ maximum_tokens: Optional[int] = (
int litellm.max_tokens
] = litellm.max_tokens # aleph alpha requires max tokens ) # aleph alpha requires max tokens
minimum_tokens: Optional[int] = None minimum_tokens: Optional[int] = None
echo: Optional[bool] = None echo: Optional[bool] = None
temperature: Optional[int] = None temperature: Optional[int] = None
@ -285,7 +285,10 @@ def completion(
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len( completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"]["content"]) encoding.encode(
model_response["choices"][0]["message"]["content"],
disallowed_special=(),
)
) )
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())

View file

@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list):
if messages[i]["role"] == "assistant": if messages[i]["role"] == "assistant":
last_assistant_message_idx = i last_assistant_message_idx = i
new_messages.append(messages[-1])
if last_assistant_message_idx is not None: if last_assistant_message_idx is not None:
new_messages[last_assistant_message_idx]["content"] = new_messages[ new_messages[last_assistant_message_idx]["content"] = new_messages[
last_assistant_message_idx last_assistant_message_idx

View file

@ -1067,20 +1067,22 @@ async def update_database(
) )
data_list.append(existing_spend_obj) data_list.append(existing_spend_obj)
# Update the cost column for the given user id if custom_db_client is not None and user_id is not None:
if prisma_client is not None:
await prisma_client.update_data(
data_list=data_list,
query_type="update_many",
table_name="user",
)
elif custom_db_client is not None and user_id is not None:
new_spend = data_list[0].spend new_spend = data_list[0].spend
await custom_db_client.update_data( await custom_db_client.update_data(
key=user_id, value={"spend": new_spend}, table_name="user" key=user_id, value={"spend": new_spend}, table_name="user"
) )
# Update the cost column for the given user id
if prisma_client is not None:
await prisma_client.update_data(
data_list=data_list,
query_type="update_many",
table_name="user",
)
except Exception as e: except Exception as e:
verbose_proxy_logger.info(f"Update User DB call failed to execute") verbose_proxy_logger.info(
f"Update User DB call failed to execute {str(e)}"
)
### UPDATE KEY SPEND ### ### UPDATE KEY SPEND ###
async def _update_key_db(): async def _update_key_db():
@ -1215,7 +1217,9 @@ async def update_database(
await custom_db_client.insert_data(payload, table_name="spend") await custom_db_client.insert_data(payload, table_name="spend")
except Exception as e: except Exception as e:
verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute") verbose_proxy_logger.info(
f"Update Spend Logs DB failed to execute - {str(e)}"
)
### UPDATE KEY SPEND ### ### UPDATE KEY SPEND ###
async def _update_team_db(): async def _update_team_db():
@ -1286,7 +1290,9 @@ async def update_database(
valid_token.spend = new_spend valid_token.spend = new_spend
user_api_key_cache.set_cache(key=token, value=valid_token) user_api_key_cache.set_cache(key=token, value=valid_token)
except Exception as e: except Exception as e:
verbose_proxy_logger.info(f"Update Team DB failed to execute") verbose_proxy_logger.info(
f"Update Team DB failed to execute - {str(e)}"
)
asyncio.create_task(_update_user_db()) asyncio.create_task(_update_user_db())
asyncio.create_task(_update_key_db()) asyncio.create_task(_update_key_db())

View file

@ -64,7 +64,7 @@ class ProxyLogging:
litellm.callbacks.append(self.max_parallel_request_limiter) litellm.callbacks.append(self.max_parallel_request_limiter)
litellm.callbacks.append(self.max_budget_limiter) litellm.callbacks.append(self.max_budget_limiter)
litellm.callbacks.append(self.cache_control_check) litellm.callbacks.append(self.cache_control_check)
# litellm.callbacks.append(self.response_taking_too_long_callback) litellm.success_callback.append(self.response_taking_too_long_callback)
for callback in litellm.callbacks: for callback in litellm.callbacks:
if callback not in litellm.input_callback: if callback not in litellm.input_callback:
litellm.input_callback.append(callback) litellm.input_callback.append(callback)

View file

@ -82,6 +82,23 @@ def test_completion_claude():
# test_completion_claude() # test_completion_claude()
def test_completion_claude_3_empty_response():
messages = [
{
"role": "system",
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
},
{"role": "user", "content": "Hi gm!"},
{"role": "assistant", "content": "Good morning! How are you doing today?"},
{
"role": "user",
"content": "I was hoping we could chat a bit",
},
]
response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
print(response)
def test_completion_claude_3(): def test_completion_claude_3():
litellm.set_verbose = True litellm.set_verbose = True
messages = [{"role": "user", "content": "Hello, world"}] messages = [{"role": "user", "content": "Hello, world"}]

View file

@ -225,9 +225,28 @@ class ChatCompletionDeltaToolCall(OpenAIObject):
class ChatCompletionMessageToolCall(OpenAIObject): class ChatCompletionMessageToolCall(OpenAIObject):
id: str def __init__(
function: Function self,
type: str function: Union[Dict, Function],
id: Optional[str] = None,
type: Optional[str] = None,
**params,
):
super(ChatCompletionMessageToolCall, self).__init__(**params)
if isinstance(function, Dict):
self.function = Function(**function)
else:
self.function = function
if id is not None:
self.id = id
else:
self.id = f"{uuid.uuid4()}"
if type is not None:
self.type = type
else:
self.type = "function"
class Message(OpenAIObject): class Message(OpenAIObject):
@ -772,10 +791,10 @@ class ImageResponse(OpenAIObject):
############################################################ ############################################################
def print_verbose(print_statement): def print_verbose(print_statement, logger_only: bool = False):
try: try:
verbose_logger.debug(print_statement) verbose_logger.debug(print_statement)
if litellm.set_verbose: if litellm.set_verbose == True and logger_only == False:
print(print_statement) # noqa print(print_statement) # noqa
except: except:
pass pass
@ -1738,9 +1757,10 @@ class Logging:
end_time=end_time, end_time=end_time,
) )
if callable(callback): # custom logger functions if callable(callback): # custom logger functions
print_verbose( # print_verbose(
f"Making async function logging call for {callback}, result={result} - {self.model_call_details}" # f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
) # logger_only=True,
# )
if self.stream: if self.stream:
if ( if (
"async_complete_streaming_response" "async_complete_streaming_response"
@ -6231,7 +6251,7 @@ def convert_to_model_response_object(
return model_response_object return model_response_object
except Exception as e: except Exception as e:
raise Exception(f"Invalid response object {e}") raise Exception(f"Invalid response object {traceback.format_exc()}")
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call

View file

@ -40,8 +40,8 @@ litellm_settings:
budget_duration: 30d budget_duration: 30d
general_settings: general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
proxy_budget_rescheduler_min_time: 3 proxy_budget_rescheduler_min_time: 10
proxy_budget_rescheduler_max_time: 6 proxy_budget_rescheduler_max_time: 12
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
environment_variables: environment_variables:

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.29.2" version = "1.29.4"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT" license = "MIT"
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.29.2" version = "1.29.4"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]

View file

@ -469,7 +469,7 @@ async def test_key_with_budgets():
break break
except: except:
i + 1 i + 1
await asyncio.sleep(5) await asyncio.sleep(10)
assert reset_at_init_value != reset_at_new_value assert reset_at_init_value != reset_at_new_value