forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_fixes_proxy_db
This commit is contained in:
commit
03fa322b38
11 changed files with 117 additions and 57 deletions
|
@ -202,7 +202,7 @@ print(response)
|
|||
</Tabs>
|
||||
|
||||
|
||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
|
||||
## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.)
|
||||
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||
|
||||
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
||||
|
@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml
|
|||
```
|
||||
|
||||
|
||||
## Load Balancing
|
||||
|
||||
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
|
||||
|
||||
model_list:
|
||||
- model_name: zephyr-beta
|
||||
litellm_params:
|
||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||
api_base: http://0.0.0.0:8001
|
||||
- model_name: zephyr-beta
|
||||
litellm_params:
|
||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||
api_base: http://0.0.0.0:8002
|
||||
- model_name: zephyr-beta
|
||||
litellm_params:
|
||||
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||
api_base: http://0.0.0.0:8003
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
api_key: <my-openai-key>
|
||||
- model_name: gpt-3.5-turbo-16k
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo-16k
|
||||
api_key: <my-openai-key>
|
||||
|
||||
litellm_settings:
|
||||
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
||||
request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
|
||||
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
||||
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
||||
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
||||
```
|
||||
|
||||
|
||||
## Set Azure `base_model` for cost tracking
|
||||
|
||||
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
||||
|
@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
|||
```
|
||||
|
||||
|
||||
## Router Settings
|
||||
|
||||
Use this to configure things like routing strategy.
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
routing_strategy: "least-busy"
|
||||
|
||||
model_list: # will route requests to the least busy ollama model
|
||||
- model_name: ollama-models
|
||||
litellm_params:
|
||||
model: "ollama/mistral"
|
||||
api_base: "http://127.0.0.1:8001"
|
||||
- model_name: ollama-models
|
||||
litellm_params:
|
||||
model: "ollama/codellama"
|
||||
api_base: "http://127.0.0.1:8002"
|
||||
- model_name: ollama-models
|
||||
litellm_params:
|
||||
model: "ollama/llama2"
|
||||
api_base: "http://127.0.0.1:8003"
|
||||
```
|
||||
|
||||
|
||||
## Configure DB Pool Limits + Connection Timeouts
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -124,7 +124,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
|||
start_time,
|
||||
end_time,
|
||||
)
|
||||
print_verbose(f"Custom Logger - final response object: {response_obj}")
|
||||
except:
|
||||
# traceback.print_exc()
|
||||
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
||||
|
@ -142,7 +141,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
|||
start_time,
|
||||
end_time,
|
||||
)
|
||||
print_verbose(f"Custom Logger - final response object: {response_obj}")
|
||||
except:
|
||||
# traceback.print_exc()
|
||||
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
||||
|
|
|
@ -77,9 +77,9 @@ class AlephAlphaConfig:
|
|||
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
|
||||
"""
|
||||
|
||||
maximum_tokens: Optional[
|
||||
int
|
||||
] = litellm.max_tokens # aleph alpha requires max tokens
|
||||
maximum_tokens: Optional[int] = (
|
||||
litellm.max_tokens
|
||||
) # aleph alpha requires max tokens
|
||||
minimum_tokens: Optional[int] = None
|
||||
echo: Optional[bool] = None
|
||||
temperature: Optional[int] = None
|
||||
|
@ -285,7 +285,10 @@ def completion(
|
|||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(
|
||||
encoding.encode(model_response["choices"][0]["message"]["content"])
|
||||
encoding.encode(
|
||||
model_response["choices"][0]["message"]["content"],
|
||||
disallowed_special=(),
|
||||
)
|
||||
)
|
||||
|
||||
model_response["created"] = int(time.time())
|
||||
|
|
|
@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list):
|
|||
if messages[i]["role"] == "assistant":
|
||||
last_assistant_message_idx = i
|
||||
|
||||
new_messages.append(messages[-1])
|
||||
if last_assistant_message_idx is not None:
|
||||
new_messages[last_assistant_message_idx]["content"] = new_messages[
|
||||
last_assistant_message_idx
|
||||
|
|
|
@ -1067,6 +1067,11 @@ async def update_database(
|
|||
)
|
||||
data_list.append(existing_spend_obj)
|
||||
|
||||
if custom_db_client is not None and user_id is not None:
|
||||
new_spend = data_list[0].spend
|
||||
await custom_db_client.update_data(
|
||||
key=user_id, value={"spend": new_spend}, table_name="user"
|
||||
)
|
||||
# Update the cost column for the given user id
|
||||
if prisma_client is not None:
|
||||
await prisma_client.update_data(
|
||||
|
@ -1074,13 +1079,10 @@ async def update_database(
|
|||
query_type="update_many",
|
||||
table_name="user",
|
||||
)
|
||||
elif custom_db_client is not None and user_id is not None:
|
||||
new_spend = data_list[0].spend
|
||||
await custom_db_client.update_data(
|
||||
key=user_id, value={"spend": new_spend}, table_name="user"
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.info(f"Update User DB call failed to execute")
|
||||
verbose_proxy_logger.info(
|
||||
f"Update User DB call failed to execute {str(e)}"
|
||||
)
|
||||
|
||||
### UPDATE KEY SPEND ###
|
||||
async def _update_key_db():
|
||||
|
@ -1215,7 +1217,9 @@ async def update_database(
|
|||
await custom_db_client.insert_data(payload, table_name="spend")
|
||||
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute")
|
||||
verbose_proxy_logger.info(
|
||||
f"Update Spend Logs DB failed to execute - {str(e)}"
|
||||
)
|
||||
|
||||
### UPDATE KEY SPEND ###
|
||||
async def _update_team_db():
|
||||
|
@ -1286,7 +1290,9 @@ async def update_database(
|
|||
valid_token.spend = new_spend
|
||||
user_api_key_cache.set_cache(key=token, value=valid_token)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.info(f"Update Team DB failed to execute")
|
||||
verbose_proxy_logger.info(
|
||||
f"Update Team DB failed to execute - {str(e)}"
|
||||
)
|
||||
|
||||
asyncio.create_task(_update_user_db())
|
||||
asyncio.create_task(_update_key_db())
|
||||
|
|
|
@ -64,7 +64,7 @@ class ProxyLogging:
|
|||
litellm.callbacks.append(self.max_parallel_request_limiter)
|
||||
litellm.callbacks.append(self.max_budget_limiter)
|
||||
litellm.callbacks.append(self.cache_control_check)
|
||||
# litellm.callbacks.append(self.response_taking_too_long_callback)
|
||||
litellm.success_callback.append(self.response_taking_too_long_callback)
|
||||
for callback in litellm.callbacks:
|
||||
if callback not in litellm.input_callback:
|
||||
litellm.input_callback.append(callback)
|
||||
|
|
|
@ -82,6 +82,23 @@ def test_completion_claude():
|
|||
# test_completion_claude()
|
||||
|
||||
|
||||
def test_completion_claude_3_empty_response():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
|
||||
},
|
||||
{"role": "user", "content": "Hi gm!"},
|
||||
{"role": "assistant", "content": "Good morning! How are you doing today?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "I was hoping we could chat a bit",
|
||||
},
|
||||
]
|
||||
response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
|
||||
print(response)
|
||||
|
||||
|
||||
def test_completion_claude_3():
|
||||
litellm.set_verbose = True
|
||||
messages = [{"role": "user", "content": "Hello, world"}]
|
||||
|
|
|
@ -225,9 +225,28 @@ class ChatCompletionDeltaToolCall(OpenAIObject):
|
|||
|
||||
|
||||
class ChatCompletionMessageToolCall(OpenAIObject):
|
||||
id: str
|
||||
function: Function
|
||||
type: str
|
||||
def __init__(
|
||||
self,
|
||||
function: Union[Dict, Function],
|
||||
id: Optional[str] = None,
|
||||
type: Optional[str] = None,
|
||||
**params,
|
||||
):
|
||||
super(ChatCompletionMessageToolCall, self).__init__(**params)
|
||||
if isinstance(function, Dict):
|
||||
self.function = Function(**function)
|
||||
else:
|
||||
self.function = function
|
||||
|
||||
if id is not None:
|
||||
self.id = id
|
||||
else:
|
||||
self.id = f"{uuid.uuid4()}"
|
||||
|
||||
if type is not None:
|
||||
self.type = type
|
||||
else:
|
||||
self.type = "function"
|
||||
|
||||
|
||||
class Message(OpenAIObject):
|
||||
|
@ -772,10 +791,10 @@ class ImageResponse(OpenAIObject):
|
|||
|
||||
|
||||
############################################################
|
||||
def print_verbose(print_statement):
|
||||
def print_verbose(print_statement, logger_only: bool = False):
|
||||
try:
|
||||
verbose_logger.debug(print_statement)
|
||||
if litellm.set_verbose:
|
||||
if litellm.set_verbose == True and logger_only == False:
|
||||
print(print_statement) # noqa
|
||||
except:
|
||||
pass
|
||||
|
@ -1738,9 +1757,10 @@ class Logging:
|
|||
end_time=end_time,
|
||||
)
|
||||
if callable(callback): # custom logger functions
|
||||
print_verbose(
|
||||
f"Making async function logging call for {callback}, result={result} - {self.model_call_details}"
|
||||
)
|
||||
# print_verbose(
|
||||
# f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
|
||||
# logger_only=True,
|
||||
# )
|
||||
if self.stream:
|
||||
if (
|
||||
"async_complete_streaming_response"
|
||||
|
@ -6231,7 +6251,7 @@ def convert_to_model_response_object(
|
|||
|
||||
return model_response_object
|
||||
except Exception as e:
|
||||
raise Exception(f"Invalid response object {e}")
|
||||
raise Exception(f"Invalid response object {traceback.format_exc()}")
|
||||
|
||||
|
||||
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
|
||||
|
|
|
@ -40,8 +40,8 @@ litellm_settings:
|
|||
budget_duration: 30d
|
||||
general_settings:
|
||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||
proxy_budget_rescheduler_min_time: 3
|
||||
proxy_budget_rescheduler_max_time: 6
|
||||
proxy_budget_rescheduler_min_time: 10
|
||||
proxy_budget_rescheduler_max_time: 12
|
||||
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
|
||||
|
||||
environment_variables:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.29.2"
|
||||
version = "1.29.4"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.29.2"
|
||||
version = "1.29.4"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
|
@ -469,7 +469,7 @@ async def test_key_with_budgets():
|
|||
break
|
||||
except:
|
||||
i + 1
|
||||
await asyncio.sleep(5)
|
||||
await asyncio.sleep(10)
|
||||
assert reset_at_init_value != reset_at_new_value
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue