forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_fixes_proxy_db
This commit is contained in:
commit
03fa322b38
11 changed files with 117 additions and 57 deletions
|
@ -202,7 +202,7 @@ print(response)
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
|
## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.)
|
||||||
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||||
|
|
||||||
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
||||||
|
@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Load Balancing
|
||||||
|
|
||||||
|
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
router_settings:
|
||||||
|
routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
|
||||||
|
|
||||||
|
model_list:
|
||||||
|
- model_name: zephyr-beta
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
|
api_base: http://0.0.0.0:8001
|
||||||
|
- model_name: zephyr-beta
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
|
api_base: http://0.0.0.0:8002
|
||||||
|
- model_name: zephyr-beta
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
|
api_base: http://0.0.0.0:8003
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: gpt-3.5-turbo
|
||||||
|
api_key: <my-openai-key>
|
||||||
|
- model_name: gpt-3.5-turbo-16k
|
||||||
|
litellm_params:
|
||||||
|
model: gpt-3.5-turbo-16k
|
||||||
|
api_key: <my-openai-key>
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
||||||
|
request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
|
||||||
|
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
||||||
|
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
||||||
|
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Set Azure `base_model` for cost tracking
|
## Set Azure `base_model` for cost tracking
|
||||||
|
|
||||||
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
||||||
|
@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Router Settings
|
|
||||||
|
|
||||||
Use this to configure things like routing strategy.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
router_settings:
|
|
||||||
routing_strategy: "least-busy"
|
|
||||||
|
|
||||||
model_list: # will route requests to the least busy ollama model
|
|
||||||
- model_name: ollama-models
|
|
||||||
litellm_params:
|
|
||||||
model: "ollama/mistral"
|
|
||||||
api_base: "http://127.0.0.1:8001"
|
|
||||||
- model_name: ollama-models
|
|
||||||
litellm_params:
|
|
||||||
model: "ollama/codellama"
|
|
||||||
api_base: "http://127.0.0.1:8002"
|
|
||||||
- model_name: ollama-models
|
|
||||||
litellm_params:
|
|
||||||
model: "ollama/llama2"
|
|
||||||
api_base: "http://127.0.0.1:8003"
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Configure DB Pool Limits + Connection Timeouts
|
## Configure DB Pool Limits + Connection Timeouts
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -124,7 +124,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
||||||
start_time,
|
start_time,
|
||||||
end_time,
|
end_time,
|
||||||
)
|
)
|
||||||
print_verbose(f"Custom Logger - final response object: {response_obj}")
|
|
||||||
except:
|
except:
|
||||||
# traceback.print_exc()
|
# traceback.print_exc()
|
||||||
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
||||||
|
@ -142,7 +141,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
||||||
start_time,
|
start_time,
|
||||||
end_time,
|
end_time,
|
||||||
)
|
)
|
||||||
print_verbose(f"Custom Logger - final response object: {response_obj}")
|
|
||||||
except:
|
except:
|
||||||
# traceback.print_exc()
|
# traceback.print_exc()
|
||||||
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
||||||
|
|
|
@ -77,9 +77,9 @@ class AlephAlphaConfig:
|
||||||
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
|
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
maximum_tokens: Optional[
|
maximum_tokens: Optional[int] = (
|
||||||
int
|
litellm.max_tokens
|
||||||
] = litellm.max_tokens # aleph alpha requires max tokens
|
) # aleph alpha requires max tokens
|
||||||
minimum_tokens: Optional[int] = None
|
minimum_tokens: Optional[int] = None
|
||||||
echo: Optional[bool] = None
|
echo: Optional[bool] = None
|
||||||
temperature: Optional[int] = None
|
temperature: Optional[int] = None
|
||||||
|
@ -285,7 +285,10 @@ def completion(
|
||||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||||
prompt_tokens = len(encoding.encode(prompt))
|
prompt_tokens = len(encoding.encode(prompt))
|
||||||
completion_tokens = len(
|
completion_tokens = len(
|
||||||
encoding.encode(model_response["choices"][0]["message"]["content"])
|
encoding.encode(
|
||||||
|
model_response["choices"][0]["message"]["content"],
|
||||||
|
disallowed_special=(),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
|
|
|
@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list):
|
||||||
if messages[i]["role"] == "assistant":
|
if messages[i]["role"] == "assistant":
|
||||||
last_assistant_message_idx = i
|
last_assistant_message_idx = i
|
||||||
|
|
||||||
|
new_messages.append(messages[-1])
|
||||||
if last_assistant_message_idx is not None:
|
if last_assistant_message_idx is not None:
|
||||||
new_messages[last_assistant_message_idx]["content"] = new_messages[
|
new_messages[last_assistant_message_idx]["content"] = new_messages[
|
||||||
last_assistant_message_idx
|
last_assistant_message_idx
|
||||||
|
|
|
@ -1067,20 +1067,22 @@ async def update_database(
|
||||||
)
|
)
|
||||||
data_list.append(existing_spend_obj)
|
data_list.append(existing_spend_obj)
|
||||||
|
|
||||||
# Update the cost column for the given user id
|
if custom_db_client is not None and user_id is not None:
|
||||||
if prisma_client is not None:
|
|
||||||
await prisma_client.update_data(
|
|
||||||
data_list=data_list,
|
|
||||||
query_type="update_many",
|
|
||||||
table_name="user",
|
|
||||||
)
|
|
||||||
elif custom_db_client is not None and user_id is not None:
|
|
||||||
new_spend = data_list[0].spend
|
new_spend = data_list[0].spend
|
||||||
await custom_db_client.update_data(
|
await custom_db_client.update_data(
|
||||||
key=user_id, value={"spend": new_spend}, table_name="user"
|
key=user_id, value={"spend": new_spend}, table_name="user"
|
||||||
)
|
)
|
||||||
|
# Update the cost column for the given user id
|
||||||
|
if prisma_client is not None:
|
||||||
|
await prisma_client.update_data(
|
||||||
|
data_list=data_list,
|
||||||
|
query_type="update_many",
|
||||||
|
table_name="user",
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_proxy_logger.info(f"Update User DB call failed to execute")
|
verbose_proxy_logger.info(
|
||||||
|
f"Update User DB call failed to execute {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
### UPDATE KEY SPEND ###
|
### UPDATE KEY SPEND ###
|
||||||
async def _update_key_db():
|
async def _update_key_db():
|
||||||
|
@ -1215,7 +1217,9 @@ async def update_database(
|
||||||
await custom_db_client.insert_data(payload, table_name="spend")
|
await custom_db_client.insert_data(payload, table_name="spend")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute")
|
verbose_proxy_logger.info(
|
||||||
|
f"Update Spend Logs DB failed to execute - {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
### UPDATE KEY SPEND ###
|
### UPDATE KEY SPEND ###
|
||||||
async def _update_team_db():
|
async def _update_team_db():
|
||||||
|
@ -1286,7 +1290,9 @@ async def update_database(
|
||||||
valid_token.spend = new_spend
|
valid_token.spend = new_spend
|
||||||
user_api_key_cache.set_cache(key=token, value=valid_token)
|
user_api_key_cache.set_cache(key=token, value=valid_token)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_proxy_logger.info(f"Update Team DB failed to execute")
|
verbose_proxy_logger.info(
|
||||||
|
f"Update Team DB failed to execute - {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
asyncio.create_task(_update_user_db())
|
asyncio.create_task(_update_user_db())
|
||||||
asyncio.create_task(_update_key_db())
|
asyncio.create_task(_update_key_db())
|
||||||
|
|
|
@ -64,7 +64,7 @@ class ProxyLogging:
|
||||||
litellm.callbacks.append(self.max_parallel_request_limiter)
|
litellm.callbacks.append(self.max_parallel_request_limiter)
|
||||||
litellm.callbacks.append(self.max_budget_limiter)
|
litellm.callbacks.append(self.max_budget_limiter)
|
||||||
litellm.callbacks.append(self.cache_control_check)
|
litellm.callbacks.append(self.cache_control_check)
|
||||||
# litellm.callbacks.append(self.response_taking_too_long_callback)
|
litellm.success_callback.append(self.response_taking_too_long_callback)
|
||||||
for callback in litellm.callbacks:
|
for callback in litellm.callbacks:
|
||||||
if callback not in litellm.input_callback:
|
if callback not in litellm.input_callback:
|
||||||
litellm.input_callback.append(callback)
|
litellm.input_callback.append(callback)
|
||||||
|
|
|
@ -82,6 +82,23 @@ def test_completion_claude():
|
||||||
# test_completion_claude()
|
# test_completion_claude()
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_claude_3_empty_response():
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": "Hi gm!"},
|
||||||
|
{"role": "assistant", "content": "Good morning! How are you doing today?"},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "I was hoping we could chat a bit",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
|
||||||
def test_completion_claude_3():
|
def test_completion_claude_3():
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
messages = [{"role": "user", "content": "Hello, world"}]
|
messages = [{"role": "user", "content": "Hello, world"}]
|
||||||
|
|
|
@ -225,9 +225,28 @@ class ChatCompletionDeltaToolCall(OpenAIObject):
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionMessageToolCall(OpenAIObject):
|
class ChatCompletionMessageToolCall(OpenAIObject):
|
||||||
id: str
|
def __init__(
|
||||||
function: Function
|
self,
|
||||||
type: str
|
function: Union[Dict, Function],
|
||||||
|
id: Optional[str] = None,
|
||||||
|
type: Optional[str] = None,
|
||||||
|
**params,
|
||||||
|
):
|
||||||
|
super(ChatCompletionMessageToolCall, self).__init__(**params)
|
||||||
|
if isinstance(function, Dict):
|
||||||
|
self.function = Function(**function)
|
||||||
|
else:
|
||||||
|
self.function = function
|
||||||
|
|
||||||
|
if id is not None:
|
||||||
|
self.id = id
|
||||||
|
else:
|
||||||
|
self.id = f"{uuid.uuid4()}"
|
||||||
|
|
||||||
|
if type is not None:
|
||||||
|
self.type = type
|
||||||
|
else:
|
||||||
|
self.type = "function"
|
||||||
|
|
||||||
|
|
||||||
class Message(OpenAIObject):
|
class Message(OpenAIObject):
|
||||||
|
@ -772,10 +791,10 @@ class ImageResponse(OpenAIObject):
|
||||||
|
|
||||||
|
|
||||||
############################################################
|
############################################################
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement, logger_only: bool = False):
|
||||||
try:
|
try:
|
||||||
verbose_logger.debug(print_statement)
|
verbose_logger.debug(print_statement)
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose == True and logger_only == False:
|
||||||
print(print_statement) # noqa
|
print(print_statement) # noqa
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
@ -1738,9 +1757,10 @@ class Logging:
|
||||||
end_time=end_time,
|
end_time=end_time,
|
||||||
)
|
)
|
||||||
if callable(callback): # custom logger functions
|
if callable(callback): # custom logger functions
|
||||||
print_verbose(
|
# print_verbose(
|
||||||
f"Making async function logging call for {callback}, result={result} - {self.model_call_details}"
|
# f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
|
||||||
)
|
# logger_only=True,
|
||||||
|
# )
|
||||||
if self.stream:
|
if self.stream:
|
||||||
if (
|
if (
|
||||||
"async_complete_streaming_response"
|
"async_complete_streaming_response"
|
||||||
|
@ -6231,7 +6251,7 @@ def convert_to_model_response_object(
|
||||||
|
|
||||||
return model_response_object
|
return model_response_object
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Invalid response object {e}")
|
raise Exception(f"Invalid response object {traceback.format_exc()}")
|
||||||
|
|
||||||
|
|
||||||
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
|
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
|
||||||
|
|
|
@ -40,8 +40,8 @@ litellm_settings:
|
||||||
budget_duration: 30d
|
budget_duration: 30d
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||||
proxy_budget_rescheduler_min_time: 3
|
proxy_budget_rescheduler_min_time: 10
|
||||||
proxy_budget_rescheduler_max_time: 6
|
proxy_budget_rescheduler_max_time: 12
|
||||||
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
|
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
|
||||||
|
|
||||||
environment_variables:
|
environment_variables:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm"
|
name = "litellm"
|
||||||
version = "1.29.2"
|
version = "1.29.4"
|
||||||
description = "Library to easily interface with LLM API providers"
|
description = "Library to easily interface with LLM API providers"
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.commitizen]
|
[tool.commitizen]
|
||||||
version = "1.29.2"
|
version = "1.29.4"
|
||||||
version_files = [
|
version_files = [
|
||||||
"pyproject.toml:^version"
|
"pyproject.toml:^version"
|
||||||
]
|
]
|
||||||
|
|
|
@ -469,7 +469,7 @@ async def test_key_with_budgets():
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
i + 1
|
i + 1
|
||||||
await asyncio.sleep(5)
|
await asyncio.sleep(10)
|
||||||
assert reset_at_init_value != reset_at_new_value
|
assert reset_at_init_value != reset_at_new_value
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue