forked from phoenix/litellm-mirror
Merge branch 'BerriAI:main' into main
This commit is contained in:
commit
de7fe98556
27 changed files with 850 additions and 346 deletions
47
.github/pull_request_template.md
vendored
Normal file
47
.github/pull_request_template.md
vendored
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
<!-- This is just examples. You can remove all items if you want. -->
|
||||||
|
<!-- Please remove all comments. -->
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
<!-- e.g. "Implement user authentication feature" -->
|
||||||
|
|
||||||
|
## Relevant issues
|
||||||
|
|
||||||
|
<!-- e.g. "Fixes #000" -->
|
||||||
|
|
||||||
|
## Type
|
||||||
|
|
||||||
|
<!-- Select the type of Pull Request -->
|
||||||
|
<!-- Keep only the necessary ones -->
|
||||||
|
|
||||||
|
🆕 New Feature
|
||||||
|
🐛 Bug Fix
|
||||||
|
🧹 Refactoring
|
||||||
|
📖 Documentation
|
||||||
|
💻 Development Environment
|
||||||
|
🚄 Infrastructure
|
||||||
|
✅ Test
|
||||||
|
|
||||||
|
## Changes
|
||||||
|
|
||||||
|
<!-- List of changes -->
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
<!-- Test procedure -->
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
<!-- Test results -->
|
||||||
|
|
||||||
|
<!-- Points to note for the reviewer, consultation content, concerns -->
|
||||||
|
|
||||||
|
## Pre-Submission Checklist (optional but appreciated):
|
||||||
|
|
||||||
|
- [ ] I have included relevant documentation updates (stored in /docs/my-website)
|
||||||
|
|
||||||
|
## OS Tests (optional but appreciated):
|
||||||
|
|
||||||
|
- [ ] Tested on Windows
|
||||||
|
- [ ] Tested on MacOS
|
||||||
|
- [ ] Tested on Linux
|
|
@ -248,7 +248,7 @@ Step 2: Navigate into the project, and install dependencies:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd litellm
|
cd litellm
|
||||||
poetry install
|
poetry install -E extra_proxy -E proxy
|
||||||
```
|
```
|
||||||
|
|
||||||
Step 3: Test your change:
|
Step 3: Test your change:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Completion Token Usage & Cost
|
# Completion Token Usage & Cost
|
||||||
By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
|
By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
|
||||||
|
|
||||||
However, we also expose 5 helper functions + **[NEW]** an API to calculate token usage across providers:
|
However, we also expose some helper functions + **[NEW]** an API to calculate token usage across providers:
|
||||||
|
|
||||||
- `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode)
|
- `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode)
|
||||||
|
|
||||||
|
@ -9,17 +9,19 @@ However, we also expose 5 helper functions + **[NEW]** an API to calculate token
|
||||||
|
|
||||||
- `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. [**Jump to code**](#3-token_counter)
|
- `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. [**Jump to code**](#3-token_counter)
|
||||||
|
|
||||||
- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#4-cost_per_token)
|
- `create_pretrained_tokenizer` and `create_tokenizer`: LiteLLM provides default tokenizer support for OpenAI, Cohere, Anthropic, Llama2, and Llama3 models. If you are using a different model, you can create a custom tokenizer and pass it as `custom_tokenizer` to the `encode`, `decode`, and `token_counter` methods. [**Jump to code**](#4-create_pretrained_tokenizer-and-create_tokenizer)
|
||||||
|
|
||||||
- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#5-completion_cost)
|
- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#5-cost_per_token)
|
||||||
|
|
||||||
- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#6-get_max_tokens)
|
- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#6-completion_cost)
|
||||||
|
|
||||||
- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#7-model_cost)
|
- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#7-get_max_tokens)
|
||||||
|
|
||||||
- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#8-register_model)
|
- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#8-model_cost)
|
||||||
|
|
||||||
- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#9-apilitellmai)
|
- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#9-register_model)
|
||||||
|
|
||||||
|
- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#10-apilitellmai)
|
||||||
|
|
||||||
📣 This is a community maintained list. Contributions are welcome! ❤️
|
📣 This is a community maintained list. Contributions are welcome! ❤️
|
||||||
|
|
||||||
|
@ -60,7 +62,24 @@ messages = [{"user": "role", "content": "Hey, how's it going"}]
|
||||||
print(token_counter(model="gpt-3.5-turbo", messages=messages))
|
print(token_counter(model="gpt-3.5-turbo", messages=messages))
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. `cost_per_token`
|
### 4. `create_pretrained_tokenizer` and `create_tokenizer`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import create_pretrained_tokenizer, create_tokenizer
|
||||||
|
|
||||||
|
# get tokenizer from huggingface repo
|
||||||
|
custom_tokenizer_1 = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
|
||||||
|
|
||||||
|
# use tokenizer from json file
|
||||||
|
with open("tokenizer.json") as f:
|
||||||
|
json_data = json.load(f)
|
||||||
|
|
||||||
|
json_str = json.dumps(json_data)
|
||||||
|
|
||||||
|
custom_tokenizer_2 = create_tokenizer(json_str)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. `cost_per_token`
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from litellm import cost_per_token
|
from litellm import cost_per_token
|
||||||
|
@ -72,7 +91,7 @@ prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_toke
|
||||||
print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
|
print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
|
||||||
```
|
```
|
||||||
|
|
||||||
### 5. `completion_cost`
|
### 6. `completion_cost`
|
||||||
|
|
||||||
* Input: Accepts a `litellm.completion()` response **OR** prompt + completion strings
|
* Input: Accepts a `litellm.completion()` response **OR** prompt + completion strings
|
||||||
* Output: Returns a `float` of cost for the `completion` call
|
* Output: Returns a `float` of cost for the `completion` call
|
||||||
|
@ -99,7 +118,7 @@ cost = completion_cost(model="bedrock/anthropic.claude-v2", prompt="Hey!", compl
|
||||||
formatted_string = f"${float(cost):.10f}"
|
formatted_string = f"${float(cost):.10f}"
|
||||||
print(formatted_string)
|
print(formatted_string)
|
||||||
```
|
```
|
||||||
### 6. `get_max_tokens`
|
### 7. `get_max_tokens`
|
||||||
|
|
||||||
Input: Accepts a model name - e.g., gpt-3.5-turbo (to get a complete list, call litellm.model_list).
|
Input: Accepts a model name - e.g., gpt-3.5-turbo (to get a complete list, call litellm.model_list).
|
||||||
Output: Returns the maximum number of tokens allowed for the given model
|
Output: Returns the maximum number of tokens allowed for the given model
|
||||||
|
@ -112,7 +131,7 @@ model = "gpt-3.5-turbo"
|
||||||
print(get_max_tokens(model)) # Output: 4097
|
print(get_max_tokens(model)) # Output: 4097
|
||||||
```
|
```
|
||||||
|
|
||||||
### 7. `model_cost`
|
### 8. `model_cost`
|
||||||
|
|
||||||
* Output: Returns a dict object containing the max_tokens, input_cost_per_token, output_cost_per_token for all models on [community-maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
|
* Output: Returns a dict object containing the max_tokens, input_cost_per_token, output_cost_per_token for all models on [community-maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
|
||||||
|
|
||||||
|
@ -122,7 +141,7 @@ from litellm import model_cost
|
||||||
print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token': 1.5e-06, 'output_cost_per_token': 2e-06}, ...}
|
print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token': 1.5e-06, 'output_cost_per_token': 2e-06}, ...}
|
||||||
```
|
```
|
||||||
|
|
||||||
### 8. `register_model`
|
### 9. `register_model`
|
||||||
|
|
||||||
* Input: Provide EITHER a model cost dictionary or a url to a hosted json blob
|
* Input: Provide EITHER a model cost dictionary or a url to a hosted json blob
|
||||||
* Output: Returns updated model_cost dictionary + updates litellm.model_cost with model details.
|
* Output: Returns updated model_cost dictionary + updates litellm.model_cost with model details.
|
||||||
|
@ -157,5 +176,3 @@ export LITELLM_LOCAL_MODEL_COST_MAP="True"
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: this means you will need to upgrade to get updated pricing, and newer models.
|
Note: this means you will need to upgrade to get updated pricing, and newer models.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Greenscale Tutorial
|
# Greenscale - Track LLM Spend and Responsible Usage
|
||||||
|
|
||||||
[Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
|
[Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
|
||||||
|
|
||||||
|
|
|
@ -178,6 +178,7 @@ const sidebars = {
|
||||||
"observability/traceloop_integration",
|
"observability/traceloop_integration",
|
||||||
"observability/athina_integration",
|
"observability/athina_integration",
|
||||||
"observability/lunary_integration",
|
"observability/lunary_integration",
|
||||||
|
"observability/greenscale_integration",
|
||||||
"observability/helicone_integration",
|
"observability/helicone_integration",
|
||||||
"observability/supabase_integration",
|
"observability/supabase_integration",
|
||||||
`observability/telemetry`,
|
`observability/telemetry`,
|
||||||
|
|
8
litellm-js/spend-logs/package-lock.json
generated
8
litellm-js/spend-logs/package-lock.json
generated
|
@ -5,7 +5,7 @@
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@hono/node-server": "^1.9.0",
|
"@hono/node-server": "^1.10.1",
|
||||||
"hono": "^4.2.7"
|
"hono": "^4.2.7"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
@ -382,9 +382,9 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@hono/node-server": {
|
"node_modules/@hono/node-server": {
|
||||||
"version": "1.9.0",
|
"version": "1.10.1",
|
||||||
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
|
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.10.1.tgz",
|
||||||
"integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
|
"integrity": "sha512-5BKW25JH5PQKPDkTcIgv3yNUPtOAbnnjFFgWvIxxAY/B/ZNeYjjWoAeDmqhIiCgOAJ3Tauuw+0G+VainhuZRYQ==",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=18.14.1"
|
"node": ">=18.14.1"
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
"dev": "tsx watch src/index.ts"
|
"dev": "tsx watch src/index.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@hono/node-server": "^1.9.0",
|
"@hono/node-server": "^1.10.1",
|
||||||
"hono": "^4.2.7"
|
"hono": "^4.2.7"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|
|
@ -542,7 +542,11 @@ models_by_provider: dict = {
|
||||||
"together_ai": together_ai_models,
|
"together_ai": together_ai_models,
|
||||||
"baseten": baseten_models,
|
"baseten": baseten_models,
|
||||||
"openrouter": openrouter_models,
|
"openrouter": openrouter_models,
|
||||||
"vertex_ai": vertex_chat_models + vertex_text_models,
|
"vertex_ai": vertex_chat_models
|
||||||
|
+ vertex_text_models
|
||||||
|
+ vertex_anthropic_models
|
||||||
|
+ vertex_vision_models
|
||||||
|
+ vertex_language_models,
|
||||||
"ai21": ai21_models,
|
"ai21": ai21_models,
|
||||||
"bedrock": bedrock_models,
|
"bedrock": bedrock_models,
|
||||||
"petals": petals_models,
|
"petals": petals_models,
|
||||||
|
@ -609,6 +613,8 @@ from .utils import (
|
||||||
get_optional_params,
|
get_optional_params,
|
||||||
modify_integration,
|
modify_integration,
|
||||||
token_counter,
|
token_counter,
|
||||||
|
create_pretrained_tokenizer,
|
||||||
|
create_tokenizer,
|
||||||
cost_per_token,
|
cost_per_token,
|
||||||
completion_cost,
|
completion_cost,
|
||||||
supports_function_calling,
|
supports_function_calling,
|
||||||
|
|
|
@ -38,7 +38,7 @@ class OpenMeterLogger(CustomLogger):
|
||||||
in the environment
|
in the environment
|
||||||
"""
|
"""
|
||||||
missing_keys = []
|
missing_keys = []
|
||||||
if litellm.get_secret("OPENMETER_API_KEY", None) is None:
|
if os.getenv("OPENMETER_API_KEY", None) is None:
|
||||||
missing_keys.append("OPENMETER_API_KEY")
|
missing_keys.append("OPENMETER_API_KEY")
|
||||||
|
|
||||||
if len(missing_keys) > 0:
|
if len(missing_keys) > 0:
|
||||||
|
@ -71,15 +71,13 @@ class OpenMeterLogger(CustomLogger):
|
||||||
}
|
}
|
||||||
|
|
||||||
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
_url = litellm.get_secret(
|
_url = os.getenv("OPENMETER_API_ENDPOINT", "https://openmeter.cloud")
|
||||||
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
|
|
||||||
)
|
|
||||||
if _url.endswith("/"):
|
if _url.endswith("/"):
|
||||||
_url += "api/v1/events"
|
_url += "api/v1/events"
|
||||||
else:
|
else:
|
||||||
_url += "/api/v1/events"
|
_url += "/api/v1/events"
|
||||||
|
|
||||||
api_key = litellm.get_secret("OPENMETER_API_KEY")
|
api_key = os.getenv("OPENMETER_API_KEY")
|
||||||
|
|
||||||
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
|
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
|
||||||
self.sync_http_handler.post(
|
self.sync_http_handler.post(
|
||||||
|
@ -92,15 +90,13 @@ class OpenMeterLogger(CustomLogger):
|
||||||
)
|
)
|
||||||
|
|
||||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
_url = litellm.get_secret(
|
_url = os.getenv("OPENMETER_API_ENDPOINT", "https://openmeter.cloud")
|
||||||
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
|
|
||||||
)
|
|
||||||
if _url.endswith("/"):
|
if _url.endswith("/"):
|
||||||
_url += "api/v1/events"
|
_url += "api/v1/events"
|
||||||
else:
|
else:
|
||||||
_url += "/api/v1/events"
|
_url += "/api/v1/events"
|
||||||
|
|
||||||
api_key = litellm.get_secret("OPENMETER_API_KEY")
|
api_key = os.getenv("OPENMETER_API_KEY")
|
||||||
|
|
||||||
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
|
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
|
||||||
_headers = {
|
_headers = {
|
||||||
|
@ -117,7 +113,6 @@ class OpenMeterLogger(CustomLogger):
|
||||||
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"\nAn Exception Occurred - {str(e)}")
|
|
||||||
if hasattr(response, "text"):
|
if hasattr(response, "text"):
|
||||||
print(f"\nError Message: {response.text}")
|
litellm.print_verbose(f"\nError Message: {response.text}")
|
||||||
raise e
|
raise e
|
||||||
|
|
|
@ -48,19 +48,6 @@ class SlackAlerting:
|
||||||
self.internal_usage_cache = DualCache()
|
self.internal_usage_cache = DualCache()
|
||||||
self.async_http_handler = AsyncHTTPHandler()
|
self.async_http_handler = AsyncHTTPHandler()
|
||||||
self.alert_to_webhook_url = alert_to_webhook_url
|
self.alert_to_webhook_url = alert_to_webhook_url
|
||||||
self.langfuse_logger = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
from litellm.integrations.langfuse import LangFuseLogger
|
|
||||||
|
|
||||||
self.langfuse_logger = LangFuseLogger(
|
|
||||||
os.getenv("LANGFUSE_PUBLIC_KEY"),
|
|
||||||
os.getenv("LANGFUSE_SECRET_KEY"),
|
|
||||||
flush_interval=1,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def update_values(
|
def update_values(
|
||||||
|
@ -110,62 +97,8 @@ class SlackAlerting:
|
||||||
start_time: Optional[datetime.datetime] = None,
|
start_time: Optional[datetime.datetime] = None,
|
||||||
end_time: Optional[datetime.datetime] = None,
|
end_time: Optional[datetime.datetime] = None,
|
||||||
):
|
):
|
||||||
import uuid
|
# do nothing for now
|
||||||
|
pass
|
||||||
# For now: do nothing as we're debugging why this is not working as expected
|
|
||||||
if request_data is not None:
|
|
||||||
trace_id = request_data.get("metadata", {}).get(
|
|
||||||
"trace_id", None
|
|
||||||
) # get langfuse trace id
|
|
||||||
if trace_id is None:
|
|
||||||
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
|
|
||||||
request_data["metadata"]["trace_id"] = trace_id
|
|
||||||
elif kwargs is not None:
|
|
||||||
_litellm_params = kwargs.get("litellm_params", {})
|
|
||||||
trace_id = _litellm_params.get("metadata", {}).get(
|
|
||||||
"trace_id", None
|
|
||||||
) # get langfuse trace id
|
|
||||||
if trace_id is None:
|
|
||||||
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
|
|
||||||
_litellm_params["metadata"]["trace_id"] = trace_id
|
|
||||||
|
|
||||||
# Log hanging request as an error on langfuse
|
|
||||||
if type == "hanging_request":
|
|
||||||
if self.langfuse_logger is not None:
|
|
||||||
_logging_kwargs = copy.deepcopy(request_data)
|
|
||||||
if _logging_kwargs is None:
|
|
||||||
_logging_kwargs = {}
|
|
||||||
_logging_kwargs["litellm_params"] = {}
|
|
||||||
request_data = request_data or {}
|
|
||||||
_logging_kwargs["litellm_params"]["metadata"] = request_data.get(
|
|
||||||
"metadata", {}
|
|
||||||
)
|
|
||||||
# log to langfuse in a separate thread
|
|
||||||
import threading
|
|
||||||
|
|
||||||
threading.Thread(
|
|
||||||
target=self.langfuse_logger.log_event,
|
|
||||||
args=(
|
|
||||||
_logging_kwargs,
|
|
||||||
None,
|
|
||||||
start_time,
|
|
||||||
end_time,
|
|
||||||
None,
|
|
||||||
print,
|
|
||||||
"ERROR",
|
|
||||||
"Requests is hanging",
|
|
||||||
),
|
|
||||||
).start()
|
|
||||||
|
|
||||||
_langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
|
||||||
_langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
|
|
||||||
|
|
||||||
# langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
|
|
||||||
|
|
||||||
_langfuse_url = (
|
|
||||||
f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
|
|
||||||
)
|
|
||||||
request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
|
|
||||||
return request_info
|
return request_info
|
||||||
|
|
||||||
def _response_taking_too_long_callback(
|
def _response_taking_too_long_callback(
|
||||||
|
@ -242,10 +175,6 @@ class SlackAlerting:
|
||||||
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
|
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
|
||||||
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
|
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
|
||||||
if time_difference_float > self.alerting_threshold:
|
if time_difference_float > self.alerting_threshold:
|
||||||
if "langfuse" in litellm.success_callback:
|
|
||||||
request_info = self._add_langfuse_trace_id_to_alert(
|
|
||||||
request_info=request_info, kwargs=kwargs, type="slow_response"
|
|
||||||
)
|
|
||||||
# add deployment latencies to alert
|
# add deployment latencies to alert
|
||||||
if (
|
if (
|
||||||
kwargs is not None
|
kwargs is not None
|
||||||
|
|
|
@ -34,6 +34,8 @@ from litellm.utils import (
|
||||||
async_mock_completion_streaming_obj,
|
async_mock_completion_streaming_obj,
|
||||||
convert_to_model_response_object,
|
convert_to_model_response_object,
|
||||||
token_counter,
|
token_counter,
|
||||||
|
create_pretrained_tokenizer,
|
||||||
|
create_tokenizer,
|
||||||
Usage,
|
Usage,
|
||||||
get_optional_params_embeddings,
|
get_optional_params_embeddings,
|
||||||
get_optional_params_image_gen,
|
get_optional_params_image_gen,
|
||||||
|
|
|
@ -338,6 +338,18 @@
|
||||||
"output_cost_per_second": 0.0001,
|
"output_cost_per_second": 0.0001,
|
||||||
"litellm_provider": "azure"
|
"litellm_provider": "azure"
|
||||||
},
|
},
|
||||||
|
"azure/gpt-4-turbo-2024-04-09": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 128000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00001,
|
||||||
|
"output_cost_per_token": 0.00003,
|
||||||
|
"litellm_provider": "azure",
|
||||||
|
"mode": "chat",
|
||||||
|
"supports_function_calling": true,
|
||||||
|
"supports_parallel_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
|
},
|
||||||
"azure/gpt-4-0125-preview": {
|
"azure/gpt-4-0125-preview": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
"max_input_tokens": 128000,
|
"max_input_tokens": 128000,
|
||||||
|
@ -813,6 +825,7 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 264
|
"tool_use_system_prompt_tokens": 264
|
||||||
},
|
},
|
||||||
"claude-3-opus-20240229": {
|
"claude-3-opus-20240229": {
|
||||||
|
@ -824,6 +837,7 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 395
|
"tool_use_system_prompt_tokens": 395
|
||||||
},
|
},
|
||||||
"claude-3-sonnet-20240229": {
|
"claude-3-sonnet-20240229": {
|
||||||
|
@ -835,6 +849,7 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 159
|
"tool_use_system_prompt_tokens": 159
|
||||||
},
|
},
|
||||||
"text-bison": {
|
"text-bison": {
|
||||||
|
@ -1142,7 +1157,8 @@
|
||||||
"output_cost_per_token": 0.000015,
|
"output_cost_per_token": 0.000015,
|
||||||
"litellm_provider": "vertex_ai-anthropic_models",
|
"litellm_provider": "vertex_ai-anthropic_models",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"vertex_ai/claude-3-haiku@20240307": {
|
"vertex_ai/claude-3-haiku@20240307": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1152,7 +1168,8 @@
|
||||||
"output_cost_per_token": 0.00000125,
|
"output_cost_per_token": 0.00000125,
|
||||||
"litellm_provider": "vertex_ai-anthropic_models",
|
"litellm_provider": "vertex_ai-anthropic_models",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"vertex_ai/claude-3-opus@20240229": {
|
"vertex_ai/claude-3-opus@20240229": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1162,7 +1179,8 @@
|
||||||
"output_cost_per_token": 0.0000075,
|
"output_cost_per_token": 0.0000075,
|
||||||
"litellm_provider": "vertex_ai-anthropic_models",
|
"litellm_provider": "vertex_ai-anthropic_models",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"textembedding-gecko": {
|
"textembedding-gecko": {
|
||||||
"max_tokens": 3072,
|
"max_tokens": 3072,
|
||||||
|
@ -1581,6 +1599,7 @@
|
||||||
"litellm_provider": "openrouter",
|
"litellm_provider": "openrouter",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 395
|
"tool_use_system_prompt_tokens": 395
|
||||||
},
|
},
|
||||||
"openrouter/google/palm-2-chat-bison": {
|
"openrouter/google/palm-2-chat-bison": {
|
||||||
|
@ -1929,7 +1948,8 @@
|
||||||
"output_cost_per_token": 0.000015,
|
"output_cost_per_token": 0.000015,
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"anthropic.claude-3-haiku-20240307-v1:0": {
|
"anthropic.claude-3-haiku-20240307-v1:0": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1939,7 +1959,8 @@
|
||||||
"output_cost_per_token": 0.00000125,
|
"output_cost_per_token": 0.00000125,
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"anthropic.claude-3-opus-20240229-v1:0": {
|
"anthropic.claude-3-opus-20240229-v1:0": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1949,7 +1970,8 @@
|
||||||
"output_cost_per_token": 0.000075,
|
"output_cost_per_token": 0.000075,
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"anthropic.claude-v1": {
|
"anthropic.claude-v1": {
|
||||||
"max_tokens": 8191,
|
"max_tokens": 8191,
|
||||||
|
|
|
@ -11,5 +11,12 @@ router_settings:
|
||||||
redis_password: os.environ/REDIS_PASSWORD
|
redis_password: os.environ/REDIS_PASSWORD
|
||||||
redis_port: os.environ/REDIS_PORT
|
redis_port: os.environ/REDIS_PORT
|
||||||
|
|
||||||
|
router_settings:
|
||||||
|
routing_strategy: "latency-based-routing"
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
success_callback: ["openmeter"]
|
success_callback: ["openmeter"]
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
alerting: ["slack"]
|
||||||
|
alert_types: ["llm_exceptions"]
|
|
@ -3446,172 +3446,6 @@ def model_list(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
|
||||||
"/v1/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
|
|
||||||
)
|
|
||||||
@router.post(
|
|
||||||
"/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
|
|
||||||
)
|
|
||||||
@router.post(
|
|
||||||
"/engines/{model:path}/completions",
|
|
||||||
dependencies=[Depends(user_api_key_auth)],
|
|
||||||
tags=["completions"],
|
|
||||||
)
|
|
||||||
@router.post(
|
|
||||||
"/openai/deployments/{model:path}/completions",
|
|
||||||
dependencies=[Depends(user_api_key_auth)],
|
|
||||||
tags=["completions"],
|
|
||||||
)
|
|
||||||
async def completion(
|
|
||||||
request: Request,
|
|
||||||
fastapi_response: Response,
|
|
||||||
model: Optional[str] = None,
|
|
||||||
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
|
||||||
):
|
|
||||||
global user_temperature, user_request_timeout, user_max_tokens, user_api_base
|
|
||||||
try:
|
|
||||||
body = await request.body()
|
|
||||||
body_str = body.decode()
|
|
||||||
try:
|
|
||||||
data = ast.literal_eval(body_str)
|
|
||||||
except:
|
|
||||||
data = json.loads(body_str)
|
|
||||||
|
|
||||||
data["user"] = data.get("user", user_api_key_dict.user_id)
|
|
||||||
data["model"] = (
|
|
||||||
general_settings.get("completion_model", None) # server default
|
|
||||||
or user_model # model name passed via cli args
|
|
||||||
or model # for azure deployments
|
|
||||||
or data["model"] # default passed in http request
|
|
||||||
)
|
|
||||||
if user_model:
|
|
||||||
data["model"] = user_model
|
|
||||||
if "metadata" not in data:
|
|
||||||
data["metadata"] = {}
|
|
||||||
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
|
|
||||||
data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
|
|
||||||
data["metadata"]["user_api_key_alias"] = getattr(
|
|
||||||
user_api_key_dict, "key_alias", None
|
|
||||||
)
|
|
||||||
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
|
|
||||||
data["metadata"]["user_api_key_team_id"] = getattr(
|
|
||||||
user_api_key_dict, "team_id", None
|
|
||||||
)
|
|
||||||
data["metadata"]["user_api_key_team_alias"] = getattr(
|
|
||||||
user_api_key_dict, "team_alias", None
|
|
||||||
)
|
|
||||||
_headers = dict(request.headers)
|
|
||||||
_headers.pop(
|
|
||||||
"authorization", None
|
|
||||||
) # do not store the original `sk-..` api key in the db
|
|
||||||
data["metadata"]["headers"] = _headers
|
|
||||||
data["metadata"]["endpoint"] = str(request.url)
|
|
||||||
|
|
||||||
# override with user settings, these are params passed via cli
|
|
||||||
if user_temperature:
|
|
||||||
data["temperature"] = user_temperature
|
|
||||||
if user_request_timeout:
|
|
||||||
data["request_timeout"] = user_request_timeout
|
|
||||||
if user_max_tokens:
|
|
||||||
data["max_tokens"] = user_max_tokens
|
|
||||||
if user_api_base:
|
|
||||||
data["api_base"] = user_api_base
|
|
||||||
|
|
||||||
### MODEL ALIAS MAPPING ###
|
|
||||||
# check if model name in model alias map
|
|
||||||
# get the actual model name
|
|
||||||
if data["model"] in litellm.model_alias_map:
|
|
||||||
data["model"] = litellm.model_alias_map[data["model"]]
|
|
||||||
|
|
||||||
### CALL HOOKS ### - modify incoming data before calling the model
|
|
||||||
data = await proxy_logging_obj.pre_call_hook(
|
|
||||||
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
|
|
||||||
)
|
|
||||||
|
|
||||||
### ROUTE THE REQUESTs ###
|
|
||||||
router_model_names = llm_router.model_names if llm_router is not None else []
|
|
||||||
# skip router if user passed their key
|
|
||||||
if "api_key" in data:
|
|
||||||
response = await litellm.atext_completion(**data)
|
|
||||||
elif (
|
|
||||||
llm_router is not None and data["model"] in router_model_names
|
|
||||||
): # model in router model list
|
|
||||||
response = await llm_router.atext_completion(**data)
|
|
||||||
elif (
|
|
||||||
llm_router is not None
|
|
||||||
and llm_router.model_group_alias is not None
|
|
||||||
and data["model"] in llm_router.model_group_alias
|
|
||||||
): # model set in model_group_alias
|
|
||||||
response = await llm_router.atext_completion(**data)
|
|
||||||
elif (
|
|
||||||
llm_router is not None and data["model"] in llm_router.deployment_names
|
|
||||||
): # model in router deployments, calling a specific deployment on the router
|
|
||||||
response = await llm_router.atext_completion(
|
|
||||||
**data, specific_deployment=True
|
|
||||||
)
|
|
||||||
elif (
|
|
||||||
llm_router is not None
|
|
||||||
and data["model"] not in router_model_names
|
|
||||||
and llm_router.default_deployment is not None
|
|
||||||
): # model in router deployments, calling a specific deployment on the router
|
|
||||||
response = await llm_router.atext_completion(**data)
|
|
||||||
elif user_model is not None: # `litellm --model <your-model-name>`
|
|
||||||
response = await litellm.atext_completion(**data)
|
|
||||||
else:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
|
||||||
detail={
|
|
||||||
"error": "Invalid model name passed in model="
|
|
||||||
+ data.get("model", "")
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
if hasattr(response, "_hidden_params"):
|
|
||||||
model_id = response._hidden_params.get("model_id", None) or ""
|
|
||||||
original_response = (
|
|
||||||
response._hidden_params.get("original_response", None) or ""
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model_id = ""
|
|
||||||
original_response = ""
|
|
||||||
|
|
||||||
verbose_proxy_logger.debug("final response: %s", response)
|
|
||||||
if (
|
|
||||||
"stream" in data and data["stream"] == True
|
|
||||||
): # use generate_responses to stream responses
|
|
||||||
custom_headers = {
|
|
||||||
"x-litellm-model-id": model_id,
|
|
||||||
}
|
|
||||||
selected_data_generator = select_data_generator(
|
|
||||||
response=response, user_api_key_dict=user_api_key_dict
|
|
||||||
)
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
selected_data_generator,
|
|
||||||
media_type="text/event-stream",
|
|
||||||
headers=custom_headers,
|
|
||||||
)
|
|
||||||
|
|
||||||
fastapi_response.headers["x-litellm-model-id"] = model_id
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
data["litellm_status"] = "fail" # used for alerting
|
|
||||||
verbose_proxy_logger.debug("EXCEPTION RAISED IN PROXY MAIN.PY")
|
|
||||||
verbose_proxy_logger.debug(
|
|
||||||
"\033[1;31mAn error occurred: %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`",
|
|
||||||
e,
|
|
||||||
)
|
|
||||||
traceback.print_exc()
|
|
||||||
error_traceback = traceback.format_exc()
|
|
||||||
error_msg = f"{str(e)}"
|
|
||||||
raise ProxyException(
|
|
||||||
message=getattr(e, "message", error_msg),
|
|
||||||
type=getattr(e, "type", "None"),
|
|
||||||
param=getattr(e, "param", "None"),
|
|
||||||
code=getattr(e, "status_code", 500),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
"/v1/chat/completions",
|
"/v1/chat/completions",
|
||||||
dependencies=[Depends(user_api_key_auth)],
|
dependencies=[Depends(user_api_key_auth)],
|
||||||
|
@ -3810,7 +3644,7 @@ async def chat_completion(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail={
|
detail={
|
||||||
"error": "Invalid model name passed in model="
|
"error": "chat_completion: Invalid model name passed in model="
|
||||||
+ data.get("model", "")
|
+ data.get("model", "")
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -3884,6 +3718,172 @@ async def chat_completion(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/v1/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
|
||||||
|
)
|
||||||
|
@router.post(
|
||||||
|
"/completions", dependencies=[Depends(user_api_key_auth)], tags=["completions"]
|
||||||
|
)
|
||||||
|
@router.post(
|
||||||
|
"/engines/{model:path}/completions",
|
||||||
|
dependencies=[Depends(user_api_key_auth)],
|
||||||
|
tags=["completions"],
|
||||||
|
)
|
||||||
|
@router.post(
|
||||||
|
"/openai/deployments/{model:path}/completions",
|
||||||
|
dependencies=[Depends(user_api_key_auth)],
|
||||||
|
tags=["completions"],
|
||||||
|
)
|
||||||
|
async def completion(
|
||||||
|
request: Request,
|
||||||
|
fastapi_response: Response,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||||
|
):
|
||||||
|
global user_temperature, user_request_timeout, user_max_tokens, user_api_base
|
||||||
|
try:
|
||||||
|
body = await request.body()
|
||||||
|
body_str = body.decode()
|
||||||
|
try:
|
||||||
|
data = ast.literal_eval(body_str)
|
||||||
|
except:
|
||||||
|
data = json.loads(body_str)
|
||||||
|
|
||||||
|
data["user"] = data.get("user", user_api_key_dict.user_id)
|
||||||
|
data["model"] = (
|
||||||
|
general_settings.get("completion_model", None) # server default
|
||||||
|
or user_model # model name passed via cli args
|
||||||
|
or model # for azure deployments
|
||||||
|
or data["model"] # default passed in http request
|
||||||
|
)
|
||||||
|
if user_model:
|
||||||
|
data["model"] = user_model
|
||||||
|
if "metadata" not in data:
|
||||||
|
data["metadata"] = {}
|
||||||
|
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
|
||||||
|
data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
|
||||||
|
data["metadata"]["user_api_key_alias"] = getattr(
|
||||||
|
user_api_key_dict, "key_alias", None
|
||||||
|
)
|
||||||
|
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
|
||||||
|
data["metadata"]["user_api_key_team_id"] = getattr(
|
||||||
|
user_api_key_dict, "team_id", None
|
||||||
|
)
|
||||||
|
data["metadata"]["user_api_key_team_alias"] = getattr(
|
||||||
|
user_api_key_dict, "team_alias", None
|
||||||
|
)
|
||||||
|
_headers = dict(request.headers)
|
||||||
|
_headers.pop(
|
||||||
|
"authorization", None
|
||||||
|
) # do not store the original `sk-..` api key in the db
|
||||||
|
data["metadata"]["headers"] = _headers
|
||||||
|
data["metadata"]["endpoint"] = str(request.url)
|
||||||
|
|
||||||
|
# override with user settings, these are params passed via cli
|
||||||
|
if user_temperature:
|
||||||
|
data["temperature"] = user_temperature
|
||||||
|
if user_request_timeout:
|
||||||
|
data["request_timeout"] = user_request_timeout
|
||||||
|
if user_max_tokens:
|
||||||
|
data["max_tokens"] = user_max_tokens
|
||||||
|
if user_api_base:
|
||||||
|
data["api_base"] = user_api_base
|
||||||
|
|
||||||
|
### MODEL ALIAS MAPPING ###
|
||||||
|
# check if model name in model alias map
|
||||||
|
# get the actual model name
|
||||||
|
if data["model"] in litellm.model_alias_map:
|
||||||
|
data["model"] = litellm.model_alias_map[data["model"]]
|
||||||
|
|
||||||
|
### CALL HOOKS ### - modify incoming data before calling the model
|
||||||
|
data = await proxy_logging_obj.pre_call_hook(
|
||||||
|
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
|
||||||
|
)
|
||||||
|
|
||||||
|
### ROUTE THE REQUESTs ###
|
||||||
|
router_model_names = llm_router.model_names if llm_router is not None else []
|
||||||
|
# skip router if user passed their key
|
||||||
|
if "api_key" in data:
|
||||||
|
response = await litellm.atext_completion(**data)
|
||||||
|
elif (
|
||||||
|
llm_router is not None and data["model"] in router_model_names
|
||||||
|
): # model in router model list
|
||||||
|
response = await llm_router.atext_completion(**data)
|
||||||
|
elif (
|
||||||
|
llm_router is not None
|
||||||
|
and llm_router.model_group_alias is not None
|
||||||
|
and data["model"] in llm_router.model_group_alias
|
||||||
|
): # model set in model_group_alias
|
||||||
|
response = await llm_router.atext_completion(**data)
|
||||||
|
elif (
|
||||||
|
llm_router is not None and data["model"] in llm_router.deployment_names
|
||||||
|
): # model in router deployments, calling a specific deployment on the router
|
||||||
|
response = await llm_router.atext_completion(
|
||||||
|
**data, specific_deployment=True
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
llm_router is not None
|
||||||
|
and data["model"] not in router_model_names
|
||||||
|
and llm_router.default_deployment is not None
|
||||||
|
): # model in router deployments, calling a specific deployment on the router
|
||||||
|
response = await llm_router.atext_completion(**data)
|
||||||
|
elif user_model is not None: # `litellm --model <your-model-name>`
|
||||||
|
response = await litellm.atext_completion(**data)
|
||||||
|
else:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail={
|
||||||
|
"error": "completion: Invalid model name passed in model="
|
||||||
|
+ data.get("model", "")
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if hasattr(response, "_hidden_params"):
|
||||||
|
model_id = response._hidden_params.get("model_id", None) or ""
|
||||||
|
original_response = (
|
||||||
|
response._hidden_params.get("original_response", None) or ""
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_id = ""
|
||||||
|
original_response = ""
|
||||||
|
|
||||||
|
verbose_proxy_logger.debug("final response: %s", response)
|
||||||
|
if (
|
||||||
|
"stream" in data and data["stream"] == True
|
||||||
|
): # use generate_responses to stream responses
|
||||||
|
custom_headers = {
|
||||||
|
"x-litellm-model-id": model_id,
|
||||||
|
}
|
||||||
|
selected_data_generator = select_data_generator(
|
||||||
|
response=response, user_api_key_dict=user_api_key_dict
|
||||||
|
)
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
selected_data_generator,
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers=custom_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
fastapi_response.headers["x-litellm-model-id"] = model_id
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
data["litellm_status"] = "fail" # used for alerting
|
||||||
|
verbose_proxy_logger.debug("EXCEPTION RAISED IN PROXY MAIN.PY")
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
"\033[1;31mAn error occurred: %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
traceback.print_exc()
|
||||||
|
error_traceback = traceback.format_exc()
|
||||||
|
error_msg = f"{str(e)}"
|
||||||
|
raise ProxyException(
|
||||||
|
message=getattr(e, "message", error_msg),
|
||||||
|
type=getattr(e, "type", "None"),
|
||||||
|
param=getattr(e, "param", "None"),
|
||||||
|
code=getattr(e, "status_code", 500),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
"/v1/embeddings",
|
"/v1/embeddings",
|
||||||
dependencies=[Depends(user_api_key_auth)],
|
dependencies=[Depends(user_api_key_auth)],
|
||||||
|
@ -4041,7 +4041,7 @@ async def embeddings(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail={
|
detail={
|
||||||
"error": "Invalid model name passed in model="
|
"error": "embeddings: Invalid model name passed in model="
|
||||||
+ data.get("model", "")
|
+ data.get("model", "")
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -4197,7 +4197,7 @@ async def image_generation(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail={
|
detail={
|
||||||
"error": "Invalid model name passed in model="
|
"error": "image_generation: Invalid model name passed in model="
|
||||||
+ data.get("model", "")
|
+ data.get("model", "")
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -4372,7 +4372,7 @@ async def audio_transcriptions(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail={
|
detail={
|
||||||
"error": "Invalid model name passed in model="
|
"error": "audio_transcriptions: Invalid model name passed in model="
|
||||||
+ data.get("model", "")
|
+ data.get("model", "")
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -4538,7 +4538,7 @@ async def moderations(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail={
|
detail={
|
||||||
"error": "Invalid model name passed in model="
|
"error": "moderations: Invalid model name passed in model="
|
||||||
+ data.get("model", "")
|
+ data.get("model", "")
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
@ -387,15 +387,21 @@ class ProxyLogging:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
### ALERTING ###
|
### ALERTING ###
|
||||||
if "llm_exceptions" not in self.alert_types:
|
if "llm_exceptions" in self.alert_types and not isinstance(
|
||||||
return
|
original_exception, HTTPException
|
||||||
asyncio.create_task(
|
):
|
||||||
self.alerting_handler(
|
"""
|
||||||
message=f"LLM API call failed: {str(original_exception)}",
|
Just alert on LLM API exceptions. Do not alert on user errors
|
||||||
level="High",
|
|
||||||
alert_type="llm_exceptions",
|
Related issue - https://github.com/BerriAI/litellm/issues/3395
|
||||||
|
"""
|
||||||
|
asyncio.create_task(
|
||||||
|
self.alerting_handler(
|
||||||
|
message=f"LLM API call failed: {str(original_exception)}",
|
||||||
|
level="High",
|
||||||
|
alert_type="llm_exceptions",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
for callback in litellm.callbacks:
|
for callback in litellm.callbacks:
|
||||||
try:
|
try:
|
||||||
|
@ -679,8 +685,8 @@ class PrismaClient:
|
||||||
@backoff.on_exception(
|
@backoff.on_exception(
|
||||||
backoff.expo,
|
backoff.expo,
|
||||||
Exception, # base exception to catch for the backoff
|
Exception, # base exception to catch for the backoff
|
||||||
max_tries=3, # maximum number of retries
|
max_tries=1, # maximum number of retries
|
||||||
max_time=10, # maximum total time to retry for
|
max_time=2, # maximum total time to retry for
|
||||||
on_backoff=on_backoff, # specifying the function to call on backoff
|
on_backoff=on_backoff, # specifying the function to call on backoff
|
||||||
)
|
)
|
||||||
async def get_generic_data(
|
async def get_generic_data(
|
||||||
|
@ -718,7 +724,8 @@ class PrismaClient:
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
error_msg = f"LiteLLM Prisma Client Exception get_generic_data: {str(e)}"
|
error_msg = f"LiteLLM Prisma Client Exception get_generic_data: {str(e)}"
|
||||||
print_verbose(error_msg)
|
verbose_proxy_logger.error(error_msg)
|
||||||
|
error_msg = error_msg + "\nException Type: {}".format(type(e))
|
||||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
_duration = end_time - start_time
|
_duration = end_time - start_time
|
||||||
|
|
|
@ -2590,6 +2590,16 @@ class Router:
|
||||||
return model
|
return model
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_model_info(self, id: str) -> Optional[dict]:
|
||||||
|
"""
|
||||||
|
For a given model id, return the model info
|
||||||
|
"""
|
||||||
|
for model in self.model_list:
|
||||||
|
if "model_info" in model and "id" in model["model_info"]:
|
||||||
|
if id == model["model_info"]["id"]:
|
||||||
|
return model
|
||||||
|
return None
|
||||||
|
|
||||||
def get_model_ids(self):
|
def get_model_ids(self):
|
||||||
ids = []
|
ids = []
|
||||||
for model in self.model_list:
|
for model in self.model_list:
|
||||||
|
@ -2904,15 +2914,10 @@ class Router:
|
||||||
m for m in self.model_list if m["litellm_params"]["model"] == model
|
m for m in self.model_list if m["litellm_params"]["model"] == model
|
||||||
]
|
]
|
||||||
|
|
||||||
verbose_router_logger.debug(
|
litellm.print_verbose(f"initial list of deployments: {healthy_deployments}")
|
||||||
f"initial list of deployments: {healthy_deployments}"
|
|
||||||
)
|
|
||||||
|
|
||||||
verbose_router_logger.debug(
|
|
||||||
f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
|
|
||||||
)
|
|
||||||
if len(healthy_deployments) == 0:
|
if len(healthy_deployments) == 0:
|
||||||
raise ValueError(f"No healthy deployment available, passed model={model}")
|
raise ValueError(f"No healthy deployment available, passed model={model}. ")
|
||||||
if litellm.model_alias_map and model in litellm.model_alias_map:
|
if litellm.model_alias_map and model in litellm.model_alias_map:
|
||||||
model = litellm.model_alias_map[
|
model = litellm.model_alias_map[
|
||||||
model
|
model
|
||||||
|
|
|
@ -79,10 +79,12 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
model=deployment.get("litellm_params", {}).get("model"),
|
model=deployment.get("litellm_params", {}).get("model"),
|
||||||
response=httpx.Response(
|
response=httpx.Response(
|
||||||
status_code=429,
|
status_code=429,
|
||||||
content="{} rpm limit={}. current usage={}".format(
|
content="{} rpm limit={}. current usage={}. id={}, model_group={}. Get the model info by calling 'router.get_model_info(id)".format(
|
||||||
RouterErrors.user_defined_ratelimit_error.value,
|
RouterErrors.user_defined_ratelimit_error.value,
|
||||||
deployment_rpm,
|
deployment_rpm,
|
||||||
local_result,
|
local_result,
|
||||||
|
model_id,
|
||||||
|
deployment.get("model_name", ""),
|
||||||
),
|
),
|
||||||
request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"), # type: ignore
|
request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"), # type: ignore
|
||||||
),
|
),
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
|
||||||
|
"usage": _convert_usage_input(usage) if usage is not None else None,
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
|
||||||
|
"totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
|
||||||
|
return int(usage[key])
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
|
||||||
|
"usage": _convert_usage_input(usage) if usage is not None else None,
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
|
||||||
|
"totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
|
||||||
|
return int(usage[key])
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
|
||||||
|
"usage": _convert_usage_input(usage) if usage is not None else None,
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
|
||||||
|
"totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
|
||||||
|
return int(usage[key])
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
|
||||||
|
"usage": _convert_usage_input(usage) if usage is not None else None,
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
|
||||||
|
"totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
|
||||||
|
return int(usage[key])
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/client.py", line 778, in generation
|
||||||
|
"usage": _convert_usage_input(usage) if usage is not None else None,
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 77, in _convert_usage_input
|
||||||
|
"totalCost": extract_by_priority(usage, ["totalCost", "total_cost"]),
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/homebrew/lib/python3.11/site-packages/langfuse/utils.py", line 32, in extract_by_priority
|
||||||
|
return int(usage[key])
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
|
||||||
|
consumer is running...
|
||||||
|
Getting observations... None, None, None, None, litellm-test-98e1cc75-bef8-4280-a2b9-e08633b81acd, None, GENERATION
|
||||||
|
consumer is running...
|
||||||
|
Getting observations... None, None, None, None, litellm-test-532d2bc8-f8d6-42fd-8f78-416bae79925d, None, GENERATION
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
||||||
|
joining 1 consumer threads
|
||||||
|
consumer thread 0 joined
|
|
@ -205,8 +205,6 @@ async def test_langfuse_logging_without_request_response(stream):
|
||||||
assert _trace_data[0].output == {
|
assert _trace_data[0].output == {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": "redacted-by-litellm",
|
"content": "redacted-by-litellm",
|
||||||
"function_call": None,
|
|
||||||
"tool_calls": None,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import io, asyncio
|
import io, asyncio, httpx
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
# import logging
|
# import logging
|
||||||
|
@ -17,6 +17,61 @@ import asyncio
|
||||||
from unittest.mock import patch, MagicMock
|
from unittest.mock import patch, MagicMock
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.integrations.slack_alerting import SlackAlerting
|
from litellm.integrations.slack_alerting import SlackAlerting
|
||||||
|
from litellm.proxy._types import UserAPIKeyAuth
|
||||||
|
from litellm.proxy.proxy_server import HTTPException
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("exception_type", ["llm-exception", "non-llm-exception"])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_slack_alerting_llm_exceptions(exception_type, monkeypatch):
|
||||||
|
"""
|
||||||
|
Test if non-llm exception -> No request
|
||||||
|
Test if llm exception -> Request triggered
|
||||||
|
"""
|
||||||
|
_pl = ProxyLogging(user_api_key_cache=DualCache())
|
||||||
|
_pl.update_values(
|
||||||
|
alerting=["slack"],
|
||||||
|
alerting_threshold=100,
|
||||||
|
redis_cache=None,
|
||||||
|
alert_types=["llm_exceptions"],
|
||||||
|
)
|
||||||
|
|
||||||
|
async def mock_alerting_handler(message, level, alert_type):
|
||||||
|
global exception_type
|
||||||
|
|
||||||
|
if exception_type == "llm-exception":
|
||||||
|
pass
|
||||||
|
elif exception_type == "non-llm-exception":
|
||||||
|
pytest.fail("Function should not have been called")
|
||||||
|
|
||||||
|
monkeypatch.setattr(_pl, "alerting_handler", mock_alerting_handler)
|
||||||
|
|
||||||
|
if exception_type == "llm-exception":
|
||||||
|
await _pl.post_call_failure_hook(
|
||||||
|
original_exception=litellm.APIError(
|
||||||
|
status_code=500,
|
||||||
|
message="This is a test exception",
|
||||||
|
llm_provider="openai",
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
request=httpx.Request(
|
||||||
|
method="completion", url="https://github.com/BerriAI/litellm"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
user_api_key_dict=UserAPIKeyAuth(),
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
elif exception_type == "non-llm-exception":
|
||||||
|
await _pl.post_call_failure_hook(
|
||||||
|
original_exception=HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail={"error": "this is a test exception"},
|
||||||
|
),
|
||||||
|
user_api_key_dict=UserAPIKeyAuth(),
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|
|
@ -169,7 +169,7 @@ def test_chat_completion_exception_any_model(client):
|
||||||
)
|
)
|
||||||
assert isinstance(openai_exception, openai.BadRequestError)
|
assert isinstance(openai_exception, openai.BadRequestError)
|
||||||
_error_message = openai_exception.message
|
_error_message = openai_exception.message
|
||||||
assert "Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
|
assert "chat_completion: Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}")
|
pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}")
|
||||||
|
@ -197,7 +197,7 @@ def test_embedding_exception_any_model(client):
|
||||||
print("Exception raised=", openai_exception)
|
print("Exception raised=", openai_exception)
|
||||||
assert isinstance(openai_exception, openai.BadRequestError)
|
assert isinstance(openai_exception, openai.BadRequestError)
|
||||||
_error_message = openai_exception.message
|
_error_message = openai_exception.message
|
||||||
assert "Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
|
assert "embeddings: Invalid model name passed in model=Lite-GPT-12" in str(_error_message)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}")
|
pytest.fail(f"LiteLLM Proxy test failed. Exception {str(e)}")
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import sys, os
|
import sys, os
|
||||||
import traceback
|
import traceback
|
||||||
|
from unittest import mock
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
@ -35,6 +36,77 @@ token = "sk-1234"
|
||||||
|
|
||||||
headers = {"Authorization": f"Bearer {token}"}
|
headers = {"Authorization": f"Bearer {token}"}
|
||||||
|
|
||||||
|
example_completion_result = {
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"message": {
|
||||||
|
"content": "Whispers of the wind carry dreams to me.",
|
||||||
|
"role": "assistant"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
example_embedding_result = {
|
||||||
|
"object": "list",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"object": "embedding",
|
||||||
|
"index": 0,
|
||||||
|
"embedding": [
|
||||||
|
-0.006929283495992422,
|
||||||
|
-0.005336422007530928,
|
||||||
|
-4.547132266452536e-05,
|
||||||
|
-0.024047505110502243,
|
||||||
|
-0.006929283495992422,
|
||||||
|
-0.005336422007530928,
|
||||||
|
-4.547132266452536e-05,
|
||||||
|
-0.024047505110502243,
|
||||||
|
-0.006929283495992422,
|
||||||
|
-0.005336422007530928,
|
||||||
|
-4.547132266452536e-05,
|
||||||
|
-0.024047505110502243,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": "text-embedding-3-small",
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 5,
|
||||||
|
"total_tokens": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
example_image_generation_result = {
|
||||||
|
"created": 1589478378,
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"url": "https://..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def mock_patch_acompletion():
|
||||||
|
return mock.patch(
|
||||||
|
"litellm.proxy.proxy_server.llm_router.acompletion",
|
||||||
|
return_value=example_completion_result,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mock_patch_aembedding():
|
||||||
|
return mock.patch(
|
||||||
|
"litellm.proxy.proxy_server.llm_router.aembedding",
|
||||||
|
return_value=example_embedding_result,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mock_patch_aimage_generation():
|
||||||
|
return mock.patch(
|
||||||
|
"litellm.proxy.proxy_server.llm_router.aimage_generation",
|
||||||
|
return_value=example_image_generation_result,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def client_no_auth():
|
def client_no_auth():
|
||||||
|
@ -52,7 +124,8 @@ def client_no_auth():
|
||||||
return TestClient(app)
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
def test_chat_completion(client_no_auth):
|
@mock_patch_acompletion()
|
||||||
|
def test_chat_completion(mock_acompletion, client_no_auth):
|
||||||
global headers
|
global headers
|
||||||
try:
|
try:
|
||||||
# Your test data
|
# Your test data
|
||||||
|
@ -66,6 +139,19 @@ def test_chat_completion(client_no_auth):
|
||||||
|
|
||||||
print("testing proxy server with chat completions")
|
print("testing proxy server with chat completions")
|
||||||
response = client_no_auth.post("/v1/chat/completions", json=test_data)
|
response = client_no_auth.post("/v1/chat/completions", json=test_data)
|
||||||
|
mock_acompletion.assert_called_once_with(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "hi"},
|
||||||
|
],
|
||||||
|
max_tokens=10,
|
||||||
|
litellm_call_id=mock.ANY,
|
||||||
|
litellm_logging_obj=mock.ANY,
|
||||||
|
request_timeout=mock.ANY,
|
||||||
|
specific_deployment=True,
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
print(f"response - {response.text}")
|
print(f"response - {response.text}")
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
|
@ -77,7 +163,8 @@ def test_chat_completion(client_no_auth):
|
||||||
# Run the test
|
# Run the test
|
||||||
|
|
||||||
|
|
||||||
def test_chat_completion_azure(client_no_auth):
|
@mock_patch_acompletion()
|
||||||
|
def test_chat_completion_azure(mock_acompletion, client_no_auth):
|
||||||
global headers
|
global headers
|
||||||
try:
|
try:
|
||||||
# Your test data
|
# Your test data
|
||||||
|
@ -92,6 +179,19 @@ def test_chat_completion_azure(client_no_auth):
|
||||||
print("testing proxy server with Azure Request /chat/completions")
|
print("testing proxy server with Azure Request /chat/completions")
|
||||||
response = client_no_auth.post("/v1/chat/completions", json=test_data)
|
response = client_no_auth.post("/v1/chat/completions", json=test_data)
|
||||||
|
|
||||||
|
mock_acompletion.assert_called_once_with(
|
||||||
|
model="azure/chatgpt-v-2",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "write 1 sentence poem"},
|
||||||
|
],
|
||||||
|
max_tokens=10,
|
||||||
|
litellm_call_id=mock.ANY,
|
||||||
|
litellm_logging_obj=mock.ANY,
|
||||||
|
request_timeout=mock.ANY,
|
||||||
|
specific_deployment=True,
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
print(f"Received response: {result}")
|
print(f"Received response: {result}")
|
||||||
|
@ -104,8 +204,51 @@ def test_chat_completion_azure(client_no_auth):
|
||||||
# test_chat_completion_azure()
|
# test_chat_completion_azure()
|
||||||
|
|
||||||
|
|
||||||
|
@mock_patch_acompletion()
|
||||||
|
def test_openai_deployments_model_chat_completions_azure(mock_acompletion, client_no_auth):
|
||||||
|
global headers
|
||||||
|
try:
|
||||||
|
# Your test data
|
||||||
|
test_data = {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "write 1 sentence poem"},
|
||||||
|
],
|
||||||
|
"max_tokens": 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
url = "/openai/deployments/azure/chatgpt-v-2/chat/completions"
|
||||||
|
print(f"testing proxy server with Azure Request {url}")
|
||||||
|
response = client_no_auth.post(url, json=test_data)
|
||||||
|
|
||||||
|
mock_acompletion.assert_called_once_with(
|
||||||
|
model="azure/chatgpt-v-2",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "write 1 sentence poem"},
|
||||||
|
],
|
||||||
|
max_tokens=10,
|
||||||
|
litellm_call_id=mock.ANY,
|
||||||
|
litellm_logging_obj=mock.ANY,
|
||||||
|
request_timeout=mock.ANY,
|
||||||
|
specific_deployment=True,
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
print(f"Received response: {result}")
|
||||||
|
assert len(result["choices"][0]["message"]["content"]) > 0
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
# test_openai_deployments_model_chat_completions_azure()
|
||||||
|
|
||||||
|
|
||||||
### EMBEDDING
|
### EMBEDDING
|
||||||
def test_embedding(client_no_auth):
|
@mock_patch_aembedding()
|
||||||
|
def test_embedding(mock_aembedding, client_no_auth):
|
||||||
global headers
|
global headers
|
||||||
from litellm.proxy.proxy_server import user_custom_auth
|
from litellm.proxy.proxy_server import user_custom_auth
|
||||||
|
|
||||||
|
@ -117,6 +260,13 @@ def test_embedding(client_no_auth):
|
||||||
|
|
||||||
response = client_no_auth.post("/v1/embeddings", json=test_data)
|
response = client_no_auth.post("/v1/embeddings", json=test_data)
|
||||||
|
|
||||||
|
mock_aembedding.assert_called_once_with(
|
||||||
|
model="azure/azure-embedding-model",
|
||||||
|
input=["good morning from litellm"],
|
||||||
|
specific_deployment=True,
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
print(len(result["data"][0]["embedding"]))
|
print(len(result["data"][0]["embedding"]))
|
||||||
|
@ -125,7 +275,8 @@ def test_embedding(client_no_auth):
|
||||||
pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
|
pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
def test_bedrock_embedding(client_no_auth):
|
@mock_patch_aembedding()
|
||||||
|
def test_bedrock_embedding(mock_aembedding, client_no_auth):
|
||||||
global headers
|
global headers
|
||||||
from litellm.proxy.proxy_server import user_custom_auth
|
from litellm.proxy.proxy_server import user_custom_auth
|
||||||
|
|
||||||
|
@ -137,6 +288,12 @@ def test_bedrock_embedding(client_no_auth):
|
||||||
|
|
||||||
response = client_no_auth.post("/v1/embeddings", json=test_data)
|
response = client_no_auth.post("/v1/embeddings", json=test_data)
|
||||||
|
|
||||||
|
mock_aembedding.assert_called_once_with(
|
||||||
|
model="amazon-embeddings",
|
||||||
|
input=["good morning from litellm"],
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
print(len(result["data"][0]["embedding"]))
|
print(len(result["data"][0]["embedding"]))
|
||||||
|
@ -171,7 +328,8 @@ def test_sagemaker_embedding(client_no_auth):
|
||||||
#### IMAGE GENERATION
|
#### IMAGE GENERATION
|
||||||
|
|
||||||
|
|
||||||
def test_img_gen(client_no_auth):
|
@mock_patch_aimage_generation()
|
||||||
|
def test_img_gen(mock_aimage_generation, client_no_auth):
|
||||||
global headers
|
global headers
|
||||||
from litellm.proxy.proxy_server import user_custom_auth
|
from litellm.proxy.proxy_server import user_custom_auth
|
||||||
|
|
||||||
|
@ -185,6 +343,14 @@ def test_img_gen(client_no_auth):
|
||||||
|
|
||||||
response = client_no_auth.post("/v1/images/generations", json=test_data)
|
response = client_no_auth.post("/v1/images/generations", json=test_data)
|
||||||
|
|
||||||
|
mock_aimage_generation.assert_called_once_with(
|
||||||
|
model='dall-e-3',
|
||||||
|
prompt='A cute baby sea otter',
|
||||||
|
n=1,
|
||||||
|
size='1024x1024',
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
print(len(result["data"][0]["url"]))
|
print(len(result["data"][0]["url"]))
|
||||||
|
@ -249,7 +415,8 @@ class MyCustomHandler(CustomLogger):
|
||||||
customHandler = MyCustomHandler()
|
customHandler = MyCustomHandler()
|
||||||
|
|
||||||
|
|
||||||
def test_chat_completion_optional_params(client_no_auth):
|
@mock_patch_acompletion()
|
||||||
|
def test_chat_completion_optional_params(mock_acompletion, client_no_auth):
|
||||||
# [PROXY: PROD TEST] - DO NOT DELETE
|
# [PROXY: PROD TEST] - DO NOT DELETE
|
||||||
# This tests if all the /chat/completion params are passed to litellm
|
# This tests if all the /chat/completion params are passed to litellm
|
||||||
try:
|
try:
|
||||||
|
@ -267,6 +434,20 @@ def test_chat_completion_optional_params(client_no_auth):
|
||||||
litellm.callbacks = [customHandler]
|
litellm.callbacks = [customHandler]
|
||||||
print("testing proxy server: optional params")
|
print("testing proxy server: optional params")
|
||||||
response = client_no_auth.post("/v1/chat/completions", json=test_data)
|
response = client_no_auth.post("/v1/chat/completions", json=test_data)
|
||||||
|
mock_acompletion.assert_called_once_with(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "hi"},
|
||||||
|
],
|
||||||
|
max_tokens=10,
|
||||||
|
user="proxy-user",
|
||||||
|
litellm_call_id=mock.ANY,
|
||||||
|
litellm_logging_obj=mock.ANY,
|
||||||
|
request_timeout=mock.ANY,
|
||||||
|
specific_deployment=True,
|
||||||
|
metadata=mock.ANY,
|
||||||
|
proxy_server_request=mock.ANY,
|
||||||
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
print(f"Received response: {result}")
|
print(f"Received response: {result}")
|
||||||
|
|
|
@ -9,7 +9,7 @@ sys.path.insert(
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
) # Adds the parent directory to the system path
|
) # Adds the parent directory to the system path
|
||||||
import time
|
import time
|
||||||
from litellm import token_counter, encode, decode
|
from litellm import token_counter, create_pretrained_tokenizer, encode, decode
|
||||||
|
|
||||||
|
|
||||||
def test_token_counter_normal_plus_function_calling():
|
def test_token_counter_normal_plus_function_calling():
|
||||||
|
@ -69,15 +69,23 @@ def test_tokenizers():
|
||||||
model="meta-llama/Llama-2-7b-chat", text=sample_text
|
model="meta-llama/Llama-2-7b-chat", text=sample_text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# llama3 tokenizer (also testing custom tokenizer)
|
||||||
|
llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text)
|
||||||
|
|
||||||
|
llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
|
||||||
|
llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}"
|
f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# assert that all token values are different
|
# assert that all token values are different
|
||||||
assert (
|
assert (
|
||||||
openai_tokens != cohere_tokens != llama2_tokens
|
openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1
|
||||||
), "Token values are not different."
|
), "Token values are not different."
|
||||||
|
|
||||||
|
assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same."
|
||||||
|
|
||||||
print("test tokenizer: It worked!")
|
print("test tokenizer: It worked!")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occured: {e}")
|
pytest.fail(f"An exception occured: {e}")
|
||||||
|
|
|
@ -20,6 +20,8 @@ from litellm.utils import (
|
||||||
validate_environment,
|
validate_environment,
|
||||||
function_to_dict,
|
function_to_dict,
|
||||||
token_counter,
|
token_counter,
|
||||||
|
create_pretrained_tokenizer,
|
||||||
|
create_tokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils'
|
# Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils'
|
||||||
|
|
150
litellm/utils.py
150
litellm/utils.py
|
@ -378,16 +378,13 @@ class Message(OpenAIObject):
|
||||||
super(Message, self).__init__(**params)
|
super(Message, self).__init__(**params)
|
||||||
self.content = content
|
self.content = content
|
||||||
self.role = role
|
self.role = role
|
||||||
self.tool_calls = None
|
|
||||||
self.function_call = None
|
|
||||||
|
|
||||||
if function_call is not None:
|
if function_call is not None:
|
||||||
self.function_call = FunctionCall(**function_call)
|
self.function_call = FunctionCall(**function_call)
|
||||||
|
|
||||||
if tool_calls is not None:
|
if tool_calls is not None:
|
||||||
self.tool_calls = [
|
self.tool_calls = []
|
||||||
ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
|
for tool_call in tool_calls:
|
||||||
]
|
self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
|
||||||
|
|
||||||
if logprobs is not None:
|
if logprobs is not None:
|
||||||
self._logprobs = ChoiceLogprobs(**logprobs)
|
self._logprobs = ChoiceLogprobs(**logprobs)
|
||||||
|
@ -413,8 +410,6 @@ class Message(OpenAIObject):
|
||||||
|
|
||||||
|
|
||||||
class Delta(OpenAIObject):
|
class Delta(OpenAIObject):
|
||||||
tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
content=None,
|
content=None,
|
||||||
|
@ -1700,10 +1695,17 @@ class Logging:
|
||||||
print_verbose("reaches langfuse for streaming logging!")
|
print_verbose("reaches langfuse for streaming logging!")
|
||||||
result = kwargs["complete_streaming_response"]
|
result = kwargs["complete_streaming_response"]
|
||||||
if langFuseLogger is None or (
|
if langFuseLogger is None or (
|
||||||
self.langfuse_public_key != langFuseLogger.public_key
|
(
|
||||||
and self.langfuse_secret != langFuseLogger.secret_key
|
self.langfuse_public_key is not None
|
||||||
|
and self.langfuse_public_key
|
||||||
|
!= langFuseLogger.public_key
|
||||||
|
)
|
||||||
|
and (
|
||||||
|
self.langfuse_public_key is not None
|
||||||
|
and self.langfuse_public_key
|
||||||
|
!= langFuseLogger.public_key
|
||||||
|
)
|
||||||
):
|
):
|
||||||
print_verbose("Instantiates langfuse client")
|
|
||||||
langFuseLogger = LangFuseLogger(
|
langFuseLogger = LangFuseLogger(
|
||||||
langfuse_public_key=self.langfuse_public_key,
|
langfuse_public_key=self.langfuse_public_key,
|
||||||
langfuse_secret=self.langfuse_secret,
|
langfuse_secret=self.langfuse_secret,
|
||||||
|
@ -3773,29 +3775,34 @@ def _select_tokenizer(model: str):
|
||||||
elif "llama-2" in model.lower() or "replicate" in model.lower():
|
elif "llama-2" in model.lower() or "replicate" in model.lower():
|
||||||
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
|
# llama3
|
||||||
|
elif "llama-3" in model.lower():
|
||||||
|
tokenizer = Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
|
||||||
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
# default - tiktoken
|
# default - tiktoken
|
||||||
else:
|
else:
|
||||||
return {"type": "openai_tokenizer", "tokenizer": encoding}
|
return {"type": "openai_tokenizer", "tokenizer": encoding}
|
||||||
|
|
||||||
|
|
||||||
def encode(model: str, text: str):
|
def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
|
||||||
"""
|
"""
|
||||||
Encodes the given text using the specified model.
|
Encodes the given text using the specified model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (str): The name of the model to use for tokenization.
|
model (str): The name of the model to use for tokenization.
|
||||||
|
custom_tokenizer (Optional[dict]): A custom tokenizer created with the `create_pretrained_tokenizer` or `create_tokenizer` method. Must be a dictionary with a string value for `type` and Tokenizer for `tokenizer`. Default is None.
|
||||||
text (str): The text to be encoded.
|
text (str): The text to be encoded.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
enc: The encoded text.
|
enc: The encoded text.
|
||||||
"""
|
"""
|
||||||
tokenizer_json = _select_tokenizer(model=model)
|
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
|
||||||
enc = tokenizer_json["tokenizer"].encode(text)
|
enc = tokenizer_json["tokenizer"].encode(text)
|
||||||
return enc
|
return enc
|
||||||
|
|
||||||
|
|
||||||
def decode(model: str, tokens: List[int]):
|
def decode(model="", tokens: List[int] = [], custom_tokenizer: Optional[dict] = None):
|
||||||
tokenizer_json = _select_tokenizer(model=model)
|
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
|
||||||
dec = tokenizer_json["tokenizer"].decode(tokens)
|
dec = tokenizer_json["tokenizer"].decode(tokens)
|
||||||
return dec
|
return dec
|
||||||
|
|
||||||
|
@ -3967,10 +3974,47 @@ def calculage_img_tokens(
|
||||||
tile_tokens = (base_tokens * 2) * tiles_needed_high_res
|
tile_tokens = (base_tokens * 2) * tiles_needed_high_res
|
||||||
total_tokens = base_tokens + tile_tokens
|
total_tokens = base_tokens + tile_tokens
|
||||||
return total_tokens
|
return total_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def create_pretrained_tokenizer(
|
||||||
|
identifier: str,
|
||||||
|
revision="main",
|
||||||
|
auth_token: Optional[str] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Creates a tokenizer from an existing file on a HuggingFace repository to be used with `token_counter`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
identifier (str): The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file
|
||||||
|
revision (str, defaults to main): A branch or commit id
|
||||||
|
auth_token (str, optional, defaults to None): An optional auth token used to access private repositories on the Hugging Face Hub
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary with the tokenizer and its type.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_pretrained(identifier, revision=revision, auth_token=auth_token)
|
||||||
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
|
|
||||||
|
|
||||||
|
def create_tokenizer(json: str):
|
||||||
|
"""
|
||||||
|
Creates a tokenizer from a valid JSON string for use with `token_counter`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json (str): A valid JSON string representing a previously serialized tokenizer
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary with the tokenizer and its type.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_str(json)
|
||||||
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
|
|
||||||
|
|
||||||
def token_counter(
|
def token_counter(
|
||||||
model="",
|
model="",
|
||||||
|
custom_tokenizer: Optional[dict] = None,
|
||||||
text: Optional[Union[str, List[str]]] = None,
|
text: Optional[Union[str, List[str]]] = None,
|
||||||
messages: Optional[List] = None,
|
messages: Optional[List] = None,
|
||||||
count_response_tokens: Optional[bool] = False,
|
count_response_tokens: Optional[bool] = False,
|
||||||
|
@ -3980,13 +4024,14 @@ def token_counter(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (str): The name of the model to use for tokenization. Default is an empty string.
|
model (str): The name of the model to use for tokenization. Default is an empty string.
|
||||||
|
custom_tokenizer (Optional[dict]): A custom tokenizer created with the `create_pretrained_tokenizer` or `create_tokenizer` method. Must be a dictionary with a string value for `type` and Tokenizer for `tokenizer`. Default is None.
|
||||||
text (str): The raw text string to be passed to the model. Default is None.
|
text (str): The raw text string to be passed to the model. Default is None.
|
||||||
messages (Optional[List[Dict[str, str]]]): Alternative to passing in text. A list of dictionaries representing messages with "role" and "content" keys. Default is None.
|
messages (Optional[List[Dict[str, str]]]): Alternative to passing in text. A list of dictionaries representing messages with "role" and "content" keys. Default is None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int: The number of tokens in the text.
|
int: The number of tokens in the text.
|
||||||
"""
|
"""
|
||||||
# use tiktoken, anthropic, cohere or llama2's tokenizer depending on the model
|
# use tiktoken, anthropic, cohere, llama2, or llama3's tokenizer depending on the model
|
||||||
is_tool_call = False
|
is_tool_call = False
|
||||||
num_tokens = 0
|
num_tokens = 0
|
||||||
if text == None:
|
if text == None:
|
||||||
|
@ -4028,8 +4073,8 @@ def token_counter(
|
||||||
elif isinstance(text, str):
|
elif isinstance(text, str):
|
||||||
count_response_tokens = True # user just trying to count tokens for a text. don't add the chat_ml +3 tokens to this
|
count_response_tokens = True # user just trying to count tokens for a text. don't add the chat_ml +3 tokens to this
|
||||||
|
|
||||||
if model is not None:
|
if model is not None or custom_tokenizer is not None:
|
||||||
tokenizer_json = _select_tokenizer(model=model)
|
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
|
||||||
if tokenizer_json["type"] == "huggingface_tokenizer":
|
if tokenizer_json["type"] == "huggingface_tokenizer":
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Token Counter - using hugging face token counter, for model={model}"
|
f"Token Counter - using hugging face token counter, for model={model}"
|
||||||
|
@ -6768,7 +6813,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
|
||||||
keys_in_environment = True
|
keys_in_environment = True
|
||||||
else:
|
else:
|
||||||
missing_keys.append("NLP_CLOUD_API_KEY")
|
missing_keys.append("NLP_CLOUD_API_KEY")
|
||||||
elif custom_llm_provider == "bedrock":
|
elif custom_llm_provider == "bedrock" or custom_llm_provider == "sagemaker":
|
||||||
if (
|
if (
|
||||||
"AWS_ACCESS_KEY_ID" in os.environ
|
"AWS_ACCESS_KEY_ID" in os.environ
|
||||||
and "AWS_SECRET_ACCESS_KEY" in os.environ
|
and "AWS_SECRET_ACCESS_KEY" in os.environ
|
||||||
|
@ -6782,11 +6827,72 @@ def validate_environment(model: Optional[str] = None) -> dict:
|
||||||
keys_in_environment = True
|
keys_in_environment = True
|
||||||
else:
|
else:
|
||||||
missing_keys.append("OLLAMA_API_BASE")
|
missing_keys.append("OLLAMA_API_BASE")
|
||||||
|
elif custom_llm_provider == "anyscale":
|
||||||
|
if "ANYSCALE_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("ANYSCALE_API_KEY")
|
||||||
|
elif custom_llm_provider == "deepinfra":
|
||||||
|
if "DEEPINFRA_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("DEEPINFRA_API_KEY")
|
||||||
|
elif custom_llm_provider == "gemini":
|
||||||
|
if "GEMINI_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("GEMINI_API_KEY")
|
||||||
|
elif custom_llm_provider == "groq":
|
||||||
|
if "GROQ_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("GROQ_API_KEY")
|
||||||
|
elif custom_llm_provider == "mistral":
|
||||||
|
if "MISTRAL_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("MISTRAL_API_KEY")
|
||||||
|
elif custom_llm_provider == "palm":
|
||||||
|
if "PALM_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("PALM_API_KEY")
|
||||||
|
elif custom_llm_provider == "perplexity":
|
||||||
|
if "PERPLEXITYAI_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("PERPLEXITYAI_API_KEY")
|
||||||
|
elif custom_llm_provider == "voyage":
|
||||||
|
if "VOYAGE_API_KEY" in os.environ:
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("VOYAGE_API_KEY")
|
||||||
|
elif custom_llm_provider == "fireworks_ai":
|
||||||
|
if (
|
||||||
|
"FIREWORKS_AI_API_KEY" in os.environ
|
||||||
|
or "FIREWORKS_API_KEY" in os.environ
|
||||||
|
or "FIREWORKSAI_API_KEY" in os.environ
|
||||||
|
or "FIREWORKS_AI_TOKEN" in os.environ
|
||||||
|
):
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("FIREWORKS_AI_API_KEY")
|
||||||
|
elif custom_llm_provider == "cloudflare":
|
||||||
|
if "CLOUDFLARE_API_KEY" in os.environ and (
|
||||||
|
"CLOUDFLARE_ACCOUNT_ID" in os.environ
|
||||||
|
or "CLOUDFLARE_API_BASE" in os.environ
|
||||||
|
):
|
||||||
|
keys_in_environment = True
|
||||||
|
else:
|
||||||
|
missing_keys.append("CLOUDFLARE_API_KEY")
|
||||||
|
missing_keys.append("CLOUDFLARE_API_BASE")
|
||||||
else:
|
else:
|
||||||
## openai - chatcompletion + text completion
|
## openai - chatcompletion + text completion
|
||||||
if (
|
if (
|
||||||
model in litellm.open_ai_chat_completion_models
|
model in litellm.open_ai_chat_completion_models
|
||||||
or model in litellm.open_ai_text_completion_models
|
or model in litellm.open_ai_text_completion_models
|
||||||
|
or model in litellm.open_ai_embedding_models
|
||||||
|
or model in litellm.openai_image_generation_models
|
||||||
):
|
):
|
||||||
if "OPENAI_API_KEY" in os.environ:
|
if "OPENAI_API_KEY" in os.environ:
|
||||||
keys_in_environment = True
|
keys_in_environment = True
|
||||||
|
@ -6817,7 +6923,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
|
||||||
else:
|
else:
|
||||||
missing_keys.append("OPENROUTER_API_KEY")
|
missing_keys.append("OPENROUTER_API_KEY")
|
||||||
## vertex - text + chat models
|
## vertex - text + chat models
|
||||||
elif model in litellm.vertex_chat_models or model in litellm.vertex_text_models:
|
elif (
|
||||||
|
model in litellm.vertex_chat_models
|
||||||
|
or model in litellm.vertex_text_models
|
||||||
|
or model in litellm.models_by_provider["vertex_ai"]
|
||||||
|
):
|
||||||
if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
|
if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
|
||||||
keys_in_environment = True
|
keys_in_environment = True
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -338,6 +338,18 @@
|
||||||
"output_cost_per_second": 0.0001,
|
"output_cost_per_second": 0.0001,
|
||||||
"litellm_provider": "azure"
|
"litellm_provider": "azure"
|
||||||
},
|
},
|
||||||
|
"azure/gpt-4-turbo-2024-04-09": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 128000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00001,
|
||||||
|
"output_cost_per_token": 0.00003,
|
||||||
|
"litellm_provider": "azure",
|
||||||
|
"mode": "chat",
|
||||||
|
"supports_function_calling": true,
|
||||||
|
"supports_parallel_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
|
},
|
||||||
"azure/gpt-4-0125-preview": {
|
"azure/gpt-4-0125-preview": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
"max_input_tokens": 128000,
|
"max_input_tokens": 128000,
|
||||||
|
@ -813,6 +825,7 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 264
|
"tool_use_system_prompt_tokens": 264
|
||||||
},
|
},
|
||||||
"claude-3-opus-20240229": {
|
"claude-3-opus-20240229": {
|
||||||
|
@ -824,6 +837,7 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 395
|
"tool_use_system_prompt_tokens": 395
|
||||||
},
|
},
|
||||||
"claude-3-sonnet-20240229": {
|
"claude-3-sonnet-20240229": {
|
||||||
|
@ -835,6 +849,7 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 159
|
"tool_use_system_prompt_tokens": 159
|
||||||
},
|
},
|
||||||
"text-bison": {
|
"text-bison": {
|
||||||
|
@ -1142,7 +1157,8 @@
|
||||||
"output_cost_per_token": 0.000015,
|
"output_cost_per_token": 0.000015,
|
||||||
"litellm_provider": "vertex_ai-anthropic_models",
|
"litellm_provider": "vertex_ai-anthropic_models",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"vertex_ai/claude-3-haiku@20240307": {
|
"vertex_ai/claude-3-haiku@20240307": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1152,7 +1168,8 @@
|
||||||
"output_cost_per_token": 0.00000125,
|
"output_cost_per_token": 0.00000125,
|
||||||
"litellm_provider": "vertex_ai-anthropic_models",
|
"litellm_provider": "vertex_ai-anthropic_models",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"vertex_ai/claude-3-opus@20240229": {
|
"vertex_ai/claude-3-opus@20240229": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1162,7 +1179,8 @@
|
||||||
"output_cost_per_token": 0.0000075,
|
"output_cost_per_token": 0.0000075,
|
||||||
"litellm_provider": "vertex_ai-anthropic_models",
|
"litellm_provider": "vertex_ai-anthropic_models",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"textembedding-gecko": {
|
"textembedding-gecko": {
|
||||||
"max_tokens": 3072,
|
"max_tokens": 3072,
|
||||||
|
@ -1581,6 +1599,7 @@
|
||||||
"litellm_provider": "openrouter",
|
"litellm_provider": "openrouter",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true,
|
||||||
"tool_use_system_prompt_tokens": 395
|
"tool_use_system_prompt_tokens": 395
|
||||||
},
|
},
|
||||||
"openrouter/google/palm-2-chat-bison": {
|
"openrouter/google/palm-2-chat-bison": {
|
||||||
|
@ -1929,7 +1948,8 @@
|
||||||
"output_cost_per_token": 0.000015,
|
"output_cost_per_token": 0.000015,
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"anthropic.claude-3-haiku-20240307-v1:0": {
|
"anthropic.claude-3-haiku-20240307-v1:0": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1939,7 +1959,8 @@
|
||||||
"output_cost_per_token": 0.00000125,
|
"output_cost_per_token": 0.00000125,
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"anthropic.claude-3-opus-20240229-v1:0": {
|
"anthropic.claude-3-opus-20240229-v1:0": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
|
@ -1949,7 +1970,8 @@
|
||||||
"output_cost_per_token": 0.000075,
|
"output_cost_per_token": 0.000075,
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true,
|
||||||
|
"supports_vision": true
|
||||||
},
|
},
|
||||||
"anthropic.claude-v1": {
|
"anthropic.claude-v1": {
|
||||||
"max_tokens": 8191,
|
"max_tokens": 8191,
|
||||||
|
|
6
poetry.lock
generated
6
poetry.lock
generated
|
@ -1153,13 +1153,13 @@ typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "t
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.6"
|
version = "3.7"
|
||||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
files = [
|
files = [
|
||||||
{file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"},
|
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
|
||||||
{file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
|
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue