mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 11:43:54 +00:00
Merge branch 'main' into litellm_azure_ai_openai_support
This commit is contained in:
commit
bda1ee16a9
34 changed files with 1805 additions and 180 deletions
|
@ -62,6 +62,11 @@ COPY --from=builder /wheels/ /wheels/
|
|||
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
|
||||
|
||||
# Generate prisma client
|
||||
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
|
||||
RUN mkdir -p /.cache
|
||||
RUN chmod -R 777 /.cache
|
||||
RUN pip install nodejs-bin
|
||||
RUN pip install prisma
|
||||
RUN prisma generate
|
||||
RUN chmod +x entrypoint.sh
|
||||
|
||||
|
|
|
@ -62,6 +62,11 @@ RUN pip install PyJWT --no-cache-dir
|
|||
RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh
|
||||
|
||||
# Generate prisma client
|
||||
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
|
||||
RUN mkdir -p /.cache
|
||||
RUN chmod -R 777 /.cache
|
||||
RUN pip install nodejs-bin
|
||||
RUN pip install prisma
|
||||
RUN prisma generate
|
||||
RUN chmod +x entrypoint.sh
|
||||
|
||||
|
|
|
@ -225,22 +225,336 @@ print(response)
|
|||
| claude-instant-1.2 | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
|
||||
## Passing Extra Headers to Anthropic API
|
||||
## **Prompt Caching**
|
||||
|
||||
Pass `extra_headers: dict` to `litellm.completion`
|
||||
Use Anthropic Prompt Caching
|
||||
|
||||
|
||||
[Relevant Anthropic API Docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching)
|
||||
|
||||
### Caching - Large Context Caching
|
||||
|
||||
This example demonstrates basic Prompt Caching usage, caching the full text of the legal agreement as a prefix while keeping the user instruction uncached.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="LiteLLM SDK">
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
messages = [{"role": "user", "content": "What is Anthropic?"}]
|
||||
response = completion(
|
||||
model="claude-3-5-sonnet-20240620",
|
||||
messages=messages,
|
||||
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
|
||||
response = await litellm.acompletion(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "You are an AI assistant tasked with analyzing legal documents.",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what are the key terms and conditions in this agreement?",
|
||||
},
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="LiteLLM Proxy">
|
||||
|
||||
:::info
|
||||
|
||||
LiteLLM Proxy is OpenAI compatible
|
||||
|
||||
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
|
||||
|
||||
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
|
||||
|
||||
:::
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key="anything", # litellm proxy api key
|
||||
base_url="http://0.0.0.0:4000" # litellm proxy base url
|
||||
)
|
||||
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "You are an AI assistant tasked with analyzing legal documents.",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what are the key terms and conditions in this agreement?",
|
||||
},
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Caching - Tools definitions
|
||||
|
||||
In this example, we demonstrate caching tool definitions.
|
||||
|
||||
The cache_control parameter is placed on the final tool
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="LiteLLM SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
response = await litellm.acompletion(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
"cache_control": {"type": "ephemeral"}
|
||||
},
|
||||
}
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
```
|
||||
## Advanced
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="LiteLLM Proxy">
|
||||
|
||||
## Usage - Function Calling
|
||||
:::info
|
||||
|
||||
LiteLLM Proxy is OpenAI compatible
|
||||
|
||||
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
|
||||
|
||||
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
|
||||
|
||||
:::
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key="anything", # litellm proxy api key
|
||||
base_url="http://0.0.0.0:4000" # litellm proxy base url
|
||||
)
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
"cache_control": {"type": "ephemeral"}
|
||||
},
|
||||
}
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Caching - Continuing Multi-Turn Convo
|
||||
|
||||
In this example, we demonstrate how to use Prompt Caching in a multi-turn conversation.
|
||||
|
||||
The cache_control parameter is placed on the system message to designate it as part of the static prefix.
|
||||
|
||||
The conversation history (previous messages) is included in the messages array. The final turn is marked with cache-control, for continuing in followups. The second-to-last user message is marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="LiteLLM SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
response = await litellm.acompletion(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
# System Message
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement"
|
||||
* 400,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="LiteLLM Proxy">
|
||||
|
||||
:::info
|
||||
|
||||
LiteLLM Proxy is OpenAI compatible
|
||||
|
||||
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
|
||||
|
||||
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
|
||||
|
||||
:::
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key="anything", # litellm proxy api key
|
||||
base_url="http://0.0.0.0:4000" # litellm proxy base url
|
||||
)
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
# System Message
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement"
|
||||
* 400,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## **Function/Tool Calling**
|
||||
|
||||
:::info
|
||||
|
||||
|
@ -429,6 +743,20 @@ resp = litellm.completion(
|
|||
print(f"\nResponse: {resp}")
|
||||
```
|
||||
|
||||
## **Passing Extra Headers to Anthropic API**
|
||||
|
||||
Pass `extra_headers: dict` to `litellm.completion`
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
messages = [{"role": "user", "content": "What is Anthropic?"}]
|
||||
response = completion(
|
||||
model="claude-3-5-sonnet-20240620",
|
||||
messages=messages,
|
||||
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
|
||||
)
|
||||
```
|
||||
|
||||
## Usage - "Assistant Pre-fill"
|
||||
|
||||
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
|
||||
|
|
|
@ -17,7 +17,7 @@ model_list:
|
|||
|
||||
## Get Model Information - `/model/info`
|
||||
|
||||
Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
|
||||
Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled from the model_info you set and the [litellm model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Sensitive details like API keys are excluded for security purposes.
|
||||
|
||||
<Tabs
|
||||
defaultValue="curl"
|
||||
|
@ -35,14 +35,10 @@ curl -X GET "http://0.0.0.0:4000/model/info" \
|
|||
|
||||
## Add a New Model
|
||||
|
||||
Add a new model to the list in the `config.yaml` by providing the model parameters. This allows you to update the model list without restarting the proxy.
|
||||
Add a new model to the proxy via the `/model/new` API, to add models without restarting the proxy.
|
||||
|
||||
<Tabs
|
||||
defaultValue="curl"
|
||||
values={[
|
||||
{ label: 'cURL', value: 'curl', },
|
||||
]}>
|
||||
<TabItem value="curl">
|
||||
<Tabs>
|
||||
<TabItem value="API">
|
||||
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:4000/model/new" \
|
||||
|
@ -50,6 +46,21 @@ curl -X POST "http://0.0.0.0:4000/model/new" \
|
|||
-H "Content-Type: application/json" \
|
||||
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="Yaml">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ### `openai.chat.completions.create(model="gpt-3.5-turbo",...)`
|
||||
litellm_params: # all params accepted by litellm.completion() - https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/types/router.py#L297
|
||||
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
|
||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
|
||||
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
|
||||
model_info:
|
||||
my_custom_key: my_custom_value # additional model metadata
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
@ -86,3 +97,82 @@ Keep in mind that as both endpoints are in [BETA], you may need to visit the ass
|
|||
- Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964)
|
||||
|
||||
Feedback on the beta endpoints is valuable and helps improve the API for all users.
|
||||
|
||||
|
||||
## Add Additional Model Information
|
||||
|
||||
If you want the ability to add a display name, description, and labels for models, just use `model_info:`
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: "gpt-4"
|
||||
litellm_params:
|
||||
model: "gpt-4"
|
||||
api_key: "os.environ/OPENAI_API_KEY"
|
||||
model_info: # 👈 KEY CHANGE
|
||||
my_custom_key: "my_custom_value"
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
1. Add additional information to model
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: "gpt-4"
|
||||
litellm_params:
|
||||
model: "gpt-4"
|
||||
api_key: "os.environ/OPENAI_API_KEY"
|
||||
model_info: # 👈 KEY CHANGE
|
||||
my_custom_key: "my_custom_value"
|
||||
```
|
||||
|
||||
2. Call with `/model/info`
|
||||
|
||||
Use a key with access to the model `gpt-4`.
|
||||
|
||||
```bash
|
||||
curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
|
||||
-H 'Authorization: Bearer LITELLM_KEY' \
|
||||
```
|
||||
|
||||
3. **Expected Response**
|
||||
|
||||
Returned `model_info = Your custom model_info + (if exists) LITELLM MODEL INFO`
|
||||
|
||||
|
||||
[**How LiteLLM Model Info is found**](https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/proxy/proxy_server.py#L7460)
|
||||
|
||||
[Tell us how this can be improved!](https://github.com/BerriAI/litellm/issues)
|
||||
|
||||
```bash
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"model_name": "gpt-4",
|
||||
"litellm_params": {
|
||||
"model": "gpt-4"
|
||||
},
|
||||
"model_info": {
|
||||
"id": "e889baacd17f591cce4c63639275ba5e8dc60765d6c553e6ee5a504b19e50ddc",
|
||||
"db_model": false,
|
||||
"my_custom_key": "my_custom_value", # 👈 CUSTOM INFO
|
||||
"key": "gpt-4", # 👈 KEY in LiteLLM MODEL INFO/COST MAP - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 3e-05,
|
||||
"input_cost_per_character": null,
|
||||
"input_cost_per_token_above_128k_tokens": null,
|
||||
"output_cost_per_token": 6e-05,
|
||||
"output_cost_per_character": null,
|
||||
"output_cost_per_token_above_128k_tokens": null,
|
||||
"output_cost_per_character_above_128k_tokens": null,
|
||||
"output_vector_size": null,
|
||||
"litellm_provider": "openai",
|
||||
"mode": "chat"
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
|
|
@ -72,15 +72,15 @@ http://localhost:4000/metrics
|
|||
|
||||
| Metric Name | Description |
|
||||
|----------------------|--------------------------------------|
|
||||
| `deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
|
||||
| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
|
||||
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
|
||||
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
|
||||
`llm_deployment_success_responses` | Total number of successful LLM API calls for deployment |
|
||||
| `llm_deployment_failure_responses` | Total number of failed LLM API calls for deployment |
|
||||
| `llm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure |
|
||||
| `llm_deployment_latency_per_output_token` | Latency per output token for deployment |
|
||||
| `llm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model |
|
||||
| `llm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model |
|
||||
`litellm_deployment_success_responses` | Total number of successful LLM API calls for deployment |
|
||||
| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for deployment |
|
||||
| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure |
|
||||
| `litellm_deployment_latency_per_output_token` | Latency per output token for deployment |
|
||||
| `litellm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model |
|
||||
| `litellm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model |
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, TypedDict, Union
|
||||
|
||||
|
@ -29,6 +30,8 @@ class GCSBucketPayload(TypedDict):
|
|||
end_time: str
|
||||
response_cost: Optional[float]
|
||||
spend_log_metadata: str
|
||||
exception: Optional[str]
|
||||
log_event_type: Optional[str]
|
||||
|
||||
|
||||
class GCSBucketLogger(CustomLogger):
|
||||
|
@ -79,6 +82,7 @@ class GCSBucketLogger(CustomLogger):
|
|||
logging_payload: GCSBucketPayload = await self.get_gcs_payload(
|
||||
kwargs, response_obj, start_time_str, end_time_str
|
||||
)
|
||||
logging_payload["log_event_type"] = "successful_api_call"
|
||||
|
||||
json_logged_payload = json.dumps(logging_payload)
|
||||
|
||||
|
@ -103,7 +107,56 @@ class GCSBucketLogger(CustomLogger):
|
|||
verbose_logger.error("GCS Bucket logging error: %s", str(e))
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
pass
|
||||
from litellm.proxy.proxy_server import premium_user
|
||||
|
||||
if premium_user is not True:
|
||||
raise ValueError(
|
||||
f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
|
||||
)
|
||||
try:
|
||||
verbose_logger.debug(
|
||||
"GCS Logger: async_log_failure_event logging kwargs: %s, response_obj: %s",
|
||||
kwargs,
|
||||
response_obj,
|
||||
)
|
||||
|
||||
start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
headers = await self.construct_request_headers()
|
||||
|
||||
logging_payload: GCSBucketPayload = await self.get_gcs_payload(
|
||||
kwargs, response_obj, start_time_str, end_time_str
|
||||
)
|
||||
logging_payload["log_event_type"] = "failed_api_call"
|
||||
|
||||
_litellm_params = kwargs.get("litellm_params") or {}
|
||||
metadata = _litellm_params.get("metadata") or {}
|
||||
|
||||
json_logged_payload = json.dumps(logging_payload)
|
||||
|
||||
# Get the current date
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
# Modify the object_name to include the date-based folder
|
||||
object_name = f"{current_date}/failure-{uuid.uuid4().hex}"
|
||||
|
||||
if "gcs_log_id" in metadata:
|
||||
object_name = metadata["gcs_log_id"]
|
||||
|
||||
response = await self.async_httpx_client.post(
|
||||
headers=headers,
|
||||
url=f"https://storage.googleapis.com/upload/storage/v1/b/{self.BUCKET_NAME}/o?uploadType=media&name={object_name}",
|
||||
data=json_logged_payload,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
verbose_logger.error("GCS Bucket logging error: %s", str(response.text))
|
||||
|
||||
verbose_logger.debug("GCS Bucket response %s", response)
|
||||
verbose_logger.debug("GCS Bucket status code %s", response.status_code)
|
||||
verbose_logger.debug("GCS Bucket response.text %s", response.text)
|
||||
except Exception as e:
|
||||
verbose_logger.error("GCS Bucket logging error: %s", str(e))
|
||||
|
||||
async def construct_request_headers(self) -> Dict[str, str]:
|
||||
from litellm import vertex_chat_completion
|
||||
|
@ -139,10 +192,19 @@ class GCSBucketLogger(CustomLogger):
|
|||
optional_params=kwargs.get("optional_params", None),
|
||||
)
|
||||
response_dict = {}
|
||||
if response_obj:
|
||||
response_dict = convert_litellm_response_object_to_dict(
|
||||
response_obj=response_obj
|
||||
)
|
||||
|
||||
exception_str = None
|
||||
|
||||
# Handle logging exception attributes
|
||||
if "exception" in kwargs:
|
||||
exception_str = kwargs.get("exception", "")
|
||||
if not isinstance(exception_str, str):
|
||||
exception_str = str(exception_str)
|
||||
|
||||
_spend_log_payload: SpendLogsPayload = get_logging_payload(
|
||||
kwargs=kwargs,
|
||||
response_obj=response_obj,
|
||||
|
@ -156,8 +218,10 @@ class GCSBucketLogger(CustomLogger):
|
|||
response_obj=response_dict,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
spend_log_metadata=_spend_log_payload["metadata"],
|
||||
spend_log_metadata=_spend_log_payload.get("metadata", ""),
|
||||
response_cost=kwargs.get("response_cost", None),
|
||||
exception=exception_str,
|
||||
log_event_type=None,
|
||||
)
|
||||
|
||||
return gcs_payload
|
||||
|
|
|
@ -141,42 +141,42 @@ class PrometheusLogger(CustomLogger):
|
|||
]
|
||||
|
||||
# Metric for deployment state
|
||||
self.deployment_state = Gauge(
|
||||
"deployment_state",
|
||||
self.litellm_deployment_state = Gauge(
|
||||
"litellm_deployment_state",
|
||||
"LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage",
|
||||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
|
||||
self.llm_deployment_success_responses = Counter(
|
||||
name="llm_deployment_success_responses",
|
||||
self.litellm_deployment_success_responses = Counter(
|
||||
name="litellm_deployment_success_responses",
|
||||
documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm",
|
||||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
self.llm_deployment_failure_responses = Counter(
|
||||
name="llm_deployment_failure_responses",
|
||||
self.litellm_deployment_failure_responses = Counter(
|
||||
name="litellm_deployment_failure_responses",
|
||||
documentation="LLM Deployment Analytics - Total number of failed LLM API calls via litellm",
|
||||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
self.llm_deployment_total_requests = Counter(
|
||||
name="llm_deployment_total_requests",
|
||||
self.litellm_deployment_total_requests = Counter(
|
||||
name="litellm_deployment_total_requests",
|
||||
documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure",
|
||||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
|
||||
# Deployment Latency tracking
|
||||
self.llm_deployment_latency_per_output_token = Histogram(
|
||||
name="llm_deployment_latency_per_output_token",
|
||||
self.litellm_deployment_latency_per_output_token = Histogram(
|
||||
name="litellm_deployment_latency_per_output_token",
|
||||
documentation="LLM Deployment Analytics - Latency per output token",
|
||||
labelnames=_logged_llm_labels,
|
||||
)
|
||||
|
||||
self.llm_deployment_successful_fallbacks = Counter(
|
||||
"llm_deployment_successful_fallbacks",
|
||||
self.litellm_deployment_successful_fallbacks = Counter(
|
||||
"litellm_deployment_successful_fallbacks",
|
||||
"LLM Deployment Analytics - Number of successful fallback requests from primary model -> fallback model",
|
||||
["primary_model", "fallback_model"],
|
||||
)
|
||||
self.llm_deployment_failed_fallbacks = Counter(
|
||||
"llm_deployment_failed_fallbacks",
|
||||
self.litellm_deployment_failed_fallbacks = Counter(
|
||||
"litellm_deployment_failed_fallbacks",
|
||||
"LLM Deployment Analytics - Number of failed fallback requests from primary model -> fallback model",
|
||||
["primary_model", "fallback_model"],
|
||||
)
|
||||
|
@ -358,14 +358,14 @@ class PrometheusLogger(CustomLogger):
|
|||
api_provider=llm_provider,
|
||||
)
|
||||
|
||||
self.llm_deployment_failure_responses.labels(
|
||||
self.litellm_deployment_failure_responses.labels(
|
||||
litellm_model_name=litellm_model_name,
|
||||
model_id=model_id,
|
||||
api_base=api_base,
|
||||
api_provider=llm_provider,
|
||||
).inc()
|
||||
|
||||
self.llm_deployment_total_requests.labels(
|
||||
self.litellm_deployment_total_requests.labels(
|
||||
litellm_model_name=litellm_model_name,
|
||||
model_id=model_id,
|
||||
api_base=api_base,
|
||||
|
@ -438,14 +438,14 @@ class PrometheusLogger(CustomLogger):
|
|||
api_provider=llm_provider,
|
||||
)
|
||||
|
||||
self.llm_deployment_success_responses.labels(
|
||||
self.litellm_deployment_success_responses.labels(
|
||||
litellm_model_name=litellm_model_name,
|
||||
model_id=model_id,
|
||||
api_base=api_base,
|
||||
api_provider=llm_provider,
|
||||
).inc()
|
||||
|
||||
self.llm_deployment_total_requests.labels(
|
||||
self.litellm_deployment_total_requests.labels(
|
||||
litellm_model_name=litellm_model_name,
|
||||
model_id=model_id,
|
||||
api_base=api_base,
|
||||
|
@ -475,7 +475,7 @@ class PrometheusLogger(CustomLogger):
|
|||
latency_per_token = None
|
||||
if output_tokens is not None and output_tokens > 0:
|
||||
latency_per_token = _latency_seconds / output_tokens
|
||||
self.llm_deployment_latency_per_output_token.labels(
|
||||
self.litellm_deployment_latency_per_output_token.labels(
|
||||
litellm_model_name=litellm_model_name,
|
||||
model_id=model_id,
|
||||
api_base=api_base,
|
||||
|
@ -497,7 +497,7 @@ class PrometheusLogger(CustomLogger):
|
|||
kwargs,
|
||||
)
|
||||
_new_model = kwargs.get("model")
|
||||
self.llm_deployment_successful_fallbacks.labels(
|
||||
self.litellm_deployment_successful_fallbacks.labels(
|
||||
primary_model=original_model_group, fallback_model=_new_model
|
||||
).inc()
|
||||
|
||||
|
@ -508,11 +508,11 @@ class PrometheusLogger(CustomLogger):
|
|||
kwargs,
|
||||
)
|
||||
_new_model = kwargs.get("model")
|
||||
self.llm_deployment_failed_fallbacks.labels(
|
||||
self.litellm_deployment_failed_fallbacks.labels(
|
||||
primary_model=original_model_group, fallback_model=_new_model
|
||||
).inc()
|
||||
|
||||
def set_deployment_state(
|
||||
def set_litellm_deployment_state(
|
||||
self,
|
||||
state: int,
|
||||
litellm_model_name: str,
|
||||
|
@ -520,7 +520,7 @@ class PrometheusLogger(CustomLogger):
|
|||
api_base: str,
|
||||
api_provider: str,
|
||||
):
|
||||
self.deployment_state.labels(
|
||||
self.litellm_deployment_state.labels(
|
||||
litellm_model_name, model_id, api_base, api_provider
|
||||
).set(state)
|
||||
|
||||
|
@ -531,7 +531,7 @@ class PrometheusLogger(CustomLogger):
|
|||
api_base: str,
|
||||
api_provider: str,
|
||||
):
|
||||
self.set_deployment_state(
|
||||
self.set_litellm_deployment_state(
|
||||
0, litellm_model_name, model_id, api_base, api_provider
|
||||
)
|
||||
|
||||
|
@ -542,7 +542,7 @@ class PrometheusLogger(CustomLogger):
|
|||
api_base: str,
|
||||
api_provider: str,
|
||||
):
|
||||
self.set_deployment_state(
|
||||
self.set_litellm_deployment_state(
|
||||
1, litellm_model_name, model_id, api_base, api_provider
|
||||
)
|
||||
|
||||
|
@ -553,7 +553,7 @@ class PrometheusLogger(CustomLogger):
|
|||
api_base: str,
|
||||
api_provider: str,
|
||||
):
|
||||
self.set_deployment_state(
|
||||
self.set_litellm_deployment_state(
|
||||
2, litellm_model_name, model_id, api_base, api_provider
|
||||
)
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@ async def get_fallback_metric_from_prometheus():
|
|||
"""
|
||||
response_message = ""
|
||||
relevant_metrics = [
|
||||
"llm_deployment_successful_fallbacks_total",
|
||||
"llm_deployment_failed_fallbacks_total",
|
||||
"litellm_deployment_successful_fallbacks_total",
|
||||
"litellm_deployment_failed_fallbacks_total",
|
||||
]
|
||||
for metric in relevant_metrics:
|
||||
response_json = await get_metric_from_prometheus(
|
||||
|
|
|
@ -35,6 +35,7 @@ from litellm.types.llms.anthropic import (
|
|||
AnthropicResponseContentBlockText,
|
||||
AnthropicResponseContentBlockToolUse,
|
||||
AnthropicResponseUsageBlock,
|
||||
AnthropicSystemMessageContent,
|
||||
ContentBlockDelta,
|
||||
ContentBlockStart,
|
||||
ContentBlockStop,
|
||||
|
@ -759,6 +760,7 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
## CALCULATING USAGE
|
||||
prompt_tokens = completion_response["usage"]["input_tokens"]
|
||||
completion_tokens = completion_response["usage"]["output_tokens"]
|
||||
_usage = completion_response["usage"]
|
||||
total_tokens = prompt_tokens + completion_tokens
|
||||
|
||||
model_response.created = int(time.time())
|
||||
|
@ -768,6 +770,11 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
completion_tokens=completion_tokens,
|
||||
total_tokens=total_tokens,
|
||||
)
|
||||
|
||||
if "cache_creation_input_tokens" in _usage:
|
||||
usage["cache_creation_input_tokens"] = _usage["cache_creation_input_tokens"]
|
||||
if "cache_read_input_tokens" in _usage:
|
||||
usage["cache_read_input_tokens"] = _usage["cache_read_input_tokens"]
|
||||
setattr(model_response, "usage", usage) # type: ignore
|
||||
return model_response
|
||||
|
||||
|
@ -901,6 +908,7 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
# Separate system prompt from rest of message
|
||||
system_prompt_indices = []
|
||||
system_prompt = ""
|
||||
anthropic_system_message_list = None
|
||||
for idx, message in enumerate(messages):
|
||||
if message["role"] == "system":
|
||||
valid_content: bool = False
|
||||
|
@ -908,8 +916,23 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
system_prompt += message["content"]
|
||||
valid_content = True
|
||||
elif isinstance(message["content"], list):
|
||||
for content in message["content"]:
|
||||
system_prompt += content.get("text", "")
|
||||
for _content in message["content"]:
|
||||
anthropic_system_message_content = (
|
||||
AnthropicSystemMessageContent(
|
||||
type=_content.get("type"),
|
||||
text=_content.get("text"),
|
||||
)
|
||||
)
|
||||
if "cache_control" in _content:
|
||||
anthropic_system_message_content["cache_control"] = (
|
||||
_content["cache_control"]
|
||||
)
|
||||
|
||||
if anthropic_system_message_list is None:
|
||||
anthropic_system_message_list = []
|
||||
anthropic_system_message_list.append(
|
||||
anthropic_system_message_content
|
||||
)
|
||||
valid_content = True
|
||||
|
||||
if valid_content:
|
||||
|
@ -919,6 +942,10 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
messages.pop(idx)
|
||||
if len(system_prompt) > 0:
|
||||
optional_params["system"] = system_prompt
|
||||
|
||||
# Handling anthropic API Prompt Caching
|
||||
if anthropic_system_message_list is not None:
|
||||
optional_params["system"] = anthropic_system_message_list
|
||||
# Format rest of message according to anthropic guidelines
|
||||
try:
|
||||
messages = prompt_factory(
|
||||
|
@ -954,6 +981,8 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
else: # assume openai tool call
|
||||
new_tool = tool["function"]
|
||||
new_tool["input_schema"] = new_tool.pop("parameters") # rename key
|
||||
if "cache_control" in tool:
|
||||
new_tool["cache_control"] = tool["cache_control"]
|
||||
anthropic_tools.append(new_tool)
|
||||
|
||||
optional_params["tools"] = anthropic_tools
|
||||
|
|
|
@ -356,6 +356,7 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
|
|||
"json": data,
|
||||
"method": "POST",
|
||||
"timeout": litellm.request_timeout,
|
||||
"follow_redirects": True
|
||||
}
|
||||
if api_key is not None:
|
||||
_request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
|
||||
|
|
|
@ -1224,6 +1224,19 @@ def convert_to_anthropic_tool_invoke(
|
|||
return anthropic_tool_invoke
|
||||
|
||||
|
||||
def add_cache_control_to_content(
|
||||
anthropic_content_element: Union[
|
||||
dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
|
||||
],
|
||||
orignal_content_element: dict,
|
||||
):
|
||||
if "cache_control" in orignal_content_element:
|
||||
anthropic_content_element["cache_control"] = orignal_content_element[
|
||||
"cache_control"
|
||||
]
|
||||
return anthropic_content_element
|
||||
|
||||
|
||||
def anthropic_messages_pt(
|
||||
messages: list,
|
||||
model: str,
|
||||
|
@ -1264,8 +1277,8 @@ def anthropic_messages_pt(
|
|||
image_chunk = convert_to_anthropic_image_obj(
|
||||
m["image_url"]["url"]
|
||||
)
|
||||
user_content.append(
|
||||
AnthropicMessagesImageParam(
|
||||
|
||||
_anthropic_content_element = AnthropicMessagesImageParam(
|
||||
type="image",
|
||||
source=AnthropicImageParamSource(
|
||||
type="base64",
|
||||
|
@ -1273,9 +1286,22 @@ def anthropic_messages_pt(
|
|||
data=image_chunk["data"],
|
||||
),
|
||||
)
|
||||
|
||||
anthropic_content_element = add_cache_control_to_content(
|
||||
anthropic_content_element=_anthropic_content_element,
|
||||
orignal_content_element=m,
|
||||
)
|
||||
user_content.append(anthropic_content_element)
|
||||
elif m.get("type", "") == "text":
|
||||
user_content.append({"type": "text", "text": m["text"]})
|
||||
_anthropic_text_content_element = {
|
||||
"type": "text",
|
||||
"text": m["text"],
|
||||
}
|
||||
anthropic_content_element = add_cache_control_to_content(
|
||||
anthropic_content_element=_anthropic_text_content_element,
|
||||
orignal_content_element=m,
|
||||
)
|
||||
user_content.append(anthropic_content_element)
|
||||
elif (
|
||||
messages[msg_i]["role"] == "tool"
|
||||
or messages[msg_i]["role"] == "function"
|
||||
|
@ -1306,6 +1332,10 @@ def anthropic_messages_pt(
|
|||
anthropic_message = AnthropicMessagesTextParam(
|
||||
type="text", text=m.get("text")
|
||||
)
|
||||
anthropic_message = add_cache_control_to_content(
|
||||
anthropic_content_element=anthropic_message,
|
||||
orignal_content_element=m,
|
||||
)
|
||||
assistant_content.append(anthropic_message)
|
||||
elif (
|
||||
"content" in messages[msg_i]
|
||||
|
@ -1313,9 +1343,17 @@ def anthropic_messages_pt(
|
|||
and len(messages[msg_i]["content"])
|
||||
> 0 # don't pass empty text blocks. anthropic api raises errors.
|
||||
):
|
||||
assistant_content.append(
|
||||
{"type": "text", "text": messages[msg_i]["content"]}
|
||||
|
||||
_anthropic_text_content_element = {
|
||||
"type": "text",
|
||||
"text": messages[msg_i]["content"],
|
||||
}
|
||||
|
||||
anthropic_content_element = add_cache_control_to_content(
|
||||
anthropic_content_element=_anthropic_text_content_element,
|
||||
orignal_content_element=messages[msg_i],
|
||||
)
|
||||
assistant_content.append(anthropic_content_element)
|
||||
|
||||
if messages[msg_i].get(
|
||||
"tool_calls", []
|
||||
|
@ -1701,12 +1739,14 @@ def cohere_messages_pt_v2(
|
|||
assistant_tool_calls: List[ToolCallObject] = []
|
||||
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
|
||||
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
|
||||
assistant_text = (
|
||||
messages[msg_i].get("content") or ""
|
||||
) # either string or none
|
||||
if assistant_text:
|
||||
assistant_content += assistant_text
|
||||
|
||||
if isinstance(messages[msg_i]["content"], list):
|
||||
for m in messages[msg_i]["content"]:
|
||||
if m.get("type", "") == "text":
|
||||
assistant_content += m["text"]
|
||||
elif messages[msg_i].get("content") is not None and isinstance(
|
||||
messages[msg_i]["content"], str
|
||||
):
|
||||
assistant_content += messages[msg_i]["content"]
|
||||
if messages[msg_i].get(
|
||||
"tool_calls", []
|
||||
): # support assistant tool invoke conversion
|
||||
|
|
|
@ -2074,7 +2074,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/claude-3-5-sonnet@20240620": {
|
||||
"max_tokens": 4096,
|
||||
|
@ -2085,7 +2086,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/claude-3-haiku@20240307": {
|
||||
"max_tokens": 4096,
|
||||
|
@ -2096,7 +2098,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/claude-3-opus@20240229": {
|
||||
"max_tokens": 4096,
|
||||
|
@ -2107,7 +2110,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/meta/llama3-405b-instruct-maas": {
|
||||
"max_tokens": 32000,
|
||||
|
@ -4531,6 +4535,69 @@
|
|||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-70b-instruct": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000001,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-8b-instruct": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.0000002,
|
||||
"output_cost_per_token": 0.0000002,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-huge-128k-online": {
|
||||
"max_tokens": 127072,
|
||||
"max_input_tokens": 127072,
|
||||
"max_output_tokens": 127072,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"output_cost_per_token": 0.000005,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-large-128k-online": {
|
||||
"max_tokens": 127072,
|
||||
"max_input_tokens": 127072,
|
||||
"max_output_tokens": 127072,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000001,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-large-128k-chat": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000001,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-small-128k-chat": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.0000002,
|
||||
"output_cost_per_token": 0.0000002,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-small-128k-online": {
|
||||
"max_tokens": 127072,
|
||||
"max_input_tokens": 127072,
|
||||
"max_output_tokens": 127072,
|
||||
"input_cost_per_token": 0.0000002,
|
||||
"output_cost_per_token": 0.0000002,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/pplx-7b-chat": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,7 +1,6 @@
|
|||
model_list:
|
||||
- model_name: azure-embedding-model
|
||||
- model_name: "gpt-4"
|
||||
litellm_params:
|
||||
model: azure/azure-embedding-model
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
model: "gpt-4"
|
||||
model_info:
|
||||
my_custom_key: "my_custom_value"
|
|
@ -85,6 +85,8 @@ def _get_bearer_token(
|
|||
):
|
||||
if api_key.startswith("Bearer "): # ensure Bearer token passed in
|
||||
api_key = api_key.replace("Bearer ", "") # extract the token
|
||||
elif api_key.startswith("Basic "):
|
||||
api_key = api_key.replace("Basic ", "") # handle langfuse input
|
||||
else:
|
||||
api_key = ""
|
||||
return api_key
|
||||
|
@ -138,7 +140,6 @@ async def user_api_key_auth(
|
|||
pass_through_endpoints: Optional[List[dict]] = general_settings.get(
|
||||
"pass_through_endpoints", None
|
||||
)
|
||||
|
||||
if isinstance(api_key, str):
|
||||
passed_in_key = api_key
|
||||
api_key = _get_bearer_token(api_key=api_key)
|
||||
|
@ -367,6 +368,40 @@ async def user_api_key_auth(
|
|||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
#### ELSE ####
|
||||
|
||||
## CHECK PASS-THROUGH ENDPOINTS ##
|
||||
if pass_through_endpoints is not None:
|
||||
for endpoint in pass_through_endpoints:
|
||||
if endpoint.get("path", "") == route:
|
||||
## IF AUTH DISABLED
|
||||
if endpoint.get("auth") is not True:
|
||||
return UserAPIKeyAuth()
|
||||
## IF AUTH ENABLED
|
||||
### IF CUSTOM PARSER REQUIRED
|
||||
if (
|
||||
endpoint.get("custom_auth_parser") is not None
|
||||
and endpoint.get("custom_auth_parser") == "langfuse"
|
||||
):
|
||||
"""
|
||||
- langfuse returns {'Authorization': 'Basic YW55dGhpbmc6YW55dGhpbmc'}
|
||||
- check the langfuse public key if it contains the litellm api key
|
||||
"""
|
||||
import base64
|
||||
|
||||
api_key = api_key.replace("Basic ", "").strip()
|
||||
decoded_bytes = base64.b64decode(api_key)
|
||||
decoded_str = decoded_bytes.decode("utf-8")
|
||||
api_key = decoded_str.split(":")[0]
|
||||
else:
|
||||
headers = endpoint.get("headers", None)
|
||||
if headers is not None:
|
||||
header_key = headers.get("litellm_user_api_key", "")
|
||||
if (
|
||||
isinstance(request.headers, dict)
|
||||
and request.headers.get(key=header_key) is not None
|
||||
):
|
||||
api_key = request.headers.get(key=header_key)
|
||||
|
||||
if master_key is None:
|
||||
if isinstance(api_key, str):
|
||||
return UserAPIKeyAuth(
|
||||
|
@ -533,7 +568,11 @@ async def user_api_key_auth(
|
|||
if isinstance(
|
||||
api_key, str
|
||||
): # if generated token, make sure it starts with sk-.
|
||||
assert api_key.startswith("sk-") # prevent token hashes from being used
|
||||
assert api_key.startswith(
|
||||
"sk-"
|
||||
), "LiteLLM Virtual Key expected. Received={}, expected to start with 'sk-'.".format(
|
||||
api_key
|
||||
) # prevent token hashes from being used
|
||||
else:
|
||||
verbose_logger.warning(
|
||||
"litellm.proxy.proxy_server.user_api_key_auth(): Warning - Key={} is not a string.".format(
|
||||
|
|
|
@ -5,7 +5,12 @@ from fastapi import Request
|
|||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger, verbose_proxy_logger
|
||||
from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth
|
||||
from litellm.proxy._types import (
|
||||
AddTeamCallback,
|
||||
CommonProxyErrors,
|
||||
TeamCallbackMetadata,
|
||||
UserAPIKeyAuth,
|
||||
)
|
||||
from litellm.types.utils import SupportedCacheControls
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -59,6 +64,42 @@ def safe_add_api_version_from_query_params(data: dict, request: Request):
|
|||
verbose_logger.error("error checking api version in query params: %s", str(e))
|
||||
|
||||
|
||||
def convert_key_logging_metadata_to_callback(
|
||||
data: AddTeamCallback, team_callback_settings_obj: Optional[TeamCallbackMetadata]
|
||||
) -> TeamCallbackMetadata:
|
||||
if team_callback_settings_obj is None:
|
||||
team_callback_settings_obj = TeamCallbackMetadata()
|
||||
if data.callback_type == "success":
|
||||
if team_callback_settings_obj.success_callback is None:
|
||||
team_callback_settings_obj.success_callback = []
|
||||
|
||||
if data.callback_name not in team_callback_settings_obj.success_callback:
|
||||
team_callback_settings_obj.success_callback.append(data.callback_name)
|
||||
elif data.callback_type == "failure":
|
||||
if team_callback_settings_obj.failure_callback is None:
|
||||
team_callback_settings_obj.failure_callback = []
|
||||
|
||||
if data.callback_name not in team_callback_settings_obj.failure_callback:
|
||||
team_callback_settings_obj.failure_callback.append(data.callback_name)
|
||||
elif data.callback_type == "success_and_failure":
|
||||
if team_callback_settings_obj.success_callback is None:
|
||||
team_callback_settings_obj.success_callback = []
|
||||
if team_callback_settings_obj.failure_callback is None:
|
||||
team_callback_settings_obj.failure_callback = []
|
||||
if data.callback_name not in team_callback_settings_obj.success_callback:
|
||||
team_callback_settings_obj.success_callback.append(data.callback_name)
|
||||
|
||||
if data.callback_name in team_callback_settings_obj.failure_callback:
|
||||
team_callback_settings_obj.failure_callback.append(data.callback_name)
|
||||
|
||||
for var, value in data.callback_vars.items():
|
||||
if team_callback_settings_obj.callback_vars is None:
|
||||
team_callback_settings_obj.callback_vars = {}
|
||||
team_callback_settings_obj.callback_vars[var] = litellm.get_secret(value)
|
||||
|
||||
return team_callback_settings_obj
|
||||
|
||||
|
||||
async def add_litellm_data_to_request(
|
||||
data: dict,
|
||||
request: Request,
|
||||
|
@ -224,6 +265,7 @@ async def add_litellm_data_to_request(
|
|||
} # add the team-specific configs to the completion call
|
||||
|
||||
# Team Callbacks controls
|
||||
callback_settings_obj: Optional[TeamCallbackMetadata] = None
|
||||
if user_api_key_dict.team_metadata is not None:
|
||||
team_metadata = user_api_key_dict.team_metadata
|
||||
if "callback_settings" in team_metadata:
|
||||
|
@ -241,6 +283,18 @@ async def add_litellm_data_to_request(
|
|||
}
|
||||
}
|
||||
"""
|
||||
elif (
|
||||
user_api_key_dict.metadata is not None
|
||||
and "logging" in user_api_key_dict.metadata
|
||||
):
|
||||
for item in user_api_key_dict.metadata["logging"]:
|
||||
|
||||
callback_settings_obj = convert_key_logging_metadata_to_callback(
|
||||
data=AddTeamCallback(**item),
|
||||
team_callback_settings_obj=callback_settings_obj,
|
||||
)
|
||||
|
||||
if callback_settings_obj is not None:
|
||||
data["success_callback"] = callback_settings_obj.success_callback
|
||||
data["failure_callback"] = callback_settings_obj.failure_callback
|
||||
|
||||
|
|
|
@ -309,7 +309,7 @@ async def pass_through_request(
|
|||
json=_parsed_body,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
if response.status_code >= 300:
|
||||
raise HTTPException(status_code=response.status_code, detail=response.text)
|
||||
|
||||
content = await response.aread()
|
||||
|
|
|
@ -39,7 +39,4 @@ general_settings:
|
|||
|
||||
litellm_settings:
|
||||
fallbacks: [{"gemini-1.5-pro-001": ["gpt-4o"]}]
|
||||
success_callback: ["langfuse", "prometheus"]
|
||||
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"]
|
||||
failure_callback: ["prometheus"]
|
||||
cache: True
|
||||
callbacks: ["gcs_bucket"]
|
||||
|
|
|
@ -21,6 +21,8 @@ def get_logging_payload(
|
|||
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
if response_obj is None:
|
||||
response_obj = {}
|
||||
# standardize this function to be used across, s3, dynamoDB, langfuse logging
|
||||
litellm_params = kwargs.get("litellm_params", {})
|
||||
metadata = (
|
||||
|
|
|
@ -190,7 +190,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):
|
|||
if azure_ad_token.startswith("oidc/"):
|
||||
azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
|
||||
if api_version is None:
|
||||
api_version = litellm.AZURE_DEFAULT_API_VERSION
|
||||
api_version = os.getenv("AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION)
|
||||
|
||||
if "gateway.ai.cloudflare.com" in api_base:
|
||||
if not api_base.endswith("/"):
|
||||
|
|
321
litellm/tests/test_anthropic_prompt_caching.py
Normal file
321
litellm/tests/test_anthropic_prompt_caching.py
Normal file
|
@ -0,0 +1,321 @@
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import io
|
||||
import os
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
import os
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import litellm
|
||||
from litellm import RateLimitError, Timeout, completion, completion_cost, embedding
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
|
||||
|
||||
# litellm.num_retries =3
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
user_message = "Write a short poem about the sky"
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
|
||||
def logger_fn(user_model_dict):
|
||||
print(f"user_model_dict: {user_model_dict}")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_callbacks():
|
||||
print("\npytest fixture - resetting callbacks")
|
||||
litellm.success_callback = []
|
||||
litellm._async_success_callback = []
|
||||
litellm.failure_callback = []
|
||||
litellm.callbacks = []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_litellm_anthropic_prompt_caching_tools():
|
||||
# Arrange: Set up the MagicMock for the httpx.AsyncClient
|
||||
mock_response = AsyncMock()
|
||||
|
||||
def return_val():
|
||||
return {
|
||||
"id": "msg_01XFDUDYJgAACzvnptvVoYEL",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "Hello!"}],
|
||||
"model": "claude-3-5-sonnet-20240620",
|
||||
"stop_reason": "end_turn",
|
||||
"stop_sequence": None,
|
||||
"usage": {"input_tokens": 12, "output_tokens": 6},
|
||||
}
|
||||
|
||||
mock_response.json = return_val
|
||||
|
||||
litellm.set_verbose = True
|
||||
with patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
return_value=mock_response,
|
||||
) as mock_post:
|
||||
# Act: Call the litellm.acompletion function
|
||||
response = await litellm.acompletion(
|
||||
api_key="mock_api_key",
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
{"role": "user", "content": "What's the weather like in Boston today?"}
|
||||
],
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
}
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
|
||||
# Print what was called on the mock
|
||||
print("call args=", mock_post.call_args)
|
||||
|
||||
expected_url = "https://api.anthropic.com/v1/messages"
|
||||
expected_headers = {
|
||||
"accept": "application/json",
|
||||
"content-type": "application/json",
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
"x-api-key": "mock_api_key",
|
||||
}
|
||||
|
||||
expected_json = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's the weather like in Boston today?",
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
],
|
||||
"max_tokens": 4096,
|
||||
"model": "claude-3-5-sonnet-20240620",
|
||||
}
|
||||
|
||||
mock_post.assert_called_once_with(
|
||||
expected_url, json=expected_json, headers=expected_headers, timeout=600.0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio()
|
||||
async def test_anthropic_api_prompt_caching_basic():
|
||||
litellm.set_verbose = True
|
||||
response = await litellm.acompletion(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
# System Message
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement"
|
||||
* 400,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=10,
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
|
||||
print("response=", response)
|
||||
|
||||
assert "cache_read_input_tokens" in response.usage
|
||||
assert "cache_creation_input_tokens" in response.usage
|
||||
|
||||
# Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
|
||||
assert (response.usage.cache_read_input_tokens > 0) or (
|
||||
response.usage.cache_creation_input_tokens > 0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_litellm_anthropic_prompt_caching_system():
|
||||
# https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples
|
||||
# LArge Context Caching Example
|
||||
mock_response = AsyncMock()
|
||||
|
||||
def return_val():
|
||||
return {
|
||||
"id": "msg_01XFDUDYJgAACzvnptvVoYEL",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "Hello!"}],
|
||||
"model": "claude-3-5-sonnet-20240620",
|
||||
"stop_reason": "end_turn",
|
||||
"stop_sequence": None,
|
||||
"usage": {"input_tokens": 12, "output_tokens": 6},
|
||||
}
|
||||
|
||||
mock_response.json = return_val
|
||||
|
||||
litellm.set_verbose = True
|
||||
with patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
return_value=mock_response,
|
||||
) as mock_post:
|
||||
# Act: Call the litellm.acompletion function
|
||||
response = await litellm.acompletion(
|
||||
api_key="mock_api_key",
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "You are an AI assistant tasked with analyzing legal documents.",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what are the key terms and conditions in this agreement?",
|
||||
},
|
||||
],
|
||||
extra_headers={
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
},
|
||||
)
|
||||
|
||||
# Print what was called on the mock
|
||||
print("call args=", mock_post.call_args)
|
||||
|
||||
expected_url = "https://api.anthropic.com/v1/messages"
|
||||
expected_headers = {
|
||||
"accept": "application/json",
|
||||
"content-type": "application/json",
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "prompt-caching-2024-07-31",
|
||||
"x-api-key": "mock_api_key",
|
||||
}
|
||||
|
||||
expected_json = {
|
||||
"system": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "You are an AI assistant tasked with analyzing legal documents.",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "what are the key terms and conditions in this agreement?",
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"max_tokens": 4096,
|
||||
"model": "claude-3-5-sonnet-20240620",
|
||||
}
|
||||
|
||||
mock_post.assert_called_once_with(
|
||||
expected_url, json=expected_json, headers=expected_headers, timeout=600.0
|
||||
)
|
|
@ -14,7 +14,7 @@ sys.path.insert(
|
|||
) # Adds the parent directory to the system path
|
||||
|
||||
import os
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -3474,7 +3474,6 @@ def response_format_tests(response: litellm.ModelResponse):
|
|||
assert isinstance(response.usage.total_tokens, int) # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
|
@ -3488,6 +3487,7 @@ def response_format_tests(response: litellm.ModelResponse):
|
|||
"cohere.command-text-v14",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_bedrock_httpx_models(sync_mode, model):
|
||||
litellm.set_verbose = True
|
||||
|
@ -3730,19 +3730,21 @@ def test_completion_anyscale_api():
|
|||
# test_completion_anyscale_api()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="flaky test, times out frequently")
|
||||
# @pytest.mark.skip(reason="flaky test, times out frequently")
|
||||
def test_completion_cohere():
|
||||
try:
|
||||
# litellm.set_verbose=True
|
||||
messages = [
|
||||
{"role": "system", "content": "You're a good bot"},
|
||||
{"role": "assistant", "content": [{"text": "2", "type": "text"}]},
|
||||
{"role": "assistant", "content": [{"text": "3", "type": "text"}]},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hey",
|
||||
},
|
||||
]
|
||||
response = completion(
|
||||
model="command-nightly",
|
||||
model="command-r",
|
||||
messages=messages,
|
||||
)
|
||||
print(response)
|
||||
|
|
|
@ -1,23 +1,27 @@
|
|||
# What is this?
|
||||
## Test to make sure function call response always works with json.loads() -> no extra parsing required. Relevant issue - https://github.com/BerriAI/litellm/issues/2654
|
||||
import sys, os
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os, io
|
||||
import io
|
||||
import os
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import pytest
|
||||
import litellm
|
||||
import json
|
||||
import warnings
|
||||
|
||||
from litellm import completion
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
|
||||
# Just a stub to keep the sample code simple
|
||||
class Trade:
|
||||
|
@ -78,6 +82,7 @@ def trade(model_name: str) -> List[Trade]:
|
|||
},
|
||||
}
|
||||
|
||||
try:
|
||||
response = completion(
|
||||
model_name,
|
||||
[
|
||||
|
@ -129,7 +134,8 @@ def trade(model_name: str) -> List[Trade]:
|
|||
"function": {"name": tool_spec["function"]["name"]}, # type: ignore
|
||||
},
|
||||
)
|
||||
|
||||
except litellm.InternalServerError:
|
||||
pass
|
||||
calls = response.choices[0].message.tool_calls
|
||||
trades = [trade for call in calls for trade in parse_call(call)]
|
||||
return trades
|
||||
|
|
|
@ -147,6 +147,117 @@ async def test_basic_gcs_logger():
|
|||
|
||||
assert gcs_payload["response_cost"] > 0.0
|
||||
|
||||
assert gcs_payload["log_event_type"] == "successful_api_call"
|
||||
gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"])
|
||||
|
||||
assert (
|
||||
gcs_payload["spend_log_metadata"]["user_api_key"]
|
||||
== "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b"
|
||||
)
|
||||
assert (
|
||||
gcs_payload["spend_log_metadata"]["user_api_key_user_id"]
|
||||
== "116544810872468347480"
|
||||
)
|
||||
|
||||
# Delete Object from GCS
|
||||
print("deleting object from GCS")
|
||||
await gcs_logger.delete_gcs_object(object_name=object_name)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_gcs_logger_failure():
|
||||
load_vertex_ai_credentials()
|
||||
gcs_logger = GCSBucketLogger()
|
||||
print("GCSBucketLogger", gcs_logger)
|
||||
|
||||
gcs_log_id = f"failure-test-{uuid.uuid4().hex}"
|
||||
|
||||
litellm.callbacks = [gcs_logger]
|
||||
|
||||
try:
|
||||
response = await litellm.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
temperature=0.7,
|
||||
messages=[{"role": "user", "content": "This is a test"}],
|
||||
max_tokens=10,
|
||||
user="ishaan-2",
|
||||
mock_response=litellm.BadRequestError(
|
||||
model="gpt-3.5-turbo",
|
||||
message="Error: 400: Bad Request: Invalid API key, please check your API key and try again.",
|
||||
llm_provider="openai",
|
||||
),
|
||||
metadata={
|
||||
"gcs_log_id": gcs_log_id,
|
||||
"tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"],
|
||||
"user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
|
||||
"user_api_key_alias": None,
|
||||
"user_api_end_user_max_budget": None,
|
||||
"litellm_api_version": "0.0.0",
|
||||
"global_max_parallel_requests": None,
|
||||
"user_api_key_user_id": "116544810872468347480",
|
||||
"user_api_key_org_id": None,
|
||||
"user_api_key_team_id": None,
|
||||
"user_api_key_team_alias": None,
|
||||
"user_api_key_metadata": {},
|
||||
"requester_ip_address": "127.0.0.1",
|
||||
"spend_logs_metadata": {"hello": "world"},
|
||||
"headers": {
|
||||
"content-type": "application/json",
|
||||
"user-agent": "PostmanRuntime/7.32.3",
|
||||
"accept": "*/*",
|
||||
"postman-token": "92300061-eeaa-423b-a420-0b44896ecdc4",
|
||||
"host": "localhost:4000",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
"connection": "keep-alive",
|
||||
"content-length": "163",
|
||||
},
|
||||
"endpoint": "http://localhost:4000/chat/completions",
|
||||
"model_group": "gpt-3.5-turbo",
|
||||
"deployment": "azure/chatgpt-v-2",
|
||||
"model_info": {
|
||||
"id": "4bad40a1eb6bebd1682800f16f44b9f06c52a6703444c99c7f9f32e9de3693b4",
|
||||
"db_model": False,
|
||||
},
|
||||
"api_base": "https://openai-gpt-4-test-v-1.openai.azure.com/",
|
||||
"caching_groups": None,
|
||||
"raw_request": "\n\nPOST Request Sent from LiteLLM:\ncurl -X POST \\\nhttps://openai-gpt-4-test-v-1.openai.azure.com//openai/ \\\n-H 'Authorization: *****' \\\n-d '{'model': 'chatgpt-v-2', 'messages': [{'role': 'system', 'content': 'you are a helpful assistant.\\n'}, {'role': 'user', 'content': 'bom dia'}], 'stream': False, 'max_tokens': 10, 'user': '116544810872468347480', 'extra_body': {}}'\n",
|
||||
},
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
await asyncio.sleep(5)
|
||||
|
||||
# Get the current date
|
||||
# Get the current date
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
# Modify the object_name to include the date-based folder
|
||||
object_name = gcs_log_id
|
||||
|
||||
print("object_name", object_name)
|
||||
|
||||
# Check if object landed on GCS
|
||||
object_from_gcs = await gcs_logger.download_gcs_object(object_name=object_name)
|
||||
print("object from gcs=", object_from_gcs)
|
||||
# convert object_from_gcs from bytes to DICT
|
||||
parsed_data = json.loads(object_from_gcs)
|
||||
print("object_from_gcs as dict", parsed_data)
|
||||
|
||||
print("type of object_from_gcs", type(parsed_data))
|
||||
|
||||
gcs_payload = GCSBucketPayload(**parsed_data)
|
||||
|
||||
print("gcs_payload", gcs_payload)
|
||||
|
||||
assert gcs_payload["request_kwargs"]["model"] == "gpt-3.5-turbo"
|
||||
assert gcs_payload["request_kwargs"]["messages"] == [
|
||||
{"role": "user", "content": "This is a test"}
|
||||
]
|
||||
|
||||
assert gcs_payload["response_cost"] == 0
|
||||
assert gcs_payload["log_event_type"] == "failed_api_call"
|
||||
|
||||
gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"])
|
||||
|
||||
assert (
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fastapi import FastAPI
|
||||
|
@ -30,6 +31,7 @@ def client():
|
|||
async def test_pass_through_endpoint(client, monkeypatch):
|
||||
# Mock the httpx.AsyncClient.request method
|
||||
monkeypatch.setattr("httpx.AsyncClient.request", mock_request)
|
||||
import litellm
|
||||
|
||||
# Define a pass-through endpoint
|
||||
pass_through_endpoints = [
|
||||
|
@ -42,6 +44,11 @@ async def test_pass_through_endpoint(client, monkeypatch):
|
|||
|
||||
# Initialize the pass-through endpoint
|
||||
await initialize_pass_through_endpoints(pass_through_endpoints)
|
||||
general_settings: Optional[dict] = (
|
||||
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
|
||||
)
|
||||
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
|
||||
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
|
||||
|
||||
# Make a request to the pass-through endpoint
|
||||
response = client.post("/test-endpoint", json={"prompt": "Hello, world!"})
|
||||
|
@ -54,6 +61,7 @@ async def test_pass_through_endpoint(client, monkeypatch):
|
|||
@pytest.mark.asyncio
|
||||
async def test_pass_through_endpoint_rerank(client):
|
||||
_cohere_api_key = os.environ.get("COHERE_API_KEY")
|
||||
import litellm
|
||||
|
||||
# Define a pass-through endpoint
|
||||
pass_through_endpoints = [
|
||||
|
@ -66,6 +74,11 @@ async def test_pass_through_endpoint_rerank(client):
|
|||
|
||||
# Initialize the pass-through endpoint
|
||||
await initialize_pass_through_endpoints(pass_through_endpoints)
|
||||
general_settings: Optional[dict] = (
|
||||
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
|
||||
)
|
||||
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
|
||||
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
|
||||
|
||||
_json_data = {
|
||||
"model": "rerank-english-v3.0",
|
||||
|
@ -87,7 +100,7 @@ async def test_pass_through_endpoint_rerank(client):
|
|||
|
||||
@pytest.mark.parametrize(
|
||||
"auth, rpm_limit, expected_error_code",
|
||||
[(True, 0, 429), (True, 1, 200), (False, 0, 401)],
|
||||
[(True, 0, 429), (True, 1, 200), (False, 0, 200)],
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_limit):
|
||||
|
@ -123,6 +136,11 @@ async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_li
|
|||
|
||||
# Initialize the pass-through endpoint
|
||||
await initialize_pass_through_endpoints(pass_through_endpoints)
|
||||
general_settings: Optional[dict] = (
|
||||
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
|
||||
)
|
||||
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
|
||||
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
|
||||
|
||||
_json_data = {
|
||||
"model": "rerank-english-v3.0",
|
||||
|
@ -146,6 +164,123 @@ async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_li
|
|||
assert response.status_code == expected_error_code
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"auth, rpm_limit, expected_error_code",
|
||||
[(True, 0, 429), (True, 1, 207), (False, 0, 207)],
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_aaapass_through_endpoint_pass_through_keys_langfuse(
|
||||
auth, expected_error_code, rpm_limit
|
||||
):
|
||||
|
||||
client = TestClient(app)
|
||||
import litellm
|
||||
from litellm.proxy._types import UserAPIKeyAuth
|
||||
from litellm.proxy.proxy_server import ProxyLogging, hash_token, user_api_key_cache
|
||||
|
||||
# Store original values
|
||||
original_user_api_key_cache = getattr(
|
||||
litellm.proxy.proxy_server, "user_api_key_cache", None
|
||||
)
|
||||
original_master_key = getattr(litellm.proxy.proxy_server, "master_key", None)
|
||||
original_prisma_client = getattr(litellm.proxy.proxy_server, "prisma_client", None)
|
||||
original_proxy_logging_obj = getattr(
|
||||
litellm.proxy.proxy_server, "proxy_logging_obj", None
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
mock_api_key = "sk-my-test-key"
|
||||
cache_value = UserAPIKeyAuth(
|
||||
token=hash_token(mock_api_key), rpm_limit=rpm_limit
|
||||
)
|
||||
|
||||
_cohere_api_key = os.environ.get("COHERE_API_KEY")
|
||||
|
||||
user_api_key_cache.set_cache(key=hash_token(mock_api_key), value=cache_value)
|
||||
|
||||
proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
|
||||
proxy_logging_obj._init_litellm_callbacks()
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "user_api_key_cache", user_api_key_cache)
|
||||
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
|
||||
setattr(litellm.proxy.proxy_server, "prisma_client", "FAKE-VAR")
|
||||
setattr(litellm.proxy.proxy_server, "proxy_logging_obj", proxy_logging_obj)
|
||||
|
||||
# Define a pass-through endpoint
|
||||
pass_through_endpoints = [
|
||||
{
|
||||
"path": "/api/public/ingestion",
|
||||
"target": "https://cloud.langfuse.com/api/public/ingestion",
|
||||
"auth": auth,
|
||||
"custom_auth_parser": "langfuse",
|
||||
"headers": {
|
||||
"LANGFUSE_PUBLIC_KEY": "os.environ/LANGFUSE_PUBLIC_KEY",
|
||||
"LANGFUSE_SECRET_KEY": "os.environ/LANGFUSE_SECRET_KEY",
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
# Initialize the pass-through endpoint
|
||||
await initialize_pass_through_endpoints(pass_through_endpoints)
|
||||
general_settings: Optional[dict] = (
|
||||
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
|
||||
)
|
||||
old_general_settings = general_settings
|
||||
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
|
||||
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
|
||||
|
||||
_json_data = {
|
||||
"batch": [
|
||||
{
|
||||
"id": "80e2141f-0ca6-47b7-9c06-dde5e97de690",
|
||||
"type": "trace-create",
|
||||
"body": {
|
||||
"id": "0687af7b-4a75-4de8-a4f6-cba1cdc00865",
|
||||
"timestamp": "2024-08-14T02:38:56.092950Z",
|
||||
"name": "test-trace-litellm-proxy-passthrough",
|
||||
},
|
||||
"timestamp": "2024-08-14T02:38:56.093352Z",
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"batch_size": 1,
|
||||
"sdk_integration": "default",
|
||||
"sdk_name": "python",
|
||||
"sdk_version": "2.27.0",
|
||||
"public_key": "anything",
|
||||
},
|
||||
}
|
||||
|
||||
# Make a request to the pass-through endpoint
|
||||
response = client.post(
|
||||
"/api/public/ingestion",
|
||||
json=_json_data,
|
||||
headers={"Authorization": "Basic c2stbXktdGVzdC1rZXk6YW55dGhpbmc="},
|
||||
)
|
||||
|
||||
print("JSON response: ", _json_data)
|
||||
|
||||
print("RESPONSE RECEIVED - {}".format(response.text))
|
||||
|
||||
# Assert the response
|
||||
assert response.status_code == expected_error_code
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "general_settings", old_general_settings)
|
||||
finally:
|
||||
# Reset to original values
|
||||
setattr(
|
||||
litellm.proxy.proxy_server,
|
||||
"user_api_key_cache",
|
||||
original_user_api_key_cache,
|
||||
)
|
||||
setattr(litellm.proxy.proxy_server, "master_key", original_master_key)
|
||||
setattr(litellm.proxy.proxy_server, "prisma_client", original_prisma_client)
|
||||
setattr(
|
||||
litellm.proxy.proxy_server, "proxy_logging_obj", original_proxy_logging_obj
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pass_through_endpoint_anthropic(client):
|
||||
import litellm
|
||||
|
@ -178,6 +313,11 @@ async def test_pass_through_endpoint_anthropic(client):
|
|||
|
||||
# Initialize the pass-through endpoint
|
||||
await initialize_pass_through_endpoints(pass_through_endpoints)
|
||||
general_settings: Optional[dict] = (
|
||||
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
|
||||
)
|
||||
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
|
||||
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
|
||||
|
||||
_json_data = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
|
|
|
@ -76,6 +76,6 @@ async def test_async_prometheus_success_logging():
|
|||
print("metrics from prometheus", metrics)
|
||||
assert metrics["litellm_requests_metric_total"] == 1.0
|
||||
assert metrics["litellm_total_tokens_total"] == 30.0
|
||||
assert metrics["llm_deployment_success_responses_total"] == 1.0
|
||||
assert metrics["llm_deployment_total_requests_total"] == 1.0
|
||||
assert metrics["llm_deployment_latency_per_output_token_bucket"] == 1.0
|
||||
assert metrics["litellm_deployment_success_responses_total"] == 1.0
|
||||
assert metrics["litellm_deployment_total_requests_total"] == 1.0
|
||||
assert metrics["litellm_deployment_latency_per_output_token_bucket"] == 1.0
|
||||
|
|
|
@ -260,3 +260,56 @@ def test_anthropic_messages_tool_call():
|
|||
translated_messages[-1]["content"][0]["tool_use_id"]
|
||||
== "bc8cb4b6-88c4-4138-8993-3a9d9cd51656"
|
||||
)
|
||||
|
||||
|
||||
def test_anthropic_cache_controls_pt():
|
||||
"see anthropic docs for this: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#continuing-a-multi-turn-conversation"
|
||||
messages = [
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
]
|
||||
|
||||
translated_messages = anthropic_messages_pt(
|
||||
messages, model="claude-3-5-sonnet-20240620", llm_provider="anthropic"
|
||||
)
|
||||
|
||||
for i, msg in enumerate(translated_messages):
|
||||
if i == 0:
|
||||
assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
|
||||
elif i == 1:
|
||||
assert "cache_controls" not in msg["content"][0]
|
||||
elif i == 2:
|
||||
assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
|
||||
elif i == 3:
|
||||
assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
|
||||
|
||||
print("translated_messages: ", translated_messages)
|
||||
|
|
|
@ -966,3 +966,203 @@ async def test_user_info_team_list(prisma_client):
|
|||
pass
|
||||
|
||||
mock_client.assert_called()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Local test")
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_callback_via_key(prisma_client):
|
||||
"""
|
||||
Test if callback specified in key, is used.
|
||||
"""
|
||||
global headers
|
||||
import json
|
||||
|
||||
from fastapi import HTTPException, Request, Response
|
||||
from starlette.datastructures import URL
|
||||
|
||||
from litellm.proxy.proxy_server import chat_completion
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
|
||||
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
|
||||
await litellm.proxy.proxy_server.prisma_client.connect()
|
||||
|
||||
litellm.set_verbose = True
|
||||
|
||||
try:
|
||||
# Your test data
|
||||
test_data = {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"messages": [
|
||||
{"role": "user", "content": "write 1 sentence poem"},
|
||||
],
|
||||
"max_tokens": 10,
|
||||
"mock_response": "Hello world",
|
||||
"api_key": "my-fake-key",
|
||||
}
|
||||
|
||||
request = Request(scope={"type": "http", "method": "POST", "headers": {}})
|
||||
request._url = URL(url="/chat/completions")
|
||||
|
||||
json_bytes = json.dumps(test_data).encode("utf-8")
|
||||
|
||||
request._body = json_bytes
|
||||
|
||||
with patch.object(
|
||||
litellm.litellm_core_utils.litellm_logging,
|
||||
"LangFuseLogger",
|
||||
new=MagicMock(),
|
||||
) as mock_client:
|
||||
resp = await chat_completion(
|
||||
request=request,
|
||||
fastapi_response=Response(),
|
||||
user_api_key_dict=UserAPIKeyAuth(
|
||||
metadata={
|
||||
"logging": [
|
||||
{
|
||||
"callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
|
||||
"callback_type": "success", # set, if required by integration - future improvement, have logging tools work for success + failure by default
|
||||
"callback_vars": {
|
||||
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY",
|
||||
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY",
|
||||
"langfuse_host": "https://us.cloud.langfuse.com",
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
)
|
||||
print(resp)
|
||||
mock_client.assert_called()
|
||||
mock_client.return_value.log_event.assert_called()
|
||||
args, kwargs = mock_client.return_value.log_event.call_args
|
||||
kwargs = kwargs["kwargs"]
|
||||
assert "user_api_key_metadata" in kwargs["litellm_params"]["metadata"]
|
||||
assert (
|
||||
"logging"
|
||||
in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"]
|
||||
)
|
||||
checked_keys = False
|
||||
for item in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"][
|
||||
"logging"
|
||||
]:
|
||||
for k, v in item["callback_vars"].items():
|
||||
print("k={}, v={}".format(k, v))
|
||||
if "key" in k:
|
||||
assert "os.environ" in v
|
||||
checked_keys = True
|
||||
|
||||
assert checked_keys
|
||||
except Exception as e:
|
||||
pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
|
||||
import json
|
||||
|
||||
from fastapi import HTTPException, Request, Response
|
||||
from starlette.datastructures import URL
|
||||
|
||||
from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
|
||||
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
|
||||
await litellm.proxy.proxy_server.prisma_client.connect()
|
||||
|
||||
proxy_config = getattr(litellm.proxy.proxy_server, "proxy_config")
|
||||
|
||||
request = Request(scope={"type": "http", "method": "POST", "headers": {}})
|
||||
request._url = URL(url="/chat/completions")
|
||||
|
||||
test_data = {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"messages": [
|
||||
{"role": "user", "content": "write 1 sentence poem"},
|
||||
],
|
||||
"max_tokens": 10,
|
||||
"mock_response": "Hello world",
|
||||
"api_key": "my-fake-key",
|
||||
}
|
||||
|
||||
json_bytes = json.dumps(test_data).encode("utf-8")
|
||||
|
||||
request._body = json_bytes
|
||||
|
||||
data = {
|
||||
"data": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"messages": [{"role": "user", "content": "write 1 sentence poem"}],
|
||||
"max_tokens": 10,
|
||||
"mock_response": "Hello world",
|
||||
"api_key": "my-fake-key",
|
||||
},
|
||||
"request": request,
|
||||
"user_api_key_dict": UserAPIKeyAuth(
|
||||
token=None,
|
||||
key_name=None,
|
||||
key_alias=None,
|
||||
spend=0.0,
|
||||
max_budget=None,
|
||||
expires=None,
|
||||
models=[],
|
||||
aliases={},
|
||||
config={},
|
||||
user_id=None,
|
||||
team_id=None,
|
||||
max_parallel_requests=None,
|
||||
metadata={
|
||||
"logging": [
|
||||
{
|
||||
"callback_name": "langfuse",
|
||||
"callback_type": "success",
|
||||
"callback_vars": {
|
||||
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY",
|
||||
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY",
|
||||
"langfuse_host": "https://us.cloud.langfuse.com",
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
tpm_limit=None,
|
||||
rpm_limit=None,
|
||||
budget_duration=None,
|
||||
budget_reset_at=None,
|
||||
allowed_cache_controls=[],
|
||||
permissions={},
|
||||
model_spend={},
|
||||
model_max_budget={},
|
||||
soft_budget_cooldown=False,
|
||||
litellm_budget_table=None,
|
||||
org_id=None,
|
||||
team_spend=None,
|
||||
team_alias=None,
|
||||
team_tpm_limit=None,
|
||||
team_rpm_limit=None,
|
||||
team_max_budget=None,
|
||||
team_models=[],
|
||||
team_blocked=False,
|
||||
soft_budget=None,
|
||||
team_model_aliases=None,
|
||||
team_member_spend=None,
|
||||
team_metadata=None,
|
||||
end_user_id=None,
|
||||
end_user_tpm_limit=None,
|
||||
end_user_rpm_limit=None,
|
||||
end_user_max_budget=None,
|
||||
last_refreshed_at=None,
|
||||
api_key=None,
|
||||
user_role=None,
|
||||
allowed_model_region=None,
|
||||
parent_otel_span=None,
|
||||
),
|
||||
"proxy_config": proxy_config,
|
||||
"general_settings": {},
|
||||
"version": "0.0.0",
|
||||
}
|
||||
|
||||
new_data = await add_litellm_data_to_request(**data)
|
||||
|
||||
assert "success_callback" in new_data
|
||||
assert new_data["success_callback"] == ["langfuse"]
|
||||
assert "langfuse_public_key" in new_data
|
||||
assert "langfuse_secret_key" in new_data
|
||||
|
|
|
@ -15,9 +15,10 @@ class AnthropicMessagesTool(TypedDict, total=False):
|
|||
input_schema: Required[dict]
|
||||
|
||||
|
||||
class AnthropicMessagesTextParam(TypedDict):
|
||||
class AnthropicMessagesTextParam(TypedDict, total=False):
|
||||
type: Literal["text"]
|
||||
text: str
|
||||
cache_control: Optional[dict]
|
||||
|
||||
|
||||
class AnthropicMessagesToolUseParam(TypedDict):
|
||||
|
@ -54,9 +55,10 @@ class AnthropicImageParamSource(TypedDict):
|
|||
data: str
|
||||
|
||||
|
||||
class AnthropicMessagesImageParam(TypedDict):
|
||||
class AnthropicMessagesImageParam(TypedDict, total=False):
|
||||
type: Literal["image"]
|
||||
source: AnthropicImageParamSource
|
||||
cache_control: Optional[dict]
|
||||
|
||||
|
||||
class AnthropicMessagesToolResultContent(TypedDict):
|
||||
|
@ -92,6 +94,12 @@ class AnthropicMetadata(TypedDict, total=False):
|
|||
user_id: str
|
||||
|
||||
|
||||
class AnthropicSystemMessageContent(TypedDict, total=False):
|
||||
type: str
|
||||
text: str
|
||||
cache_control: Optional[dict]
|
||||
|
||||
|
||||
class AnthropicMessagesRequest(TypedDict, total=False):
|
||||
model: Required[str]
|
||||
messages: Required[
|
||||
|
@ -106,7 +114,7 @@ class AnthropicMessagesRequest(TypedDict, total=False):
|
|||
metadata: AnthropicMetadata
|
||||
stop_sequences: List[str]
|
||||
stream: bool
|
||||
system: str
|
||||
system: Union[str, List]
|
||||
temperature: float
|
||||
tool_choice: AnthropicMessagesToolChoice
|
||||
tools: List[AnthropicMessagesTool]
|
||||
|
|
|
@ -361,7 +361,7 @@ class ChatCompletionToolMessage(TypedDict):
|
|||
|
||||
class ChatCompletionSystemMessage(TypedDict, total=False):
|
||||
role: Required[Literal["system"]]
|
||||
content: Required[str]
|
||||
content: Required[Union[str, List]]
|
||||
name: str
|
||||
|
||||
|
||||
|
|
|
@ -2074,7 +2074,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/claude-3-5-sonnet@20240620": {
|
||||
"max_tokens": 4096,
|
||||
|
@ -2085,7 +2086,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/claude-3-haiku@20240307": {
|
||||
"max_tokens": 4096,
|
||||
|
@ -2096,7 +2098,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/claude-3-opus@20240229": {
|
||||
"max_tokens": 4096,
|
||||
|
@ -2107,7 +2110,8 @@
|
|||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
"supports_vision": true,
|
||||
"supports_assistant_prefill": true
|
||||
},
|
||||
"vertex_ai/meta/llama3-405b-instruct-maas": {
|
||||
"max_tokens": 32000,
|
||||
|
@ -4531,6 +4535,69 @@
|
|||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-70b-instruct": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000001,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-8b-instruct": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.0000002,
|
||||
"output_cost_per_token": 0.0000002,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-huge-128k-online": {
|
||||
"max_tokens": 127072,
|
||||
"max_input_tokens": 127072,
|
||||
"max_output_tokens": 127072,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"output_cost_per_token": 0.000005,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-large-128k-online": {
|
||||
"max_tokens": 127072,
|
||||
"max_input_tokens": 127072,
|
||||
"max_output_tokens": 127072,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000001,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-large-128k-chat": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000001,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-small-128k-chat": {
|
||||
"max_tokens": 131072,
|
||||
"max_input_tokens": 131072,
|
||||
"max_output_tokens": 131072,
|
||||
"input_cost_per_token": 0.0000002,
|
||||
"output_cost_per_token": 0.0000002,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/llama-3.1-sonar-small-128k-online": {
|
||||
"max_tokens": 127072,
|
||||
"max_input_tokens": 127072,
|
||||
"max_output_tokens": 127072,
|
||||
"input_cost_per_token": 0.0000002,
|
||||
"output_cost_per_token": 0.0000002,
|
||||
"litellm_provider": "perplexity",
|
||||
"mode": "chat"
|
||||
},
|
||||
"perplexity/pplx-7b-chat": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.43.10"
|
||||
version = "1.43.13"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.43.10"
|
||||
version = "1.43.13"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue