Merge branch 'main' into litellm_azure_ai_openai_support

This commit is contained in:
Krish Dholakia 2024-08-14 17:53:27 -07:00 committed by GitHub
commit bda1ee16a9
34 changed files with 1805 additions and 180 deletions

View file

@ -62,6 +62,11 @@ COPY --from=builder /wheels/ /wheels/
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# Generate prisma client
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
RUN mkdir -p /.cache
RUN chmod -R 777 /.cache
RUN pip install nodejs-bin
RUN pip install prisma
RUN prisma generate
RUN chmod +x entrypoint.sh

View file

@ -62,6 +62,11 @@ RUN pip install PyJWT --no-cache-dir
RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh
# Generate prisma client
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
RUN mkdir -p /.cache
RUN chmod -R 777 /.cache
RUN pip install nodejs-bin
RUN pip install prisma
RUN prisma generate
RUN chmod +x entrypoint.sh

View file

@ -225,22 +225,336 @@ print(response)
| claude-instant-1.2 | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
## Passing Extra Headers to Anthropic API
## **Prompt Caching**
Pass `extra_headers: dict` to `litellm.completion`
Use Anthropic Prompt Caching
[Relevant Anthropic API Docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching)
### Caching - Large Context Caching
This example demonstrates basic Prompt Caching usage, caching the full text of the legal agreement as a prefix while keeping the user instruction uncached.
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
from litellm import completion
messages = [{"role": "user", "content": "What is Anthropic?"}]
response = completion(
model="claude-3-5-sonnet-20240620",
messages=messages,
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
:::info
LiteLLM Proxy is OpenAI compatible
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
:::
```python
import openai
client = openai.AsyncOpenAI(
api_key="anything", # litellm proxy api key
base_url="http://0.0.0.0:4000" # litellm proxy base url
)
response = await client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
</Tabs>
### Caching - Tools definitions
In this example, we demonstrate caching tool definitions.
The cache_control parameter is placed on the final tool
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
import litellm
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
"cache_control": {"type": "ephemeral"}
},
}
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
## Advanced
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
## Usage - Function Calling
:::info
LiteLLM Proxy is OpenAI compatible
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
:::
```python
import openai
client = openai.AsyncOpenAI(
api_key="anything", # litellm proxy api key
base_url="http://0.0.0.0:4000" # litellm proxy base url
)
response = await client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
"cache_control": {"type": "ephemeral"}
},
}
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
</Tabs>
### Caching - Continuing Multi-Turn Convo
In this example, we demonstrate how to use Prompt Caching in a multi-turn conversation.
The cache_control parameter is placed on the system message to designate it as part of the static prefix.
The conversation history (previous messages) is included in the messages array. The final turn is marked with cache-control, for continuing in followups. The second-to-last user message is marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
import litellm
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
:::info
LiteLLM Proxy is OpenAI compatible
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
:::
```python
import openai
client = openai.AsyncOpenAI(
api_key="anything", # litellm proxy api key
base_url="http://0.0.0.0:4000" # litellm proxy base url
)
response = await client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
</Tabs>
## **Function/Tool Calling**
:::info
@ -429,6 +743,20 @@ resp = litellm.completion(
print(f"\nResponse: {resp}")
```
## **Passing Extra Headers to Anthropic API**
Pass `extra_headers: dict` to `litellm.completion`
```python
from litellm import completion
messages = [{"role": "user", "content": "What is Anthropic?"}]
response = completion(
model="claude-3-5-sonnet-20240620",
messages=messages,
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
)
```
## Usage - "Assistant Pre-fill"
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.

View file

@ -17,7 +17,7 @@ model_list:
## Get Model Information - `/model/info`
Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled from the model_info you set and the [litellm model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Sensitive details like API keys are excluded for security purposes.
<Tabs
defaultValue="curl"
@ -35,14 +35,10 @@ curl -X GET "http://0.0.0.0:4000/model/info" \
## Add a New Model
Add a new model to the list in the `config.yaml` by providing the model parameters. This allows you to update the model list without restarting the proxy.
Add a new model to the proxy via the `/model/new` API, to add models without restarting the proxy.
<Tabs
defaultValue="curl"
values={[
{ label: 'cURL', value: 'curl', },
]}>
<TabItem value="curl">
<Tabs>
<TabItem value="API">
```bash
curl -X POST "http://0.0.0.0:4000/model/new" \
@ -50,6 +46,21 @@ curl -X POST "http://0.0.0.0:4000/model/new" \
-H "Content-Type: application/json" \
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
```
</TabItem>
<TabItem value="Yaml">
```yaml
model_list:
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ### `openai.chat.completions.create(model="gpt-3.5-turbo",...)`
litellm_params: # all params accepted by litellm.completion() - https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/types/router.py#L297
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
model_info:
my_custom_key: my_custom_value # additional model metadata
```
</TabItem>
</Tabs>
@ -86,3 +97,82 @@ Keep in mind that as both endpoints are in [BETA], you may need to visit the ass
- Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964)
Feedback on the beta endpoints is valuable and helps improve the API for all users.
## Add Additional Model Information
If you want the ability to add a display name, description, and labels for models, just use `model_info:`
```yaml
model_list:
- model_name: "gpt-4"
litellm_params:
model: "gpt-4"
api_key: "os.environ/OPENAI_API_KEY"
model_info: # 👈 KEY CHANGE
my_custom_key: "my_custom_value"
```
### Usage
1. Add additional information to model
```yaml
model_list:
- model_name: "gpt-4"
litellm_params:
model: "gpt-4"
api_key: "os.environ/OPENAI_API_KEY"
model_info: # 👈 KEY CHANGE
my_custom_key: "my_custom_value"
```
2. Call with `/model/info`
Use a key with access to the model `gpt-4`.
```bash
curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
-H 'Authorization: Bearer LITELLM_KEY' \
```
3. **Expected Response**
Returned `model_info = Your custom model_info + (if exists) LITELLM MODEL INFO`
[**How LiteLLM Model Info is found**](https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/proxy/proxy_server.py#L7460)
[Tell us how this can be improved!](https://github.com/BerriAI/litellm/issues)
```bash
{
"data": [
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4"
},
"model_info": {
"id": "e889baacd17f591cce4c63639275ba5e8dc60765d6c553e6ee5a504b19e50ddc",
"db_model": false,
"my_custom_key": "my_custom_value", # 👈 CUSTOM INFO
"key": "gpt-4", # 👈 KEY in LiteLLM MODEL INFO/COST MAP - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
"max_tokens": 4096,
"max_input_tokens": 8192,
"max_output_tokens": 4096,
"input_cost_per_token": 3e-05,
"input_cost_per_character": null,
"input_cost_per_token_above_128k_tokens": null,
"output_cost_per_token": 6e-05,
"output_cost_per_character": null,
"output_cost_per_token_above_128k_tokens": null,
"output_cost_per_character_above_128k_tokens": null,
"output_vector_size": null,
"litellm_provider": "openai",
"mode": "chat"
}
},
]
}
```

View file

@ -72,15 +72,15 @@ http://localhost:4000/metrics
| Metric Name | Description |
|----------------------|--------------------------------------|
| `deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
`llm_deployment_success_responses` | Total number of successful LLM API calls for deployment |
| `llm_deployment_failure_responses` | Total number of failed LLM API calls for deployment |
| `llm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure |
| `llm_deployment_latency_per_output_token` | Latency per output token for deployment |
| `llm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model |
| `llm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model |
`litellm_deployment_success_responses` | Total number of successful LLM API calls for deployment |
| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for deployment |
| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure |
| `litellm_deployment_latency_per_output_token` | Latency per output token for deployment |
| `litellm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model |
| `litellm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model |

View file

@ -1,5 +1,6 @@
import json
import os
import uuid
from datetime import datetime
from typing import Any, Dict, List, Optional, TypedDict, Union
@ -29,6 +30,8 @@ class GCSBucketPayload(TypedDict):
end_time: str
response_cost: Optional[float]
spend_log_metadata: str
exception: Optional[str]
log_event_type: Optional[str]
class GCSBucketLogger(CustomLogger):
@ -79,6 +82,7 @@ class GCSBucketLogger(CustomLogger):
logging_payload: GCSBucketPayload = await self.get_gcs_payload(
kwargs, response_obj, start_time_str, end_time_str
)
logging_payload["log_event_type"] = "successful_api_call"
json_logged_payload = json.dumps(logging_payload)
@ -103,7 +107,56 @@ class GCSBucketLogger(CustomLogger):
verbose_logger.error("GCS Bucket logging error: %s", str(e))
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
pass
from litellm.proxy.proxy_server import premium_user
if premium_user is not True:
raise ValueError(
f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
)
try:
verbose_logger.debug(
"GCS Logger: async_log_failure_event logging kwargs: %s, response_obj: %s",
kwargs,
response_obj,
)
start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")
headers = await self.construct_request_headers()
logging_payload: GCSBucketPayload = await self.get_gcs_payload(
kwargs, response_obj, start_time_str, end_time_str
)
logging_payload["log_event_type"] = "failed_api_call"
_litellm_params = kwargs.get("litellm_params") or {}
metadata = _litellm_params.get("metadata") or {}
json_logged_payload = json.dumps(logging_payload)
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")
# Modify the object_name to include the date-based folder
object_name = f"{current_date}/failure-{uuid.uuid4().hex}"
if "gcs_log_id" in metadata:
object_name = metadata["gcs_log_id"]
response = await self.async_httpx_client.post(
headers=headers,
url=f"https://storage.googleapis.com/upload/storage/v1/b/{self.BUCKET_NAME}/o?uploadType=media&name={object_name}",
data=json_logged_payload,
)
if response.status_code != 200:
verbose_logger.error("GCS Bucket logging error: %s", str(response.text))
verbose_logger.debug("GCS Bucket response %s", response)
verbose_logger.debug("GCS Bucket status code %s", response.status_code)
verbose_logger.debug("GCS Bucket response.text %s", response.text)
except Exception as e:
verbose_logger.error("GCS Bucket logging error: %s", str(e))
async def construct_request_headers(self) -> Dict[str, str]:
from litellm import vertex_chat_completion
@ -139,10 +192,19 @@ class GCSBucketLogger(CustomLogger):
optional_params=kwargs.get("optional_params", None),
)
response_dict = {}
if response_obj:
response_dict = convert_litellm_response_object_to_dict(
response_obj=response_obj
)
exception_str = None
# Handle logging exception attributes
if "exception" in kwargs:
exception_str = kwargs.get("exception", "")
if not isinstance(exception_str, str):
exception_str = str(exception_str)
_spend_log_payload: SpendLogsPayload = get_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
@ -156,8 +218,10 @@ class GCSBucketLogger(CustomLogger):
response_obj=response_dict,
start_time=start_time,
end_time=end_time,
spend_log_metadata=_spend_log_payload["metadata"],
spend_log_metadata=_spend_log_payload.get("metadata", ""),
response_cost=kwargs.get("response_cost", None),
exception=exception_str,
log_event_type=None,
)
return gcs_payload

View file

@ -141,42 +141,42 @@ class PrometheusLogger(CustomLogger):
]
# Metric for deployment state
self.deployment_state = Gauge(
"deployment_state",
self.litellm_deployment_state = Gauge(
"litellm_deployment_state",
"LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage",
labelnames=_logged_llm_labels,
)
self.llm_deployment_success_responses = Counter(
name="llm_deployment_success_responses",
self.litellm_deployment_success_responses = Counter(
name="litellm_deployment_success_responses",
documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm",
labelnames=_logged_llm_labels,
)
self.llm_deployment_failure_responses = Counter(
name="llm_deployment_failure_responses",
self.litellm_deployment_failure_responses = Counter(
name="litellm_deployment_failure_responses",
documentation="LLM Deployment Analytics - Total number of failed LLM API calls via litellm",
labelnames=_logged_llm_labels,
)
self.llm_deployment_total_requests = Counter(
name="llm_deployment_total_requests",
self.litellm_deployment_total_requests = Counter(
name="litellm_deployment_total_requests",
documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure",
labelnames=_logged_llm_labels,
)
# Deployment Latency tracking
self.llm_deployment_latency_per_output_token = Histogram(
name="llm_deployment_latency_per_output_token",
self.litellm_deployment_latency_per_output_token = Histogram(
name="litellm_deployment_latency_per_output_token",
documentation="LLM Deployment Analytics - Latency per output token",
labelnames=_logged_llm_labels,
)
self.llm_deployment_successful_fallbacks = Counter(
"llm_deployment_successful_fallbacks",
self.litellm_deployment_successful_fallbacks = Counter(
"litellm_deployment_successful_fallbacks",
"LLM Deployment Analytics - Number of successful fallback requests from primary model -> fallback model",
["primary_model", "fallback_model"],
)
self.llm_deployment_failed_fallbacks = Counter(
"llm_deployment_failed_fallbacks",
self.litellm_deployment_failed_fallbacks = Counter(
"litellm_deployment_failed_fallbacks",
"LLM Deployment Analytics - Number of failed fallback requests from primary model -> fallback model",
["primary_model", "fallback_model"],
)
@ -358,14 +358,14 @@ class PrometheusLogger(CustomLogger):
api_provider=llm_provider,
)
self.llm_deployment_failure_responses.labels(
self.litellm_deployment_failure_responses.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
api_provider=llm_provider,
).inc()
self.llm_deployment_total_requests.labels(
self.litellm_deployment_total_requests.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
@ -438,14 +438,14 @@ class PrometheusLogger(CustomLogger):
api_provider=llm_provider,
)
self.llm_deployment_success_responses.labels(
self.litellm_deployment_success_responses.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
api_provider=llm_provider,
).inc()
self.llm_deployment_total_requests.labels(
self.litellm_deployment_total_requests.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
@ -475,7 +475,7 @@ class PrometheusLogger(CustomLogger):
latency_per_token = None
if output_tokens is not None and output_tokens > 0:
latency_per_token = _latency_seconds / output_tokens
self.llm_deployment_latency_per_output_token.labels(
self.litellm_deployment_latency_per_output_token.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
@ -497,7 +497,7 @@ class PrometheusLogger(CustomLogger):
kwargs,
)
_new_model = kwargs.get("model")
self.llm_deployment_successful_fallbacks.labels(
self.litellm_deployment_successful_fallbacks.labels(
primary_model=original_model_group, fallback_model=_new_model
).inc()
@ -508,11 +508,11 @@ class PrometheusLogger(CustomLogger):
kwargs,
)
_new_model = kwargs.get("model")
self.llm_deployment_failed_fallbacks.labels(
self.litellm_deployment_failed_fallbacks.labels(
primary_model=original_model_group, fallback_model=_new_model
).inc()
def set_deployment_state(
def set_litellm_deployment_state(
self,
state: int,
litellm_model_name: str,
@ -520,7 +520,7 @@ class PrometheusLogger(CustomLogger):
api_base: str,
api_provider: str,
):
self.deployment_state.labels(
self.litellm_deployment_state.labels(
litellm_model_name, model_id, api_base, api_provider
).set(state)
@ -531,7 +531,7 @@ class PrometheusLogger(CustomLogger):
api_base: str,
api_provider: str,
):
self.set_deployment_state(
self.set_litellm_deployment_state(
0, litellm_model_name, model_id, api_base, api_provider
)
@ -542,7 +542,7 @@ class PrometheusLogger(CustomLogger):
api_base: str,
api_provider: str,
):
self.set_deployment_state(
self.set_litellm_deployment_state(
1, litellm_model_name, model_id, api_base, api_provider
)
@ -553,7 +553,7 @@ class PrometheusLogger(CustomLogger):
api_base: str,
api_provider: str,
):
self.set_deployment_state(
self.set_litellm_deployment_state(
2, litellm_model_name, model_id, api_base, api_provider
)

View file

@ -41,8 +41,8 @@ async def get_fallback_metric_from_prometheus():
"""
response_message = ""
relevant_metrics = [
"llm_deployment_successful_fallbacks_total",
"llm_deployment_failed_fallbacks_total",
"litellm_deployment_successful_fallbacks_total",
"litellm_deployment_failed_fallbacks_total",
]
for metric in relevant_metrics:
response_json = await get_metric_from_prometheus(

View file

@ -35,6 +35,7 @@ from litellm.types.llms.anthropic import (
AnthropicResponseContentBlockText,
AnthropicResponseContentBlockToolUse,
AnthropicResponseUsageBlock,
AnthropicSystemMessageContent,
ContentBlockDelta,
ContentBlockStart,
ContentBlockStop,
@ -759,6 +760,7 @@ class AnthropicChatCompletion(BaseLLM):
## CALCULATING USAGE
prompt_tokens = completion_response["usage"]["input_tokens"]
completion_tokens = completion_response["usage"]["output_tokens"]
_usage = completion_response["usage"]
total_tokens = prompt_tokens + completion_tokens
model_response.created = int(time.time())
@ -768,6 +770,11 @@ class AnthropicChatCompletion(BaseLLM):
completion_tokens=completion_tokens,
total_tokens=total_tokens,
)
if "cache_creation_input_tokens" in _usage:
usage["cache_creation_input_tokens"] = _usage["cache_creation_input_tokens"]
if "cache_read_input_tokens" in _usage:
usage["cache_read_input_tokens"] = _usage["cache_read_input_tokens"]
setattr(model_response, "usage", usage) # type: ignore
return model_response
@ -901,6 +908,7 @@ class AnthropicChatCompletion(BaseLLM):
# Separate system prompt from rest of message
system_prompt_indices = []
system_prompt = ""
anthropic_system_message_list = None
for idx, message in enumerate(messages):
if message["role"] == "system":
valid_content: bool = False
@ -908,8 +916,23 @@ class AnthropicChatCompletion(BaseLLM):
system_prompt += message["content"]
valid_content = True
elif isinstance(message["content"], list):
for content in message["content"]:
system_prompt += content.get("text", "")
for _content in message["content"]:
anthropic_system_message_content = (
AnthropicSystemMessageContent(
type=_content.get("type"),
text=_content.get("text"),
)
)
if "cache_control" in _content:
anthropic_system_message_content["cache_control"] = (
_content["cache_control"]
)
if anthropic_system_message_list is None:
anthropic_system_message_list = []
anthropic_system_message_list.append(
anthropic_system_message_content
)
valid_content = True
if valid_content:
@ -919,6 +942,10 @@ class AnthropicChatCompletion(BaseLLM):
messages.pop(idx)
if len(system_prompt) > 0:
optional_params["system"] = system_prompt
# Handling anthropic API Prompt Caching
if anthropic_system_message_list is not None:
optional_params["system"] = anthropic_system_message_list
# Format rest of message according to anthropic guidelines
try:
messages = prompt_factory(
@ -954,6 +981,8 @@ class AnthropicChatCompletion(BaseLLM):
else: # assume openai tool call
new_tool = tool["function"]
new_tool["input_schema"] = new_tool.pop("parameters") # rename key
if "cache_control" in tool:
new_tool["cache_control"] = tool["cache_control"]
anthropic_tools.append(new_tool)
optional_params["tools"] = anthropic_tools

View file

@ -356,6 +356,7 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
"json": data,
"method": "POST",
"timeout": litellm.request_timeout,
"follow_redirects": True
}
if api_key is not None:
_request["headers"] = {"Authorization": "Bearer {}".format(api_key)}

View file

@ -1224,6 +1224,19 @@ def convert_to_anthropic_tool_invoke(
return anthropic_tool_invoke
def add_cache_control_to_content(
anthropic_content_element: Union[
dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
],
orignal_content_element: dict,
):
if "cache_control" in orignal_content_element:
anthropic_content_element["cache_control"] = orignal_content_element[
"cache_control"
]
return anthropic_content_element
def anthropic_messages_pt(
messages: list,
model: str,
@ -1264,8 +1277,8 @@ def anthropic_messages_pt(
image_chunk = convert_to_anthropic_image_obj(
m["image_url"]["url"]
)
user_content.append(
AnthropicMessagesImageParam(
_anthropic_content_element = AnthropicMessagesImageParam(
type="image",
source=AnthropicImageParamSource(
type="base64",
@ -1273,9 +1286,22 @@ def anthropic_messages_pt(
data=image_chunk["data"],
),
)
anthropic_content_element = add_cache_control_to_content(
anthropic_content_element=_anthropic_content_element,
orignal_content_element=m,
)
user_content.append(anthropic_content_element)
elif m.get("type", "") == "text":
user_content.append({"type": "text", "text": m["text"]})
_anthropic_text_content_element = {
"type": "text",
"text": m["text"],
}
anthropic_content_element = add_cache_control_to_content(
anthropic_content_element=_anthropic_text_content_element,
orignal_content_element=m,
)
user_content.append(anthropic_content_element)
elif (
messages[msg_i]["role"] == "tool"
or messages[msg_i]["role"] == "function"
@ -1306,6 +1332,10 @@ def anthropic_messages_pt(
anthropic_message = AnthropicMessagesTextParam(
type="text", text=m.get("text")
)
anthropic_message = add_cache_control_to_content(
anthropic_content_element=anthropic_message,
orignal_content_element=m,
)
assistant_content.append(anthropic_message)
elif (
"content" in messages[msg_i]
@ -1313,9 +1343,17 @@ def anthropic_messages_pt(
and len(messages[msg_i]["content"])
> 0 # don't pass empty text blocks. anthropic api raises errors.
):
assistant_content.append(
{"type": "text", "text": messages[msg_i]["content"]}
_anthropic_text_content_element = {
"type": "text",
"text": messages[msg_i]["content"],
}
anthropic_content_element = add_cache_control_to_content(
anthropic_content_element=_anthropic_text_content_element,
orignal_content_element=messages[msg_i],
)
assistant_content.append(anthropic_content_element)
if messages[msg_i].get(
"tool_calls", []
@ -1701,12 +1739,14 @@ def cohere_messages_pt_v2(
assistant_tool_calls: List[ToolCallObject] = []
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
assistant_text = (
messages[msg_i].get("content") or ""
) # either string or none
if assistant_text:
assistant_content += assistant_text
if isinstance(messages[msg_i]["content"], list):
for m in messages[msg_i]["content"]:
if m.get("type", "") == "text":
assistant_content += m["text"]
elif messages[msg_i].get("content") is not None and isinstance(
messages[msg_i]["content"], str
):
assistant_content += messages[msg_i]["content"]
if messages[msg_i].get(
"tool_calls", []
): # support assistant tool invoke conversion

View file

@ -2074,7 +2074,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-5-sonnet@20240620": {
"max_tokens": 4096,
@ -2085,7 +2086,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-haiku@20240307": {
"max_tokens": 4096,
@ -2096,7 +2098,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-opus@20240229": {
"max_tokens": 4096,
@ -2107,7 +2110,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/meta/llama3-405b-instruct-maas": {
"max_tokens": 32000,
@ -4531,6 +4535,69 @@
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-70b-instruct": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000001,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-8b-instruct": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-huge-128k-online": {
"max_tokens": 127072,
"max_input_tokens": 127072,
"max_output_tokens": 127072,
"input_cost_per_token": 0.000005,
"output_cost_per_token": 0.000005,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-large-128k-online": {
"max_tokens": 127072,
"max_input_tokens": 127072,
"max_output_tokens": 127072,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000001,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-large-128k-chat": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000001,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-small-128k-chat": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-small-128k-online": {
"max_tokens": 127072,
"max_input_tokens": 127072,
"max_output_tokens": 127072,
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/pplx-7b-chat": {
"max_tokens": 8192,
"max_input_tokens": 8192,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,7 +1,6 @@
model_list:
- model_name: azure-embedding-model
- model_name: "gpt-4"
litellm_params:
model: azure/azure-embedding-model
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model: "gpt-4"
model_info:
my_custom_key: "my_custom_value"

View file

@ -85,6 +85,8 @@ def _get_bearer_token(
):
if api_key.startswith("Bearer "): # ensure Bearer token passed in
api_key = api_key.replace("Bearer ", "") # extract the token
elif api_key.startswith("Basic "):
api_key = api_key.replace("Basic ", "") # handle langfuse input
else:
api_key = ""
return api_key
@ -138,7 +140,6 @@ async def user_api_key_auth(
pass_through_endpoints: Optional[List[dict]] = general_settings.get(
"pass_through_endpoints", None
)
if isinstance(api_key, str):
passed_in_key = api_key
api_key = _get_bearer_token(api_key=api_key)
@ -367,6 +368,40 @@ async def user_api_key_auth(
parent_otel_span=parent_otel_span,
)
#### ELSE ####
## CHECK PASS-THROUGH ENDPOINTS ##
if pass_through_endpoints is not None:
for endpoint in pass_through_endpoints:
if endpoint.get("path", "") == route:
## IF AUTH DISABLED
if endpoint.get("auth") is not True:
return UserAPIKeyAuth()
## IF AUTH ENABLED
### IF CUSTOM PARSER REQUIRED
if (
endpoint.get("custom_auth_parser") is not None
and endpoint.get("custom_auth_parser") == "langfuse"
):
"""
- langfuse returns {'Authorization': 'Basic YW55dGhpbmc6YW55dGhpbmc'}
- check the langfuse public key if it contains the litellm api key
"""
import base64
api_key = api_key.replace("Basic ", "").strip()
decoded_bytes = base64.b64decode(api_key)
decoded_str = decoded_bytes.decode("utf-8")
api_key = decoded_str.split(":")[0]
else:
headers = endpoint.get("headers", None)
if headers is not None:
header_key = headers.get("litellm_user_api_key", "")
if (
isinstance(request.headers, dict)
and request.headers.get(key=header_key) is not None
):
api_key = request.headers.get(key=header_key)
if master_key is None:
if isinstance(api_key, str):
return UserAPIKeyAuth(
@ -533,7 +568,11 @@ async def user_api_key_auth(
if isinstance(
api_key, str
): # if generated token, make sure it starts with sk-.
assert api_key.startswith("sk-") # prevent token hashes from being used
assert api_key.startswith(
"sk-"
), "LiteLLM Virtual Key expected. Received={}, expected to start with 'sk-'.".format(
api_key
) # prevent token hashes from being used
else:
verbose_logger.warning(
"litellm.proxy.proxy_server.user_api_key_auth(): Warning - Key={} is not a string.".format(

View file

@ -5,7 +5,12 @@ from fastapi import Request
import litellm
from litellm._logging import verbose_logger, verbose_proxy_logger
from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth
from litellm.proxy._types import (
AddTeamCallback,
CommonProxyErrors,
TeamCallbackMetadata,
UserAPIKeyAuth,
)
from litellm.types.utils import SupportedCacheControls
if TYPE_CHECKING:
@ -59,6 +64,42 @@ def safe_add_api_version_from_query_params(data: dict, request: Request):
verbose_logger.error("error checking api version in query params: %s", str(e))
def convert_key_logging_metadata_to_callback(
data: AddTeamCallback, team_callback_settings_obj: Optional[TeamCallbackMetadata]
) -> TeamCallbackMetadata:
if team_callback_settings_obj is None:
team_callback_settings_obj = TeamCallbackMetadata()
if data.callback_type == "success":
if team_callback_settings_obj.success_callback is None:
team_callback_settings_obj.success_callback = []
if data.callback_name not in team_callback_settings_obj.success_callback:
team_callback_settings_obj.success_callback.append(data.callback_name)
elif data.callback_type == "failure":
if team_callback_settings_obj.failure_callback is None:
team_callback_settings_obj.failure_callback = []
if data.callback_name not in team_callback_settings_obj.failure_callback:
team_callback_settings_obj.failure_callback.append(data.callback_name)
elif data.callback_type == "success_and_failure":
if team_callback_settings_obj.success_callback is None:
team_callback_settings_obj.success_callback = []
if team_callback_settings_obj.failure_callback is None:
team_callback_settings_obj.failure_callback = []
if data.callback_name not in team_callback_settings_obj.success_callback:
team_callback_settings_obj.success_callback.append(data.callback_name)
if data.callback_name in team_callback_settings_obj.failure_callback:
team_callback_settings_obj.failure_callback.append(data.callback_name)
for var, value in data.callback_vars.items():
if team_callback_settings_obj.callback_vars is None:
team_callback_settings_obj.callback_vars = {}
team_callback_settings_obj.callback_vars[var] = litellm.get_secret(value)
return team_callback_settings_obj
async def add_litellm_data_to_request(
data: dict,
request: Request,
@ -224,6 +265,7 @@ async def add_litellm_data_to_request(
} # add the team-specific configs to the completion call
# Team Callbacks controls
callback_settings_obj: Optional[TeamCallbackMetadata] = None
if user_api_key_dict.team_metadata is not None:
team_metadata = user_api_key_dict.team_metadata
if "callback_settings" in team_metadata:
@ -241,6 +283,18 @@ async def add_litellm_data_to_request(
}
}
"""
elif (
user_api_key_dict.metadata is not None
and "logging" in user_api_key_dict.metadata
):
for item in user_api_key_dict.metadata["logging"]:
callback_settings_obj = convert_key_logging_metadata_to_callback(
data=AddTeamCallback(**item),
team_callback_settings_obj=callback_settings_obj,
)
if callback_settings_obj is not None:
data["success_callback"] = callback_settings_obj.success_callback
data["failure_callback"] = callback_settings_obj.failure_callback

View file

@ -309,7 +309,7 @@ async def pass_through_request(
json=_parsed_body,
)
if response.status_code != 200:
if response.status_code >= 300:
raise HTTPException(status_code=response.status_code, detail=response.text)
content = await response.aread()

View file

@ -39,7 +39,4 @@ general_settings:
litellm_settings:
fallbacks: [{"gemini-1.5-pro-001": ["gpt-4o"]}]
success_callback: ["langfuse", "prometheus"]
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"]
failure_callback: ["prometheus"]
cache: True
callbacks: ["gcs_bucket"]

View file

@ -21,6 +21,8 @@ def get_logging_payload(
if kwargs is None:
kwargs = {}
if response_obj is None:
response_obj = {}
# standardize this function to be used across, s3, dynamoDB, langfuse logging
litellm_params = kwargs.get("litellm_params", {})
metadata = (

View file

@ -190,7 +190,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):
if azure_ad_token.startswith("oidc/"):
azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
if api_version is None:
api_version = litellm.AZURE_DEFAULT_API_VERSION
api_version = os.getenv("AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION)
if "gateway.ai.cloudflare.com" in api_base:
if not api_base.endswith("/"):

View file

@ -0,0 +1,321 @@
import json
import os
import sys
import traceback
from dotenv import load_dotenv
load_dotenv()
import io
import os
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import os
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import litellm
from litellm import RateLimitError, Timeout, completion, completion_cost, embedding
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
# litellm.num_retries =3
litellm.cache = None
litellm.success_callback = []
user_message = "Write a short poem about the sky"
messages = [{"content": user_message, "role": "user"}]
def logger_fn(user_model_dict):
print(f"user_model_dict: {user_model_dict}")
@pytest.fixture(autouse=True)
def reset_callbacks():
print("\npytest fixture - resetting callbacks")
litellm.success_callback = []
litellm._async_success_callback = []
litellm.failure_callback = []
litellm.callbacks = []
@pytest.mark.asyncio
async def test_litellm_anthropic_prompt_caching_tools():
# Arrange: Set up the MagicMock for the httpx.AsyncClient
mock_response = AsyncMock()
def return_val():
return {
"id": "msg_01XFDUDYJgAACzvnptvVoYEL",
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": "Hello!"}],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": "end_turn",
"stop_sequence": None,
"usage": {"input_tokens": 12, "output_tokens": 6},
}
mock_response.json = return_val
litellm.set_verbose = True
with patch(
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
return_value=mock_response,
) as mock_post:
# Act: Call the litellm.acompletion function
response = await litellm.acompletion(
api_key="mock_api_key",
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{"role": "user", "content": "What's the weather like in Boston today?"}
],
tools=[
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
"cache_control": {"type": "ephemeral"},
},
}
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
# Print what was called on the mock
print("call args=", mock_post.call_args)
expected_url = "https://api.anthropic.com/v1/messages"
expected_headers = {
"accept": "application/json",
"content-type": "application/json",
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
"x-api-key": "mock_api_key",
}
expected_json = {
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's the weather like in Boston today?",
}
],
}
],
"tools": [
{
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"cache_control": {"type": "ephemeral"},
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
}
],
"max_tokens": 4096,
"model": "claude-3-5-sonnet-20240620",
}
mock_post.assert_called_once_with(
expected_url, json=expected_json, headers=expected_headers, timeout=600.0
)
@pytest.mark.asyncio()
async def test_anthropic_api_prompt_caching_basic():
litellm.set_verbose = True
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
temperature=0.2,
max_tokens=10,
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
print("response=", response)
assert "cache_read_input_tokens" in response.usage
assert "cache_creation_input_tokens" in response.usage
# Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
assert (response.usage.cache_read_input_tokens > 0) or (
response.usage.cache_creation_input_tokens > 0
)
@pytest.mark.asyncio
async def test_litellm_anthropic_prompt_caching_system():
# https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples
# LArge Context Caching Example
mock_response = AsyncMock()
def return_val():
return {
"id": "msg_01XFDUDYJgAACzvnptvVoYEL",
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": "Hello!"}],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": "end_turn",
"stop_sequence": None,
"usage": {"input_tokens": 12, "output_tokens": 6},
}
mock_response.json = return_val
litellm.set_verbose = True
with patch(
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
return_value=mock_response,
) as mock_post:
# Act: Call the litellm.acompletion function
response = await litellm.acompletion(
api_key="mock_api_key",
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
# Print what was called on the mock
print("call args=", mock_post.call_args)
expected_url = "https://api.anthropic.com/v1/messages"
expected_headers = {
"accept": "application/json",
"content-type": "application/json",
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
"x-api-key": "mock_api_key",
}
expected_json = {
"system": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "what are the key terms and conditions in this agreement?",
}
],
}
],
"max_tokens": 4096,
"model": "claude-3-5-sonnet-20240620",
}
mock_post.assert_called_once_with(
expected_url, json=expected_json, headers=expected_headers, timeout=600.0
)

View file

@ -14,7 +14,7 @@ sys.path.insert(
) # Adds the parent directory to the system path
import os
from unittest.mock import MagicMock, patch
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
@ -3474,7 +3474,6 @@ def response_format_tests(response: litellm.ModelResponse):
assert isinstance(response.usage.total_tokens, int) # type: ignore
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize(
"model",
[
@ -3488,6 +3487,7 @@ def response_format_tests(response: litellm.ModelResponse):
"cohere.command-text-v14",
],
)
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_completion_bedrock_httpx_models(sync_mode, model):
litellm.set_verbose = True
@ -3730,19 +3730,21 @@ def test_completion_anyscale_api():
# test_completion_anyscale_api()
@pytest.mark.skip(reason="flaky test, times out frequently")
# @pytest.mark.skip(reason="flaky test, times out frequently")
def test_completion_cohere():
try:
# litellm.set_verbose=True
messages = [
{"role": "system", "content": "You're a good bot"},
{"role": "assistant", "content": [{"text": "2", "type": "text"}]},
{"role": "assistant", "content": [{"text": "3", "type": "text"}]},
{
"role": "user",
"content": "Hey",
},
]
response = completion(
model="command-nightly",
model="command-r",
messages=messages,
)
print(response)

View file

@ -1,23 +1,27 @@
# What is this?
## Test to make sure function call response always works with json.loads() -> no extra parsing required. Relevant issue - https://github.com/BerriAI/litellm/issues/2654
import sys, os
import os
import sys
import traceback
from dotenv import load_dotenv
load_dotenv()
import os, io
import io
import os
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest
import litellm
import json
import warnings
from litellm import completion
from typing import List
import pytest
import litellm
from litellm import completion
# Just a stub to keep the sample code simple
class Trade:
@ -78,6 +82,7 @@ def trade(model_name: str) -> List[Trade]:
},
}
try:
response = completion(
model_name,
[
@ -129,7 +134,8 @@ def trade(model_name: str) -> List[Trade]:
"function": {"name": tool_spec["function"]["name"]}, # type: ignore
},
)
except litellm.InternalServerError:
pass
calls = response.choices[0].message.tool_calls
trades = [trade for call in calls for trade in parse_call(call)]
return trades

View file

@ -147,6 +147,117 @@ async def test_basic_gcs_logger():
assert gcs_payload["response_cost"] > 0.0
assert gcs_payload["log_event_type"] == "successful_api_call"
gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"])
assert (
gcs_payload["spend_log_metadata"]["user_api_key"]
== "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b"
)
assert (
gcs_payload["spend_log_metadata"]["user_api_key_user_id"]
== "116544810872468347480"
)
# Delete Object from GCS
print("deleting object from GCS")
await gcs_logger.delete_gcs_object(object_name=object_name)
@pytest.mark.asyncio
async def test_basic_gcs_logger_failure():
load_vertex_ai_credentials()
gcs_logger = GCSBucketLogger()
print("GCSBucketLogger", gcs_logger)
gcs_log_id = f"failure-test-{uuid.uuid4().hex}"
litellm.callbacks = [gcs_logger]
try:
response = await litellm.acompletion(
model="gpt-3.5-turbo",
temperature=0.7,
messages=[{"role": "user", "content": "This is a test"}],
max_tokens=10,
user="ishaan-2",
mock_response=litellm.BadRequestError(
model="gpt-3.5-turbo",
message="Error: 400: Bad Request: Invalid API key, please check your API key and try again.",
llm_provider="openai",
),
metadata={
"gcs_log_id": gcs_log_id,
"tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"],
"user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
"user_api_key_alias": None,
"user_api_end_user_max_budget": None,
"litellm_api_version": "0.0.0",
"global_max_parallel_requests": None,
"user_api_key_user_id": "116544810872468347480",
"user_api_key_org_id": None,
"user_api_key_team_id": None,
"user_api_key_team_alias": None,
"user_api_key_metadata": {},
"requester_ip_address": "127.0.0.1",
"spend_logs_metadata": {"hello": "world"},
"headers": {
"content-type": "application/json",
"user-agent": "PostmanRuntime/7.32.3",
"accept": "*/*",
"postman-token": "92300061-eeaa-423b-a420-0b44896ecdc4",
"host": "localhost:4000",
"accept-encoding": "gzip, deflate, br",
"connection": "keep-alive",
"content-length": "163",
},
"endpoint": "http://localhost:4000/chat/completions",
"model_group": "gpt-3.5-turbo",
"deployment": "azure/chatgpt-v-2",
"model_info": {
"id": "4bad40a1eb6bebd1682800f16f44b9f06c52a6703444c99c7f9f32e9de3693b4",
"db_model": False,
},
"api_base": "https://openai-gpt-4-test-v-1.openai.azure.com/",
"caching_groups": None,
"raw_request": "\n\nPOST Request Sent from LiteLLM:\ncurl -X POST \\\nhttps://openai-gpt-4-test-v-1.openai.azure.com//openai/ \\\n-H 'Authorization: *****' \\\n-d '{'model': 'chatgpt-v-2', 'messages': [{'role': 'system', 'content': 'you are a helpful assistant.\\n'}, {'role': 'user', 'content': 'bom dia'}], 'stream': False, 'max_tokens': 10, 'user': '116544810872468347480', 'extra_body': {}}'\n",
},
)
except:
pass
await asyncio.sleep(5)
# Get the current date
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")
# Modify the object_name to include the date-based folder
object_name = gcs_log_id
print("object_name", object_name)
# Check if object landed on GCS
object_from_gcs = await gcs_logger.download_gcs_object(object_name=object_name)
print("object from gcs=", object_from_gcs)
# convert object_from_gcs from bytes to DICT
parsed_data = json.loads(object_from_gcs)
print("object_from_gcs as dict", parsed_data)
print("type of object_from_gcs", type(parsed_data))
gcs_payload = GCSBucketPayload(**parsed_data)
print("gcs_payload", gcs_payload)
assert gcs_payload["request_kwargs"]["model"] == "gpt-3.5-turbo"
assert gcs_payload["request_kwargs"]["messages"] == [
{"role": "user", "content": "This is a test"}
]
assert gcs_payload["response_cost"] == 0
assert gcs_payload["log_event_type"] == "failed_api_call"
gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"])
assert (

View file

@ -1,5 +1,6 @@
import os
import sys
from typing import Optional
import pytest
from fastapi import FastAPI
@ -30,6 +31,7 @@ def client():
async def test_pass_through_endpoint(client, monkeypatch):
# Mock the httpx.AsyncClient.request method
monkeypatch.setattr("httpx.AsyncClient.request", mock_request)
import litellm
# Define a pass-through endpoint
pass_through_endpoints = [
@ -42,6 +44,11 @@ async def test_pass_through_endpoint(client, monkeypatch):
# Initialize the pass-through endpoint
await initialize_pass_through_endpoints(pass_through_endpoints)
general_settings: Optional[dict] = (
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
)
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
# Make a request to the pass-through endpoint
response = client.post("/test-endpoint", json={"prompt": "Hello, world!"})
@ -54,6 +61,7 @@ async def test_pass_through_endpoint(client, monkeypatch):
@pytest.mark.asyncio
async def test_pass_through_endpoint_rerank(client):
_cohere_api_key = os.environ.get("COHERE_API_KEY")
import litellm
# Define a pass-through endpoint
pass_through_endpoints = [
@ -66,6 +74,11 @@ async def test_pass_through_endpoint_rerank(client):
# Initialize the pass-through endpoint
await initialize_pass_through_endpoints(pass_through_endpoints)
general_settings: Optional[dict] = (
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
)
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
_json_data = {
"model": "rerank-english-v3.0",
@ -87,7 +100,7 @@ async def test_pass_through_endpoint_rerank(client):
@pytest.mark.parametrize(
"auth, rpm_limit, expected_error_code",
[(True, 0, 429), (True, 1, 200), (False, 0, 401)],
[(True, 0, 429), (True, 1, 200), (False, 0, 200)],
)
@pytest.mark.asyncio
async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_limit):
@ -123,6 +136,11 @@ async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_li
# Initialize the pass-through endpoint
await initialize_pass_through_endpoints(pass_through_endpoints)
general_settings: Optional[dict] = (
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
)
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
_json_data = {
"model": "rerank-english-v3.0",
@ -146,6 +164,123 @@ async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_li
assert response.status_code == expected_error_code
@pytest.mark.parametrize(
"auth, rpm_limit, expected_error_code",
[(True, 0, 429), (True, 1, 207), (False, 0, 207)],
)
@pytest.mark.asyncio
async def test_aaapass_through_endpoint_pass_through_keys_langfuse(
auth, expected_error_code, rpm_limit
):
client = TestClient(app)
import litellm
from litellm.proxy._types import UserAPIKeyAuth
from litellm.proxy.proxy_server import ProxyLogging, hash_token, user_api_key_cache
# Store original values
original_user_api_key_cache = getattr(
litellm.proxy.proxy_server, "user_api_key_cache", None
)
original_master_key = getattr(litellm.proxy.proxy_server, "master_key", None)
original_prisma_client = getattr(litellm.proxy.proxy_server, "prisma_client", None)
original_proxy_logging_obj = getattr(
litellm.proxy.proxy_server, "proxy_logging_obj", None
)
try:
mock_api_key = "sk-my-test-key"
cache_value = UserAPIKeyAuth(
token=hash_token(mock_api_key), rpm_limit=rpm_limit
)
_cohere_api_key = os.environ.get("COHERE_API_KEY")
user_api_key_cache.set_cache(key=hash_token(mock_api_key), value=cache_value)
proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
proxy_logging_obj._init_litellm_callbacks()
setattr(litellm.proxy.proxy_server, "user_api_key_cache", user_api_key_cache)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(litellm.proxy.proxy_server, "prisma_client", "FAKE-VAR")
setattr(litellm.proxy.proxy_server, "proxy_logging_obj", proxy_logging_obj)
# Define a pass-through endpoint
pass_through_endpoints = [
{
"path": "/api/public/ingestion",
"target": "https://cloud.langfuse.com/api/public/ingestion",
"auth": auth,
"custom_auth_parser": "langfuse",
"headers": {
"LANGFUSE_PUBLIC_KEY": "os.environ/LANGFUSE_PUBLIC_KEY",
"LANGFUSE_SECRET_KEY": "os.environ/LANGFUSE_SECRET_KEY",
},
}
]
# Initialize the pass-through endpoint
await initialize_pass_through_endpoints(pass_through_endpoints)
general_settings: Optional[dict] = (
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
)
old_general_settings = general_settings
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
_json_data = {
"batch": [
{
"id": "80e2141f-0ca6-47b7-9c06-dde5e97de690",
"type": "trace-create",
"body": {
"id": "0687af7b-4a75-4de8-a4f6-cba1cdc00865",
"timestamp": "2024-08-14T02:38:56.092950Z",
"name": "test-trace-litellm-proxy-passthrough",
},
"timestamp": "2024-08-14T02:38:56.093352Z",
}
],
"metadata": {
"batch_size": 1,
"sdk_integration": "default",
"sdk_name": "python",
"sdk_version": "2.27.0",
"public_key": "anything",
},
}
# Make a request to the pass-through endpoint
response = client.post(
"/api/public/ingestion",
json=_json_data,
headers={"Authorization": "Basic c2stbXktdGVzdC1rZXk6YW55dGhpbmc="},
)
print("JSON response: ", _json_data)
print("RESPONSE RECEIVED - {}".format(response.text))
# Assert the response
assert response.status_code == expected_error_code
setattr(litellm.proxy.proxy_server, "general_settings", old_general_settings)
finally:
# Reset to original values
setattr(
litellm.proxy.proxy_server,
"user_api_key_cache",
original_user_api_key_cache,
)
setattr(litellm.proxy.proxy_server, "master_key", original_master_key)
setattr(litellm.proxy.proxy_server, "prisma_client", original_prisma_client)
setattr(
litellm.proxy.proxy_server, "proxy_logging_obj", original_proxy_logging_obj
)
@pytest.mark.asyncio
async def test_pass_through_endpoint_anthropic(client):
import litellm
@ -178,6 +313,11 @@ async def test_pass_through_endpoint_anthropic(client):
# Initialize the pass-through endpoint
await initialize_pass_through_endpoints(pass_through_endpoints)
general_settings: Optional[dict] = (
getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
)
general_settings.update({"pass_through_endpoints": pass_through_endpoints})
setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
_json_data = {
"model": "gpt-3.5-turbo",

View file

@ -76,6 +76,6 @@ async def test_async_prometheus_success_logging():
print("metrics from prometheus", metrics)
assert metrics["litellm_requests_metric_total"] == 1.0
assert metrics["litellm_total_tokens_total"] == 30.0
assert metrics["llm_deployment_success_responses_total"] == 1.0
assert metrics["llm_deployment_total_requests_total"] == 1.0
assert metrics["llm_deployment_latency_per_output_token_bucket"] == 1.0
assert metrics["litellm_deployment_success_responses_total"] == 1.0
assert metrics["litellm_deployment_total_requests_total"] == 1.0
assert metrics["litellm_deployment_latency_per_output_token_bucket"] == 1.0

View file

@ -260,3 +260,56 @@ def test_anthropic_messages_tool_call():
translated_messages[-1]["content"][0]["tool_use_id"]
== "bc8cb4b6-88c4-4138-8993-3a9d9cd51656"
)
def test_anthropic_cache_controls_pt():
"see anthropic docs for this: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#continuing-a-multi-turn-conversation"
messages = [
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
"cache_control": {"type": "ephemeral"},
},
]
translated_messages = anthropic_messages_pt(
messages, model="claude-3-5-sonnet-20240620", llm_provider="anthropic"
)
for i, msg in enumerate(translated_messages):
if i == 0:
assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
elif i == 1:
assert "cache_controls" not in msg["content"][0]
elif i == 2:
assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
elif i == 3:
assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
print("translated_messages: ", translated_messages)

View file

@ -966,3 +966,203 @@ async def test_user_info_team_list(prisma_client):
pass
mock_client.assert_called()
@pytest.mark.skip(reason="Local test")
@pytest.mark.asyncio
async def test_add_callback_via_key(prisma_client):
"""
Test if callback specified in key, is used.
"""
global headers
import json
from fastapi import HTTPException, Request, Response
from starlette.datastructures import URL
from litellm.proxy.proxy_server import chat_completion
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
await litellm.proxy.proxy_server.prisma_client.connect()
litellm.set_verbose = True
try:
# Your test data
test_data = {
"model": "azure/chatgpt-v-2",
"messages": [
{"role": "user", "content": "write 1 sentence poem"},
],
"max_tokens": 10,
"mock_response": "Hello world",
"api_key": "my-fake-key",
}
request = Request(scope={"type": "http", "method": "POST", "headers": {}})
request._url = URL(url="/chat/completions")
json_bytes = json.dumps(test_data).encode("utf-8")
request._body = json_bytes
with patch.object(
litellm.litellm_core_utils.litellm_logging,
"LangFuseLogger",
new=MagicMock(),
) as mock_client:
resp = await chat_completion(
request=request,
fastapi_response=Response(),
user_api_key_dict=UserAPIKeyAuth(
metadata={
"logging": [
{
"callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
"callback_type": "success", # set, if required by integration - future improvement, have logging tools work for success + failure by default
"callback_vars": {
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY",
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY",
"langfuse_host": "https://us.cloud.langfuse.com",
},
}
]
}
),
)
print(resp)
mock_client.assert_called()
mock_client.return_value.log_event.assert_called()
args, kwargs = mock_client.return_value.log_event.call_args
kwargs = kwargs["kwargs"]
assert "user_api_key_metadata" in kwargs["litellm_params"]["metadata"]
assert (
"logging"
in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"]
)
checked_keys = False
for item in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"][
"logging"
]:
for k, v in item["callback_vars"].items():
print("k={}, v={}".format(k, v))
if "key" in k:
assert "os.environ" in v
checked_keys = True
assert checked_keys
except Exception as e:
pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
@pytest.mark.asyncio
async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
import json
from fastapi import HTTPException, Request, Response
from starlette.datastructures import URL
from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
await litellm.proxy.proxy_server.prisma_client.connect()
proxy_config = getattr(litellm.proxy.proxy_server, "proxy_config")
request = Request(scope={"type": "http", "method": "POST", "headers": {}})
request._url = URL(url="/chat/completions")
test_data = {
"model": "azure/chatgpt-v-2",
"messages": [
{"role": "user", "content": "write 1 sentence poem"},
],
"max_tokens": 10,
"mock_response": "Hello world",
"api_key": "my-fake-key",
}
json_bytes = json.dumps(test_data).encode("utf-8")
request._body = json_bytes
data = {
"data": {
"model": "azure/chatgpt-v-2",
"messages": [{"role": "user", "content": "write 1 sentence poem"}],
"max_tokens": 10,
"mock_response": "Hello world",
"api_key": "my-fake-key",
},
"request": request,
"user_api_key_dict": UserAPIKeyAuth(
token=None,
key_name=None,
key_alias=None,
spend=0.0,
max_budget=None,
expires=None,
models=[],
aliases={},
config={},
user_id=None,
team_id=None,
max_parallel_requests=None,
metadata={
"logging": [
{
"callback_name": "langfuse",
"callback_type": "success",
"callback_vars": {
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY",
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY",
"langfuse_host": "https://us.cloud.langfuse.com",
},
}
]
},
tpm_limit=None,
rpm_limit=None,
budget_duration=None,
budget_reset_at=None,
allowed_cache_controls=[],
permissions={},
model_spend={},
model_max_budget={},
soft_budget_cooldown=False,
litellm_budget_table=None,
org_id=None,
team_spend=None,
team_alias=None,
team_tpm_limit=None,
team_rpm_limit=None,
team_max_budget=None,
team_models=[],
team_blocked=False,
soft_budget=None,
team_model_aliases=None,
team_member_spend=None,
team_metadata=None,
end_user_id=None,
end_user_tpm_limit=None,
end_user_rpm_limit=None,
end_user_max_budget=None,
last_refreshed_at=None,
api_key=None,
user_role=None,
allowed_model_region=None,
parent_otel_span=None,
),
"proxy_config": proxy_config,
"general_settings": {},
"version": "0.0.0",
}
new_data = await add_litellm_data_to_request(**data)
assert "success_callback" in new_data
assert new_data["success_callback"] == ["langfuse"]
assert "langfuse_public_key" in new_data
assert "langfuse_secret_key" in new_data

View file

@ -15,9 +15,10 @@ class AnthropicMessagesTool(TypedDict, total=False):
input_schema: Required[dict]
class AnthropicMessagesTextParam(TypedDict):
class AnthropicMessagesTextParam(TypedDict, total=False):
type: Literal["text"]
text: str
cache_control: Optional[dict]
class AnthropicMessagesToolUseParam(TypedDict):
@ -54,9 +55,10 @@ class AnthropicImageParamSource(TypedDict):
data: str
class AnthropicMessagesImageParam(TypedDict):
class AnthropicMessagesImageParam(TypedDict, total=False):
type: Literal["image"]
source: AnthropicImageParamSource
cache_control: Optional[dict]
class AnthropicMessagesToolResultContent(TypedDict):
@ -92,6 +94,12 @@ class AnthropicMetadata(TypedDict, total=False):
user_id: str
class AnthropicSystemMessageContent(TypedDict, total=False):
type: str
text: str
cache_control: Optional[dict]
class AnthropicMessagesRequest(TypedDict, total=False):
model: Required[str]
messages: Required[
@ -106,7 +114,7 @@ class AnthropicMessagesRequest(TypedDict, total=False):
metadata: AnthropicMetadata
stop_sequences: List[str]
stream: bool
system: str
system: Union[str, List]
temperature: float
tool_choice: AnthropicMessagesToolChoice
tools: List[AnthropicMessagesTool]

View file

@ -361,7 +361,7 @@ class ChatCompletionToolMessage(TypedDict):
class ChatCompletionSystemMessage(TypedDict, total=False):
role: Required[Literal["system"]]
content: Required[str]
content: Required[Union[str, List]]
name: str

View file

@ -2074,7 +2074,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-5-sonnet@20240620": {
"max_tokens": 4096,
@ -2085,7 +2086,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-haiku@20240307": {
"max_tokens": 4096,
@ -2096,7 +2098,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-opus@20240229": {
"max_tokens": 4096,
@ -2107,7 +2110,8 @@
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/meta/llama3-405b-instruct-maas": {
"max_tokens": 32000,
@ -4531,6 +4535,69 @@
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-70b-instruct": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000001,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-8b-instruct": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-huge-128k-online": {
"max_tokens": 127072,
"max_input_tokens": 127072,
"max_output_tokens": 127072,
"input_cost_per_token": 0.000005,
"output_cost_per_token": 0.000005,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-large-128k-online": {
"max_tokens": 127072,
"max_input_tokens": 127072,
"max_output_tokens": 127072,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000001,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-large-128k-chat": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000001,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-small-128k-chat": {
"max_tokens": 131072,
"max_input_tokens": 131072,
"max_output_tokens": 131072,
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/llama-3.1-sonar-small-128k-online": {
"max_tokens": 127072,
"max_input_tokens": 127072,
"max_output_tokens": 127072,
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "perplexity",
"mode": "chat"
},
"perplexity/pplx-7b-chat": {
"max_tokens": 8192,
"max_input_tokens": 8192,

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.43.10"
version = "1.43.13"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.43.10"
version = "1.43.13"
version_files = [
"pyproject.toml:^version"
]