litellm/tests/local_testing/test_anthropic_prompt_caching.py
Ishaan Jaff 045ecf3ffb
(feat proxy slack alerting) - allow opting in to getting key / internal user alerts (#5990)
* define all slack alert types

* use correct type hints for alert type

* use correct defaults on slack alerting

* add readme for slack alerting

* fix linting error

* update readme

* docs all alert types

* update slack alerting docs

* fix slack alerting docs

* handle new testing dir structure

* fix config for testing

* fix testing folder related imports

* fix /tests import errors

* fix import stream_chunk_testdata

* docs alert types

* fix test test_langfuse_trace_id

* fix type checks for slack alerting

* fix outage alerting test slack
2024-10-01 10:49:22 -07:00

548 lines
19 KiB
Python

import json
import os
import sys
import traceback
from dotenv import load_dotenv
load_dotenv()
import io
import os
from test_streaming import streaming_format_tests
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import os
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import litellm
from litellm import RateLimitError, Timeout, completion, completion_cost, embedding
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
# litellm.num_retries =3
litellm.cache = None
litellm.success_callback = []
user_message = "Write a short poem about the sky"
messages = [{"content": user_message, "role": "user"}]
def logger_fn(user_model_dict):
print(f"user_model_dict: {user_model_dict}")
@pytest.fixture(autouse=True)
def reset_callbacks():
print("\npytest fixture - resetting callbacks")
litellm.success_callback = []
litellm._async_success_callback = []
litellm.failure_callback = []
litellm.callbacks = []
@pytest.mark.asyncio
async def test_litellm_anthropic_prompt_caching_tools():
# Arrange: Set up the MagicMock for the httpx.AsyncClient
mock_response = AsyncMock()
def return_val():
return {
"id": "msg_01XFDUDYJgAACzvnptvVoYEL",
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": "Hello!"}],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": "end_turn",
"stop_sequence": None,
"usage": {"input_tokens": 12, "output_tokens": 6},
}
mock_response.json = return_val
mock_response.headers = {"key": "value"}
litellm.set_verbose = True
with patch(
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
return_value=mock_response,
) as mock_post:
# Act: Call the litellm.acompletion function
response = await litellm.acompletion(
api_key="mock_api_key",
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{"role": "user", "content": "What's the weather like in Boston today?"}
],
tools=[
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
"cache_control": {"type": "ephemeral"},
},
}
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
# Print what was called on the mock
print("call args=", mock_post.call_args)
expected_url = "https://api.anthropic.com/v1/messages"
expected_headers = {
"accept": "application/json",
"content-type": "application/json",
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
"x-api-key": "mock_api_key",
}
expected_json = {
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's the weather like in Boston today?",
}
],
}
],
"tools": [
{
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"cache_control": {"type": "ephemeral"},
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
}
],
"max_tokens": 4096,
"model": "claude-3-5-sonnet-20240620",
}
mock_post.assert_called_once_with(
expected_url, json=expected_json, headers=expected_headers, timeout=600.0
)
@pytest.mark.asyncio()
async def test_anthropic_api_prompt_caching_basic():
litellm.set_verbose = True
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
temperature=0.2,
max_tokens=10,
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
print("response=", response)
assert "cache_read_input_tokens" in response.usage
assert "cache_creation_input_tokens" in response.usage
# Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
assert (response.usage.cache_read_input_tokens > 0) or (
response.usage.cache_creation_input_tokens > 0
)
@pytest.mark.asyncio()
async def test_anthropic_api_prompt_caching_with_content_str():
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
system_message = [
{
"role": "system",
"content": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
]
translated_system_message = litellm.AnthropicConfig().translate_system_message(
messages=system_message
)
assert translated_system_message == [
# System Message
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
}
]
user_messages = [
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
},
]
translated_messages = anthropic_messages_pt(
messages=user_messages,
model="claude-3-5-sonnet-20240620",
llm_provider="anthropic",
)
expected_messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
}
],
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
]
assert len(translated_messages) == len(expected_messages)
for idx, i in enumerate(translated_messages):
assert (
i == expected_messages[idx]
), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx])
@pytest.mark.asyncio()
async def test_anthropic_api_prompt_caching_no_headers():
litellm.set_verbose = True
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
temperature=0.2,
max_tokens=10,
)
print("response=", response)
assert "cache_read_input_tokens" in response.usage
assert "cache_creation_input_tokens" in response.usage
# Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
assert (response.usage.cache_read_input_tokens > 0) or (
response.usage.cache_creation_input_tokens > 0
)
@pytest.mark.asyncio()
@pytest.mark.flaky(retries=3, delay=1)
async def test_anthropic_api_prompt_caching_streaming():
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
temperature=0.2,
max_tokens=10,
stream=True,
stream_options={"include_usage": True},
)
idx = 0
is_cache_read_input_tokens_in_usage = False
is_cache_creation_input_tokens_in_usage = False
async for chunk in response:
streaming_format_tests(idx=idx, chunk=chunk)
# Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
if hasattr(chunk, "usage"):
print("Received final usage - {}".format(chunk.usage))
if hasattr(chunk, "usage") and hasattr(chunk.usage, "cache_read_input_tokens"):
is_cache_read_input_tokens_in_usage = True
if hasattr(chunk, "usage") and hasattr(
chunk.usage, "cache_creation_input_tokens"
):
is_cache_creation_input_tokens_in_usage = True
idx += 1
print("response=", response)
assert (
is_cache_read_input_tokens_in_usage and is_cache_creation_input_tokens_in_usage
)
@pytest.mark.asyncio
async def test_litellm_anthropic_prompt_caching_system():
# https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples
# LArge Context Caching Example
mock_response = AsyncMock()
def return_val():
return {
"id": "msg_01XFDUDYJgAACzvnptvVoYEL",
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": "Hello!"}],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": "end_turn",
"stop_sequence": None,
"usage": {"input_tokens": 12, "output_tokens": 6},
}
mock_response.json = return_val
mock_response.headers = {"key": "value"}
litellm.set_verbose = True
with patch(
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
return_value=mock_response,
) as mock_post:
# Act: Call the litellm.acompletion function
response = await litellm.acompletion(
api_key="mock_api_key",
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
# Print what was called on the mock
print("call args=", mock_post.call_args)
expected_url = "https://api.anthropic.com/v1/messages"
expected_headers = {
"accept": "application/json",
"content-type": "application/json",
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
"x-api-key": "mock_api_key",
}
expected_json = {
"system": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "what are the key terms and conditions in this agreement?",
}
],
}
],
"max_tokens": 4096,
"model": "claude-3-5-sonnet-20240620",
}
mock_post.assert_called_once_with(
expected_url, json=expected_json, headers=expected_headers, timeout=600.0
)