diff --git a/docs/my-website/docs/anthropic_unified.md b/docs/my-website/docs/anthropic_unified.md index cf6ba798d5..485571aa28 100644 --- a/docs/my-website/docs/anthropic_unified.md +++ b/docs/my-website/docs/anthropic_unified.md @@ -3,9 +3,10 @@ import TabItem from '@theme/TabItem'; # /v1/messages [BETA] -LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. +Use LiteLLM to call all your LLM APIs in the Anthropic `v1/messages` format. -This currently just supports the Anthropic API. + +## Overview | Feature | Supported | Notes | |-------|-------|-------| @@ -21,9 +22,61 @@ Planned improvement: - Bedrock Anthropic support ## Usage +--- + +### LiteLLM Python SDK + +#### Non-streaming example +```python showLineNumbers title="Example using LiteLLM Python SDK" +import litellm +response = await litellm.anthropic.messages.acreate( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + api_key=api_key, + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, +) +``` + +Example response: +```json +{ + "content": [ + { + "text": "Hi! this is a very short joke", + "type": "text" + } + ], + "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF", + "model": "claude-3-7-sonnet-20250219", + "role": "assistant", + "stop_reason": "end_turn", + "stop_sequence": null, + "type": "message", + "usage": { + "input_tokens": 2095, + "output_tokens": 503, + "cache_creation_input_tokens": 2095, + "cache_read_input_tokens": 0 + } +} +``` + +#### Streaming example +```python showLineNumbers title="Example using LiteLLM Python SDK" +import litellm +response = await litellm.anthropic.messages.acreate( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + api_key=api_key, + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, + stream=True, +) +async for chunk in response: + print(chunk) +``` + +### LiteLLM Proxy Server - - 1. Setup config.yaml @@ -42,7 +95,28 @@ litellm --config /path/to/config.yaml 3. Test it! -```bash + + + +```python showLineNumbers title="Example using LiteLLM Proxy Server" +import anthropic + +# point anthropic sdk to litellm proxy +client = anthropic.Anthropic( + base_url="http://0.0.0.0:4000", + api_key="sk-1234", +) + +response = client.messages.create( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, +) +``` + + + +```bash showLineNumbers title="Example using LiteLLM Proxy Server" curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \ -H 'content-type: application/json' \ -H 'x-api-key: $LITELLM_API_KEY' \ @@ -52,41 +126,176 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \ "messages": [ { "role": "user", - "content": [ - { - "type": "text", - "text": "List 5 important events in the XIX century" - } - ] + "content": "Hello, can you tell me a short joke?" } ], - "max_tokens": 4096 + "max_tokens": 100 }' ``` + - + -```python -from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages -import asyncio -import os -# set env -os.environ["ANTHROPIC_API_KEY"] = "my-api-key" +## Request Format +--- -messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}] +Request body will be in the Anthropic messages API format. **litellm follows the Anthropic messages specification for this endpoint.** -# Call the handler -async def call(): - response = await anthropic_messages( - messages=messages, - api_key=api_key, - model="claude-3-haiku-20240307", - max_tokens=100, - ) +#### Example request body -asyncio.run(call()) +```json +{ + "model": "claude-3-7-sonnet-20250219", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Hello, world" + } + ] +} ``` - - \ No newline at end of file +#### Required Fields +- **model** (string): + The model identifier (e.g., `"claude-3-7-sonnet-20250219"`). +- **max_tokens** (integer): + The maximum number of tokens to generate before stopping. + _Note: The model may stop before reaching this limit; value must be greater than 1._ +- **messages** (array of objects): + An ordered list of conversational turns. + Each message object must include: + - **role** (enum: `"user"` or `"assistant"`): + Specifies the speaker of the message. + - **content** (string or array of content blocks): + The text or content blocks (e.g., an array containing objects with a `type` such as `"text"`) that form the message. + _Example equivalence:_ + ```json + {"role": "user", "content": "Hello, Claude"} + ``` + is equivalent to: + ```json + {"role": "user", "content": [{"type": "text", "text": "Hello, Claude"}]} + ``` + +#### Optional Fields +- **metadata** (object): + Contains additional metadata about the request (e.g., `user_id` as an opaque identifier). +- **stop_sequences** (array of strings): + Custom sequences that, when encountered in the generated text, cause the model to stop. +- **stream** (boolean): + Indicates whether to stream the response using server-sent events. +- **system** (string or array): + A system prompt providing context or specific instructions to the model. +- **temperature** (number): + Controls randomness in the model’s responses. Valid range: `0 < temperature < 1`. +- **thinking** (object): + Configuration for enabling extended thinking. If enabled, it includes: + - **budget_tokens** (integer): + Minimum of 1024 tokens (and less than `max_tokens`). + - **type** (enum): + E.g., `"enabled"`. +- **tool_choice** (object): + Instructs how the model should utilize any provided tools. +- **tools** (array of objects): + Definitions for tools available to the model. Each tool includes: + - **name** (string): + The tool’s name. + - **description** (string): + A detailed description of the tool. + - **input_schema** (object): + A JSON schema describing the expected input format for the tool. +- **top_k** (integer): + Limits sampling to the top K options. +- **top_p** (number): + Enables nucleus sampling with a cumulative probability cutoff. Valid range: `0 < top_p < 1`. + + +## Response Format +--- + +Responses will be in the Anthropic messages API format. + +#### Example Response + +```json +{ + "content": [ + { + "text": "Hi! My name is Claude.", + "type": "text" + } + ], + "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF", + "model": "claude-3-7-sonnet-20250219", + "role": "assistant", + "stop_reason": "end_turn", + "stop_sequence": null, + "type": "message", + "usage": { + "input_tokens": 2095, + "output_tokens": 503, + "cache_creation_input_tokens": 2095, + "cache_read_input_tokens": 0 + } +} +``` + +#### Response fields + +- **content** (array of objects): + Contains the generated content blocks from the model. Each block includes: + - **type** (string): + Indicates the type of content (e.g., `"text"`, `"tool_use"`, `"thinking"`, or `"redacted_thinking"`). + - **text** (string): + The generated text from the model. + _Note: Maximum length is 5,000,000 characters._ + - **citations** (array of objects or `null`): + Optional field providing citation details. Each citation includes: + - **cited_text** (string): + The excerpt being cited. + - **document_index** (integer): + An index referencing the cited document. + - **document_title** (string or `null`): + The title of the cited document. + - **start_char_index** (integer): + The starting character index for the citation. + - **end_char_index** (integer): + The ending character index for the citation. + - **type** (string): + Typically `"char_location"`. + +- **id** (string): + A unique identifier for the response message. + _Note: The format and length of IDs may change over time._ + +- **model** (string): + Specifies the model that generated the response. + +- **role** (string): + Indicates the role of the generated message. For responses, this is always `"assistant"`. + +- **stop_reason** (string): + Explains why the model stopped generating text. Possible values include: + - `"end_turn"`: The model reached a natural stopping point. + - `"max_tokens"`: The generation stopped because the maximum token limit was reached. + - `"stop_sequence"`: A custom stop sequence was encountered. + - `"tool_use"`: The model invoked one or more tools. + +- **stop_sequence** (string or `null`): + Contains the specific stop sequence that caused the generation to halt, if applicable; otherwise, it is `null`. + +- **type** (string): + Denotes the type of response object, which is always `"message"`. + +- **usage** (object): + Provides details on token usage for billing and rate limiting. This includes: + - **input_tokens** (integer): + Total number of input tokens processed. + - **output_tokens** (integer): + Total number of output tokens generated. + - **cache_creation_input_tokens** (integer or `null`): + Number of tokens used to create a cache entry. + - **cache_read_input_tokens** (integer or `null`): + Number of tokens read from the cache. diff --git a/litellm/__init__.py b/litellm/__init__.py index c2e366e2b1..9997b9a8ac 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -1038,6 +1038,7 @@ from .cost_calculator import response_cost_calculator, cost_per_token ### ADAPTERS ### from .types.adapter import AdapterItem +import litellm.anthropic_interface as anthropic adapters: List[AdapterItem] = [] diff --git a/litellm/anthropic_interface/__init__.py b/litellm/anthropic_interface/__init__.py new file mode 100644 index 0000000000..9902fdc553 --- /dev/null +++ b/litellm/anthropic_interface/__init__.py @@ -0,0 +1,6 @@ +""" +Anthropic module for LiteLLM +""" +from .messages import acreate, create + +__all__ = ["acreate", "create"] diff --git a/litellm/anthropic_interface/messages/__init__.py b/litellm/anthropic_interface/messages/__init__.py new file mode 100644 index 0000000000..f3249f981b --- /dev/null +++ b/litellm/anthropic_interface/messages/__init__.py @@ -0,0 +1,117 @@ +""" +Interface for Anthropic's messages API + +Use this to call LLMs in Anthropic /messages Request/Response format + +This is an __init__.py file to allow the following interface + +- litellm.messages.acreate +- litellm.messages.create + +""" + +from typing import AsyncIterator, Dict, Iterator, List, Optional, Union + +from litellm.llms.anthropic.experimental_pass_through.messages.handler import ( + anthropic_messages as _async_anthropic_messages, +) +from litellm.types.llms.anthropic_messages.anthropic_response import ( + AnthropicMessagesResponse, +) + + +async def acreate( + max_tokens: int, + messages: List[Dict], + model: str, + metadata: Optional[Dict] = None, + stop_sequences: Optional[List[str]] = None, + stream: Optional[bool] = False, + system: Optional[str] = None, + temperature: Optional[float] = 1.0, + thinking: Optional[Dict] = None, + tool_choice: Optional[Dict] = None, + tools: Optional[List[Dict]] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + **kwargs +) -> Union[AnthropicMessagesResponse, AsyncIterator]: + """ + Async wrapper for Anthropic's messages API + + Args: + max_tokens (int): Maximum tokens to generate (required) + messages (List[Dict]): List of message objects with role and content (required) + model (str): Model name to use (required) + metadata (Dict, optional): Request metadata + stop_sequences (List[str], optional): Custom stop sequences + stream (bool, optional): Whether to stream the response + system (str, optional): System prompt + temperature (float, optional): Sampling temperature (0.0 to 1.0) + thinking (Dict, optional): Extended thinking configuration + tool_choice (Dict, optional): Tool choice configuration + tools (List[Dict], optional): List of tool definitions + top_k (int, optional): Top K sampling parameter + top_p (float, optional): Nucleus sampling parameter + **kwargs: Additional arguments + + Returns: + Dict: Response from the API + """ + return await _async_anthropic_messages( + max_tokens=max_tokens, + messages=messages, + model=model, + metadata=metadata, + stop_sequences=stop_sequences, + stream=stream, + system=system, + temperature=temperature, + thinking=thinking, + tool_choice=tool_choice, + tools=tools, + top_k=top_k, + top_p=top_p, + **kwargs, + ) + + +async def create( + max_tokens: int, + messages: List[Dict], + model: str, + metadata: Optional[Dict] = None, + stop_sequences: Optional[List[str]] = None, + stream: Optional[bool] = False, + system: Optional[str] = None, + temperature: Optional[float] = 1.0, + thinking: Optional[Dict] = None, + tool_choice: Optional[Dict] = None, + tools: Optional[List[Dict]] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + **kwargs +) -> Union[AnthropicMessagesResponse, Iterator]: + """ + Async wrapper for Anthropic's messages API + + Args: + max_tokens (int): Maximum tokens to generate (required) + messages (List[Dict]): List of message objects with role and content (required) + model (str): Model name to use (required) + metadata (Dict, optional): Request metadata + stop_sequences (List[str], optional): Custom stop sequences + stream (bool, optional): Whether to stream the response + system (str, optional): System prompt + temperature (float, optional): Sampling temperature (0.0 to 1.0) + thinking (Dict, optional): Extended thinking configuration + tool_choice (Dict, optional): Tool choice configuration + tools (List[Dict], optional): List of tool definitions + top_k (int, optional): Top K sampling parameter + top_p (float, optional): Nucleus sampling parameter + **kwargs: Additional arguments + + Returns: + Dict: Response from the API + """ + raise NotImplementedError("This function is not implemented") diff --git a/litellm/anthropic_interface/readme.md b/litellm/anthropic_interface/readme.md new file mode 100644 index 0000000000..01c5f1b7c3 --- /dev/null +++ b/litellm/anthropic_interface/readme.md @@ -0,0 +1,116 @@ +## Use LLM API endpoints in Anthropic Interface + +Note: This is called `anthropic_interface` because `anthropic` is a known python package and was failing mypy type checking. + + +## Usage +--- + +### LiteLLM Python SDK + +#### Non-streaming example +```python showLineNumbers title="Example using LiteLLM Python SDK" +import litellm +response = await litellm.anthropic.messages.acreate( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + api_key=api_key, + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, +) +``` + +Example response: +```json +{ + "content": [ + { + "text": "Hi! this is a very short joke", + "type": "text" + } + ], + "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF", + "model": "claude-3-7-sonnet-20250219", + "role": "assistant", + "stop_reason": "end_turn", + "stop_sequence": null, + "type": "message", + "usage": { + "input_tokens": 2095, + "output_tokens": 503, + "cache_creation_input_tokens": 2095, + "cache_read_input_tokens": 0 + } +} +``` + +#### Streaming example +```python showLineNumbers title="Example using LiteLLM Python SDK" +import litellm +response = await litellm.anthropic.messages.acreate( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + api_key=api_key, + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, + stream=True, +) +async for chunk in response: + print(chunk) +``` + +### LiteLLM Proxy Server + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: anthropic-claude + litellm_params: + model: claude-3-7-sonnet-latest +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + + + + +```python showLineNumbers title="Example using LiteLLM Proxy Server" +import anthropic + +# point anthropic sdk to litellm proxy +client = anthropic.Anthropic( + base_url="http://0.0.0.0:4000", + api_key="sk-1234", +) + +response = client.messages.create( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, +) +``` + + + +```bash showLineNumbers title="Example using LiteLLM Proxy Server" +curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \ +-H 'content-type: application/json' \ +-H 'x-api-key: $LITELLM_API_KEY' \ +-H 'anthropic-version: 2023-06-01' \ +-d '{ + "model": "anthropic-claude", + "messages": [ + { + "role": "user", + "content": "Hello, can you tell me a short joke?" + } + ], + "max_tokens": 100 +}' +``` \ No newline at end of file diff --git a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py index 099a2acdae..a37d816770 100644 --- a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py +++ b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py @@ -6,7 +6,7 @@ """ import json -from typing import Any, AsyncIterator, Dict, Optional, Union, cast +from typing import AsyncIterator, Dict, List, Optional, Union, cast import httpx @@ -19,6 +19,9 @@ from litellm.llms.custom_httpx.http_handler import ( AsyncHTTPHandler, get_async_httpx_client, ) +from litellm.types.llms.anthropic_messages.anthropic_response import ( + AnthropicMessagesResponse, +) from litellm.types.router import GenericLiteLLMParams from litellm.types.utils import ProviderSpecificHeader from litellm.utils import ProviderConfigManager, client @@ -60,14 +63,25 @@ class AnthropicMessagesHandler: @client async def anthropic_messages( - api_key: str, + max_tokens: int, + messages: List[Dict], model: str, - stream: bool = False, + metadata: Optional[Dict] = None, + stop_sequences: Optional[List[str]] = None, + stream: Optional[bool] = False, + system: Optional[str] = None, + temperature: Optional[float] = None, + thinking: Optional[Dict] = None, + tool_choice: Optional[Dict] = None, + tools: Optional[List[Dict]] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + api_key: Optional[str] = None, api_base: Optional[str] = None, client: Optional[AsyncHTTPHandler] = None, custom_llm_provider: Optional[str] = None, **kwargs, -) -> Union[Dict[str, Any], AsyncIterator]: +) -> Union[AnthropicMessagesResponse, AsyncIterator]: """ Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec """ @@ -129,10 +143,8 @@ async def anthropic_messages( }, custom_llm_provider=_custom_llm_provider, ) - litellm_logging_obj.model_call_details.update(kwargs) - # Prepare request body - request_body = kwargs.copy() + request_body = locals().copy() request_body = { k: v for k, v in request_body.items() @@ -140,10 +152,12 @@ async def anthropic_messages( in anthropic_messages_provider_config.get_supported_anthropic_messages_params( model=model ) + and v is not None } request_body["stream"] = stream request_body["model"] = model litellm_logging_obj.stream = stream + litellm_logging_obj.model_call_details.update(request_body) # Make the request request_url = anthropic_messages_provider_config.get_complete_url( @@ -164,7 +178,7 @@ async def anthropic_messages( url=request_url, headers=headers, data=json.dumps(request_body), - stream=stream, + stream=stream or False, ) response.raise_for_status() diff --git a/litellm/types/llms/anthropic_messages/anthropic_response.py b/litellm/types/llms/anthropic_messages/anthropic_response.py new file mode 100644 index 0000000000..270807fc8f --- /dev/null +++ b/litellm/types/llms/anthropic_messages/anthropic_response.py @@ -0,0 +1,83 @@ +from typing import Any, Dict, List, Literal, Optional, TypedDict, Union + +from typing_extensions import TypeAlias + + +class AnthropicResponseTextBlock(TypedDict, total=False): + """ + Anthropic Response Text Block: https://docs.anthropic.com/en/api/messages + """ + + citations: Optional[List[Dict[str, Any]]] + text: str + type: Literal["text"] + + +class AnthropicResponseToolUseBlock(TypedDict, total=False): + """ + Anthropic Response Tool Use Block: https://docs.anthropic.com/en/api/messages + """ + + id: Optional[str] + input: Optional[str] + name: Optional[str] + type: Literal["tool_use"] + + +class AnthropicResponseThinkingBlock(TypedDict, total=False): + """ + Anthropic Response Thinking Block: https://docs.anthropic.com/en/api/messages + """ + + signature: Optional[str] + thinking: Optional[str] + type: Literal["thinking"] + + +class AnthropicResponseRedactedThinkingBlock(TypedDict, total=False): + """ + Anthropic Response Redacted Thinking Block: https://docs.anthropic.com/en/api/messages + """ + + data: Optional[str] + type: Literal["redacted_thinking"] + + +AnthropicResponseContentBlock: TypeAlias = Union[ + AnthropicResponseTextBlock, + AnthropicResponseToolUseBlock, + AnthropicResponseThinkingBlock, + AnthropicResponseRedactedThinkingBlock, +] + + +class AnthropicUsage(TypedDict, total=False): + """ + Input and output tokens used in the request + """ + + input_tokens: int + output_tokens: int + + """ + Cache Tokens Used + """ + cache_creation_input_tokens: int + cache_read_input_tokens: int + + +class AnthropicMessagesResponse(TypedDict, total=False): + """ + Anthropic Messages API Response: https://docs.anthropic.com/en/api/messages + """ + + content: Optional[List[AnthropicResponseContentBlock]] + id: str + model: Optional[str] # This represents the Model type from Anthropic + role: Optional[Literal["assistant"]] + stop_reason: Optional[ + Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] + ] + stop_sequence: Optional[str] + type: Optional[Literal["message"]] + usage: Optional[AnthropicUsage] diff --git a/mypy.ini b/mypy.ini index 19ead3ba7d..3ce8c5fcc0 100644 --- a/mypy.ini +++ b/mypy.ini @@ -2,6 +2,7 @@ warn_return_any = False ignore_missing_imports = True mypy_path = litellm/stubs +namespace_packages = True [mypy-google.*] ignore_missing_imports = True diff --git a/tests/pass_through_unit_tests/test_anthropic_messages_passthrough.py b/tests/pass_through_unit_tests/test_anthropic_messages_passthrough.py index b5b3302acc..ec268b1a24 100644 --- a/tests/pass_through_unit_tests/test_anthropic_messages_passthrough.py +++ b/tests/pass_through_unit_tests/test_anthropic_messages_passthrough.py @@ -8,7 +8,7 @@ import unittest.mock from unittest.mock import AsyncMock, MagicMock sys.path.insert( - 0, os.path.abspath("../..") + 0, os.path.abspath("../../..") ) # Adds the parent directory to the system path import litellm import pytest @@ -16,6 +16,7 @@ from dotenv import load_dotenv from litellm.llms.anthropic.experimental_pass_through.messages.handler import ( anthropic_messages, ) + from typing import Optional from litellm.types.utils import StandardLoggingPayload from litellm.integrations.custom_logger import CustomLogger @@ -73,6 +74,7 @@ async def test_anthropic_messages_non_streaming(): """ Test the anthropic_messages with non-streaming request """ + litellm._turn_on_debug() # Get API key from environment api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: @@ -82,7 +84,7 @@ async def test_anthropic_messages_non_streaming(): messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}] # Call the handler - response = await anthropic_messages( + response = await litellm.anthropic.messages.acreate( messages=messages, api_key=api_key, model="claude-3-haiku-20240307", @@ -114,7 +116,7 @@ async def test_anthropic_messages_streaming(): # Call the handler async_httpx_client = AsyncHTTPHandler() - response = await anthropic_messages( + response = await litellm.anthropic.messages.acreate( messages=messages, api_key=api_key, model="claude-3-haiku-20240307", @@ -134,7 +136,7 @@ async def test_anthropic_messages_streaming_with_bad_request(): Test the anthropic_messages with streaming request """ try: - response = await anthropic_messages( + response = await litellm.anthropic.messages.acreate( messages=["hi"], api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-haiku-20240307", @@ -458,7 +460,7 @@ async def test_anthropic_messages_with_extra_headers(): mock_client.post = AsyncMock(return_value=mock_response) # Call the handler with extra_headers and our mocked client - response = await anthropic_messages( + response = await litellm.anthropic.messages.acreate( messages=messages, api_key=api_key, model="claude-3-haiku-20240307",