feat(proxy_server.py): working /v1/messages endpoint

Works with claude engineer
2024-07-10 18:15:38 -07:00 · 2024-07-10 18:15:38 -07:00 · 2f8dbbeb97
commit 2f8dbbeb97
parent 5d6e172d5c
9 changed files with 272 additions and 152 deletions
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -8,11 +8,12 @@ from typing import Literal, Optional
 import dotenv
 import httpx
 from pydantic import BaseModel
 import litellm
 from litellm import ChatCompletionRequest, verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
-from litellm.types.llms.anthropic import AnthropicMessagesRequest
+from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
 class AnthropicAdapter(CustomLogger):
@ -31,12 +32,18 @@ class AnthropicAdapter(CustomLogger):
        translated_body = litellm.AnthropicConfig().translate_anthropic_to_openai(
            anthropic_message_request=request_body
        )
        return translated_body
-    def translate_completion_output_params(self, response: litellm.ModelResponse):
+    def translate_completion_output_params(
-        return super().translate_completion_output_params(response)
+        self, response: litellm.ModelResponse
    ) -> Optional[AnthropicResponse]:
-    def translate_completion_output_params_streaming(self):
+        return litellm.AnthropicConfig().translate_openai_response_to_anthropic(
            response=response
        )
    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
        return super().translate_completion_output_params_streaming()
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -5,6 +5,7 @@ import traceback
 from typing import Literal, Optional, Union
 import dotenv
 from pydantic import BaseModel
 from litellm.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
@ -67,13 +68,15 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
        """
        pass
-    def translate_completion_output_params(self, response: ModelResponse):
+    def translate_completion_output_params(
        self, response: ModelResponse
    ) -> Optional[BaseModel]:
        """
        Translates the output params, from the OpenAI format to the custom format.
        """
        pass
-    def translate_completion_output_params_streaming(self):
+    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
        """
        Translates the streaming chunk, from the OpenAI format to the custom format.
        """
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -21,10 +21,15 @@ from litellm.llms.custom_httpx.http_handler import (
 )
 from litellm.types.llms.anthropic import (
    AnthopicMessagesAssistantMessageParam,
    AnthropicFinishReason,
    AnthropicMessagesRequest,
    AnthropicMessagesTool,
    AnthropicMessagesToolChoice,
    AnthropicMessagesUserMessageParam,
    AnthropicResponse,
    AnthropicResponseContentBlockText,
    AnthropicResponseContentBlockToolUse,
    AnthropicResponseUsageBlock,
    ContentBlockDelta,
    ContentBlockStart,
    MessageBlockDelta,
@ -51,7 +56,7 @@ from litellm.types.llms.openai import (
    ChatCompletionUsageBlock,
    ChatCompletionUserMessage,
 )
-from litellm.types.utils import GenericStreamingChunk
+from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 from .base import BaseLLM
@ -187,6 +192,8 @@ class AnthropicConfig:
                optional_params["top_p"] = value
        return optional_params
    ### FOR [BETA] `/v1/messages` endpoint support
    def translatable_anthropic_params(self) -> List:
        """
        Which anthropic params, we need to translate to the openai format.
@ -300,10 +307,14 @@ class AnthropicConfig:
                                )
                            )
-            assistant_message = ChatCompletionAssistantMessage(
+            if assistant_message_str is not None or len(tool_calls) > 0:
-                role="assistant", content=assistant_message_str, tool_calls=tool_calls
+                assistant_message = ChatCompletionAssistantMessage(
-            )
+                    role="assistant",
-            new_messages.append(assistant_message)
+                    content=assistant_message_str,
                )
                if len(tool_calls) > 0:
                    assistant_message["tool_calls"] = tool_calls
                new_messages.append(assistant_message)
        return new_messages
@ -391,6 +402,77 @@ class AnthropicConfig:
        return new_kwargs
    def _translate_openai_content_to_anthropic(
        self, choices: List[Choices]
    ) -> List[
        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
    ]:
        new_content: List[
            Union[
                AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
            ]
        ] = []
        for choice in choices:
            if (
                choice.message.tool_calls is not None
                and len(choice.message.tool_calls) > 0
            ):
                for tool_call in choice.message.tool_calls:
                    new_content.append(
                        AnthropicResponseContentBlockToolUse(
                            type="tool_use",
                            id=tool_call.id,
                            name=tool_call.function.name,
                            input=tool_call.function.arguments,
                        )
                    )
            elif choice.message.content is not None:
                new_content.append(
                    AnthropicResponseContentBlockText(
                        type="text", text=choice.message.content
                    )
                )
        return new_content
    def _translate_openai_finish_reason_to_anthropic(
        self, openai_finish_reason: str
    ) -> AnthropicFinishReason:
        if openai_finish_reason == "stop":
            return "end_turn"
        elif openai_finish_reason == "length":
            return "max_tokens"
        elif openai_finish_reason == "tool_calls":
            return "tool_use"
        return "end_turn"
    def translate_openai_response_to_anthropic(
        self, response: litellm.ModelResponse
    ) -> AnthropicResponse:
        ## translate content block
        anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices)  # type: ignore
        ## extract finish reason
        anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
            openai_finish_reason=response.choices[0].finish_reason  # type: ignore
        )
        # extract usage
        usage: litellm.Usage = getattr(response, "usage")
        anthropic_usage = AnthropicResponseUsageBlock(
            input_tokens=usage.prompt_tokens, output_tokens=usage.completion_tokens
        )
        translated_obj = AnthropicResponse(
            id=response.id,
            type="message",
            role="assistant",
            model=response.model or "unknown-model",
            stop_sequence=None,
            usage=anthropic_usage,
            content=anthropic_content,
            stop_reason=anthropic_finish_reason,
        )
        return translated_obj
 # makes headers for API call
 def validate_environment(api_key, user_headers):
@ -454,121 +536,6 @@ class AnthropicChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()
    # def process_streaming_response(
    #     self,
    #     model: str,
    #     response: Union[requests.Response, httpx.Response],
    #     model_response: ModelResponse,
    #     stream: bool,
    #     logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
    #     optional_params: dict,
    #     api_key: str,
    #     data: Union[dict, str],
    #     messages: List,
    #     print_verbose,
    #     encoding,
    # ) -> CustomStreamWrapper:
    #     """
    #     Return stream object for tool-calling + streaming
    #     """
    #     ## LOGGING
    #     logging_obj.post_call(
    #         input=messages,
    #         api_key=api_key,
    #         original_response=response.text,
    #         additional_args={"complete_input_dict": data},
    #     )
    #     print_verbose(f"raw model_response: {response.text}")
    #     ## RESPONSE OBJECT
    #     try:
    #         completion_response = response.json()
    #     except:
    #         raise AnthropicError(
    #             message=response.text, status_code=response.status_code
    #         )
    #     text_content = ""
    #     tool_calls = []
    #     for content in completion_response["content"]:
    #         if content["type"] == "text":
    #             text_content += content["text"]
    #         ## TOOL CALLING
    #         elif content["type"] == "tool_use":
    #             tool_calls.append(
    #                 {
    #                     "id": content["id"],
    #                     "type": "function",
    #                     "function": {
    #                         "name": content["name"],
    #                         "arguments": json.dumps(content["input"]),
    #                     },
    #                 }
    #             )
    #     if "error" in completion_response:
    #         raise AnthropicError(
    #             message=str(completion_response["error"]),
    #             status_code=response.status_code,
    #         )
    #     _message = litellm.Message(
    #         tool_calls=tool_calls,
    #         content=text_content or None,
    #     )
    #     model_response.choices[0].message = _message  # type: ignore
    #     model_response._hidden_params["original_response"] = completion_response[
    #         "content"
    #     ]  # allow user to access raw anthropic tool calling response
    #     model_response.choices[0].finish_reason = map_finish_reason(
    #         completion_response["stop_reason"]
    #     )
    #     print_verbose("INSIDE ANTHROPIC STREAMING TOOL CALLING CONDITION BLOCK")
    #     # return an iterator
    #     streaming_model_response = ModelResponse(stream=True)
    #     streaming_model_response.choices[0].finish_reason = model_response.choices[  # type: ignore
    #         0
    #     ].finish_reason
    #     # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
    #     streaming_choice = litellm.utils.StreamingChoices()
    #     streaming_choice.index = model_response.choices[0].index
    #     _tool_calls = []
    #     print_verbose(
    #         f"type of model_response.choices[0]: {type(model_response.choices[0])}"
    #     )
    #     print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
    #     if isinstance(model_response.choices[0], litellm.Choices):
    #         if getattr(
    #             model_response.choices[0].message, "tool_calls", None
    #         ) is not None and isinstance(
    #             model_response.choices[0].message.tool_calls, list
    #         ):
    #             for tool_call in model_response.choices[0].message.tool_calls:
    #                 _tool_call = {**tool_call.dict(), "index": 0}
    #                 _tool_calls.append(_tool_call)
    #         delta_obj = litellm.utils.Delta(
    #             content=getattr(model_response.choices[0].message, "content", None),
    #             role=model_response.choices[0].message.role,
    #             tool_calls=_tool_calls,
    #         )
    #         streaming_choice.delta = delta_obj
    #         streaming_model_response.choices = [streaming_choice]
    #         completion_stream = ModelResponseIterator(
    #             model_response=streaming_model_response
    #         )
    #         print_verbose(
    #             "Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
    #         )
    #         return CustomStreamWrapper(
    #             completion_stream=completion_stream,
    #             model=model,
    #             custom_llm_provider="cached_response",
    #             logging_obj=logging_obj,
    #         )
    #     else:
    #         raise AnthropicError(
    #             status_code=422,
    #             message="Unprocessable response object - {}".format(response.text),
    #         )
    def process_response(
        self,
        model: str,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -38,6 +38,7 @@ import dotenv
 import httpx
 import openai
 import tiktoken
 from pydantic import BaseModel
 from typing_extensions import overload
 import litellm
@ -3947,7 +3948,7 @@ def text_completion(
 ###### Adapter Completion ################
-def adapter_completion(*, adapter_id: str, **kwargs) -> Any:
+def adapter_completion(*, adapter_id: str, **kwargs) -> Optional[BaseModel]:
    translation_obj: Optional[CustomLogger] = None
    for item in litellm.adapters:
        if item["id"] == adapter_id:
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -71,6 +71,11 @@ azure_api_key_header = APIKeyHeader(
    auto_error=False,
    description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
 )
 anthropic_api_key_header = APIKeyHeader(
    name="x-api-key",
    auto_error=False,
    description="If anthropic client used.",
 )
 def _get_bearer_token(
@ -87,6 +92,9 @@ async def user_api_key_auth(
    request: Request,
    api_key: str = fastapi.Security(api_key_header),
    azure_api_key_header: str = fastapi.Security(azure_api_key_header),
    anthropic_api_key_header: Optional[str] = fastapi.Security(
        anthropic_api_key_header
    ),
 ) -> UserAPIKeyAuth:
    from litellm.proxy.proxy_server import (
@ -114,6 +122,9 @@ async def user_api_key_auth(
        elif isinstance(azure_api_key_header, str):
            api_key = azure_api_key_header
        elif isinstance(anthropic_api_key_header, str):
            api_key = anthropic_api_key_header
        parent_otel_span: Optional[Span] = None
        if open_telemetry_logger is not None:
            parent_otel_span = open_telemetry_logger.tracer.start_span(
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -210,6 +210,12 @@ from litellm.router import (
 from litellm.router import ModelInfo as RouterModelInfo
 from litellm.router import updateDeployment
 from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
 from litellm.types.llms.anthropic import (
    AnthropicMessagesRequest,
    AnthropicResponse,
    AnthropicResponseContentBlockText,
    AnthropicResponseUsageBlock,
 )
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import RouterGeneralSettings
@ -5030,6 +5036,34 @@ async def moderations(
            )
 #### ANTHROPIC ENDPOINTS ####
@router.post(
    "/v1/messages",
    tags=["[beta] Anthropic `/v1/messages`"],
    dependencies=[Depends(user_api_key_auth)],
    response_model=AnthropicResponse,
 )
 async def anthropic_response(data: AnthropicMessagesRequest):
    from litellm import adapter_completion
    from litellm.adapters.anthropic_adapter import anthropic_adapter
    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
    response: Optional[BaseModel] = adapter_completion(adapter_id="anthropic", **data)
    if response is None:
        raise Exception("Response is None.")
    elif not isinstance(response, AnthropicResponse):
        raise Exception(
            "Invalid model response={}. Not in 'AnthropicResponse' format".format(
                response
            )
        )
    return response
 #### DEV UTILS ####
 # @router.get(
@ -7546,7 +7580,7 @@ async def login(request: Request):
            litellm_dashboard_ui += "/ui/"
        import jwt
-        jwt_token = jwt.encode(
+        jwt_token = jwt.encode(  # type: ignore
            {
                "user_id": user_id,
                "key": key,
@ -7610,7 +7644,7 @@ async def login(request: Request):
                litellm_dashboard_ui += "/ui/"
            import jwt
-            jwt_token = jwt.encode(
+            jwt_token = jwt.encode(  # type: ignore
                {
                    "user_id": user_id,
                    "key": key,
@ -7745,7 +7779,7 @@ async def onboarding(invite_link: str):
        litellm_dashboard_ui += "/ui/onboarding"
    import jwt
-    jwt_token = jwt.encode(
+    jwt_token = jwt.encode(  # type: ignore
        {
            "user_id": user_obj.user_id,
            "key": key,
@ -8162,7 +8196,7 @@ async def auth_callback(request: Request):
    import jwt
-    jwt_token = jwt.encode(
+    jwt_token = jwt.encode(  # type: ignore
        {
            "user_id": user_id,
            "key": key,
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@ -20,16 +20,51 @@ from unittest.mock import MagicMock, patch
 import pytest
 import litellm
-from litellm import adapter_completion
+from litellm import AnthropicConfig, adapter_completion
 from litellm.adapters.anthropic_adapter import anthropic_adapter
 from litellm.types.llms.anthropic import AnthropicResponse
-def test_anthropic_completion():
+def test_anthropic_completion_messages_translation():
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    translated_messages = AnthropicConfig().translate_anthropic_messages_to_openai(messages=messages)  # type: ignore
    assert translated_messages == [{"role": "user", "content": "Hey, how's it going?"}]
 def test_anthropic_completion_input_translation():
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": "Hey, how's it going?"}],
    }
    translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data)
    assert translated_input is not None
    assert translated_input["model"] == "gpt-3.5-turbo"
    assert translated_input["messages"] == [
        {"role": "user", "content": "Hey, how's it going?"}
    ]
 def test_anthropic_completion_e2e():
    litellm.set_verbose = True
    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    response = adapter_completion(
-        model="gpt-3.5-turbo", messages=messages, adapter_id="anthropic"
+        model="gpt-3.5-turbo",
        messages=messages,
        adapter_id="anthropic",
        mock_response="This is a fake call",
    )
-    print(response)
+    print("Response: {}".format(response))
    assert response is not None
    assert isinstance(response, AnthropicResponse)
    assert False
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -223,3 +223,51 @@ class MessageStartBlock(TypedDict):
    type: Literal["message_start"]
    message: MessageChunk
 class AnthropicResponseContentBlockText(BaseModel):
    type: Literal["text"]
    text: str
 class AnthropicResponseContentBlockToolUse(BaseModel):
    type: Literal["tool_use"]
    id: str
    name: str
    input: str
 class AnthropicResponseUsageBlock(BaseModel):
    input_tokens: int
    output_tokens: int
 AnthropicFinishReason = Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
 class AnthropicResponse(BaseModel):
    id: str
    """Unique object identifier."""
    type: Literal["message"]
    """For Messages, this is always "message"."""
    role: Literal["assistant"]
    """Conversational role of the generated message. This will always be "assistant"."""
    content: List[
        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
    ]
    """Content generated by the model."""
    model: str
    """The model that handled the request."""
    stop_reason: Optional[AnthropicFinishReason]
    """The reason that we stopped."""
    stop_sequence: Optional[str]
    """Which custom stop sequence was generated, if any."""
    usage: AnthropicResponseUsageBlock
    """Billing and rate-limit usage."""
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -166,7 +166,7 @@ class FunctionCall(OpenAIObject):
 class Function(OpenAIObject):
    arguments: str
-    name: Optional[str] = None
+    name: str
    def __init__(
        self,
@ -280,29 +280,43 @@ class ChatCompletionMessageToolCall(OpenAIObject):
        setattr(self, key, value)
 """
 Reference:
 ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None))
 """
 class Message(OpenAIObject):
    content: Optional[str]
    role: Literal["assistant"]
    tool_calls: Optional[List[ChatCompletionMessageToolCall]]
    function_call: Optional[FunctionCall]
    def __init__(
        self,
-        content: Optional[str] = "default",
+        content: Optional[str] = None,
-        role="assistant",
+        role: Literal["assistant"] = "assistant",
        logprobs=None,
        function_call=None,
        tool_calls=None,
        **params,
    ):
-        super(Message, self).__init__(**params)
+        init_values = {
-        self.content = content
+            "content": content,
-        self.role = role
+            "role": role,
-        if function_call is not None:
+            "function_call": (
-            self.function_call = FunctionCall(**function_call)
+                FunctionCall(**function_call) if function_call is not None else None
-
+            ),
-        if tool_calls is not None:
+            "tool_calls": (
-            self.tool_calls = []
+                [ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls]
-            for tool_call in tool_calls:
+                if tool_calls is not None
-                self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
+                else None
-
+            ),
-        if logprobs is not None:
+        }
-            self._logprobs = ChoiceLogprobs(**logprobs)
+        super(Message, self).__init__(
            **init_values,
            **params,
        )
    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist