fix(anthropic_text.py): add support for async text completion calls

2024-04-15 08:15:00 -07:00 · 2024-04-15 08:15:00 -07:00 · 26286a54b8
commit 26286a54b8
parent bdf7f6d13c
6 changed files with 324 additions and 98 deletions
--- a/litellm/llms/anthropic_text.py
+++ b/litellm/llms/anthropic_text.py
@ -8,6 +8,8 @@ from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 import httpx
 from .base import BaseLLM
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 class AnthropicConstants(Enum):
@ -94,98 +96,13 @@ def validate_environment(api_key, user_headers):
    return headers
-def completion(
+class AnthropicTextCompletion(BaseLLM):
-    model: str,
+    def __init__(self) -> None:
-    messages: list,
+        super().__init__()
    api_base: str,
    custom_prompt_dict: dict,
    model_response: ModelResponse,
    print_verbose: Callable,
    encoding,
    api_key,
    logging_obj,
    optional_params=None,
    litellm_params=None,
    logger_fn=None,
    headers={},
 ):
    headers = validate_environment(api_key, headers)
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_details = custom_prompt_dict[model]
        prompt = custom_prompt(
            role_dict=model_prompt_details["roles"],
            initial_prompt_value=model_prompt_details["initial_prompt_value"],
            final_prompt_value=model_prompt_details["final_prompt_value"],
            messages=messages,
        )
    else:
        prompt = prompt_factory(
            model=model, messages=messages, custom_llm_provider="anthropic"
        )
-    ## Load Config
+    def process_response(
-    config = litellm.AnthropicTextConfig.get_config()
+        self, model_response: ModelResponse, response, encoding, prompt: str, model: str
-    for k, v in config.items():
+    ):
        if (
            k not in optional_params
        ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v
    data = {
        "model": model,
        "prompt": prompt,
        **optional_params,
    }
    ## LOGGING
    logging_obj.pre_call(
        input=prompt,
        api_key=api_key,
        additional_args={
            "complete_input_dict": data,
            "api_base": api_base,
            "headers": headers,
        },
    )
    ## COMPLETION CALL
    if "stream" in optional_params and optional_params["stream"] == True:
        response = requests.post(
            api_base,
            headers=headers,
            data=json.dumps(data),
            stream=optional_params["stream"],
        )
        if response.status_code != 200:
            raise AnthropicError(
                status_code=response.status_code, message=response.text
            )
        completion_stream = response.iter_lines()
        stream_response = CustomStreamWrapper(
            completion_stream=completion_stream,
            model=model,
            custom_llm_provider="anthropic",
            logging_obj=logging_obj,
        )
        return stream_response
    else:
        response = requests.post(api_base, headers=headers, data=json.dumps(data))
        if response.status_code != 200:
            raise AnthropicError(
                status_code=response.status_code, message=response.text
            )
        ## LOGGING
        logging_obj.post_call(
            input=prompt,
            api_key=api_key,
            original_response=response.text,
            additional_args={"complete_input_dict": data},
        )
        print_verbose(f"raw model_response: {response.text}")
        ## RESPONSE OBJECT
        try:
            completion_response = response.json()
@ -221,9 +138,204 @@ def completion(
            total_tokens=prompt_tokens + completion_tokens,
        )
        model_response.usage = usage
        return model_response
    async def async_completion(
        self,
        model: str,
        model_response: ModelResponse,
        api_base: str,
        logging_obj,
        encoding,
        headers: dict,
        data: dict,
        client=None,
    ):
        if client is None:
            client = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
-def embedding():
+        response = await client.post(api_base, headers=headers, data=json.dumps(data))
-    # logic for parsing in - calling - parsing out model embedding calls
+
-    pass
+        if response.status_code != 200:
            raise AnthropicError(
                status_code=response.status_code, message=response.text
            )
        ## LOGGING
        logging_obj.post_call(
            input=data["prompt"],
            api_key=headers.get("x-api-key"),
            original_response=response.text,
            additional_args={"complete_input_dict": data},
        )
        response = self.process_response(
            model_response=model_response,
            response=response,
            encoding=encoding,
            prompt=data["prompt"],
            model=model,
        )
        return response
    async def async_streaming(
        self,
        model: str,
        api_base: str,
        logging_obj,
        headers: dict,
        data: Optional[dict],
        client=None,
    ):
        if client is None:
            client = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
        response = await client.post(api_base, headers=headers, data=json.dumps(data))
        if response.status_code != 200:
            raise AnthropicError(
                status_code=response.status_code, message=response.text
            )
        completion_stream = response.aiter_lines()
        streamwrapper = CustomStreamWrapper(
            completion_stream=completion_stream,
            model=model,
            custom_llm_provider="anthropic_text",
            logging_obj=logging_obj,
        )
        return streamwrapper
    def completion(
        self,
        model: str,
        messages: list,
        api_base: str,
        acompletion: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
        api_key,
        logging_obj,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
        headers={},
        client=None,
    ):
        headers = validate_environment(api_key, headers)
        if model in custom_prompt_dict:
            # check if the model has a registered custom prompt
            model_prompt_details = custom_prompt_dict[model]
            prompt = custom_prompt(
                role_dict=model_prompt_details["roles"],
                initial_prompt_value=model_prompt_details["initial_prompt_value"],
                final_prompt_value=model_prompt_details["final_prompt_value"],
                messages=messages,
            )
        else:
            prompt = prompt_factory(
                model=model, messages=messages, custom_llm_provider="anthropic"
            )
        ## Load Config
        config = litellm.AnthropicTextConfig.get_config()
        for k, v in config.items():
            if (
                k not in optional_params
            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                optional_params[k] = v
        data = {
            "model": model,
            "prompt": prompt,
            **optional_params,
        }
        ## LOGGING
        logging_obj.pre_call(
            input=prompt,
            api_key=api_key,
            additional_args={
                "complete_input_dict": data,
                "api_base": api_base,
                "headers": headers,
            },
        )
        ## COMPLETION CALL
        if "stream" in optional_params and optional_params["stream"] == True:
            if acompletion == True:
                return self.async_streaming(
                    model=model,
                    api_base=api_base,
                    logging_obj=logging_obj,
                    headers=headers,
                    data=data,
                    client=None,
                )
            if client is None:
                client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
            response = client.post(
                api_base,
                headers=headers,
                data=json.dumps(data),
                # stream=optional_params["stream"],
            )
            if response.status_code != 200:
                raise AnthropicError(
                    status_code=response.status_code, message=response.text
                )
            completion_stream = response.iter_lines()
            stream_response = CustomStreamWrapper(
                completion_stream=completion_stream,
                model=model,
                custom_llm_provider="anthropic_text",
                logging_obj=logging_obj,
            )
            return stream_response
        elif acompletion == True:
            return self.async_completion(
                model=model,
                model_response=model_response,
                api_base=api_base,
                logging_obj=logging_obj,
                encoding=encoding,
                headers=headers,
                data=data,
                client=client,
            )
        else:
            if client is None:
                client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
            response = client.post(api_base, headers=headers, data=json.dumps(data))
            if response.status_code != 200:
                raise AnthropicError(
                    status_code=response.status_code, message=response.text
                )
            ## LOGGING
            logging_obj.post_call(
                input=prompt,
                api_key=api_key,
                original_response=response.text,
                additional_args={"complete_input_dict": data},
            )
            print_verbose(f"raw model_response: {response.text}")
            response = self.process_response(
                model_response=model_response,
                response=response,
                encoding=encoding,
                prompt=data["prompt"],
                model=model,
            )
            return response
    def embedding(self):
        # logic for parsing in - calling - parsing out model embedding calls
        pass
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -58,13 +58,16 @@ class AsyncHTTPHandler:
 class HTTPHandler:
-    def __init__(self, concurrent_limit=1000):
+    def __init__(
        self, timeout: httpx.Timeout = _DEFAULT_TIMEOUT, concurrent_limit=1000
    ):
        # Create a client with a connection pool
        self.client = httpx.Client(
            timeout=timeout,
            limits=httpx.Limits(
                max_connections=concurrent_limit,
                max_keepalive_connections=concurrent_limit,
-            )
+            ),
        )
    def close(self):
--- a/litellm/main.py
+++ b/litellm/main.py
@ -67,6 +67,7 @@ from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.azure_text import AzureTextCompletion
 from .llms.anthropic import AnthropicChatCompletion
 from .llms.anthropic_text import AnthropicTextCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.prompt_templates.factory import (
    prompt_factory,
@ -99,6 +100,7 @@ dotenv.load_dotenv()  # Loading env variables using dotenv
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
 anthropic_chat_completions = AnthropicChatCompletion()
 anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
 azure_text_completions = AzureTextCompletion()
 huggingface = Huggingface()
@ -1165,10 +1167,11 @@ def completion(
                    or get_secret("ANTHROPIC_API_BASE")
                    or "https://api.anthropic.com/v1/complete"
                )
-                response = anthropic_text.completion(
+                response = anthropic_text_completions.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
                    acompletion=acompletion,
                    custom_prompt_dict=litellm.custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -492,6 +492,31 @@ def test_completion_claude2_1():
 # test_completion_claude2_1()
@pytest.mark.asyncio
 async def test_acompletion_claude2_1():
    try:
        litellm.set_verbose = True
        print("claude2.1 test request")
        messages = [
            {
                "role": "system",
                "content": "Your goal is generate a joke on the topic user gives.",
            },
            {"role": "user", "content": "Generate a 3 liner joke for me"},
        ]
        # test without max tokens
        response = await litellm.acompletion(model="claude-2.1", messages=messages)
        # Add any assertions here to check the response
        print(response)
        print(response.usage)
        print(response.usage.completion_tokens)
        print(response["usage"]["completion_tokens"])
        # print("new cost tracking")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # def test_completion_oobabooga():
 #     try:
 #         response = completion(
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -380,6 +380,51 @@ def test_completion_claude_stream():
 # test_completion_claude_stream()
 def test_completion_claude_2_stream():
    litellm.set_verbose = True
    response = completion(
        model="claude-2",
        messages=[{"role": "user", "content": "hello from litellm"}],
        stream=True,
    )
    complete_response = ""
    # Add any assertions here to check the response
    idx = 0
    for chunk in response:
        print(chunk)
        # print(chunk.choices[0].delta)
        chunk, finished = streaming_format_tests(idx, chunk)
        if finished:
            break
        complete_response += chunk
        idx += 1
    if complete_response.strip() == "":
        raise Exception("Empty response received")
    print(f"completion_response: {complete_response}")
@pytest.mark.asyncio
 async def test_acompletion_claude_2_stream():
    litellm.set_verbose = True
    response = await litellm.acompletion(
        model="claude-2",
        messages=[{"role": "user", "content": "hello from litellm"}],
        stream=True,
    )
    complete_response = ""
    # Add any assertions here to check the response
    idx = 0
    async for chunk in response:
        print(chunk)
        # print(chunk.choices[0].delta)
        chunk, finished = streaming_format_tests(idx, chunk)
        if finished:
            break
        complete_response += chunk
        idx += 1
    if complete_response.strip() == "":
        raise Exception("Empty response received")
    print(f"completion_response: {complete_response}")
 def test_completion_palm_stream():
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -8810,6 +8810,35 @@ class CustomStreamWrapper:
            self.holding_chunk = ""
        return hold, curr_chunk
    def handle_anthropic_text_chunk(self, chunk):
        str_line = chunk
        if isinstance(chunk, bytes):  # Handle binary data
            str_line = chunk.decode("utf-8")  # Convert bytes to string
        text = ""
        is_finished = False
        finish_reason = None
        if str_line.startswith("data:"):
            data_json = json.loads(str_line[5:])
            type_chunk = data_json.get("type", None)
            if type_chunk == "completion":
                text = data_json.get("completion")
                finish_reason = data_json.get("stop_reason")
                if finish_reason is not None:
                    is_finished = True
            return {
                "text": text,
                "is_finished": is_finished,
                "finish_reason": finish_reason,
            }
        elif "error" in str_line:
            raise ValueError(f"Unable to parse response. Original response: {str_line}")
        else:
            return {
                "text": text,
                "is_finished": is_finished,
                "finish_reason": finish_reason,
            }
    def handle_anthropic_chunk(self, chunk):
        str_line = chunk
        if isinstance(chunk, bytes):  # Handle binary data
@ -9497,6 +9526,14 @@ class CustomStreamWrapper:
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif (
                self.custom_llm_provider
                and self.custom_llm_provider == "anthropic_text"
            ):
                response_obj = self.handle_anthropic_text_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.model == "replicate" or self.custom_llm_provider == "replicate":
                response_obj = self.handle_replicate_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
@ -10074,6 +10111,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "text-completion-openai"
                or self.custom_llm_provider == "azure_text"
                or self.custom_llm_provider == "anthropic"
                or self.custom_llm_provider == "anthropic_text"
                or self.custom_llm_provider == "huggingface"
                or self.custom_llm_provider == "ollama"
                or self.custom_llm_provider == "ollama_chat"