Add support for Triton streaming & triton async completions

2024-07-19 09:35:27 -05:00 · 2024-07-19 09:35:27 -05:00 · d5c65c6be2
commit d5c65c6be2
parent 1b3050477a
3 changed files with 199 additions and 33 deletions
--- a/litellm/llms/triton.py
+++ b/litellm/llms/triton.py
@ -4,15 +4,23 @@ from enum import Enum
 import requests  # type: ignore
 import time
 from typing import Callable, Optional, List, Sequence, Any, Union, Dict
-from litellm.utils import ModelResponse, Choices, Usage, map_finish_reason, CustomStreamWrapper, Message, EmbeddingResponse
+from litellm.utils import (
    ModelResponse,
    Choices,
    Delta,
    Usage,
    map_finish_reason,
    CustomStreamWrapper,
    Message,
    EmbeddingResponse,
 )
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from .base import BaseLLM
 import httpx  # type: ignore
 class TritonError(Exception):
    def __init__(self, status_code: int, message: str) -> None:
        self.status_code = status_code
@ -26,6 +34,7 @@ class TritonError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs
 class TritonChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()
@ -127,41 +136,68 @@ class TritonChatCompletion(BaseLLM):
        optional_params=None,
        client=None,
        stream: bool = False,
        acompletion: bool = False,
    ) -> ModelResponse:
        type_of_model = ""
        optional_params.pop("stream", False)
        if api_base.endswith("generate"):  ### This is a trtllm model
            text_input = messages[0]["content"]
            data_for_triton: Dict[str, Any] = {
-                "text_input": str(text_input),
+                "text_input": prompt_factory(model=model, messages=messages),
                "parameters": {
-                    "max_tokens": int(optional_params.get("max_tokens", 20)),
+                    "max_tokens": int(optional_params.get("max_tokens", 2000)),
                    "bad_words": [""],
-                    "stop_words": [""]
+                    "stop_words": [""],
-                }
+                },
                "stream": bool(stream),
            }
            data_for_triton["parameters"].update(optional_params)
            type_of_model = "trtllm"
-        elif api_base.endswith("infer"):  ### This is an infer model with a custom model on triton
+        elif api_base.endswith(
            "infer"
        ):  ### This is an infer model with a custom model on triton
            text_input = messages[0]["content"]
            data_for_triton = {
-                "inputs": [{"name": "text_input", "shape": [1], "datatype": "BYTES", "data": [text_input]}]
+                "inputs": [
                    {
                        "name": "text_input",
                        "shape": [1],
                        "datatype": "BYTES",
                        "data": [text_input],
                    }
                ]
            }
            for k, v in optional_params.items():
                if not (k == "stream" or k == "max_retries"):
                    datatype = "INT32" if isinstance(v, int) else "BYTES"
                    datatype = "FP32" if isinstance(v, float) else datatype
-                    data_for_triton['inputs'].append({"name": k, "shape": [1], "datatype": datatype, "data": [v]})
+                    data_for_triton["inputs"].append(
                        {"name": k, "shape": [1], "datatype": datatype, "data": [v]}
                    )
            if "max_tokens" not in optional_params:
-                data_for_triton['inputs'].append({"name": "max_tokens", "shape": [1], "datatype": "INT32", "data": [20]})
+                data_for_triton["inputs"].append(
                    {
                        "name": "max_tokens",
                        "shape": [1],
                        "datatype": "INT32",
                        "data": [20],
                    }
                )
            type_of_model = "infer"
        else:  ## Unknown model type passthrough
            data_for_triton = {
-                "inputs": [{"name": "text_input", "shape": [1], "datatype": "BYTES", "data": [messages[0]["content"]]}] 
+                "inputs": [
                    {
                        "name": "text_input",
                        "shape": [1],
                        "datatype": "BYTES",
                        "data": [messages[0]["content"]],
                    }
                ]
            }
        if logging_obj:
@ -174,24 +210,108 @@ class TritonChatCompletion(BaseLLM):
                    "http_client": client,
                },
            )
        handler = requests.Session()
        handler.timeout = (600.0, 5.0)
-        response = handler.post(url=api_base, json=data_for_triton)
+        headers = {"Content-Type": "application/json"}
        data_for_triton = json.dumps(data_for_triton)
        if acompletion:
            return self.acompletion(
                model,
                data_for_triton,
                headers=headers,
                logging_obj=logging_obj,
                api_base=api_base,
                stream=stream,
                model_response=model_response,
                type_of_model=type_of_model,
            )
        else:
            handler = HTTPHandler()
        if stream:
            return self._handle_stream(
                handler, api_base, data_for_triton, model, logging_obj
            )
        else:
            response = handler.post(url=api_base, data=data_for_triton, headers=headers)
            return self._handle_response(
                response, model_response, logging_obj, type_of_model=type_of_model
            )
    async def acompletion(
        self,
        model: str,
        data_for_triton,
        api_base,
        stream,
        logging_obj,
        headers,
        model_response,
        type_of_model,
    ) -> ModelResponse:
        handler = AsyncHTTPHandler()
        if stream:
            return self._ahandle_stream(
                handler, api_base, data_for_triton, model, logging_obj
            )
        else:
            response = await handler.post(
                url=api_base, data=data_for_triton, headers=headers
            )
            return self._handle_response(
                response, model_response, logging_obj, type_of_model=type_of_model
            )
    def _handle_stream(self, handler, api_base, data_for_triton, model, logging_obj):
        response = handler.post(
            url=api_base + "_stream", data=data_for_triton, stream=True
        )
        streamwrapper = litellm.CustomStreamWrapper(
            response.iter_lines(),
            model=model,
            custom_llm_provider="triton",
            logging_obj=logging_obj,
        )
        for chunk in streamwrapper:
            yield (chunk)
    async def _ahandle_stream(
        self, handler, api_base, data_for_triton, model, logging_obj
    ):
        response = await handler.post(
            url=api_base + "_stream", data=data_for_triton, stream=True
        )
        streamwrapper = litellm.CustomStreamWrapper(
            response.aiter_lines(),
            model=model,
            custom_llm_provider="triton",
            logging_obj=logging_obj,
        )
        async for chunk in streamwrapper:
            yield (chunk)
    def _handle_response(self, response, model_response, logging_obj, type_of_model):
        if logging_obj:
            logging_obj.post_call(original_response=response)
        if response.status_code != 200:
            raise TritonError(status_code=response.status_code, message=response.text)
        _json_response = response.json()
        _json_response = response.json()
        model_response.model = _json_response.get("model_name", "None")
        if type_of_model == "trtllm":
-            model_response.choices = [Choices(index=0, message=Message(content=_json_response['text_output']))]
+            model_response.choices = [
                Choices(index=0, message=Message(content=_json_response["text_output"]))
            ]
        elif type_of_model == "infer":
-            model_response.choices = [Choices(index=0, message=Message(content=_json_response['outputs'][0]['data']))]
+            model_response.choices = [
                Choices(
                    index=0,
                    message=Message(content=_json_response["outputs"][0]["data"]),
                )
            ]
        else:
-            model_response.choices = [Choices(index=0, message=Message(content=_json_response['outputs']))]
+            model_response.choices = [
-
+                Choices(index=0, message=Message(content=_json_response["outputs"]))
            ]
        return model_response
--- a/litellm/main.py
+++ b/litellm/main.py
@ -333,6 +333,7 @@ async def acompletion(
            or custom_llm_provider == "predibase"
            or custom_llm_provider == "bedrock"
            or custom_llm_provider == "databricks"
            or custom_llm_provider == "triton"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
@ -2267,6 +2268,8 @@ def completion(
            model_response=model_response,
            optional_params=optional_params,
            logging_obj=logging,
            stream=stream,
            acompletion=acompletion
            )
            ## RESPONSE OBJECT
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -11013,6 +11013,42 @@ class CustomStreamWrapper:
        except Exception as e:
            raise e
    def handle_triton_stream(self, chunk):
        try:
            if isinstance(chunk, dict):
                parsed_response = chunk
            elif isinstance(chunk, (str, bytes)):
                if isinstance(chunk, bytes):
                    chunk = chunk.decode("utf-8")
                if "text_output" in chunk:
                    response = chunk.replace("data: ", "").strip()
                    parsed_response = json.loads(response)
                else:
                    return {
                        "text": "",
                        "is_finished": False,
                        "prompt_tokens": 0,
                        "completion_tokens": 0,
                    }
            else:
                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
                raise ValueError(
                    f"Unable to parse response. Original response: {chunk}"
                )
            text = parsed_response.get("text_output", "")
            finish_reason = parsed_response.get("stop_reason")
            is_finished = parsed_response.get("is_finished", False)
            return {
                "text": text,
                "is_finished": is_finished,
                "finish_reason": finish_reason,
                "prompt_tokens": parsed_response.get("input_token_count", 0),
                "completion_tokens": parsed_response.get("generated_token_count", 0),
            }
            return {"text": "", "is_finished": False}
        except Exception as e:
            raise e
    def handle_clarifai_completion_chunk(self, chunk):
        try:
            if isinstance(chunk, dict):
@ -11337,6 +11373,12 @@ class CustomStreamWrapper:
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "triton":
                response_obj = self.handle_triton_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
                response_obj = self.handle_openai_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
@ -11773,6 +11815,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "predibase"
                or self.custom_llm_provider == "databricks"
                or self.custom_llm_provider == "bedrock"
                or self.custom_llm_provider == "triton"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
                async for chunk in self.completion_stream: