Merge pull request #3577 from BerriAI/litellm_add_triton_server

[Feat] Add Triton Embeddings to LiteLLM
2024-05-10 19:20:23 -07:00 · 2024-05-10 19:20:23 -07:00 · b09075da53
commit b09075da53
parent 1aa567f3b5 ed2c05d10d
8 changed files with 265 additions and 0 deletions
--- a/docs/my-website/docs/providers/triton-inference-server.md
+++ b/docs/my-website/docs/providers/triton-inference-server.md
@ -0,0 +1,95 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Triton Inference Server
+
+LiteLLM supports Embedding Models on Triton Inference Servers
+
+
+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+
+### Example Call
+
+Use the `triton/` prefix to route to triton server
+```python
+from litellm import embedding
+import os
+
+response = await litellm.aembedding(
+    model="triton/<your-triton-model>",                                                       
+    api_base="https://your-triton-api-base/triton/embeddings", # /embeddings endpoint you want litellm to call on your server
+    input=["good morning from litellm"],
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: my-triton-model
+      litellm_params:
+        model: triton/<your-triton-model>"
+        api_base: https://your-triton-api-base/triton/embeddings
+  ```
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --detailed_debug
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+    ```python
+    import openai
+    from openai import OpenAI
+
+    # set base_url to your proxy server
+    # set api_key to send to proxy server
+    client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
+
+    response = client.embeddings.create(
+        input=["hello from litellm"],
+        model="my-triton-model"
+    )
+
+    print(response)
+
+    ```
+
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  `--header` is optional, only required if you're using litellm proxy with Virtual Keys
+
+    ```shell
+    curl --location 'http://0.0.0.0:4000/embeddings' \
+    --header 'Content-Type: application/json' \
+    --header 'Authorization: Bearer sk-1234' \
+    --data ' {
+    "model": "my-triton-model",
+    "input": ["write a litellm poem"]
+    }'
+
+    ```
+  </TabItem>
+
+  </Tabs>
+
+
+</TabItem>
+
+</Tabs>
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -134,6 +134,7 @@ const sidebars = {
        "providers/huggingface", 
        "providers/watsonx",
        "providers/predibase",
+        "providers/triton-inference-server",
        "providers/ollama", 
        "providers/perplexity", 
        "providers/groq", 
--- a/litellm/init.py
+++ b/litellm/init.py
@ -1,5 +1,6 @@
 ### Hide pydantic namespace conflict warnings globally ###
 import warnings
+
 warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
 ### INIT VARIABLES ###
 import threading, requests, os
@ -537,6 +538,7 @@ provider_list: List = [
    "xinference",
    "fireworks_ai",
    "watsonx",
+    "triton",
    "predibase",
    "custom",  # custom apis
 ]
--- a/litellm/llms/triton.py
+++ b/litellm/llms/triton.py
@ -0,0 +1,119 @@
+import os, types
+import json
+from enum import Enum
+import requests, copy  # type: ignore
+import time
+from typing import Callable, Optional, List
+from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
+import litellm
+from .prompt_templates.factory import prompt_factory, custom_prompt
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from .base import BaseLLM
+import httpx  # type: ignore
+
+
+class TritonError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        self.request = httpx.Request(
+            method="POST",
+            url="https://api.anthropic.com/v1/messages",  # using anthropic api base since httpx requires a url
+        )
+        self.response = httpx.Response(status_code=status_code, request=self.request)
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class TritonChatCompletion(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    async def aembedding(
+        self,
+        data: dict,
+        model_response: litellm.utils.EmbeddingResponse,
+        api_base: str,
+        logging_obj=None,
+        api_key: Optional[str] = None,
+    ):
+
+        async_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+
+        response = await async_handler.post(url=api_base, data=json.dumps(data))
+
+        if response.status_code != 200:
+            raise TritonError(status_code=response.status_code, message=response.text)
+
+        _text_response = response.text
+
+        logging_obj.post_call(original_response=_text_response)
+
+        _json_response = response.json()
+
+        _outputs = _json_response["outputs"]
+        _output_data = _outputs[0]["data"]
+        _embedding_output = {
+            "object": "embedding",
+            "index": 0,
+            "embedding": _output_data,
+        }
+
+        model_response.model = _json_response.get("model_name", "None")
+        model_response.data = [_embedding_output]
+
+        return model_response
+
+    def embedding(
+        self,
+        model: str,
+        input: list,
+        timeout: float,
+        api_base: str,
+        model_response: litellm.utils.EmbeddingResponse,
+        api_key: Optional[str] = None,
+        logging_obj=None,
+        optional_params=None,
+        client=None,
+        aembedding=None,
+    ):
+        data_for_triton = {
+            "inputs": [
+                {
+                    "name": "input_text",
+                    "shape": [1],
+                    "datatype": "BYTES",
+                    "data": input,
+                }
+            ]
+        }
+
+        ## LOGGING
+
+        curl_string = f"curl {api_base} -X POST -H 'Content-Type: application/json' -d '{data_for_triton}'"
+
+        logging_obj.pre_call(
+            input="",
+            api_key=None,
+            additional_args={
+                "complete_input_dict": optional_params,
+                "request_str": curl_string,
+            },
+        )
+
+        if aembedding == True:
+            response = self.aembedding(
+                data=data_for_triton,
+                model_response=model_response,
+                logging_obj=logging_obj,
+                api_base=api_base,
+                api_key=api_key,
+            )
+            return response
+        else:
+            raise Exception(
+                "Only async embedding supported for triton, please use litellm.aembedding() for now"
+            )
--- a/litellm/main.py
+++ b/litellm/main.py
@ -47,6 +47,7 @@ from .llms import (
    ai21,
    sagemaker,
    bedrock,
+    triton,
    huggingface_restapi,
    replicate,
    aleph_alpha,
@ -75,6 +76,7 @@ from .llms.anthropic import AnthropicChatCompletion
 from .llms.anthropic_text import AnthropicTextCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.predibase import PredibaseChatCompletion
+from .llms.triton import TritonChatCompletion
 from .llms.prompt_templates.factory import (
    prompt_factory,
    custom_prompt,
@ -112,6 +114,7 @@ azure_chat_completions = AzureChatCompletion()
 azure_text_completions = AzureTextCompletion()
 huggingface = Huggingface()
 predibase_chat_completions = PredibaseChatCompletion()
+triton_chat_completions = TritonChatCompletion()
 ####### COMPLETION ENDPOINTS ################


@ -2622,6 +2625,7 @@ async def aembedding(*args, **kwargs):
            or custom_llm_provider == "voyage"
            or custom_llm_provider == "mistral"
            or custom_llm_provider == "custom_openai"
+            or custom_llm_provider == "triton"
            or custom_llm_provider == "anyscale"
            or custom_llm_provider == "openrouter"
            or custom_llm_provider == "deepinfra"
@ -2955,6 +2959,23 @@ def embedding(
                optional_params=optional_params,
                model_response=EmbeddingResponse(),
            )
+        elif custom_llm_provider == "triton":
+            if api_base is None:
+                raise ValueError(
+                    "api_base is required for triton. Please pass `api_base`"
+                )
+            response = triton_chat_completions.embedding(
+                model=model,
+                input=input,
+                api_base=api_base,
+                api_key=api_key,
+                logging_obj=logging,
+                timeout=timeout,
+                model_response=EmbeddingResponse(),
+                optional_params=optional_params,
+                client=client,
+                aembedding=aembedding,
+            )
        elif custom_llm_provider == "vertex_ai":
            vertex_ai_project = (
                optional_params.pop("vertex_project", None)
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -8,6 +8,10 @@ model_list:
    litellm_params:
      model: openai/*
      api_key: os.environ/OPENAI_API_KEY
+  - model_name: my-triton-model
+    litellm_params:
+      model: triton/any"
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/triton/embeddings

 general_settings:
  store_model_in_db: true
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -516,6 +516,23 @@ def test_voyage_embeddings():
        pytest.fail(f"Error occurred: {e}")


+@pytest.mark.asyncio
+async def test_triton_embeddings():
+    try:
+        litellm.set_verbose = True
+        response = await litellm.aembedding(
+            model="triton/my-triton-model",
+            api_base="https://exampleopenaiendpoint-production.up.railway.app/triton/embeddings",
+            input=["good morning from litellm"],
+        )
+        print(f"response: {response}")
+
+        # stubbed endpoint is setup to return this
+        assert response.data[0]["embedding"] == [0.1, 0.2, 0.3]
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # test_voyage_embeddings()
 # def test_xinference_embeddings():
 #     try:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4814,6 +4814,12 @@ def get_optional_params_embeddings(
                status_code=500,
                message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
            )
+    if custom_llm_provider == "triton":
+        keys = list(non_default_params.keys())
+        for k in keys:
+            non_default_params.pop(k, None)
+        final_params = {**non_default_params, **kwargs}
+        return final_params
    if custom_llm_provider == "vertex_ai":
        if len(non_default_params.keys()) > 0:
            if litellm.drop_params is True:  # drop the unsupported non-default values