[Bug fix ]: Triton /infer handler incompatible with batch responses (#7337)

* migrate triton to base llm http handler * clean up triton handler.py * use transform functions for triton * add TritonConfig * get openai params for triton * use triton embedding config * test_completion_triton_generate_api * test_completion_triton_infer_api * fix TritonConfig doc string * use TritonResponseIterator * fix triton embeddings * docs triton chat usage
2025-04-26 19:24:27 +00:00 · 2024-12-20 20:59:40 -08:00 · 2024-12-20 20:59:40 -08:00 · 1b2ed0c344
commit 1b2ed0c344
parent e6bdec4eed
11 changed files with 814 additions and 450 deletions
--- a/tests/local_testing/test_embedding.py
+++ b/tests/local_testing/test_embedding.py
@ -888,23 +888,6 @@ def test_voyage_embeddings():
        pytest.fail(f"Error occurred: {e}")


-@pytest.mark.asyncio
-async def test_triton_embeddings():
-    try:
-        litellm.set_verbose = True
-        response = await litellm.aembedding(
-            model="triton/my-triton-model",
-            api_base="https://exampleopenaiendpoint-production.up.railway.app/triton/embeddings",
-            input=["good morning from litellm"],
-        )
-        print(f"response: {response}")
-
-        # stubbed endpoint is setup to return this
-        assert response.data[0]["embedding"] == [0.1, 0.2]
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize(
    "input", ["good morning from litellm", ["good morning from litellm"]]  #