[Bug fix ]: Triton /infer handler incompatible with batch responses (#7337)

* migrate triton to base llm http handler

* clean up triton handler.py

* use transform functions for triton

* add TritonConfig

* get openai params for triton

* use triton embedding config

* test_completion_triton_generate_api

* test_completion_triton_infer_api

* fix TritonConfig doc string

* use TritonResponseIterator

* fix triton embeddings

* docs triton chat usage
This commit is contained in:
Ishaan Jaff 2024-12-20 20:59:40 -08:00 committed by GitHub
parent e6bdec4eed
commit 1b2ed0c344
11 changed files with 814 additions and 450 deletions

View file

@ -888,23 +888,6 @@ def test_voyage_embeddings():
pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio
async def test_triton_embeddings():
try:
litellm.set_verbose = True
response = await litellm.aembedding(
model="triton/my-triton-model",
api_base="https://exampleopenaiendpoint-production.up.railway.app/triton/embeddings",
input=["good morning from litellm"],
)
print(f"response: {response}")
# stubbed endpoint is setup to return this
assert response.data[0]["embedding"] == [0.1, 0.2]
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize(
"input", ["good morning from litellm", ["good morning from litellm"]] #