forked from phoenix/litellm-mirror
(feat) add nvidia nim embeddings (#6032)
* nvidia nim support embedding config * add nvidia config in init * nvidia nim embeddings * docs nvidia nim embeddings * docs embeddings on nvidia nim * fix llm translation test
This commit is contained in:
parent
05df9cc6d0
commit
d92696a303
8 changed files with 238 additions and 9 deletions
|
@ -1,3 +1,6 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Nvidia NIM
|
||||
https://docs.api.nvidia.com/nim/reference/
|
||||
|
||||
|
@ -65,6 +68,96 @@ for chunk in response:
|
|||
```
|
||||
|
||||
|
||||
## Usage - embedding
|
||||
|
||||
```python
|
||||
import litellm
|
||||
import os
|
||||
|
||||
response = litellm.embedding(
|
||||
model="nvidia_nim/nvidia/nv-embedqa-e5-v5", # add `nvidia_nim/` prefix to model so litellm knows to route to Nvidia NIM
|
||||
input=["good morning from litellm"],
|
||||
encoding_format = "float",
|
||||
user_id = "user-1234",
|
||||
|
||||
# Nvidia NIM Specific Parameters
|
||||
input_type = "passage", # Optional
|
||||
truncate = "NONE" # Optional
|
||||
)
|
||||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## **Usage - LiteLLM Proxy Server**
|
||||
|
||||
Here's how to call an Nvidia NIM Endpoint with the LiteLLM Proxy Server
|
||||
|
||||
1. Modify the config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: my-model
|
||||
litellm_params:
|
||||
model: nvidia_nim/<your-model-name> # add nvidia_nim/ prefix to route as Nvidia NIM provider
|
||||
api_key: api-key # api key to send your model
|
||||
```
|
||||
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Send Request to LiteLLM Proxy Server
|
||||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
|
||||
base_url="http://0.0.0.0:4000" # litellm-proxy-base url
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="my-model",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="curl" label="curl">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "my-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
|
||||
|
||||
## Supported Models - 💥 ALL Nvidia NIM Models Supported!
|
||||
We support ALL `nvidia_nim` models, just set `nvidia_nim/` as a prefix when sending completion requests
|
||||
|
||||
|
|
|
@ -981,7 +981,13 @@ from .llms.OpenAI.chat.o1_transformation import (
|
|||
from .llms.OpenAI.chat.gpt_transformation import (
|
||||
OpenAIGPTConfig,
|
||||
)
|
||||
from .llms.nvidia_nim import NvidiaNimConfig
|
||||
|
||||
from .llms.nvidia_nim.chat import NvidiaNimConfig
|
||||
from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
|
||||
|
||||
nvidiaNimConfig = NvidiaNimConfig()
|
||||
nvidiaNimEmbeddingConfig = NvidiaNimEmbeddingConfig()
|
||||
|
||||
from .llms.cerebras.chat import CerebrasConfig
|
||||
from .llms.sambanova.chat import SambanovaConfig
|
||||
from .llms.AI21.chat import AI21ChatConfig
|
||||
|
|
80
litellm/llms/nvidia_nim/embed.py
Normal file
80
litellm/llms/nvidia_nim/embed.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
"""
|
||||
Nvidia NIM embeddings endpoint: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
|
||||
|
||||
This is OpenAI compatible
|
||||
|
||||
This file only contains param mapping logic
|
||||
|
||||
API calling is done using the OpenAI SDK with an api_base
|
||||
"""
|
||||
|
||||
import types
|
||||
from typing import Optional, Union
|
||||
|
||||
|
||||
class NvidiaNimEmbeddingConfig:
|
||||
"""
|
||||
Reference: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
|
||||
"""
|
||||
|
||||
# OpenAI params
|
||||
encoding_format: Optional[str] = None
|
||||
user: Optional[str] = None
|
||||
|
||||
# Nvidia NIM params
|
||||
input_type: Optional[str] = None
|
||||
truncate: Optional[str] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoding_format: Optional[str] = None,
|
||||
user: Optional[str] = None,
|
||||
input_type: Optional[str] = None,
|
||||
truncate: Optional[str] = None,
|
||||
) -> None:
|
||||
locals_ = locals()
|
||||
for key, value in locals_.items():
|
||||
if key != "self" and value is not None:
|
||||
setattr(self.__class__, key, value)
|
||||
|
||||
@classmethod
|
||||
def get_config(cls):
|
||||
return {
|
||||
k: v
|
||||
for k, v in cls.__dict__.items()
|
||||
if not k.startswith("__")
|
||||
and not isinstance(
|
||||
v,
|
||||
(
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
classmethod,
|
||||
staticmethod,
|
||||
),
|
||||
)
|
||||
and v is not None
|
||||
}
|
||||
|
||||
def get_supported_openai_params(
|
||||
self,
|
||||
):
|
||||
return ["encoding_format", "user"]
|
||||
|
||||
def map_openai_params(
|
||||
self,
|
||||
non_default_params: dict,
|
||||
optional_params: dict,
|
||||
kwargs: Optional[dict] = None,
|
||||
):
|
||||
if "extra_body" not in optional_params:
|
||||
optional_params["extra_body"] = {}
|
||||
for k, v in non_default_params.items():
|
||||
if k == "input_type":
|
||||
optional_params["extra_body"].update({"input_type": v})
|
||||
elif k == "truncate":
|
||||
optional_params["extra_body"].update({"truncate": v})
|
||||
|
||||
if kwargs is not None:
|
||||
# pass kwargs in extra_body
|
||||
optional_params["extra_body"].update(kwargs)
|
||||
return optional_params
|
|
@ -3316,6 +3316,7 @@ def embedding(
|
|||
input=[],
|
||||
# Optional params
|
||||
dimensions: Optional[int] = None,
|
||||
encoding_format: Optional[str] = None,
|
||||
timeout=600, # default to 10 minutes
|
||||
# set api_base, api_version, api_key
|
||||
api_base: Optional[str] = None,
|
||||
|
@ -3336,6 +3337,7 @@ def embedding(
|
|||
Parameters:
|
||||
- model: The embedding model to use.
|
||||
- input: The input for which embeddings are to be generated.
|
||||
- encoding_format: Optional[str] The format to return the embeddings in. Can be either `float` or `base64`
|
||||
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
|
||||
- timeout: The timeout value for the API call, default 10 mins
|
||||
- litellm_call_id: The call ID for litellm logging.
|
||||
|
@ -3362,7 +3364,6 @@ def embedding(
|
|||
max_parallel_requests = kwargs.pop("max_parallel_requests", None)
|
||||
model_info = kwargs.get("model_info", None)
|
||||
metadata = kwargs.get("metadata", None)
|
||||
encoding_format = kwargs.get("encoding_format", None)
|
||||
proxy_server_request = kwargs.get("proxy_server_request", None)
|
||||
aembedding = kwargs.get("aembedding", None)
|
||||
extra_headers = kwargs.get("extra_headers", None)
|
||||
|
@ -3556,6 +3557,7 @@ def embedding(
|
|||
model in litellm.open_ai_embedding_models
|
||||
or custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "together_ai"
|
||||
or custom_llm_provider == "nvidia_nim"
|
||||
):
|
||||
api_base = (
|
||||
api_base
|
||||
|
|
|
@ -2552,9 +2552,9 @@ def get_optional_params_image_gen(
|
|||
def get_optional_params_embeddings(
|
||||
# 2 optional params
|
||||
model: str,
|
||||
user=None,
|
||||
encoding_format=None,
|
||||
dimensions=None,
|
||||
user: Optional[str] = None,
|
||||
encoding_format: Optional[str] = None,
|
||||
dimensions: Optional[int] = None,
|
||||
custom_llm_provider="",
|
||||
drop_params: Optional[bool] = None,
|
||||
additional_drop_params: Optional[bool] = None,
|
||||
|
@ -2595,7 +2595,6 @@ def get_optional_params_embeddings(
|
|||
default_params=default_params,
|
||||
additional_drop_params=additional_drop_params,
|
||||
)
|
||||
|
||||
## raise exception if non-default value passed for non-openai/azure embedding calls
|
||||
if custom_llm_provider == "openai":
|
||||
# 'dimensions` is only supported in `text-embedding-3` and later models
|
||||
|
@ -2627,6 +2626,17 @@ def get_optional_params_embeddings(
|
|||
)
|
||||
final_params = {**optional_params, **kwargs}
|
||||
return final_params
|
||||
elif custom_llm_provider == "nvidia_nim":
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model or "",
|
||||
custom_llm_provider="nvidia_nim",
|
||||
request_type="embeddings",
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
optional_params = litellm.nvidiaNimEmbeddingConfig.map_openai_params(
|
||||
non_default_params=non_default_params, optional_params={}, kwargs=kwargs
|
||||
)
|
||||
return optional_params
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model,
|
||||
|
@ -4308,7 +4318,10 @@ def get_supported_openai_params(
|
|||
else:
|
||||
return litellm.FireworksAIConfig().get_supported_openai_params()
|
||||
elif custom_llm_provider == "nvidia_nim":
|
||||
return litellm.NvidiaNimConfig().get_supported_openai_params(model=model)
|
||||
if request_type == "chat_completion":
|
||||
return litellm.nvidiaNimConfig.get_supported_openai_params(model=model)
|
||||
elif request_type == "embeddings":
|
||||
return litellm.nvidiaNimEmbeddingConfig.get_supported_openai_params()
|
||||
elif custom_llm_provider == "cerebras":
|
||||
return litellm.CerebrasConfig().get_supported_openai_params(model=model)
|
||||
elif custom_llm_provider == "ai21_chat":
|
||||
|
|
|
@ -165,7 +165,7 @@ def test_all_model_configs():
|
|||
"max_new_tokens": 10
|
||||
}
|
||||
|
||||
from litellm.llms.nvidia_nim import NvidiaNimConfig
|
||||
from litellm.llms.nvidia_nim.chat import NvidiaNimConfig
|
||||
|
||||
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
|
||||
model="llama3"
|
||||
|
|
|
@ -14,7 +14,7 @@ import pytest
|
|||
from respx import MockRouter
|
||||
|
||||
import litellm
|
||||
from litellm import Choices, Message, ModelResponse
|
||||
from litellm import Choices, Message, ModelResponse, EmbeddingResponse, Usage
|
||||
from litellm import completion
|
||||
|
||||
|
||||
|
@ -69,3 +69,38 @@ def test_completion_nvidia_nim(respx_mock: MockRouter):
|
|||
pass
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_embedding_nvidia_nim(respx_mock: MockRouter):
|
||||
litellm.set_verbose = True
|
||||
mock_response = EmbeddingResponse(
|
||||
model="nvidia_nim/databricks/dbrx-instruct",
|
||||
data=[
|
||||
{
|
||||
"embedding": [0.1, 0.2, 0.3],
|
||||
"index": 0,
|
||||
}
|
||||
],
|
||||
usage=Usage(
|
||||
prompt_tokens=10,
|
||||
completion_tokens=0,
|
||||
total_tokens=10,
|
||||
),
|
||||
)
|
||||
mock_request = respx_mock.post(
|
||||
"https://integrate.api.nvidia.com/v1/embeddings"
|
||||
).mock(return_value=httpx.Response(200, json=mock_response.dict()))
|
||||
response = litellm.embedding(
|
||||
model="nvidia_nim/nvidia/nv-embedqa-e5-v5",
|
||||
input="What is the meaning of life?",
|
||||
input_type="passage",
|
||||
)
|
||||
assert mock_request.called
|
||||
request_body = json.loads(mock_request.calls[0].request.content)
|
||||
print("request_body: ", request_body)
|
||||
assert request_body == {
|
||||
"input": "What is the meaning of life?",
|
||||
"model": "nvidia/nv-embedqa-e5-v5",
|
||||
"input_type": "passage",
|
||||
"encoding_format": "base64",
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue