diff --git a/docs/my-website/docs/providers/vllm.md b/docs/my-website/docs/providers/vllm.md index 5388a0bb7d..9cc0ad487e 100644 --- a/docs/my-website/docs/providers/vllm.md +++ b/docs/my-website/docs/providers/vllm.md @@ -5,9 +5,17 @@ import TabItem from '@theme/TabItem'; LiteLLM supports all models on VLLM. +| Property | Details | +|-------|-------| +| Description | vLLM is a fast and easy-to-use library for LLM inference and serving. [Docs](https://docs.vllm.ai/en/latest/index.html) | +| Provider Route on LiteLLM | `hosted_vllm/` (for OpenAI compatible server), `vllm/` (for vLLM sdk usage) | +| Provider Doc | [vLLM ↗](https://docs.vllm.ai/en/latest/index.html) | +| Supported Endpoints | `/chat/completions`, `/embeddings`, `/completions` | + + # Quick Start -## Usage - litellm.completion (calling vLLM endpoint) +## Usage - litellm.completion (calling OpenAI compatible endpoint) vLLM Provides an OpenAI compatible endpoints - here's how to call it with LiteLLM In order to use litellm to call a hosted vllm server add the following to your completion call @@ -29,7 +37,7 @@ print(response) ``` -## Usage - LiteLLM Proxy Server (calling vLLM endpoint) +## Usage - LiteLLM Proxy Server (calling OpenAI compatible endpoint) Here's how to call an OpenAI-Compatible Endpoint with the LiteLLM Proxy Server @@ -97,7 +105,59 @@ Here's how to call an OpenAI-Compatible Endpoint with the LiteLLM Proxy Server -## Extras - for `vllm pip package` +## Embeddings + + + + +```python +from litellm import embedding +import os + +os.environ["HOSTED_VLLM_API_BASE"] = "http://localhost:8000" + + +embedding = embedding(model="hosted_vllm/facebook/opt-125m", input=["Hello world"]) + +print(embedding) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: my-model + litellm_params: + model: hosted_vllm/facebook/opt-125m # add hosted_vllm/ prefix to route as OpenAI provider + api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider +``` + +2. Start the proxy + +```bash +$ litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/embeddings' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{"input": ["hello world"], "model": "my-model"}' +``` + +[See OpenAI SDK/Langchain/etc. examples](../proxy/user_keys.md#embeddings) + + + + +## (Deprecated) for `vllm pip package` ### Using - `litellm.completion` ``` diff --git a/litellm/__init__.py b/litellm/__init__.py index 576239b946..7ec48da073 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -470,6 +470,7 @@ azure_models: List = [] anyscale_models: List = [] cerebras_models: List = [] galadriel_models: List = [] +sambanova_models: List = [] def add_known_models(): @@ -578,6 +579,8 @@ def add_known_models(): cerebras_models.append(key) elif value.get("litellm_provider") == "galadriel": galadriel_models.append(key) + elif value.get("litellm_provider") == "sambanova_models": + sambanova_models.append(key) add_known_models() @@ -841,6 +844,7 @@ model_list = ( + anyscale_models + cerebras_models + galadriel_models + + sambanova_models ) @@ -891,6 +895,7 @@ models_by_provider: dict = { "anyscale": anyscale_models, "cerebras": cerebras_models, "galadriel": galadriel_models, + "sambanova": sambanova_models, } # mapping for those models which have larger equivalents diff --git a/litellm/llms/hosted_vllm/chat/transformation.py b/litellm/llms/hosted_vllm/chat/transformation.py index a48a845a25..37425929ed 100644 --- a/litellm/llms/hosted_vllm/chat/transformation.py +++ b/litellm/llms/hosted_vllm/chat/transformation.py @@ -40,6 +40,6 @@ class HostedVLLMChatConfig(OpenAIGPTConfig): ) -> Tuple[Optional[str], Optional[str]]: api_base = api_base or get_secret_str("HOSTED_VLLM_API_BASE") # type: ignore dynamic_api_key = ( - api_key or get_secret_str("HOSTED_VLLM_API_KEY") or "" + api_key or get_secret_str("HOSTED_VLLM_API_KEY") or "fake-api-key" ) # vllm does not require an api key return api_base, dynamic_api_key diff --git a/litellm/llms/hosted_vllm/embedding/README.md b/litellm/llms/hosted_vllm/embedding/README.md new file mode 100644 index 0000000000..f82b3c77a6 --- /dev/null +++ b/litellm/llms/hosted_vllm/embedding/README.md @@ -0,0 +1,5 @@ +No transformation is required for hosted_vllm embedding. + +VLLM is a superset of OpenAI's `embedding` endpoint. + +To pass provider-specific parameters, see [this](https://docs.litellm.ai/docs/completion/provider_specific_params) \ No newline at end of file diff --git a/litellm/main.py b/litellm/main.py index ba7e1303ac..0b3288accd 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -3362,7 +3362,11 @@ def embedding( # noqa: PLR0915 client=client, aembedding=aembedding, ) - elif custom_llm_provider == "openai_like" or custom_llm_provider == "jina_ai": + elif ( + custom_llm_provider == "openai_like" + or custom_llm_provider == "jina_ai" + or custom_llm_provider == "hosted_vllm" + ): api_base = ( api_base or litellm.api_base or get_secret_str("OPENAI_LIKE_API_BASE") ) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index a607bfbc6f..10b1b2782b 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -15,6 +15,76 @@ "supports_prompt_caching": true, "supports_response_schema": true }, + "sambanova/Meta-Llama-3.1-8B-Instruct": { + "max_tokens": 16000, + "max_input_tokens": 16000, + "max_output_tokens": 16000, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000002, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, + "sambanova/Meta-Llama-3.1-70B-Instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000006, + "output_cost_per_token": 0.0000012, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, + "sambanova/Meta-Llama-3.1-405B-Instruct": { + "max_tokens": 16000, + "max_input_tokens": 16000, + "max_output_tokens": 16000, + "input_cost_per_token": 0.000005, + "output_cost_per_token": 0.000010, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, + "sambanova/Meta-Llama-3.2-1B-Instruct": { + "max_tokens": 16000, + "max_input_tokens": 16000, + "max_output_tokens": 16000, + "input_cost_per_token": 0.0000004, + "output_cost_per_token": 0.0000008, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, + "sambanova/Meta-Llama-3.2-3B-Instruct": { + "max_tokens": 4000, + "max_input_tokens": 4000, + "max_output_tokens": 4000, + "input_cost_per_token": 0.0000008, + "output_cost_per_token": 0.0000016, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, + "sambanova/Qwen2.5-Coder-32B-Instruct": { + "max_tokens": 8000, + "max_input_tokens": 8000, + "max_output_tokens": 8000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000003, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, + "sambanova/Qwen2.5-72B-Instruct": { + "max_tokens": 8000, + "max_input_tokens": 8000, + "max_output_tokens": 8000, + "input_cost_per_token": 0.000002, + "output_cost_per_token": 0.000004, + "litellm_provider": "sambanova", + "supports_function_calling": true, + "mode": "chat" + }, "gpt-4": { "max_tokens": 4096, "max_input_tokens": 8192, diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py index ab42bc5fe9..968655f433 100644 --- a/tests/llm_translation/base_llm_unit_tests.py +++ b/tests/llm_translation/base_llm_unit_tests.py @@ -219,6 +219,7 @@ class BaseLLMChatTest(ABC): }, ], response_format=TestModel, + timeout=5, ) assert res is not None @@ -226,6 +227,8 @@ class BaseLLMChatTest(ABC): assert res.choices[0].message.content is not None assert res.choices[0].message.tool_calls is None + except litellm.Timeout: + pytest.skip("Model took too long to respond") except litellm.InternalServerError: pytest.skip("Model is overloaded") diff --git a/tests/local_testing/test_embedding.py b/tests/local_testing/test_embedding.py index 5930e16d11..8f1490314c 100644 --- a/tests/local_testing/test_embedding.py +++ b/tests/local_testing/test_embedding.py @@ -1004,6 +1004,28 @@ async def test_hf_embedddings_with_optional_params(sync_mode): assert json_data["parameters"]["top_k"] == 10 +def test_hosted_vllm_embedding(monkeypatch): + monkeypatch.setenv("HOSTED_VLLM_API_BASE", "http://localhost:8000") + from litellm.llms.custom_httpx.http_handler import HTTPHandler + + client = HTTPHandler() + with patch.object(client, "post") as mock_post: + try: + embedding( + model="hosted_vllm/jina-embeddings-v3", + input=["Hello world"], + client=client, + ) + except Exception as e: + print(e) + + mock_post.assert_called_once() + + json_data = json.loads(mock_post.call_args.kwargs["data"]) + assert json_data["input"] == ["Hello world"] + assert json_data["model"] == "jina-embeddings-v3" + + @pytest.mark.parametrize( "model", [ diff --git a/tests/local_testing/test_get_llm_provider.py b/tests/local_testing/test_get_llm_provider.py index 423ffe2fd7..4a79d7b1c0 100644 --- a/tests/local_testing/test_get_llm_provider.py +++ b/tests/local_testing/test_get_llm_provider.py @@ -153,6 +153,20 @@ def test_default_api_base(): assert other_provider.value not in api_base.replace("/openai", "") +def test_hosted_vllm_default_api_key(): + from litellm.litellm_core_utils.get_llm_provider_logic import ( + _get_openai_compatible_provider_info, + ) + + _, _, dynamic_api_key, _ = _get_openai_compatible_provider_info( + model="hosted_vllm/llama-3.1-70b-instruct", + api_base=None, + api_key=None, + dynamic_api_key=None, + ) + assert dynamic_api_key == "fake-api-key" + + def test_get_llm_provider_jina_ai(): model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider( model="jina_ai/jina-embeddings-v3", @@ -168,7 +182,7 @@ def test_get_llm_provider_hosted_vllm(): ) assert custom_llm_provider == "hosted_vllm" assert model == "llama-3.1-70b-instruct" - assert dynamic_api_key == "" + assert dynamic_api_key == "fake-api-key" def test_get_llm_provider_watson_text():