From d5cd0eea14a3e061bc9a6e48bd606190ebaf907b Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Wed, 19 Nov 2025 11:44:28 -0500 Subject: [PATCH] feat!: standardize base_url for inference (#4177) # What does this PR do? Completes #3732 by removing runtime URL transformations and requiring users to provide full URLs in configuration. All providers now use 'base_url' consistently and respect the exact URL provided without appending paths like /v1 or /openai/v1 at runtime. BREAKING CHANGE: Users must update configs to include full URL paths (e.g., http://localhost:11434/v1 instead of http://localhost:11434). Closes #3732 ## Test Plan Existing tests should pass even with the URL changes, due to default URLs being altered. Add unit test to enforce URL standardization across remote inference providers (verifies all use 'base_url' field with HttpUrl | None type) Signed-off-by: Charlie Doern --- .../docs/providers/inference/remote_azure.mdx | 4 +- .../providers/inference/remote_cerebras.mdx | 4 +- .../providers/inference/remote_databricks.mdx | 4 +- .../providers/inference/remote_fireworks.mdx | 4 +- docs/docs/providers/inference/remote_groq.mdx | 4 +- .../inference/remote_llama-openai-compat.mdx | 4 +- .../providers/inference/remote_nvidia.mdx | 6 +- .../providers/inference/remote_ollama.mdx | 4 +- .../providers/inference/remote_openai.mdx | 2 +- .../inference/remote_passthrough.mdx | 4 +- .../providers/inference/remote_runpod.mdx | 4 +- .../providers/inference/remote_sambanova.mdx | 4 +- docs/docs/providers/inference/remote_tgi.mdx | 4 +- .../providers/inference/remote_together.mdx | 4 +- docs/docs/providers/inference/remote_vllm.mdx | 4 +- .../providers/inference/remote_watsonx.mdx | 4 +- scripts/docker.sh | 4 +- scripts/install.sh | 2 +- .../ci-tests/run-with-postgres-store.yaml | 21 ++++--- .../distributions/ci-tests/run.yaml | 21 ++++--- .../distributions/nvidia/run-with-safety.yaml | 3 +- src/llama_stack/distributions/nvidia/run.yaml | 3 +- .../distributions/open-benchmark/run.yaml | 4 +- .../distributions/postgres-demo/run.yaml | 2 +- .../starter-gpu/run-with-postgres-store.yaml | 21 ++++--- .../distributions/starter-gpu/run.yaml | 21 ++++--- .../starter/run-with-postgres-store.yaml | 21 ++++--- .../distributions/starter/run.yaml | 21 ++++--- .../distributions/watsonx/run.yaml | 2 +- .../providers/remote/inference/azure/azure.py | 4 +- .../remote/inference/azure/config.py | 9 +-- .../remote/inference/cerebras/cerebras.py | 4 +- .../remote/inference/cerebras/config.py | 8 +-- .../remote/inference/databricks/config.py | 10 ++-- .../remote/inference/databricks/databricks.py | 10 +++- .../remote/inference/fireworks/config.py | 8 +-- .../remote/inference/fireworks/fireworks.py | 2 +- .../providers/remote/inference/groq/config.py | 8 +-- .../providers/remote/inference/groq/groq.py | 2 +- .../inference/llama_openai_compat/config.py | 8 +-- .../inference/llama_openai_compat/llama.py | 2 +- .../remote/inference/nvidia/config.py | 16 ++---- .../remote/inference/nvidia/nvidia.py | 4 +- .../remote/inference/nvidia/utils.py | 2 +- .../remote/inference/ollama/config.py | 12 ++-- .../remote/inference/ollama/ollama.py | 12 +++- .../remote/inference/openai/config.py | 6 +- .../remote/inference/openai/openai.py | 2 +- .../remote/inference/passthrough/config.py | 8 +-- .../inference/passthrough/passthrough.py | 4 +- .../remote/inference/runpod/config.py | 6 +- .../remote/inference/runpod/runpod.py | 2 +- .../remote/inference/sambanova/config.py | 8 +-- .../remote/inference/sambanova/sambanova.py | 2 +- .../providers/remote/inference/tgi/config.py | 11 ++-- .../providers/remote/inference/tgi/tgi.py | 20 ++++--- .../remote/inference/together/config.py | 8 +-- .../remote/inference/together/together.py | 3 +- .../providers/remote/inference/vllm/config.py | 8 +-- .../providers/remote/inference/vllm/vllm.py | 6 +- .../remote/inference/watsonx/config.py | 6 +- .../remote/inference/watsonx/watsonx.py | 4 +- tests/integration/suites.py | 6 +- .../test_inference_client_caching.py | 4 +- .../providers/inference/test_remote_vllm.py | 6 +- .../providers/nvidia/test_rerank_inference.py | 2 +- tests/unit/providers/test_configs.py | 56 ++++++++++++++++++- 67 files changed, 282 insertions(+), 227 deletions(-) diff --git a/docs/docs/providers/inference/remote_azure.mdx b/docs/docs/providers/inference/remote_azure.mdx index fd22b157e..0382b42d7 100644 --- a/docs/docs/providers/inference/remote_azure.mdx +++ b/docs/docs/providers/inference/remote_azure.mdx @@ -24,7 +24,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `api_base` | `HttpUrl` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) | +| `base_url` | `HttpUrl \| None` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1) | | `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) | | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) | @@ -32,7 +32,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview ```yaml api_key: ${env.AZURE_API_KEY:=} -api_base: ${env.AZURE_API_BASE:=} +base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} ``` diff --git a/docs/docs/providers/inference/remote_cerebras.mdx b/docs/docs/providers/inference/remote_cerebras.mdx index 1fb9530bb..9fd390a29 100644 --- a/docs/docs/providers/inference/remote_cerebras.mdx +++ b/docs/docs/providers/inference/remote_cerebras.mdx @@ -17,11 +17,11 @@ Cerebras inference provider for running models on Cerebras Cloud platform. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `base_url` | `str` | No | https://api.cerebras.ai | Base URL for the Cerebras API | +| `base_url` | `HttpUrl \| None` | No | https://api.cerebras.ai/v1 | Base URL for the Cerebras API | ## Sample Configuration ```yaml -base_url: https://api.cerebras.ai +base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_databricks.mdx b/docs/docs/providers/inference/remote_databricks.mdx index 7a926baf4..d50c52958 100644 --- a/docs/docs/providers/inference/remote_databricks.mdx +++ b/docs/docs/providers/inference/remote_databricks.mdx @@ -17,11 +17,11 @@ Databricks inference provider for running models on Databricks' unified analytic | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_token` | `SecretStr \| None` | No | | The Databricks API token | -| `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the Databricks model serving endpoint (should include /serving-endpoints path) | ## Sample Configuration ```yaml -url: ${env.DATABRICKS_HOST:=} +base_url: ${env.DATABRICKS_HOST:=} api_token: ${env.DATABRICKS_TOKEN:=} ``` diff --git a/docs/docs/providers/inference/remote_fireworks.mdx b/docs/docs/providers/inference/remote_fireworks.mdx index 7db74efc4..a67403a9b 100644 --- a/docs/docs/providers/inference/remote_fireworks.mdx +++ b/docs/docs/providers/inference/remote_fireworks.mdx @@ -17,11 +17,11 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server | +| `base_url` | `HttpUrl \| None` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server | ## Sample Configuration ```yaml -url: https://api.fireworks.ai/inference/v1 +base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_groq.mdx b/docs/docs/providers/inference/remote_groq.mdx index 3ebd6f907..17acd3140 100644 --- a/docs/docs/providers/inference/remote_groq.mdx +++ b/docs/docs/providers/inference/remote_groq.mdx @@ -17,11 +17,11 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.groq.com | The URL for the Groq AI server | +| `base_url` | `HttpUrl \| None` | No | https://api.groq.com/openai/v1 | The URL for the Groq AI server | ## Sample Configuration ```yaml -url: https://api.groq.com +base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_llama-openai-compat.mdx b/docs/docs/providers/inference/remote_llama-openai-compat.mdx index f67f40909..69e90b2ac 100644 --- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx +++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx @@ -17,11 +17,11 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `openai_compat_api_base` | `str` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server | +| `base_url` | `HttpUrl \| None` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server | ## Sample Configuration ```yaml -openai_compat_api_base: https://api.llama.com/compat/v1/ +base_url: https://api.llama.com/compat/v1/ api_key: ${env.LLAMA_API_KEY} ``` diff --git a/docs/docs/providers/inference/remote_nvidia.mdx b/docs/docs/providers/inference/remote_nvidia.mdx index 6646d8b00..a890bc57f 100644 --- a/docs/docs/providers/inference/remote_nvidia.mdx +++ b/docs/docs/providers/inference/remote_nvidia.mdx @@ -17,15 +17,13 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM | +| `base_url` | `HttpUrl \| None` | No | https://integrate.api.nvidia.com/v1 | A base url for accessing the NVIDIA NIM | | `timeout` | `int` | No | 60 | Timeout for the HTTP requests | -| `append_api_version` | `bool` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. | | `rerank_model_to_url` | `dict[str, str]` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints. | ## Sample Configuration ```yaml -url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} +base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} -append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} ``` diff --git a/docs/docs/providers/inference/remote_ollama.mdx b/docs/docs/providers/inference/remote_ollama.mdx index 497bfed52..f9be84add 100644 --- a/docs/docs/providers/inference/remote_ollama.mdx +++ b/docs/docs/providers/inference/remote_ollama.mdx @@ -16,10 +16,10 @@ Ollama inference provider for running local models through the Ollama runtime. |-------|------|----------|---------|-------------| | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | -| `url` | `str` | No | http://localhost:11434 | | +| `base_url` | `HttpUrl \| None` | No | http://localhost:11434/v1 | | ## Sample Configuration ```yaml -url: ${env.OLLAMA_URL:=http://localhost:11434} +base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} ``` diff --git a/docs/docs/providers/inference/remote_openai.mdx b/docs/docs/providers/inference/remote_openai.mdx index 4931118fd..3ac3a21ad 100644 --- a/docs/docs/providers/inference/remote_openai.mdx +++ b/docs/docs/providers/inference/remote_openai.mdx @@ -17,7 +17,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `base_url` | `str` | No | https://api.openai.com/v1 | Base URL for OpenAI API | +| `base_url` | `HttpUrl \| None` | No | https://api.openai.com/v1 | Base URL for OpenAI API | ## Sample Configuration diff --git a/docs/docs/providers/inference/remote_passthrough.mdx b/docs/docs/providers/inference/remote_passthrough.mdx index 009961d49..325ecc352 100644 --- a/docs/docs/providers/inference/remote_passthrough.mdx +++ b/docs/docs/providers/inference/remote_passthrough.mdx @@ -17,11 +17,11 @@ Passthrough inference provider for connecting to any external inference service | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | | The URL for the passthrough endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the passthrough endpoint | ## Sample Configuration ```yaml -url: ${env.PASSTHROUGH_URL} +base_url: ${env.PASSTHROUGH_URL} api_key: ${env.PASSTHROUGH_API_KEY} ``` diff --git a/docs/docs/providers/inference/remote_runpod.mdx b/docs/docs/providers/inference/remote_runpod.mdx index 3b67e157d..6cdcdd3b5 100644 --- a/docs/docs/providers/inference/remote_runpod.mdx +++ b/docs/docs/providers/inference/remote_runpod.mdx @@ -17,11 +17,11 @@ RunPod inference provider for running models on RunPod's cloud GPU platform. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_token` | `SecretStr \| None` | No | | The API token | -| `url` | `str \| None` | No | | The URL for the Runpod model serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the Runpod model serving endpoint | ## Sample Configuration ```yaml -url: ${env.RUNPOD_URL:=} +base_url: ${env.RUNPOD_URL:=} api_token: ${env.RUNPOD_API_TOKEN} ``` diff --git a/docs/docs/providers/inference/remote_sambanova.mdx b/docs/docs/providers/inference/remote_sambanova.mdx index 6f4c5d7f6..bbefdb0f0 100644 --- a/docs/docs/providers/inference/remote_sambanova.mdx +++ b/docs/docs/providers/inference/remote_sambanova.mdx @@ -17,11 +17,11 @@ SambaNova inference provider for running models on SambaNova's dataflow architec | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server | +| `base_url` | `HttpUrl \| None` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server | ## Sample Configuration ```yaml -url: https://api.sambanova.ai/v1 +base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_tgi.mdx b/docs/docs/providers/inference/remote_tgi.mdx index cd5ea7661..3790acdd4 100644 --- a/docs/docs/providers/inference/remote_tgi.mdx +++ b/docs/docs/providers/inference/remote_tgi.mdx @@ -16,10 +16,10 @@ Text Generation Inference (TGI) provider for HuggingFace model serving. |-------|------|----------|---------|-------------| | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | -| `url` | `str` | No | | The URL for the TGI serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the TGI serving endpoint (should include /v1 path) | ## Sample Configuration ```yaml -url: ${env.TGI_URL:=} +base_url: ${env.TGI_URL:=} ``` diff --git a/docs/docs/providers/inference/remote_together.mdx b/docs/docs/providers/inference/remote_together.mdx index 43192cc9e..dc025b5ac 100644 --- a/docs/docs/providers/inference/remote_together.mdx +++ b/docs/docs/providers/inference/remote_together.mdx @@ -17,11 +17,11 @@ Together AI inference provider for open-source models and collaborative AI devel | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.together.xyz/v1 | The URL for the Together AI server | +| `base_url` | `HttpUrl \| None` | No | https://api.together.xyz/v1 | The URL for the Together AI server | ## Sample Configuration ```yaml -url: https://api.together.xyz/v1 +base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_vllm.mdx b/docs/docs/providers/inference/remote_vllm.mdx index 81620dbca..a52c24adb 100644 --- a/docs/docs/providers/inference/remote_vllm.mdx +++ b/docs/docs/providers/inference/remote_vllm.mdx @@ -17,14 +17,14 @@ Remote vLLM inference provider for connecting to vLLM servers. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_token` | `SecretStr \| None` | No | | The API token | -| `url` | `str \| None` | No | | The URL for the vLLM model serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the vLLM model serving endpoint | | `max_tokens` | `int` | No | 4096 | Maximum number of tokens to generate. | | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. | ## Sample Configuration ```yaml -url: ${env.VLLM_URL:=} +base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} diff --git a/docs/docs/providers/inference/remote_watsonx.mdx b/docs/docs/providers/inference/remote_watsonx.mdx index 3a1dba3b4..47d543e3a 100644 --- a/docs/docs/providers/inference/remote_watsonx.mdx +++ b/docs/docs/providers/inference/remote_watsonx.mdx @@ -17,14 +17,14 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai | +| `base_url` | `HttpUrl \| None` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai | | `project_id` | `str \| None` | No | | The watsonx.ai project ID | | `timeout` | `int` | No | 60 | Timeout for the HTTP requests | ## Sample Configuration ```yaml -url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} +base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} api_key: ${env.WATSONX_API_KEY:=} project_id: ${env.WATSONX_PROJECT_ID:=} ``` diff --git a/scripts/docker.sh b/scripts/docker.sh index b56df8c03..3b2db5ca7 100755 --- a/scripts/docker.sh +++ b/scripts/docker.sh @@ -287,9 +287,9 @@ start_container() { # On macOS/Windows, use host.docker.internal to reach host from container # On Linux with --network host, use localhost if [[ "$(uname)" == "Darwin" ]] || [[ "$(uname)" == *"MINGW"* ]]; then - OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434}" + OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434/v1}" else - OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}" + OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434/v1}" fi DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL" diff --git a/scripts/install.sh b/scripts/install.sh index 5e4939767..7fe1d3243 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -640,7 +640,7 @@ cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \ --network llama-net \ -p "${PORT}:${PORT}" \ "${server_env_opts[@]}" \ - -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" \ + -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}/v1" \ "${SERVER_IMAGE}" --port "${PORT}") log "🦙 Starting Llama Stack..." diff --git a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml index 5384b58fe..d942c23a4 100644 --- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/ci-tests/run.yaml b/src/llama_stack/distributions/ci-tests/run.yaml index 1118d2ad1..8b1cd2bb2 100644 --- a/src/llama_stack/distributions/ci-tests/run.yaml +++ b/src/llama_stack/distributions/ci-tests/run.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/nvidia/run-with-safety.yaml b/src/llama_stack/distributions/nvidia/run-with-safety.yaml index 1d57ad17a..d2c7dd090 100644 --- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml +++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml @@ -16,9 +16,8 @@ providers: - provider_id: nvidia provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: nvidia provider_type: remote::nvidia config: diff --git a/src/llama_stack/distributions/nvidia/run.yaml b/src/llama_stack/distributions/nvidia/run.yaml index 8c50b8bfb..c267587c7 100644 --- a/src/llama_stack/distributions/nvidia/run.yaml +++ b/src/llama_stack/distributions/nvidia/run.yaml @@ -16,9 +16,8 @@ providers: - provider_id: nvidia provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/src/llama_stack/distributions/open-benchmark/run.yaml b/src/llama_stack/distributions/open-benchmark/run.yaml index 912e48dd3..7ebc58841 100644 --- a/src/llama_stack/distributions/open-benchmark/run.yaml +++ b/src/llama_stack/distributions/open-benchmark/run.yaml @@ -27,12 +27,12 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} vector_io: - provider_id: sqlite-vec diff --git a/src/llama_stack/distributions/postgres-demo/run.yaml b/src/llama_stack/distributions/postgres-demo/run.yaml index dd1c2bc7f..049f519cd 100644 --- a/src/llama_stack/distributions/postgres-demo/run.yaml +++ b/src/llama_stack/distributions/postgres-demo/run.yaml @@ -11,7 +11,7 @@ providers: - provider_id: vllm-inference provider_type: remote::vllm config: - url: ${env.VLLM_URL:=http://localhost:8000/v1} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} diff --git a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml index e29ada6f4..75cc9d188 100644 --- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/starter-gpu/run.yaml b/src/llama_stack/distributions/starter-gpu/run.yaml index 7149b8659..09c7be5a1 100644 --- a/src/llama_stack/distributions/starter-gpu/run.yaml +++ b/src/llama_stack/distributions/starter-gpu/run.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml index 437674bf9..f59c809d2 100644 --- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/starter/run.yaml b/src/llama_stack/distributions/starter/run.yaml index 0ce392810..435bb22a7 100644 --- a/src/llama_stack/distributions/starter/run.yaml +++ b/src/llama_stack/distributions/starter/run.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/watsonx/run.yaml b/src/llama_stack/distributions/watsonx/run.yaml index 8456115d2..f8c489fe3 100644 --- a/src/llama_stack/distributions/watsonx/run.yaml +++ b/src/llama_stack/distributions/watsonx/run.yaml @@ -15,7 +15,7 @@ providers: - provider_id: watsonx provider_type: remote::watsonx config: - url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} + base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} api_key: ${env.WATSONX_API_KEY:=} project_id: ${env.WATSONX_PROJECT_ID:=} vector_io: diff --git a/src/llama_stack/providers/remote/inference/azure/azure.py b/src/llama_stack/providers/remote/inference/azure/azure.py index 134d01b15..c977d75d5 100644 --- a/src/llama_stack/providers/remote/inference/azure/azure.py +++ b/src/llama_stack/providers/remote/inference/azure/azure.py @@ -4,8 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from urllib.parse import urljoin - from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from .config import AzureConfig @@ -22,4 +20,4 @@ class AzureInferenceAdapter(OpenAIMixin): Returns the Azure API base URL from the configuration. """ - return urljoin(str(self.config.api_base), "/openai/v1") + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/azure/config.py b/src/llama_stack/providers/remote/inference/azure/config.py index b801b91b2..f6407a183 100644 --- a/src/llama_stack/providers/remote/inference/azure/config.py +++ b/src/llama_stack/providers/remote/inference/azure/config.py @@ -32,8 +32,9 @@ class AzureProviderDataValidator(BaseModel): @json_schema_type class AzureConfig(RemoteInferenceProviderConfig): - api_base: HttpUrl = Field( - description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)", + base_url: HttpUrl | None = Field( + default=None, + description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1)", ) api_version: str | None = Field( default_factory=lambda: os.getenv("AZURE_API_VERSION"), @@ -48,14 +49,14 @@ class AzureConfig(RemoteInferenceProviderConfig): def sample_run_config( cls, api_key: str = "${env.AZURE_API_KEY:=}", - api_base: str = "${env.AZURE_API_BASE:=}", + base_url: str = "${env.AZURE_API_BASE:=}", api_version: str = "${env.AZURE_API_VERSION:=}", api_type: str = "${env.AZURE_API_TYPE:=}", **kwargs, ) -> dict[str, Any]: return { "api_key": api_key, - "api_base": api_base, + "base_url": base_url, "api_version": api_version, "api_type": api_type, } diff --git a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py index 680431e22..23c27df1e 100644 --- a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -4,8 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from urllib.parse import urljoin - from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack_api import ( OpenAIEmbeddingsRequestWithExtraBody, @@ -21,7 +19,7 @@ class CerebrasInferenceAdapter(OpenAIMixin): provider_data_api_key_field: str = "cerebras_api_key" def get_base_url(self) -> str: - return urljoin(self.config.base_url, "v1") + return str(self.config.base_url) async def openai_embeddings( self, diff --git a/src/llama_stack/providers/remote/inference/cerebras/config.py b/src/llama_stack/providers/remote/inference/cerebras/config.py index db357fd1c..ea88abbea 100644 --- a/src/llama_stack/providers/remote/inference/cerebras/config.py +++ b/src/llama_stack/providers/remote/inference/cerebras/config.py @@ -7,12 +7,12 @@ import os from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type -DEFAULT_BASE_URL = "https://api.cerebras.ai" +DEFAULT_BASE_URL = "https://api.cerebras.ai/v1" class CerebrasProviderDataValidator(BaseModel): @@ -24,8 +24,8 @@ class CerebrasProviderDataValidator(BaseModel): @json_schema_type class CerebrasImplConfig(RemoteInferenceProviderConfig): - base_url: str = Field( - default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL), + base_url: HttpUrl | None = Field( + default=HttpUrl(os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL)), description="Base URL for the Cerebras API", ) diff --git a/src/llama_stack/providers/remote/inference/databricks/config.py b/src/llama_stack/providers/remote/inference/databricks/config.py index bd409fa13..44cb862f9 100644 --- a/src/llama_stack/providers/remote/inference/databricks/config.py +++ b/src/llama_stack/providers/remote/inference/databricks/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field, SecretStr +from pydantic import BaseModel, Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,9 +21,9 @@ class DatabricksProviderDataValidator(BaseModel): @json_schema_type class DatabricksImplConfig(RemoteInferenceProviderConfig): - url: str | None = Field( + base_url: HttpUrl | None = Field( default=None, - description="The URL for the Databricks model serving endpoint", + description="The URL for the Databricks model serving endpoint (should include /serving-endpoints path)", ) auth_credential: SecretStr | None = Field( default=None, @@ -34,11 +34,11 @@ class DatabricksImplConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config( cls, - url: str = "${env.DATABRICKS_HOST:=}", + base_url: str = "${env.DATABRICKS_HOST:=}", api_token: str = "${env.DATABRICKS_TOKEN:=}", **kwargs: Any, ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, "api_token": api_token, } diff --git a/src/llama_stack/providers/remote/inference/databricks/databricks.py b/src/llama_stack/providers/remote/inference/databricks/databricks.py index c07d97b67..f2f8832f6 100644 --- a/src/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/src/llama_stack/providers/remote/inference/databricks/databricks.py @@ -29,15 +29,21 @@ class DatabricksInferenceAdapter(OpenAIMixin): } def get_base_url(self) -> str: - return f"{self.config.url}/serving-endpoints" + return str(self.config.base_url) async def list_provider_model_ids(self) -> Iterable[str]: # Filter out None values from endpoint names api_token = self._get_api_key_from_config_or_provider_data() + # WorkspaceClient expects base host without /serving-endpoints suffix + base_url_str = str(self.config.base_url) + if base_url_str.endswith("/serving-endpoints"): + host = base_url_str[:-18] # Remove '/serving-endpoints' + else: + host = base_url_str return [ endpoint.name # type: ignore[misc] for endpoint in WorkspaceClient( - host=self.config.url, token=api_token + host=host, token=api_token ).serving_endpoints.list() # TODO: this is not async ] diff --git a/src/llama_stack/providers/remote/inference/fireworks/config.py b/src/llama_stack/providers/remote/inference/fireworks/config.py index e36c76054..c59b5f270 100644 --- a/src/llama_stack/providers/remote/inference/fireworks/config.py +++ b/src/llama_stack/providers/remote/inference/fireworks/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import Field +from pydantic import Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type @json_schema_type class FireworksImplConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.fireworks.ai/inference/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.fireworks.ai/inference/v1"), description="The URL for the Fireworks server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]: return { - "url": "https://api.fireworks.ai/inference/v1", + "base_url": "https://api.fireworks.ai/inference/v1", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/fireworks/fireworks.py b/src/llama_stack/providers/remote/inference/fireworks/fireworks.py index 7e2b73546..61ea0b1f6 100644 --- a/src/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/src/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -24,4 +24,4 @@ class FireworksInferenceAdapter(OpenAIMixin): provider_data_api_key_field: str = "fireworks_api_key" def get_base_url(self) -> str: - return "https://api.fireworks.ai/inference/v1" + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/groq/config.py b/src/llama_stack/providers/remote/inference/groq/config.py index cca53a4e8..e5c29c271 100644 --- a/src/llama_stack/providers/remote/inference/groq/config.py +++ b/src/llama_stack/providers/remote/inference/groq/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,14 +21,14 @@ class GroqProviderDataValidator(BaseModel): @json_schema_type class GroqConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.groq.com", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.groq.com/openai/v1"), description="The URL for the Groq AI server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]: return { - "url": "https://api.groq.com", + "base_url": "https://api.groq.com/openai/v1", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/groq/groq.py b/src/llama_stack/providers/remote/inference/groq/groq.py index 3a4f2626d..f99de91ca 100644 --- a/src/llama_stack/providers/remote/inference/groq/groq.py +++ b/src/llama_stack/providers/remote/inference/groq/groq.py @@ -15,4 +15,4 @@ class GroqInferenceAdapter(OpenAIMixin): provider_data_api_key_field: str = "groq_api_key" def get_base_url(self) -> str: - return f"{self.config.url}/openai/v1" + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py index ded210d89..a0f80d969 100644 --- a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py +++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,14 +21,14 @@ class LlamaProviderDataValidator(BaseModel): @json_schema_type class LlamaCompatConfig(RemoteInferenceProviderConfig): - openai_compat_api_base: str = Field( - default="https://api.llama.com/compat/v1/", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.llama.com/compat/v1/"), description="The URL for the Llama API server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]: return { - "openai_compat_api_base": "https://api.llama.com/compat/v1/", + "base_url": "https://api.llama.com/compat/v1/", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index a5f67ecd1..f29aebf36 100644 --- a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -31,7 +31,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin): :return: The Llama API base URL """ - return self.config.openai_compat_api_base + return str(self.config.base_url) async def openai_completion( self, diff --git a/src/llama_stack/providers/remote/inference/nvidia/config.py b/src/llama_stack/providers/remote/inference/nvidia/config.py index e5b0c6b73..e1e9a0ea9 100644 --- a/src/llama_stack/providers/remote/inference/nvidia/config.py +++ b/src/llama_stack/providers/remote/inference/nvidia/config.py @@ -7,7 +7,7 @@ import os from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -44,18 +44,14 @@ class NVIDIAConfig(RemoteInferenceProviderConfig): URL of your running NVIDIA NIM and do not need to set the api_key. """ - url: str = Field( - default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"), + base_url: HttpUrl | None = Field( + default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1"), description="A base url for accessing the NVIDIA NIM", ) timeout: int = Field( default=60, description="Timeout for the HTTP requests", ) - append_api_version: bool = Field( - default_factory=lambda: os.getenv("NVIDIA_APPEND_API_VERSION", "True").lower() != "false", - description="When set to false, the API version will not be appended to the base_url. By default, it is true.", - ) rerank_model_to_url: dict[str, str] = Field( default_factory=lambda: { "nv-rerank-qa-mistral-4b:1": "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking", @@ -68,13 +64,11 @@ class NVIDIAConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config( cls, - url: str = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}", + base_url: HttpUrl | None = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}", api_key: str = "${env.NVIDIA_API_KEY:=}", - append_api_version: bool = "${env.NVIDIA_APPEND_API_VERSION:=True}", **kwargs, ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, "api_key": api_key, - "append_api_version": append_api_version, } diff --git a/src/llama_stack/providers/remote/inference/nvidia/nvidia.py b/src/llama_stack/providers/remote/inference/nvidia/nvidia.py index 17f8775bf..5d0d52d6a 100644 --- a/src/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/src/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -44,7 +44,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin): } async def initialize(self) -> None: - logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...") + logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.base_url})...") if _is_nvidia_hosted(self.config): if not self.config.auth_credential: @@ -72,7 +72,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin): :return: The NVIDIA API base URL """ - return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url + return str(self.config.base_url) async def list_provider_model_ids(self) -> Iterable[str]: """ diff --git a/src/llama_stack/providers/remote/inference/nvidia/utils.py b/src/llama_stack/providers/remote/inference/nvidia/utils.py index 46ee939d9..c138d1fc5 100644 --- a/src/llama_stack/providers/remote/inference/nvidia/utils.py +++ b/src/llama_stack/providers/remote/inference/nvidia/utils.py @@ -8,4 +8,4 @@ from . import NVIDIAConfig def _is_nvidia_hosted(config: NVIDIAConfig) -> bool: - return "integrate.api.nvidia.com" in config.url + return "integrate.api.nvidia.com" in str(config.base_url) diff --git a/src/llama_stack/providers/remote/inference/ollama/config.py b/src/llama_stack/providers/remote/inference/ollama/config.py index 416b847a0..60dd34fa8 100644 --- a/src/llama_stack/providers/remote/inference/ollama/config.py +++ b/src/llama_stack/providers/remote/inference/ollama/config.py @@ -6,20 +6,22 @@ from typing import Any -from pydantic import Field, SecretStr +from pydantic import Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig -DEFAULT_OLLAMA_URL = "http://localhost:11434" +DEFAULT_OLLAMA_URL = "http://localhost:11434/v1" class OllamaImplConfig(RemoteInferenceProviderConfig): auth_credential: SecretStr | None = Field(default=None, exclude=True) - url: str = DEFAULT_OLLAMA_URL + base_url: HttpUrl | None = Field(default=HttpUrl(DEFAULT_OLLAMA_URL)) @classmethod - def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]: + def sample_run_config( + cls, base_url: str = "${env.OLLAMA_URL:=http://localhost:11434/v1}", **kwargs + ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, } diff --git a/src/llama_stack/providers/remote/inference/ollama/ollama.py b/src/llama_stack/providers/remote/inference/ollama/ollama.py index d1bf85361..e8b872384 100644 --- a/src/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/src/llama_stack/providers/remote/inference/ollama/ollama.py @@ -55,17 +55,23 @@ class OllamaInferenceAdapter(OpenAIMixin): # ollama client attaches itself to the current event loop (sadly?) loop = asyncio.get_running_loop() if loop not in self._clients: - self._clients[loop] = AsyncOllamaClient(host=self.config.url) + # Ollama client expects base URL without /v1 suffix + base_url_str = str(self.config.base_url) + if base_url_str.endswith("/v1"): + host = base_url_str[:-3] + else: + host = base_url_str + self._clients[loop] = AsyncOllamaClient(host=host) return self._clients[loop] def get_api_key(self): return "NO KEY REQUIRED" def get_base_url(self): - return self.config.url.rstrip("/") + "/v1" + return str(self.config.base_url) async def initialize(self) -> None: - logger.info(f"checking connectivity to Ollama at `{self.config.url}`...") + logger.info(f"checking connectivity to Ollama at `{self.config.base_url}`...") r = await self.health() if r["status"] == HealthStatus.ERROR: logger.warning( diff --git a/src/llama_stack/providers/remote/inference/openai/config.py b/src/llama_stack/providers/remote/inference/openai/config.py index ab28e571f..2057cd0d6 100644 --- a/src/llama_stack/providers/remote/inference/openai/config.py +++ b/src/llama_stack/providers/remote/inference/openai/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,8 +21,8 @@ class OpenAIProviderDataValidator(BaseModel): @json_schema_type class OpenAIConfig(RemoteInferenceProviderConfig): - base_url: str = Field( - default="https://api.openai.com/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.openai.com/v1"), description="Base URL for OpenAI API", ) diff --git a/src/llama_stack/providers/remote/inference/openai/openai.py b/src/llama_stack/providers/remote/inference/openai/openai.py index 52bc48f1a..2d465546a 100644 --- a/src/llama_stack/providers/remote/inference/openai/openai.py +++ b/src/llama_stack/providers/remote/inference/openai/openai.py @@ -35,4 +35,4 @@ class OpenAIInferenceAdapter(OpenAIMixin): Returns the OpenAI API base URL from the configuration. """ - return self.config.base_url + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/passthrough/config.py b/src/llama_stack/providers/remote/inference/passthrough/config.py index 54508b6fb..f45806e79 100644 --- a/src/llama_stack/providers/remote/inference/passthrough/config.py +++ b/src/llama_stack/providers/remote/inference/passthrough/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import Field +from pydantic import Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,16 +14,16 @@ from llama_stack_api import json_schema_type @json_schema_type class PassthroughImplConfig(RemoteInferenceProviderConfig): - url: str = Field( + base_url: HttpUrl | None = Field( default=None, description="The URL for the passthrough endpoint", ) @classmethod def sample_run_config( - cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs + cls, base_url: HttpUrl | None = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py index 75eedf026..b0e2e74ad 100644 --- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -82,8 +82,8 @@ class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference): def _get_passthrough_url(self) -> str: """Get the passthrough URL from config or provider data.""" - if self.config.url is not None: - return self.config.url + if self.config.base_url is not None: + return str(self.config.base_url) provider_data = self.get_request_provider_data() if provider_data is None: diff --git a/src/llama_stack/providers/remote/inference/runpod/config.py b/src/llama_stack/providers/remote/inference/runpod/config.py index 2ee56ca94..8d06f5263 100644 --- a/src/llama_stack/providers/remote/inference/runpod/config.py +++ b/src/llama_stack/providers/remote/inference/runpod/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field, SecretStr +from pydantic import BaseModel, Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,7 +21,7 @@ class RunpodProviderDataValidator(BaseModel): @json_schema_type class RunpodImplConfig(RemoteInferenceProviderConfig): - url: str | None = Field( + base_url: HttpUrl | None = Field( default=None, description="The URL for the Runpod model serving endpoint", ) @@ -34,6 +34,6 @@ class RunpodImplConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]: return { - "url": "${env.RUNPOD_URL:=}", + "base_url": "${env.RUNPOD_URL:=}", "api_token": "${env.RUNPOD_API_TOKEN}", } diff --git a/src/llama_stack/providers/remote/inference/runpod/runpod.py b/src/llama_stack/providers/remote/inference/runpod/runpod.py index 9c770cc24..04ad12851 100644 --- a/src/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/src/llama_stack/providers/remote/inference/runpod/runpod.py @@ -28,7 +28,7 @@ class RunpodInferenceAdapter(OpenAIMixin): def get_base_url(self) -> str: """Get base URL for OpenAI client.""" - return self.config.url + return str(self.config.base_url) async def openai_chat_completion( self, diff --git a/src/llama_stack/providers/remote/inference/sambanova/config.py b/src/llama_stack/providers/remote/inference/sambanova/config.py index 93679ba99..79cda75a0 100644 --- a/src/llama_stack/providers/remote/inference/sambanova/config.py +++ b/src/llama_stack/providers/remote/inference/sambanova/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,14 +21,14 @@ class SambaNovaProviderDataValidator(BaseModel): @json_schema_type class SambaNovaImplConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.sambanova.ai/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.sambanova.ai/v1"), description="The URL for the SambaNova AI server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]: return { - "url": "https://api.sambanova.ai/v1", + "base_url": "https://api.sambanova.ai/v1", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/sambanova/sambanova.py b/src/llama_stack/providers/remote/inference/sambanova/sambanova.py index daa4b1670..cb01e3a90 100644 --- a/src/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/src/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -25,4 +25,4 @@ class SambaNovaInferenceAdapter(OpenAIMixin): :return: The SambaNova base URL """ - return self.config.url + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/tgi/config.py b/src/llama_stack/providers/remote/inference/tgi/config.py index 74edc8523..44cb4b812 100644 --- a/src/llama_stack/providers/remote/inference/tgi/config.py +++ b/src/llama_stack/providers/remote/inference/tgi/config.py @@ -5,7 +5,7 @@ # the root directory of this source tree. -from pydantic import BaseModel, Field, SecretStr +from pydantic import BaseModel, Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -15,18 +15,19 @@ from llama_stack_api import json_schema_type class TGIImplConfig(RemoteInferenceProviderConfig): auth_credential: SecretStr | None = Field(default=None, exclude=True) - url: str = Field( - description="The URL for the TGI serving endpoint", + base_url: HttpUrl | None = Field( + default=None, + description="The URL for the TGI serving endpoint (should include /v1 path)", ) @classmethod def sample_run_config( cls, - url: str = "${env.TGI_URL:=}", + base_url: str = "${env.TGI_URL:=}", **kwargs, ): return { - "url": url, + "base_url": base_url, } diff --git a/src/llama_stack/providers/remote/inference/tgi/tgi.py b/src/llama_stack/providers/remote/inference/tgi/tgi.py index dd47ccc62..5dc8c33f7 100644 --- a/src/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/src/llama_stack/providers/remote/inference/tgi/tgi.py @@ -8,7 +8,7 @@ from collections.abc import Iterable from huggingface_hub import AsyncInferenceClient, HfApi -from pydantic import SecretStr +from pydantic import HttpUrl, SecretStr from llama_stack.log import get_logger from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -23,7 +23,7 @@ log = get_logger(name=__name__, category="inference::tgi") class _HfAdapter(OpenAIMixin): - url: str + base_url: HttpUrl api_key: SecretStr hf_client: AsyncInferenceClient @@ -36,7 +36,7 @@ class _HfAdapter(OpenAIMixin): return "NO KEY REQUIRED" def get_base_url(self): - return self.url + return self.base_url async def list_provider_model_ids(self) -> Iterable[str]: return [self.model_id] @@ -50,14 +50,20 @@ class _HfAdapter(OpenAIMixin): class TGIAdapter(_HfAdapter): async def initialize(self, config: TGIImplConfig) -> None: - if not config.url: + if not config.base_url: raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.") - log.info(f"Initializing TGI client with url={config.url}") - self.hf_client = AsyncInferenceClient(model=config.url, provider="hf-inference") + log.info(f"Initializing TGI client with url={config.base_url}") + # Extract base URL without /v1 for HF client initialization + base_url_str = str(config.base_url).rstrip("/") + if base_url_str.endswith("/v1"): + base_url_for_client = base_url_str[:-3] + else: + base_url_for_client = base_url_str + self.hf_client = AsyncInferenceClient(model=base_url_for_client, provider="hf-inference") endpoint_info = await self.hf_client.get_endpoint_info() self.max_tokens = endpoint_info["max_total_tokens"] self.model_id = endpoint_info["model_id"] - self.url = f"{config.url.rstrip('/')}/v1" + self.base_url = config.base_url self.api_key = SecretStr("NO_KEY") diff --git a/src/llama_stack/providers/remote/inference/together/config.py b/src/llama_stack/providers/remote/inference/together/config.py index c1b3c4a55..16f0686ba 100644 --- a/src/llama_stack/providers/remote/inference/together/config.py +++ b/src/llama_stack/providers/remote/inference/together/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import Field +from pydantic import Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type @json_schema_type class TogetherImplConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.together.xyz/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.together.xyz/v1"), description="The URL for the Together AI server", ) @classmethod def sample_run_config(cls, **kwargs) -> dict[str, Any]: return { - "url": "https://api.together.xyz/v1", + "base_url": "https://api.together.xyz/v1", "api_key": "${env.TOGETHER_API_KEY:=}", } diff --git a/src/llama_stack/providers/remote/inference/together/together.py b/src/llama_stack/providers/remote/inference/together/together.py index cd34aec5e..0826dbcd2 100644 --- a/src/llama_stack/providers/remote/inference/together/together.py +++ b/src/llama_stack/providers/remote/inference/together/together.py @@ -9,7 +9,6 @@ from collections.abc import Iterable from typing import Any, cast from together import AsyncTogether # type: ignore[import-untyped] -from together.constants import BASE_URL # type: ignore[import-untyped] from llama_stack.core.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger @@ -42,7 +41,7 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData): provider_data_api_key_field: str = "together_api_key" def get_base_url(self): - return BASE_URL + return str(self.config.base_url) def _get_client(self) -> AsyncTogether: together_api_key = None diff --git a/src/llama_stack/providers/remote/inference/vllm/config.py b/src/llama_stack/providers/remote/inference/vllm/config.py index c43533ee4..db6c74431 100644 --- a/src/llama_stack/providers/remote/inference/vllm/config.py +++ b/src/llama_stack/providers/remote/inference/vllm/config.py @@ -6,7 +6,7 @@ from pathlib import Path -from pydantic import Field, SecretStr, field_validator +from pydantic import Field, HttpUrl, SecretStr, field_validator from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,7 +14,7 @@ from llama_stack_api import json_schema_type @json_schema_type class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig): - url: str | None = Field( + base_url: HttpUrl | None = Field( default=None, description="The URL for the vLLM model serving endpoint", ) @@ -48,11 +48,11 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config( cls, - url: str = "${env.VLLM_URL:=}", + base_url: str = "${env.VLLM_URL:=}", **kwargs, ): return { - "url": url, + "base_url": base_url, "max_tokens": "${env.VLLM_MAX_TOKENS:=4096}", "api_token": "${env.VLLM_API_TOKEN:=fake}", "tls_verify": "${env.VLLM_TLS_VERIFY:=true}", diff --git a/src/llama_stack/providers/remote/inference/vllm/vllm.py b/src/llama_stack/providers/remote/inference/vllm/vllm.py index 1510e9384..6664ca36b 100644 --- a/src/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/src/llama_stack/providers/remote/inference/vllm/vllm.py @@ -39,12 +39,12 @@ class VLLMInferenceAdapter(OpenAIMixin): def get_base_url(self) -> str: """Get the base URL from config.""" - if not self.config.url: + if not self.config.base_url: raise ValueError("No base URL configured") - return self.config.url + return str(self.config.base_url) async def initialize(self) -> None: - if not self.config.url: + if not self.config.base_url: raise ValueError( "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM." ) diff --git a/src/llama_stack/providers/remote/inference/watsonx/config.py b/src/llama_stack/providers/remote/inference/watsonx/config.py index 914f80820..be2b2c0ab 100644 --- a/src/llama_stack/providers/remote/inference/watsonx/config.py +++ b/src/llama_stack/providers/remote/inference/watsonx/config.py @@ -7,7 +7,7 @@ import os from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -23,7 +23,7 @@ class WatsonXProviderDataValidator(BaseModel): @json_schema_type class WatsonXConfig(RemoteInferenceProviderConfig): - url: str = Field( + base_url: HttpUrl | None = Field( default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"), description="A base url for accessing the watsonx.ai", ) @@ -39,7 +39,7 @@ class WatsonXConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config(cls, **kwargs) -> dict[str, Any]: return { - "url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}", + "base_url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}", "api_key": "${env.WATSONX_API_KEY:=}", "project_id": "${env.WATSONX_PROJECT_ID:=}", } diff --git a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py index aab9e2dca..5684f6c17 100644 --- a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py +++ b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -255,7 +255,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin): ) def get_base_url(self) -> str: - return self.config.url + return str(self.config.base_url) # Copied from OpenAIMixin async def check_model_availability(self, model: str) -> bool: @@ -316,7 +316,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin): """ Retrieves foundation model specifications from the watsonx.ai API. """ - url = f"{self.config.url}/ml/v1/foundation_model_specs?version=2023-10-25" + url = f"{str(self.config.base_url)}/ml/v1/foundation_model_specs?version=2023-10-25" headers = { # Note that there is no authorization header. Listing models does not require authentication. "Content-Type": "application/json", diff --git a/tests/integration/suites.py b/tests/integration/suites.py index 7689657b4..10c872705 100644 --- a/tests/integration/suites.py +++ b/tests/integration/suites.py @@ -50,7 +50,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="ollama", description="Local Ollama provider with text + safety models", env={ - "OLLAMA_URL": "http://0.0.0.0:11434", + "OLLAMA_URL": "http://0.0.0.0:11434/v1", "SAFETY_MODEL": "ollama/llama-guard3:1b", }, defaults={ @@ -64,7 +64,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="ollama", description="Local Ollama provider with a vision model", env={ - "OLLAMA_URL": "http://0.0.0.0:11434", + "OLLAMA_URL": "http://0.0.0.0:11434/v1", }, defaults={ "vision_model": "ollama/llama3.2-vision:11b", @@ -75,7 +75,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="ollama-postgres", description="Server-mode tests with Postgres-backed persistence", env={ - "OLLAMA_URL": "http://0.0.0.0:11434", + "OLLAMA_URL": "http://0.0.0.0:11434/v1", "SAFETY_MODEL": "ollama/llama-guard3:1b", "POSTGRES_HOST": "127.0.0.1", "POSTGRES_PORT": "5432", diff --git a/tests/unit/providers/inference/test_inference_client_caching.py b/tests/unit/providers/inference/test_inference_client_caching.py index aa3a2c77a..6ddf790af 100644 --- a/tests/unit/providers/inference/test_inference_client_caching.py +++ b/tests/unit/providers/inference/test_inference_client_caching.py @@ -120,7 +120,7 @@ from llama_stack.providers.remote.inference.watsonx.watsonx import WatsonXInfere VLLMInferenceAdapter, "llama_stack.providers.remote.inference.vllm.VLLMProviderDataValidator", { - "url": "http://fake", + "base_url": "http://fake", }, ), ], @@ -153,7 +153,7 @@ def test_litellm_provider_data_used(config_cls, adapter_cls, provider_data_valid """Validate data for LiteLLM-based providers. Similar to test_openai_provider_data_used, but without the assumption that there is an OpenAI-compatible client object.""" - inference_adapter = adapter_cls(config=config_cls()) + inference_adapter = adapter_cls(config=config_cls(base_url="http://fake")) inference_adapter.__provider_spec__ = MagicMock() inference_adapter.__provider_spec__.provider_data_validator = provider_data_validator diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index 958895cc4..0cf8ed306 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -40,7 +40,7 @@ from llama_stack_api import ( @pytest.fixture(scope="function") async def vllm_inference_adapter(): - config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345") inference_adapter = VLLMInferenceAdapter(config=config) inference_adapter.model_store = AsyncMock() await inference_adapter.initialize() @@ -204,7 +204,7 @@ async def test_vllm_completion_extra_body(): via extra_body to the underlying OpenAI client through the InferenceRouter. """ # Set up the vLLM adapter - config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345") vllm_adapter = VLLMInferenceAdapter(config=config) vllm_adapter.__provider_id__ = "vllm" await vllm_adapter.initialize() @@ -277,7 +277,7 @@ async def test_vllm_chat_completion_extra_body(): via extra_body to the underlying OpenAI client through the InferenceRouter for chat completion. """ # Set up the vLLM adapter - config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345") vllm_adapter = VLLMInferenceAdapter(config=config) vllm_adapter.__provider_id__ = "vllm" await vllm_adapter.initialize() diff --git a/tests/unit/providers/nvidia/test_rerank_inference.py b/tests/unit/providers/nvidia/test_rerank_inference.py index ee62910b8..4ad9dc766 100644 --- a/tests/unit/providers/nvidia/test_rerank_inference.py +++ b/tests/unit/providers/nvidia/test_rerank_inference.py @@ -146,7 +146,7 @@ async def test_hosted_model_not_in_endpoint_mapping(): async def test_self_hosted_ignores_endpoint(): adapter = create_adapter( - config=NVIDIAConfig(url="http://localhost:8000", api_key=None), + config=NVIDIAConfig(base_url="http://localhost:8000", api_key=None), rerank_endpoints={"test-model": "https://model.endpoint/rerank"}, # This should be ignored for self-hosted. ) mock_session = MockSession(MockResponse()) diff --git a/tests/unit/providers/test_configs.py b/tests/unit/providers/test_configs.py index 867cfffbc..b4ba78394 100644 --- a/tests/unit/providers/test_configs.py +++ b/tests/unit/providers/test_configs.py @@ -4,8 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import get_args, get_origin + import pytest -from pydantic import BaseModel +from pydantic import BaseModel, HttpUrl from llama_stack.core.distribution import get_provider_registry, providable_apis from llama_stack.core.utils.dynamic import instantiate_class_type @@ -41,3 +43,55 @@ class TestProviderConfigurations: sample_config = config_type.sample_run_config(__distro_dir__="foobarbaz") assert isinstance(sample_config, dict), f"{config_class_name}.sample_run_config() did not return a dict" + + def test_remote_inference_url_standardization(self): + """Verify all remote inference providers use standardized base_url configuration.""" + provider_registry = get_provider_registry() + inference_providers = provider_registry.get("inference", {}) + + # Filter for remote providers only + remote_providers = {k: v for k, v in inference_providers.items() if k.startswith("remote::")} + + failures = [] + for provider_type, provider_spec in remote_providers.items(): + try: + config_class_name = provider_spec.config_class + config_type = instantiate_class_type(config_class_name) + + # Check that config has base_url field (not url) + if hasattr(config_type, "model_fields"): + fields = config_type.model_fields + + # Should NOT have 'url' field (old pattern) + if "url" in fields: + failures.append( + f"{provider_type}: Uses deprecated 'url' field instead of 'base_url'. " + f"Please rename to 'base_url' for consistency." + ) + + # Should have 'base_url' field with HttpUrl | None type + if "base_url" in fields: + field_info = fields["base_url"] + annotation = field_info.annotation + + # Check if it's HttpUrl or HttpUrl | None + # get_origin() returns Union for (X | Y), None for plain types + # get_args() returns the types inside Union, e.g. (HttpUrl, NoneType) + is_valid = False + if get_origin(annotation) is not None: # It's a Union/Optional + if HttpUrl in get_args(annotation): + is_valid = True + elif annotation == HttpUrl: # Plain HttpUrl without | None + is_valid = True + + if not is_valid: + failures.append( + f"{provider_type}: base_url field has incorrect type annotation. " + f"Expected 'HttpUrl | None', got '{annotation}'" + ) + + except Exception as e: + failures.append(f"{provider_type}: Error checking URL standardization: {str(e)}") + + if failures: + pytest.fail("URL standardization violations found:\n" + "\n".join(f" - {f}" for f in failures))