diff --git a/docs/docs/providers/inference/remote_azure.mdx b/docs/docs/providers/inference/remote_azure.mdx index fd22b157e..0382b42d7 100644 --- a/docs/docs/providers/inference/remote_azure.mdx +++ b/docs/docs/providers/inference/remote_azure.mdx @@ -24,7 +24,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `api_base` | `HttpUrl` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) | +| `base_url` | `HttpUrl \| None` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1) | | `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) | | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) | @@ -32,7 +32,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview ```yaml api_key: ${env.AZURE_API_KEY:=} -api_base: ${env.AZURE_API_BASE:=} +base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} ``` diff --git a/docs/docs/providers/inference/remote_cerebras.mdx b/docs/docs/providers/inference/remote_cerebras.mdx index 1fb9530bb..9fd390a29 100644 --- a/docs/docs/providers/inference/remote_cerebras.mdx +++ b/docs/docs/providers/inference/remote_cerebras.mdx @@ -17,11 +17,11 @@ Cerebras inference provider for running models on Cerebras Cloud platform. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `base_url` | `str` | No | https://api.cerebras.ai | Base URL for the Cerebras API | +| `base_url` | `HttpUrl \| None` | No | https://api.cerebras.ai/v1 | Base URL for the Cerebras API | ## Sample Configuration ```yaml -base_url: https://api.cerebras.ai +base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_databricks.mdx b/docs/docs/providers/inference/remote_databricks.mdx index 7a926baf4..d50c52958 100644 --- a/docs/docs/providers/inference/remote_databricks.mdx +++ b/docs/docs/providers/inference/remote_databricks.mdx @@ -17,11 +17,11 @@ Databricks inference provider for running models on Databricks' unified analytic | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_token` | `SecretStr \| None` | No | | The Databricks API token | -| `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the Databricks model serving endpoint (should include /serving-endpoints path) | ## Sample Configuration ```yaml -url: ${env.DATABRICKS_HOST:=} +base_url: ${env.DATABRICKS_HOST:=} api_token: ${env.DATABRICKS_TOKEN:=} ``` diff --git a/docs/docs/providers/inference/remote_fireworks.mdx b/docs/docs/providers/inference/remote_fireworks.mdx index 7db74efc4..a67403a9b 100644 --- a/docs/docs/providers/inference/remote_fireworks.mdx +++ b/docs/docs/providers/inference/remote_fireworks.mdx @@ -17,11 +17,11 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server | +| `base_url` | `HttpUrl \| None` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server | ## Sample Configuration ```yaml -url: https://api.fireworks.ai/inference/v1 +base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_groq.mdx b/docs/docs/providers/inference/remote_groq.mdx index 3ebd6f907..17acd3140 100644 --- a/docs/docs/providers/inference/remote_groq.mdx +++ b/docs/docs/providers/inference/remote_groq.mdx @@ -17,11 +17,11 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.groq.com | The URL for the Groq AI server | +| `base_url` | `HttpUrl \| None` | No | https://api.groq.com/openai/v1 | The URL for the Groq AI server | ## Sample Configuration ```yaml -url: https://api.groq.com +base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_llama-openai-compat.mdx b/docs/docs/providers/inference/remote_llama-openai-compat.mdx index f67f40909..69e90b2ac 100644 --- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx +++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx @@ -17,11 +17,11 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `openai_compat_api_base` | `str` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server | +| `base_url` | `HttpUrl \| None` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server | ## Sample Configuration ```yaml -openai_compat_api_base: https://api.llama.com/compat/v1/ +base_url: https://api.llama.com/compat/v1/ api_key: ${env.LLAMA_API_KEY} ``` diff --git a/docs/docs/providers/inference/remote_nvidia.mdx b/docs/docs/providers/inference/remote_nvidia.mdx index 6646d8b00..a890bc57f 100644 --- a/docs/docs/providers/inference/remote_nvidia.mdx +++ b/docs/docs/providers/inference/remote_nvidia.mdx @@ -17,15 +17,13 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM | +| `base_url` | `HttpUrl \| None` | No | https://integrate.api.nvidia.com/v1 | A base url for accessing the NVIDIA NIM | | `timeout` | `int` | No | 60 | Timeout for the HTTP requests | -| `append_api_version` | `bool` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. | | `rerank_model_to_url` | `dict[str, str]` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints. | ## Sample Configuration ```yaml -url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} +base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} -append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} ``` diff --git a/docs/docs/providers/inference/remote_ollama.mdx b/docs/docs/providers/inference/remote_ollama.mdx index 497bfed52..f9be84add 100644 --- a/docs/docs/providers/inference/remote_ollama.mdx +++ b/docs/docs/providers/inference/remote_ollama.mdx @@ -16,10 +16,10 @@ Ollama inference provider for running local models through the Ollama runtime. |-------|------|----------|---------|-------------| | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | -| `url` | `str` | No | http://localhost:11434 | | +| `base_url` | `HttpUrl \| None` | No | http://localhost:11434/v1 | | ## Sample Configuration ```yaml -url: ${env.OLLAMA_URL:=http://localhost:11434} +base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} ``` diff --git a/docs/docs/providers/inference/remote_openai.mdx b/docs/docs/providers/inference/remote_openai.mdx index 4931118fd..3ac3a21ad 100644 --- a/docs/docs/providers/inference/remote_openai.mdx +++ b/docs/docs/providers/inference/remote_openai.mdx @@ -17,7 +17,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `base_url` | `str` | No | https://api.openai.com/v1 | Base URL for OpenAI API | +| `base_url` | `HttpUrl \| None` | No | https://api.openai.com/v1 | Base URL for OpenAI API | ## Sample Configuration diff --git a/docs/docs/providers/inference/remote_passthrough.mdx b/docs/docs/providers/inference/remote_passthrough.mdx index 009961d49..325ecc352 100644 --- a/docs/docs/providers/inference/remote_passthrough.mdx +++ b/docs/docs/providers/inference/remote_passthrough.mdx @@ -17,11 +17,11 @@ Passthrough inference provider for connecting to any external inference service | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | | The URL for the passthrough endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the passthrough endpoint | ## Sample Configuration ```yaml -url: ${env.PASSTHROUGH_URL} +base_url: ${env.PASSTHROUGH_URL} api_key: ${env.PASSTHROUGH_API_KEY} ``` diff --git a/docs/docs/providers/inference/remote_runpod.mdx b/docs/docs/providers/inference/remote_runpod.mdx index 3b67e157d..6cdcdd3b5 100644 --- a/docs/docs/providers/inference/remote_runpod.mdx +++ b/docs/docs/providers/inference/remote_runpod.mdx @@ -17,11 +17,11 @@ RunPod inference provider for running models on RunPod's cloud GPU platform. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_token` | `SecretStr \| None` | No | | The API token | -| `url` | `str \| None` | No | | The URL for the Runpod model serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the Runpod model serving endpoint | ## Sample Configuration ```yaml -url: ${env.RUNPOD_URL:=} +base_url: ${env.RUNPOD_URL:=} api_token: ${env.RUNPOD_API_TOKEN} ``` diff --git a/docs/docs/providers/inference/remote_sambanova.mdx b/docs/docs/providers/inference/remote_sambanova.mdx index 6f4c5d7f6..bbefdb0f0 100644 --- a/docs/docs/providers/inference/remote_sambanova.mdx +++ b/docs/docs/providers/inference/remote_sambanova.mdx @@ -17,11 +17,11 @@ SambaNova inference provider for running models on SambaNova's dataflow architec | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server | +| `base_url` | `HttpUrl \| None` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server | ## Sample Configuration ```yaml -url: https://api.sambanova.ai/v1 +base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_tgi.mdx b/docs/docs/providers/inference/remote_tgi.mdx index cd5ea7661..3790acdd4 100644 --- a/docs/docs/providers/inference/remote_tgi.mdx +++ b/docs/docs/providers/inference/remote_tgi.mdx @@ -16,10 +16,10 @@ Text Generation Inference (TGI) provider for HuggingFace model serving. |-------|------|----------|---------|-------------| | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | -| `url` | `str` | No | | The URL for the TGI serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the TGI serving endpoint (should include /v1 path) | ## Sample Configuration ```yaml -url: ${env.TGI_URL:=} +base_url: ${env.TGI_URL:=} ``` diff --git a/docs/docs/providers/inference/remote_together.mdx b/docs/docs/providers/inference/remote_together.mdx index 43192cc9e..dc025b5ac 100644 --- a/docs/docs/providers/inference/remote_together.mdx +++ b/docs/docs/providers/inference/remote_together.mdx @@ -17,11 +17,11 @@ Together AI inference provider for open-source models and collaborative AI devel | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://api.together.xyz/v1 | The URL for the Together AI server | +| `base_url` | `HttpUrl \| None` | No | https://api.together.xyz/v1 | The URL for the Together AI server | ## Sample Configuration ```yaml -url: https://api.together.xyz/v1 +base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} ``` diff --git a/docs/docs/providers/inference/remote_vllm.mdx b/docs/docs/providers/inference/remote_vllm.mdx index 81620dbca..a52c24adb 100644 --- a/docs/docs/providers/inference/remote_vllm.mdx +++ b/docs/docs/providers/inference/remote_vllm.mdx @@ -17,14 +17,14 @@ Remote vLLM inference provider for connecting to vLLM servers. | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_token` | `SecretStr \| None` | No | | The API token | -| `url` | `str \| None` | No | | The URL for the vLLM model serving endpoint | +| `base_url` | `HttpUrl \| None` | No | | The URL for the vLLM model serving endpoint | | `max_tokens` | `int` | No | 4096 | Maximum number of tokens to generate. | | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. | ## Sample Configuration ```yaml -url: ${env.VLLM_URL:=} +base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} diff --git a/docs/docs/providers/inference/remote_watsonx.mdx b/docs/docs/providers/inference/remote_watsonx.mdx index 3a1dba3b4..47d543e3a 100644 --- a/docs/docs/providers/inference/remote_watsonx.mdx +++ b/docs/docs/providers/inference/remote_watsonx.mdx @@ -17,14 +17,14 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform | `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider | | `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider | -| `url` | `str` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai | +| `base_url` | `HttpUrl \| None` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai | | `project_id` | `str \| None` | No | | The watsonx.ai project ID | | `timeout` | `int` | No | 60 | Timeout for the HTTP requests | ## Sample Configuration ```yaml -url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} +base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} api_key: ${env.WATSONX_API_KEY:=} project_id: ${env.WATSONX_PROJECT_ID:=} ``` diff --git a/scripts/docker.sh b/scripts/docker.sh index b56df8c03..3b2db5ca7 100755 --- a/scripts/docker.sh +++ b/scripts/docker.sh @@ -287,9 +287,9 @@ start_container() { # On macOS/Windows, use host.docker.internal to reach host from container # On Linux with --network host, use localhost if [[ "$(uname)" == "Darwin" ]] || [[ "$(uname)" == *"MINGW"* ]]; then - OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434}" + OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434/v1}" else - OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}" + OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434/v1}" fi DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL" diff --git a/scripts/install.sh b/scripts/install.sh index 5e4939767..7fe1d3243 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -640,7 +640,7 @@ cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \ --network llama-net \ -p "${PORT}:${PORT}" \ "${server_env_opts[@]}" \ - -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" \ + -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}/v1" \ "${SERVER_IMAGE}" --port "${PORT}") log "🦙 Starting Llama Stack..." diff --git a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml index 5384b58fe..d942c23a4 100644 --- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/ci-tests/run.yaml b/src/llama_stack/distributions/ci-tests/run.yaml index 1118d2ad1..8b1cd2bb2 100644 --- a/src/llama_stack/distributions/ci-tests/run.yaml +++ b/src/llama_stack/distributions/ci-tests/run.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/nvidia/run-with-safety.yaml b/src/llama_stack/distributions/nvidia/run-with-safety.yaml index 1d57ad17a..d2c7dd090 100644 --- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml +++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml @@ -16,9 +16,8 @@ providers: - provider_id: nvidia provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: nvidia provider_type: remote::nvidia config: diff --git a/src/llama_stack/distributions/nvidia/run.yaml b/src/llama_stack/distributions/nvidia/run.yaml index 8c50b8bfb..c267587c7 100644 --- a/src/llama_stack/distributions/nvidia/run.yaml +++ b/src/llama_stack/distributions/nvidia/run.yaml @@ -16,9 +16,8 @@ providers: - provider_id: nvidia provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/src/llama_stack/distributions/open-benchmark/run.yaml b/src/llama_stack/distributions/open-benchmark/run.yaml index 912e48dd3..7ebc58841 100644 --- a/src/llama_stack/distributions/open-benchmark/run.yaml +++ b/src/llama_stack/distributions/open-benchmark/run.yaml @@ -27,12 +27,12 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} vector_io: - provider_id: sqlite-vec diff --git a/src/llama_stack/distributions/postgres-demo/run.yaml b/src/llama_stack/distributions/postgres-demo/run.yaml index dd1c2bc7f..049f519cd 100644 --- a/src/llama_stack/distributions/postgres-demo/run.yaml +++ b/src/llama_stack/distributions/postgres-demo/run.yaml @@ -11,7 +11,7 @@ providers: - provider_id: vllm-inference provider_type: remote::vllm config: - url: ${env.VLLM_URL:=http://localhost:8000/v1} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} diff --git a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml index e29ada6f4..75cc9d188 100644 --- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/starter-gpu/run.yaml b/src/llama_stack/distributions/starter-gpu/run.yaml index 7149b8659..09c7be5a1 100644 --- a/src/llama_stack/distributions/starter-gpu/run.yaml +++ b/src/llama_stack/distributions/starter-gpu/run.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml index 437674bf9..f59c809d2 100644 --- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/starter/run.yaml b/src/llama_stack/distributions/starter/run.yaml index 0ce392810..435bb22a7 100644 --- a/src/llama_stack/distributions/starter/run.yaml +++ b/src/llama_stack/distributions/starter/run.yaml @@ -17,32 +17,32 @@ providers: - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} provider_type: remote::cerebras config: - base_url: https://api.cerebras.ai + base_url: https://api.cerebras.ai/v1 api_key: ${env.CEREBRAS_API_KEY:=} - provider_id: ${env.OLLAMA_URL:+ollama} provider_type: remote::ollama config: - url: ${env.OLLAMA_URL:=http://localhost:11434} + base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1} - provider_id: ${env.VLLM_URL:+vllm} provider_type: remote::vllm config: - url: ${env.VLLM_URL:=} + base_url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: ${env.TGI_URL:+tgi} provider_type: remote::tgi config: - url: ${env.TGI_URL:=} + base_url: ${env.TGI_URL:=} - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference/v1 + base_url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:=} - provider_id: together provider_type: remote::together config: - url: https://api.together.xyz/v1 + base_url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:=} - provider_id: bedrock provider_type: remote::bedrock @@ -52,9 +52,8 @@ providers: - provider_id: ${env.NVIDIA_API_KEY:+nvidia} provider_type: remote::nvidia config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1} api_key: ${env.NVIDIA_API_KEY:=} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: openai provider_type: remote::openai config: @@ -76,18 +75,18 @@ providers: - provider_id: groq provider_type: remote::groq config: - url: https://api.groq.com + base_url: https://api.groq.com/openai/v1 api_key: ${env.GROQ_API_KEY:=} - provider_id: sambanova provider_type: remote::sambanova config: - url: https://api.sambanova.ai/v1 + base_url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} - provider_id: ${env.AZURE_API_KEY:+azure} provider_type: remote::azure config: api_key: ${env.AZURE_API_KEY:=} - api_base: ${env.AZURE_API_BASE:=} + base_url: ${env.AZURE_API_BASE:=} api_version: ${env.AZURE_API_VERSION:=} api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers diff --git a/src/llama_stack/distributions/watsonx/run.yaml b/src/llama_stack/distributions/watsonx/run.yaml index 8456115d2..f8c489fe3 100644 --- a/src/llama_stack/distributions/watsonx/run.yaml +++ b/src/llama_stack/distributions/watsonx/run.yaml @@ -15,7 +15,7 @@ providers: - provider_id: watsonx provider_type: remote::watsonx config: - url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} + base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} api_key: ${env.WATSONX_API_KEY:=} project_id: ${env.WATSONX_PROJECT_ID:=} vector_io: diff --git a/src/llama_stack/providers/remote/inference/azure/azure.py b/src/llama_stack/providers/remote/inference/azure/azure.py index 134d01b15..c977d75d5 100644 --- a/src/llama_stack/providers/remote/inference/azure/azure.py +++ b/src/llama_stack/providers/remote/inference/azure/azure.py @@ -4,8 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from urllib.parse import urljoin - from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from .config import AzureConfig @@ -22,4 +20,4 @@ class AzureInferenceAdapter(OpenAIMixin): Returns the Azure API base URL from the configuration. """ - return urljoin(str(self.config.api_base), "/openai/v1") + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/azure/config.py b/src/llama_stack/providers/remote/inference/azure/config.py index b801b91b2..f6407a183 100644 --- a/src/llama_stack/providers/remote/inference/azure/config.py +++ b/src/llama_stack/providers/remote/inference/azure/config.py @@ -32,8 +32,9 @@ class AzureProviderDataValidator(BaseModel): @json_schema_type class AzureConfig(RemoteInferenceProviderConfig): - api_base: HttpUrl = Field( - description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)", + base_url: HttpUrl | None = Field( + default=None, + description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1)", ) api_version: str | None = Field( default_factory=lambda: os.getenv("AZURE_API_VERSION"), @@ -48,14 +49,14 @@ class AzureConfig(RemoteInferenceProviderConfig): def sample_run_config( cls, api_key: str = "${env.AZURE_API_KEY:=}", - api_base: str = "${env.AZURE_API_BASE:=}", + base_url: str = "${env.AZURE_API_BASE:=}", api_version: str = "${env.AZURE_API_VERSION:=}", api_type: str = "${env.AZURE_API_TYPE:=}", **kwargs, ) -> dict[str, Any]: return { "api_key": api_key, - "api_base": api_base, + "base_url": base_url, "api_version": api_version, "api_type": api_type, } diff --git a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py index 680431e22..23c27df1e 100644 --- a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -4,8 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from urllib.parse import urljoin - from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack_api import ( OpenAIEmbeddingsRequestWithExtraBody, @@ -21,7 +19,7 @@ class CerebrasInferenceAdapter(OpenAIMixin): provider_data_api_key_field: str = "cerebras_api_key" def get_base_url(self) -> str: - return urljoin(self.config.base_url, "v1") + return str(self.config.base_url) async def openai_embeddings( self, diff --git a/src/llama_stack/providers/remote/inference/cerebras/config.py b/src/llama_stack/providers/remote/inference/cerebras/config.py index db357fd1c..ea88abbea 100644 --- a/src/llama_stack/providers/remote/inference/cerebras/config.py +++ b/src/llama_stack/providers/remote/inference/cerebras/config.py @@ -7,12 +7,12 @@ import os from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type -DEFAULT_BASE_URL = "https://api.cerebras.ai" +DEFAULT_BASE_URL = "https://api.cerebras.ai/v1" class CerebrasProviderDataValidator(BaseModel): @@ -24,8 +24,8 @@ class CerebrasProviderDataValidator(BaseModel): @json_schema_type class CerebrasImplConfig(RemoteInferenceProviderConfig): - base_url: str = Field( - default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL), + base_url: HttpUrl | None = Field( + default=HttpUrl(os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL)), description="Base URL for the Cerebras API", ) diff --git a/src/llama_stack/providers/remote/inference/databricks/config.py b/src/llama_stack/providers/remote/inference/databricks/config.py index bd409fa13..44cb862f9 100644 --- a/src/llama_stack/providers/remote/inference/databricks/config.py +++ b/src/llama_stack/providers/remote/inference/databricks/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field, SecretStr +from pydantic import BaseModel, Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,9 +21,9 @@ class DatabricksProviderDataValidator(BaseModel): @json_schema_type class DatabricksImplConfig(RemoteInferenceProviderConfig): - url: str | None = Field( + base_url: HttpUrl | None = Field( default=None, - description="The URL for the Databricks model serving endpoint", + description="The URL for the Databricks model serving endpoint (should include /serving-endpoints path)", ) auth_credential: SecretStr | None = Field( default=None, @@ -34,11 +34,11 @@ class DatabricksImplConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config( cls, - url: str = "${env.DATABRICKS_HOST:=}", + base_url: str = "${env.DATABRICKS_HOST:=}", api_token: str = "${env.DATABRICKS_TOKEN:=}", **kwargs: Any, ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, "api_token": api_token, } diff --git a/src/llama_stack/providers/remote/inference/databricks/databricks.py b/src/llama_stack/providers/remote/inference/databricks/databricks.py index c07d97b67..f2f8832f6 100644 --- a/src/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/src/llama_stack/providers/remote/inference/databricks/databricks.py @@ -29,15 +29,21 @@ class DatabricksInferenceAdapter(OpenAIMixin): } def get_base_url(self) -> str: - return f"{self.config.url}/serving-endpoints" + return str(self.config.base_url) async def list_provider_model_ids(self) -> Iterable[str]: # Filter out None values from endpoint names api_token = self._get_api_key_from_config_or_provider_data() + # WorkspaceClient expects base host without /serving-endpoints suffix + base_url_str = str(self.config.base_url) + if base_url_str.endswith("/serving-endpoints"): + host = base_url_str[:-18] # Remove '/serving-endpoints' + else: + host = base_url_str return [ endpoint.name # type: ignore[misc] for endpoint in WorkspaceClient( - host=self.config.url, token=api_token + host=host, token=api_token ).serving_endpoints.list() # TODO: this is not async ] diff --git a/src/llama_stack/providers/remote/inference/fireworks/config.py b/src/llama_stack/providers/remote/inference/fireworks/config.py index e36c76054..c59b5f270 100644 --- a/src/llama_stack/providers/remote/inference/fireworks/config.py +++ b/src/llama_stack/providers/remote/inference/fireworks/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import Field +from pydantic import Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type @json_schema_type class FireworksImplConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.fireworks.ai/inference/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.fireworks.ai/inference/v1"), description="The URL for the Fireworks server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]: return { - "url": "https://api.fireworks.ai/inference/v1", + "base_url": "https://api.fireworks.ai/inference/v1", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/fireworks/fireworks.py b/src/llama_stack/providers/remote/inference/fireworks/fireworks.py index 7e2b73546..61ea0b1f6 100644 --- a/src/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/src/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -24,4 +24,4 @@ class FireworksInferenceAdapter(OpenAIMixin): provider_data_api_key_field: str = "fireworks_api_key" def get_base_url(self) -> str: - return "https://api.fireworks.ai/inference/v1" + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/groq/config.py b/src/llama_stack/providers/remote/inference/groq/config.py index cca53a4e8..e5c29c271 100644 --- a/src/llama_stack/providers/remote/inference/groq/config.py +++ b/src/llama_stack/providers/remote/inference/groq/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,14 +21,14 @@ class GroqProviderDataValidator(BaseModel): @json_schema_type class GroqConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.groq.com", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.groq.com/openai/v1"), description="The URL for the Groq AI server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]: return { - "url": "https://api.groq.com", + "base_url": "https://api.groq.com/openai/v1", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/groq/groq.py b/src/llama_stack/providers/remote/inference/groq/groq.py index 3a4f2626d..f99de91ca 100644 --- a/src/llama_stack/providers/remote/inference/groq/groq.py +++ b/src/llama_stack/providers/remote/inference/groq/groq.py @@ -15,4 +15,4 @@ class GroqInferenceAdapter(OpenAIMixin): provider_data_api_key_field: str = "groq_api_key" def get_base_url(self) -> str: - return f"{self.config.url}/openai/v1" + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py index ded210d89..a0f80d969 100644 --- a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py +++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,14 +21,14 @@ class LlamaProviderDataValidator(BaseModel): @json_schema_type class LlamaCompatConfig(RemoteInferenceProviderConfig): - openai_compat_api_base: str = Field( - default="https://api.llama.com/compat/v1/", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.llama.com/compat/v1/"), description="The URL for the Llama API server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]: return { - "openai_compat_api_base": "https://api.llama.com/compat/v1/", + "base_url": "https://api.llama.com/compat/v1/", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index a5f67ecd1..f29aebf36 100644 --- a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -31,7 +31,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin): :return: The Llama API base URL """ - return self.config.openai_compat_api_base + return str(self.config.base_url) async def openai_completion( self, diff --git a/src/llama_stack/providers/remote/inference/nvidia/config.py b/src/llama_stack/providers/remote/inference/nvidia/config.py index e5b0c6b73..e1e9a0ea9 100644 --- a/src/llama_stack/providers/remote/inference/nvidia/config.py +++ b/src/llama_stack/providers/remote/inference/nvidia/config.py @@ -7,7 +7,7 @@ import os from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -44,18 +44,14 @@ class NVIDIAConfig(RemoteInferenceProviderConfig): URL of your running NVIDIA NIM and do not need to set the api_key. """ - url: str = Field( - default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"), + base_url: HttpUrl | None = Field( + default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1"), description="A base url for accessing the NVIDIA NIM", ) timeout: int = Field( default=60, description="Timeout for the HTTP requests", ) - append_api_version: bool = Field( - default_factory=lambda: os.getenv("NVIDIA_APPEND_API_VERSION", "True").lower() != "false", - description="When set to false, the API version will not be appended to the base_url. By default, it is true.", - ) rerank_model_to_url: dict[str, str] = Field( default_factory=lambda: { "nv-rerank-qa-mistral-4b:1": "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking", @@ -68,13 +64,11 @@ class NVIDIAConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config( cls, - url: str = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}", + base_url: HttpUrl | None = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}", api_key: str = "${env.NVIDIA_API_KEY:=}", - append_api_version: bool = "${env.NVIDIA_APPEND_API_VERSION:=True}", **kwargs, ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, "api_key": api_key, - "append_api_version": append_api_version, } diff --git a/src/llama_stack/providers/remote/inference/nvidia/nvidia.py b/src/llama_stack/providers/remote/inference/nvidia/nvidia.py index 17f8775bf..5d0d52d6a 100644 --- a/src/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/src/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -44,7 +44,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin): } async def initialize(self) -> None: - logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...") + logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.base_url})...") if _is_nvidia_hosted(self.config): if not self.config.auth_credential: @@ -72,7 +72,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin): :return: The NVIDIA API base URL """ - return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url + return str(self.config.base_url) async def list_provider_model_ids(self) -> Iterable[str]: """ diff --git a/src/llama_stack/providers/remote/inference/nvidia/utils.py b/src/llama_stack/providers/remote/inference/nvidia/utils.py index 46ee939d9..c138d1fc5 100644 --- a/src/llama_stack/providers/remote/inference/nvidia/utils.py +++ b/src/llama_stack/providers/remote/inference/nvidia/utils.py @@ -8,4 +8,4 @@ from . import NVIDIAConfig def _is_nvidia_hosted(config: NVIDIAConfig) -> bool: - return "integrate.api.nvidia.com" in config.url + return "integrate.api.nvidia.com" in str(config.base_url) diff --git a/src/llama_stack/providers/remote/inference/ollama/config.py b/src/llama_stack/providers/remote/inference/ollama/config.py index 416b847a0..60dd34fa8 100644 --- a/src/llama_stack/providers/remote/inference/ollama/config.py +++ b/src/llama_stack/providers/remote/inference/ollama/config.py @@ -6,20 +6,22 @@ from typing import Any -from pydantic import Field, SecretStr +from pydantic import Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig -DEFAULT_OLLAMA_URL = "http://localhost:11434" +DEFAULT_OLLAMA_URL = "http://localhost:11434/v1" class OllamaImplConfig(RemoteInferenceProviderConfig): auth_credential: SecretStr | None = Field(default=None, exclude=True) - url: str = DEFAULT_OLLAMA_URL + base_url: HttpUrl | None = Field(default=HttpUrl(DEFAULT_OLLAMA_URL)) @classmethod - def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]: + def sample_run_config( + cls, base_url: str = "${env.OLLAMA_URL:=http://localhost:11434/v1}", **kwargs + ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, } diff --git a/src/llama_stack/providers/remote/inference/ollama/ollama.py b/src/llama_stack/providers/remote/inference/ollama/ollama.py index d1bf85361..e8b872384 100644 --- a/src/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/src/llama_stack/providers/remote/inference/ollama/ollama.py @@ -55,17 +55,23 @@ class OllamaInferenceAdapter(OpenAIMixin): # ollama client attaches itself to the current event loop (sadly?) loop = asyncio.get_running_loop() if loop not in self._clients: - self._clients[loop] = AsyncOllamaClient(host=self.config.url) + # Ollama client expects base URL without /v1 suffix + base_url_str = str(self.config.base_url) + if base_url_str.endswith("/v1"): + host = base_url_str[:-3] + else: + host = base_url_str + self._clients[loop] = AsyncOllamaClient(host=host) return self._clients[loop] def get_api_key(self): return "NO KEY REQUIRED" def get_base_url(self): - return self.config.url.rstrip("/") + "/v1" + return str(self.config.base_url) async def initialize(self) -> None: - logger.info(f"checking connectivity to Ollama at `{self.config.url}`...") + logger.info(f"checking connectivity to Ollama at `{self.config.base_url}`...") r = await self.health() if r["status"] == HealthStatus.ERROR: logger.warning( diff --git a/src/llama_stack/providers/remote/inference/openai/config.py b/src/llama_stack/providers/remote/inference/openai/config.py index ab28e571f..2057cd0d6 100644 --- a/src/llama_stack/providers/remote/inference/openai/config.py +++ b/src/llama_stack/providers/remote/inference/openai/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,8 +21,8 @@ class OpenAIProviderDataValidator(BaseModel): @json_schema_type class OpenAIConfig(RemoteInferenceProviderConfig): - base_url: str = Field( - default="https://api.openai.com/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.openai.com/v1"), description="Base URL for OpenAI API", ) diff --git a/src/llama_stack/providers/remote/inference/openai/openai.py b/src/llama_stack/providers/remote/inference/openai/openai.py index 52bc48f1a..2d465546a 100644 --- a/src/llama_stack/providers/remote/inference/openai/openai.py +++ b/src/llama_stack/providers/remote/inference/openai/openai.py @@ -35,4 +35,4 @@ class OpenAIInferenceAdapter(OpenAIMixin): Returns the OpenAI API base URL from the configuration. """ - return self.config.base_url + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/passthrough/config.py b/src/llama_stack/providers/remote/inference/passthrough/config.py index 54508b6fb..f45806e79 100644 --- a/src/llama_stack/providers/remote/inference/passthrough/config.py +++ b/src/llama_stack/providers/remote/inference/passthrough/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import Field +from pydantic import Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,16 +14,16 @@ from llama_stack_api import json_schema_type @json_schema_type class PassthroughImplConfig(RemoteInferenceProviderConfig): - url: str = Field( + base_url: HttpUrl | None = Field( default=None, description="The URL for the passthrough endpoint", ) @classmethod def sample_run_config( - cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs + cls, base_url: HttpUrl | None = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs ) -> dict[str, Any]: return { - "url": url, + "base_url": base_url, "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py index 75eedf026..b0e2e74ad 100644 --- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -82,8 +82,8 @@ class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference): def _get_passthrough_url(self) -> str: """Get the passthrough URL from config or provider data.""" - if self.config.url is not None: - return self.config.url + if self.config.base_url is not None: + return str(self.config.base_url) provider_data = self.get_request_provider_data() if provider_data is None: diff --git a/src/llama_stack/providers/remote/inference/runpod/config.py b/src/llama_stack/providers/remote/inference/runpod/config.py index 2ee56ca94..8d06f5263 100644 --- a/src/llama_stack/providers/remote/inference/runpod/config.py +++ b/src/llama_stack/providers/remote/inference/runpod/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field, SecretStr +from pydantic import BaseModel, Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,7 +21,7 @@ class RunpodProviderDataValidator(BaseModel): @json_schema_type class RunpodImplConfig(RemoteInferenceProviderConfig): - url: str | None = Field( + base_url: HttpUrl | None = Field( default=None, description="The URL for the Runpod model serving endpoint", ) @@ -34,6 +34,6 @@ class RunpodImplConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]: return { - "url": "${env.RUNPOD_URL:=}", + "base_url": "${env.RUNPOD_URL:=}", "api_token": "${env.RUNPOD_API_TOKEN}", } diff --git a/src/llama_stack/providers/remote/inference/runpod/runpod.py b/src/llama_stack/providers/remote/inference/runpod/runpod.py index 9c770cc24..04ad12851 100644 --- a/src/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/src/llama_stack/providers/remote/inference/runpod/runpod.py @@ -28,7 +28,7 @@ class RunpodInferenceAdapter(OpenAIMixin): def get_base_url(self) -> str: """Get base URL for OpenAI client.""" - return self.config.url + return str(self.config.base_url) async def openai_chat_completion( self, diff --git a/src/llama_stack/providers/remote/inference/sambanova/config.py b/src/llama_stack/providers/remote/inference/sambanova/config.py index 93679ba99..79cda75a0 100644 --- a/src/llama_stack/providers/remote/inference/sambanova/config.py +++ b/src/llama_stack/providers/remote/inference/sambanova/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -21,14 +21,14 @@ class SambaNovaProviderDataValidator(BaseModel): @json_schema_type class SambaNovaImplConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.sambanova.ai/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.sambanova.ai/v1"), description="The URL for the SambaNova AI server", ) @classmethod def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]: return { - "url": "https://api.sambanova.ai/v1", + "base_url": "https://api.sambanova.ai/v1", "api_key": api_key, } diff --git a/src/llama_stack/providers/remote/inference/sambanova/sambanova.py b/src/llama_stack/providers/remote/inference/sambanova/sambanova.py index daa4b1670..cb01e3a90 100644 --- a/src/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/src/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -25,4 +25,4 @@ class SambaNovaInferenceAdapter(OpenAIMixin): :return: The SambaNova base URL """ - return self.config.url + return str(self.config.base_url) diff --git a/src/llama_stack/providers/remote/inference/tgi/config.py b/src/llama_stack/providers/remote/inference/tgi/config.py index 74edc8523..44cb4b812 100644 --- a/src/llama_stack/providers/remote/inference/tgi/config.py +++ b/src/llama_stack/providers/remote/inference/tgi/config.py @@ -5,7 +5,7 @@ # the root directory of this source tree. -from pydantic import BaseModel, Field, SecretStr +from pydantic import BaseModel, Field, HttpUrl, SecretStr from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -15,18 +15,19 @@ from llama_stack_api import json_schema_type class TGIImplConfig(RemoteInferenceProviderConfig): auth_credential: SecretStr | None = Field(default=None, exclude=True) - url: str = Field( - description="The URL for the TGI serving endpoint", + base_url: HttpUrl | None = Field( + default=None, + description="The URL for the TGI serving endpoint (should include /v1 path)", ) @classmethod def sample_run_config( cls, - url: str = "${env.TGI_URL:=}", + base_url: str = "${env.TGI_URL:=}", **kwargs, ): return { - "url": url, + "base_url": base_url, } diff --git a/src/llama_stack/providers/remote/inference/tgi/tgi.py b/src/llama_stack/providers/remote/inference/tgi/tgi.py index dd47ccc62..5dc8c33f7 100644 --- a/src/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/src/llama_stack/providers/remote/inference/tgi/tgi.py @@ -8,7 +8,7 @@ from collections.abc import Iterable from huggingface_hub import AsyncInferenceClient, HfApi -from pydantic import SecretStr +from pydantic import HttpUrl, SecretStr from llama_stack.log import get_logger from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -23,7 +23,7 @@ log = get_logger(name=__name__, category="inference::tgi") class _HfAdapter(OpenAIMixin): - url: str + base_url: HttpUrl api_key: SecretStr hf_client: AsyncInferenceClient @@ -36,7 +36,7 @@ class _HfAdapter(OpenAIMixin): return "NO KEY REQUIRED" def get_base_url(self): - return self.url + return self.base_url async def list_provider_model_ids(self) -> Iterable[str]: return [self.model_id] @@ -50,14 +50,20 @@ class _HfAdapter(OpenAIMixin): class TGIAdapter(_HfAdapter): async def initialize(self, config: TGIImplConfig) -> None: - if not config.url: + if not config.base_url: raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.") - log.info(f"Initializing TGI client with url={config.url}") - self.hf_client = AsyncInferenceClient(model=config.url, provider="hf-inference") + log.info(f"Initializing TGI client with url={config.base_url}") + # Extract base URL without /v1 for HF client initialization + base_url_str = str(config.base_url).rstrip("/") + if base_url_str.endswith("/v1"): + base_url_for_client = base_url_str[:-3] + else: + base_url_for_client = base_url_str + self.hf_client = AsyncInferenceClient(model=base_url_for_client, provider="hf-inference") endpoint_info = await self.hf_client.get_endpoint_info() self.max_tokens = endpoint_info["max_total_tokens"] self.model_id = endpoint_info["model_id"] - self.url = f"{config.url.rstrip('/')}/v1" + self.base_url = config.base_url self.api_key = SecretStr("NO_KEY") diff --git a/src/llama_stack/providers/remote/inference/together/config.py b/src/llama_stack/providers/remote/inference/together/config.py index c1b3c4a55..16f0686ba 100644 --- a/src/llama_stack/providers/remote/inference/together/config.py +++ b/src/llama_stack/providers/remote/inference/together/config.py @@ -6,7 +6,7 @@ from typing import Any -from pydantic import Field +from pydantic import Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type @json_schema_type class TogetherImplConfig(RemoteInferenceProviderConfig): - url: str = Field( - default="https://api.together.xyz/v1", + base_url: HttpUrl | None = Field( + default=HttpUrl("https://api.together.xyz/v1"), description="The URL for the Together AI server", ) @classmethod def sample_run_config(cls, **kwargs) -> dict[str, Any]: return { - "url": "https://api.together.xyz/v1", + "base_url": "https://api.together.xyz/v1", "api_key": "${env.TOGETHER_API_KEY:=}", } diff --git a/src/llama_stack/providers/remote/inference/together/together.py b/src/llama_stack/providers/remote/inference/together/together.py index cd34aec5e..0826dbcd2 100644 --- a/src/llama_stack/providers/remote/inference/together/together.py +++ b/src/llama_stack/providers/remote/inference/together/together.py @@ -9,7 +9,6 @@ from collections.abc import Iterable from typing import Any, cast from together import AsyncTogether # type: ignore[import-untyped] -from together.constants import BASE_URL # type: ignore[import-untyped] from llama_stack.core.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger @@ -42,7 +41,7 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData): provider_data_api_key_field: str = "together_api_key" def get_base_url(self): - return BASE_URL + return str(self.config.base_url) def _get_client(self) -> AsyncTogether: together_api_key = None diff --git a/src/llama_stack/providers/remote/inference/vllm/config.py b/src/llama_stack/providers/remote/inference/vllm/config.py index c43533ee4..db6c74431 100644 --- a/src/llama_stack/providers/remote/inference/vllm/config.py +++ b/src/llama_stack/providers/remote/inference/vllm/config.py @@ -6,7 +6,7 @@ from pathlib import Path -from pydantic import Field, SecretStr, field_validator +from pydantic import Field, HttpUrl, SecretStr, field_validator from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -14,7 +14,7 @@ from llama_stack_api import json_schema_type @json_schema_type class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig): - url: str | None = Field( + base_url: HttpUrl | None = Field( default=None, description="The URL for the vLLM model serving endpoint", ) @@ -48,11 +48,11 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config( cls, - url: str = "${env.VLLM_URL:=}", + base_url: str = "${env.VLLM_URL:=}", **kwargs, ): return { - "url": url, + "base_url": base_url, "max_tokens": "${env.VLLM_MAX_TOKENS:=4096}", "api_token": "${env.VLLM_API_TOKEN:=fake}", "tls_verify": "${env.VLLM_TLS_VERIFY:=true}", diff --git a/src/llama_stack/providers/remote/inference/vllm/vllm.py b/src/llama_stack/providers/remote/inference/vllm/vllm.py index 1510e9384..6664ca36b 100644 --- a/src/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/src/llama_stack/providers/remote/inference/vllm/vllm.py @@ -39,12 +39,12 @@ class VLLMInferenceAdapter(OpenAIMixin): def get_base_url(self) -> str: """Get the base URL from config.""" - if not self.config.url: + if not self.config.base_url: raise ValueError("No base URL configured") - return self.config.url + return str(self.config.base_url) async def initialize(self) -> None: - if not self.config.url: + if not self.config.base_url: raise ValueError( "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM." ) diff --git a/src/llama_stack/providers/remote/inference/watsonx/config.py b/src/llama_stack/providers/remote/inference/watsonx/config.py index 914f80820..be2b2c0ab 100644 --- a/src/llama_stack/providers/remote/inference/watsonx/config.py +++ b/src/llama_stack/providers/remote/inference/watsonx/config.py @@ -7,7 +7,7 @@ import os from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig from llama_stack_api import json_schema_type @@ -23,7 +23,7 @@ class WatsonXProviderDataValidator(BaseModel): @json_schema_type class WatsonXConfig(RemoteInferenceProviderConfig): - url: str = Field( + base_url: HttpUrl | None = Field( default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"), description="A base url for accessing the watsonx.ai", ) @@ -39,7 +39,7 @@ class WatsonXConfig(RemoteInferenceProviderConfig): @classmethod def sample_run_config(cls, **kwargs) -> dict[str, Any]: return { - "url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}", + "base_url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}", "api_key": "${env.WATSONX_API_KEY:=}", "project_id": "${env.WATSONX_PROJECT_ID:=}", } diff --git a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py index aab9e2dca..5684f6c17 100644 --- a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py +++ b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -255,7 +255,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin): ) def get_base_url(self) -> str: - return self.config.url + return str(self.config.base_url) # Copied from OpenAIMixin async def check_model_availability(self, model: str) -> bool: @@ -316,7 +316,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin): """ Retrieves foundation model specifications from the watsonx.ai API. """ - url = f"{self.config.url}/ml/v1/foundation_model_specs?version=2023-10-25" + url = f"{str(self.config.base_url)}/ml/v1/foundation_model_specs?version=2023-10-25" headers = { # Note that there is no authorization header. Listing models does not require authentication. "Content-Type": "application/json", diff --git a/tests/integration/suites.py b/tests/integration/suites.py index 7689657b4..10c872705 100644 --- a/tests/integration/suites.py +++ b/tests/integration/suites.py @@ -50,7 +50,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="ollama", description="Local Ollama provider with text + safety models", env={ - "OLLAMA_URL": "http://0.0.0.0:11434", + "OLLAMA_URL": "http://0.0.0.0:11434/v1", "SAFETY_MODEL": "ollama/llama-guard3:1b", }, defaults={ @@ -64,7 +64,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="ollama", description="Local Ollama provider with a vision model", env={ - "OLLAMA_URL": "http://0.0.0.0:11434", + "OLLAMA_URL": "http://0.0.0.0:11434/v1", }, defaults={ "vision_model": "ollama/llama3.2-vision:11b", @@ -75,7 +75,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="ollama-postgres", description="Server-mode tests with Postgres-backed persistence", env={ - "OLLAMA_URL": "http://0.0.0.0:11434", + "OLLAMA_URL": "http://0.0.0.0:11434/v1", "SAFETY_MODEL": "ollama/llama-guard3:1b", "POSTGRES_HOST": "127.0.0.1", "POSTGRES_PORT": "5432", diff --git a/tests/unit/providers/inference/test_inference_client_caching.py b/tests/unit/providers/inference/test_inference_client_caching.py index aa3a2c77a..6ddf790af 100644 --- a/tests/unit/providers/inference/test_inference_client_caching.py +++ b/tests/unit/providers/inference/test_inference_client_caching.py @@ -120,7 +120,7 @@ from llama_stack.providers.remote.inference.watsonx.watsonx import WatsonXInfere VLLMInferenceAdapter, "llama_stack.providers.remote.inference.vllm.VLLMProviderDataValidator", { - "url": "http://fake", + "base_url": "http://fake", }, ), ], @@ -153,7 +153,7 @@ def test_litellm_provider_data_used(config_cls, adapter_cls, provider_data_valid """Validate data for LiteLLM-based providers. Similar to test_openai_provider_data_used, but without the assumption that there is an OpenAI-compatible client object.""" - inference_adapter = adapter_cls(config=config_cls()) + inference_adapter = adapter_cls(config=config_cls(base_url="http://fake")) inference_adapter.__provider_spec__ = MagicMock() inference_adapter.__provider_spec__.provider_data_validator = provider_data_validator diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index 958895cc4..0cf8ed306 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -40,7 +40,7 @@ from llama_stack_api import ( @pytest.fixture(scope="function") async def vllm_inference_adapter(): - config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345") inference_adapter = VLLMInferenceAdapter(config=config) inference_adapter.model_store = AsyncMock() await inference_adapter.initialize() @@ -204,7 +204,7 @@ async def test_vllm_completion_extra_body(): via extra_body to the underlying OpenAI client through the InferenceRouter. """ # Set up the vLLM adapter - config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345") vllm_adapter = VLLMInferenceAdapter(config=config) vllm_adapter.__provider_id__ = "vllm" await vllm_adapter.initialize() @@ -277,7 +277,7 @@ async def test_vllm_chat_completion_extra_body(): via extra_body to the underlying OpenAI client through the InferenceRouter for chat completion. """ # Set up the vLLM adapter - config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345") vllm_adapter = VLLMInferenceAdapter(config=config) vllm_adapter.__provider_id__ = "vllm" await vllm_adapter.initialize() diff --git a/tests/unit/providers/nvidia/test_rerank_inference.py b/tests/unit/providers/nvidia/test_rerank_inference.py index ee62910b8..4ad9dc766 100644 --- a/tests/unit/providers/nvidia/test_rerank_inference.py +++ b/tests/unit/providers/nvidia/test_rerank_inference.py @@ -146,7 +146,7 @@ async def test_hosted_model_not_in_endpoint_mapping(): async def test_self_hosted_ignores_endpoint(): adapter = create_adapter( - config=NVIDIAConfig(url="http://localhost:8000", api_key=None), + config=NVIDIAConfig(base_url="http://localhost:8000", api_key=None), rerank_endpoints={"test-model": "https://model.endpoint/rerank"}, # This should be ignored for self-hosted. ) mock_session = MockSession(MockResponse()) diff --git a/tests/unit/providers/test_configs.py b/tests/unit/providers/test_configs.py index 867cfffbc..b4ba78394 100644 --- a/tests/unit/providers/test_configs.py +++ b/tests/unit/providers/test_configs.py @@ -4,8 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import get_args, get_origin + import pytest -from pydantic import BaseModel +from pydantic import BaseModel, HttpUrl from llama_stack.core.distribution import get_provider_registry, providable_apis from llama_stack.core.utils.dynamic import instantiate_class_type @@ -41,3 +43,55 @@ class TestProviderConfigurations: sample_config = config_type.sample_run_config(__distro_dir__="foobarbaz") assert isinstance(sample_config, dict), f"{config_class_name}.sample_run_config() did not return a dict" + + def test_remote_inference_url_standardization(self): + """Verify all remote inference providers use standardized base_url configuration.""" + provider_registry = get_provider_registry() + inference_providers = provider_registry.get("inference", {}) + + # Filter for remote providers only + remote_providers = {k: v for k, v in inference_providers.items() if k.startswith("remote::")} + + failures = [] + for provider_type, provider_spec in remote_providers.items(): + try: + config_class_name = provider_spec.config_class + config_type = instantiate_class_type(config_class_name) + + # Check that config has base_url field (not url) + if hasattr(config_type, "model_fields"): + fields = config_type.model_fields + + # Should NOT have 'url' field (old pattern) + if "url" in fields: + failures.append( + f"{provider_type}: Uses deprecated 'url' field instead of 'base_url'. " + f"Please rename to 'base_url' for consistency." + ) + + # Should have 'base_url' field with HttpUrl | None type + if "base_url" in fields: + field_info = fields["base_url"] + annotation = field_info.annotation + + # Check if it's HttpUrl or HttpUrl | None + # get_origin() returns Union for (X | Y), None for plain types + # get_args() returns the types inside Union, e.g. (HttpUrl, NoneType) + is_valid = False + if get_origin(annotation) is not None: # It's a Union/Optional + if HttpUrl in get_args(annotation): + is_valid = True + elif annotation == HttpUrl: # Plain HttpUrl without | None + is_valid = True + + if not is_valid: + failures.append( + f"{provider_type}: base_url field has incorrect type annotation. " + f"Expected 'HttpUrl | None', got '{annotation}'" + ) + + except Exception as e: + failures.append(f"{provider_type}: Error checking URL standardization: {str(e)}") + + if failures: + pytest.fail("URL standardization violations found:\n" + "\n".join(f" - {f}" for f in failures))