From 1d941b6aa0201c0158e59a2f99c9e8aae8051b4c Mon Sep 17 00:00:00 2001 From: Bill Murdock Date: Mon, 6 Oct 2025 15:45:24 -0400 Subject: [PATCH] Address review comments Signed-off-by: Bill Murdock --- llama_stack/distributions/watsonx/__init__.py | 2 + llama_stack/distributions/watsonx/build.yaml | 41 +++---- llama_stack/distributions/watsonx/run.yaml | 103 +----------------- llama_stack/distributions/watsonx/watsonx.py | 36 +----- llama_stack/providers/registry/inference.py | 2 +- .../remote/inference/watsonx/config.py | 8 +- .../remote/inference/watsonx/watsonx.py | 11 -- .../utils/inference/litellm_openai_mixin.py | 30 ++++- .../utils/inference/openai_compat.py | 28 ----- 9 files changed, 61 insertions(+), 200 deletions(-) diff --git a/llama_stack/distributions/watsonx/__init__.py b/llama_stack/distributions/watsonx/__init__.py index 756f351d8..078d86144 100644 --- a/llama_stack/distributions/watsonx/__init__.py +++ b/llama_stack/distributions/watsonx/__init__.py @@ -3,3 +3,5 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. + +from .watsonx import get_distribution_template # noqa: F401 diff --git a/llama_stack/distributions/watsonx/build.yaml b/llama_stack/distributions/watsonx/build.yaml index bf4be7eaf..06349a741 100644 --- a/llama_stack/distributions/watsonx/build.yaml +++ b/llama_stack/distributions/watsonx/build.yaml @@ -3,44 +3,33 @@ distribution_spec: description: Use watsonx for running LLM inference providers: inference: - - provider_id: watsonx - provider_type: remote::watsonx - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers + - provider_type: remote::watsonx + - provider_type: inline::sentence-transformers vector_io: - - provider_id: faiss - provider_type: inline::faiss + - provider_type: inline::faiss safety: - - provider_id: llama-guard - provider_type: inline::llama-guard + - provider_type: inline::llama-guard agents: - - provider_id: meta-reference - provider_type: inline::meta-reference + - provider_type: inline::meta-reference telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference + - provider_type: inline::meta-reference eval: - - provider_id: meta-reference - provider_type: inline::meta-reference + - provider_type: inline::meta-reference datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - - provider_id: localfs - provider_type: inline::localfs + - provider_type: remote::huggingface + - provider_type: inline::localfs scoring: - - provider_id: basic - provider_type: inline::basic - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - - provider_id: braintrust - provider_type: inline::braintrust + - provider_type: inline::basic + - provider_type: inline::llm-as-judge + - provider_type: inline::braintrust tool_runtime: - provider_type: remote::brave-search - provider_type: remote::tavily-search - provider_type: inline::rag-runtime - provider_type: remote::model-context-protocol + files: + - provider_type: inline::localfs image_type: venv additional_pip_packages: +- aiosqlite - sqlalchemy[asyncio] -- aiosqlite -- aiosqlite diff --git a/llama_stack/distributions/watsonx/run.yaml b/llama_stack/distributions/watsonx/run.yaml index 92f367910..e0c337f9d 100644 --- a/llama_stack/distributions/watsonx/run.yaml +++ b/llama_stack/distributions/watsonx/run.yaml @@ -4,13 +4,13 @@ apis: - agents - datasetio - eval +- files - inference - safety - scoring - telemetry - tool_runtime - vector_io -- files providers: inference: - provider_id: watsonx @@ -19,8 +19,6 @@ providers: url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com} api_key: ${env.WATSONX_API_KEY:=} project_id: ${env.WATSONX_PROJECT_ID:=} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers vector_io: - provider_id: faiss provider_type: inline::faiss @@ -48,7 +46,7 @@ providers: provider_type: inline::meta-reference config: service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} + sinks: ${env.TELEMETRY_SINKS:=sqlite} sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/trace_store.db otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=} eval: @@ -109,102 +107,7 @@ metadata_store: inference_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/inference_store.db -models: -- metadata: {} - model_id: meta-llama/llama-3-3-70b-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-2-13b-chat - provider_id: watsonx - provider_model_id: meta-llama/llama-2-13b-chat - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-2-13b - provider_id: watsonx - provider_model_id: meta-llama/llama-2-13b-chat - model_type: llm -- metadata: {} - model_id: meta-llama/llama-3-1-70b-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-3-1-8b-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-3-2-11b-vision-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-3-2-1b-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-3-2-3b-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-3-2-90b-vision-instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: watsonx - provider_model_id: meta-llama/llama-3-2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/llama-guard-3-11b-vision - provider_id: watsonx - provider_model_id: meta-llama/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: watsonx - provider_model_id: meta-llama/llama-guard-3-11b-vision - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding +models: [] shields: [] vector_dbs: [] datasets: [] diff --git a/llama_stack/distributions/watsonx/watsonx.py b/llama_stack/distributions/watsonx/watsonx.py index c3cab5d1b..645770612 100644 --- a/llama_stack/distributions/watsonx/watsonx.py +++ b/llama_stack/distributions/watsonx/watsonx.py @@ -4,17 +4,11 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from pathlib import Path -from llama_stack.apis.models import ModelType -from llama_stack.core.datatypes import BuildProvider, ModelInput, Provider, ToolGroupInput -from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.core.datatypes import BuildProvider, Provider, ToolGroupInput +from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) from llama_stack.providers.remote.inference.watsonx import WatsonXConfig -from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES def get_distribution_template(name: str = "watsonx") -> DistributionTemplate: @@ -52,15 +46,6 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate: config=WatsonXConfig.sample_run_config(), ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - - available_models = { - "watsonx": MODEL_ENTRIES, - } default_tool_groups = [ ToolGroupInput( toolgroup_id="builtin::websearch", @@ -72,36 +57,25 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate: ), ] - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - files_provider = Provider( provider_id="meta-reference-files", provider_type="inline::localfs", config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"), ) - default_models, _ = get_model_registry(available_models) return DistributionTemplate( name=name, distro_type="remote_hosted", description="Use watsonx for running LLM inference", container_image=None, - template_path=Path(__file__).parent / "doc_template.md", + template_path=None, providers=providers, - available_models_by_provider=available_models, run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider, embedding_provider], + "inference": [inference_provider], "files": [files_provider], }, - default_models=default_models + [embedding_model], + default_models=[], default_tool_groups=default_tool_groups, ), }, diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 89d7f55e8..68b0a2db3 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -277,7 +277,7 @@ Available Models: api=Api.inference, adapter_type="watsonx", provider_type="remote::watsonx", - pip_packages=["ibm_watsonx_ai"], + pip_packages=["litellm"], module="llama_stack.providers.remote.inference.watsonx", config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig", provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator", diff --git a/llama_stack/providers/remote/inference/watsonx/config.py b/llama_stack/providers/remote/inference/watsonx/config.py index 8417288f7..2a77f9edb 100644 --- a/llama_stack/providers/remote/inference/watsonx/config.py +++ b/llama_stack/providers/remote/inference/watsonx/config.py @@ -25,12 +25,16 @@ class WatsonXConfig(RemoteInferenceProviderConfig): default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"), description="A base url for accessing the watsonx.ai", ) + # This seems like it should be required, but none of the other remote inference + # providers require it, so this is optional here too for consistency. + # The OpenAIConfig uses default=None instead, so this is following that precedent. api_key: SecretStr | None = Field( - default_factory=lambda: os.getenv("WATSONX_API_KEY"), + default=None, description="The watsonx.ai API key", ) + # As above, this is optional here too for consistency. project_id: str | None = Field( - default_factory=lambda: os.getenv("WATSONX_PROJECT_ID"), + default=None, description="The watsonx.ai project ID", ) timeout: int = Field( diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py index d7337d085..0d3af70f6 100644 --- a/llama_stack/providers/remote/inference/watsonx/watsonx.py +++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -16,9 +16,6 @@ from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOp class WatsonXInferenceAdapter(LiteLLMOpenAIMixin): - _config: WatsonXConfig - __provider_id__: str = "watsonx" - def __init__(self, config: WatsonXConfig): LiteLLMOpenAIMixin.__init__( self, @@ -29,17 +26,9 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin): self.available_models = None self.config = config - # get_api_key = LiteLLMOpenAIMixin.get_api_key - def get_base_url(self) -> str: return self.config.url - async def initialize(self): - await super().initialize() - - async def shutdown(self): - await super().shutdown() - async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]: # Get base parameters from parent params = await super()._get_params(request) diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index 6c8f61c3b..444a6a715 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import base64 +import struct from collections.abc import AsyncIterator from typing import Any @@ -16,6 +18,7 @@ from llama_stack.apis.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, + OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, @@ -26,7 +29,6 @@ from llama_stack.core.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper, ProviderModelEntry from llama_stack.providers.utils.inference.openai_compat import ( - b64_encode_openai_embeddings_response, convert_message_to_openai_dict_new, convert_tooldef_to_openai_tool, get_sampling_options, @@ -334,6 +336,7 @@ class LiteLLMOpenAIMixin( api_key=self.get_api_key(), api_base=self.api_base, ) + logger.info(f"params to litellm (openai compat): {params}") return await litellm.acompletion(**params) async def check_model_availability(self, model: str) -> bool: @@ -349,3 +352,28 @@ class LiteLLMOpenAIMixin( return False return model in litellm.models_by_provider[self.litellm_provider_name] + + +def b64_encode_openai_embeddings_response( + response_data: list[dict], encoding_format: str | None = "float" +) -> list[OpenAIEmbeddingData]: + """ + Process the OpenAI embeddings response to encode the embeddings in base64 format if specified. + """ + data = [] + for i, embedding_data in enumerate(response_data): + if encoding_format == "base64": + byte_array = bytearray() + for embedding_value in embedding_data["embedding"]: + byte_array.extend(struct.pack("f", float(embedding_value))) + + response_embedding = base64.b64encode(byte_array).decode("utf-8") + else: + response_embedding = embedding_data["embedding"] + data.append( + OpenAIEmbeddingData( + embedding=response_embedding, + index=i, + ) + ) + return data diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 4070f7a5a..7e465a14c 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -3,9 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import base64 import json -import struct import time import uuid import warnings @@ -103,7 +101,6 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, Message, OpenAIChatCompletion, - OpenAIEmbeddingData, OpenAIMessageParam, OpenAIResponseFormatParam, SamplingParams, @@ -1402,28 +1399,3 @@ def prepare_openai_embeddings_params( params["user"] = user return params - - -def b64_encode_openai_embeddings_response( - response_data: list[dict], encoding_format: str | None = "float" -) -> list[OpenAIEmbeddingData]: - """ - Process the OpenAI embeddings response to encode the embeddings in base64 format if specified. - """ - data = [] - for i, embedding_data in enumerate(response_data): - if encoding_format == "base64": - byte_array = bytearray() - for embedding_value in embedding_data["embedding"]: - byte_array.extend(struct.pack("f", float(embedding_value))) - - response_embedding = base64.b64encode(byte_array).decode("utf-8") - else: - response_embedding = embedding_data["embedding"] - data.append( - OpenAIEmbeddingData( - embedding=response_embedding, - index=i, - ) - ) - return data