diff --git a/README.md b/README.md index 8c201e43d..c2e688763 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ Here is a list of the various API providers and available distributions that can | OpenAI | Hosted | | ✅ | | | | | Anthropic | Hosted | | ✅ | | | | | Gemini | Hosted | | ✅ | | | | +| watsonx | Hosted | | ✅ | | | | ### Distributions diff --git a/docs/source/distributions/remote_hosted_distro/watsonx.md b/docs/source/distributions/remote_hosted_distro/watsonx.md new file mode 100644 index 000000000..018dc2a3c --- /dev/null +++ b/docs/source/distributions/remote_hosted_distro/watsonx.md @@ -0,0 +1,88 @@ +--- +orphan: true +--- + +# watsonx Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-watsonx` distribution consists of the following provider configurations. + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | +| inference | `remote::watsonx` | +| safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | +| telemetry | `inline::meta-reference` | +| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss` | + + + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `WATSONX_API_KEY`: watsonx API Key (default: ``) +- `WATSONX_PROJECT_ID`: watsonx Project ID (default: ``) + +### Models + +The following models are available by default: + +- `meta-llama/llama-3-3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)` +- `meta-llama/llama-2-13b-chat (aliases: meta-llama/Llama-2-13b)` +- `meta-llama/llama-3-1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)` +- `meta-llama/llama-3-1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)` +- `meta-llama/llama-3-2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` +- `meta-llama/llama-3-2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)` +- `meta-llama/llama-3-2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)` +- `meta-llama/llama-3-2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` +- `meta-llama/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)` + + +### Prerequisite: API Keys + +Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key). + + +## Running Llama Stack with watsonx + +You can do this via Conda (build code), venv or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-watsonx \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env WATSONX_API_KEY=$WATSONX_API_KEY \ + --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \ + --env WATSONX_BASE_URL=$WATSONX_BASE_URL +``` + +### Via Conda + +```bash +llama stack build --template watsonx --image-type conda +llama stack run ./run.yaml \ + --port $LLAMA_STACK_PORT \ + --env WATSONX_API_KEY=$WATSONX_API_KEY \ + --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID +``` diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 3c54cabcf..4040f0d80 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -288,4 +288,14 @@ def available_providers() -> List[ProviderSpec]: provider_data_validator="llama_stack.providers.remote.inference.passthrough.PassthroughProviderDataValidator", ), ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="watsonx", + pip_packages=["ibm_watson_machine_learning"], + module="llama_stack.providers.remote.inference.watsonx", + config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig", + provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator", + ), + ), ] diff --git a/llama_stack/providers/remote/inference/watsonx/__init__.py b/llama_stack/providers/remote/inference/watsonx/__init__.py new file mode 100644 index 000000000..e59e873b6 --- /dev/null +++ b/llama_stack/providers/remote/inference/watsonx/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import Inference + +from .config import WatsonXConfig + + +async def get_adapter_impl(config: WatsonXConfig, _deps) -> Inference: + # import dynamically so `llama stack build` does not fail due to missing dependencies + from .watsonx import WatsonXInferenceAdapter + + if not isinstance(config, WatsonXConfig): + raise RuntimeError(f"Unexpected config type: {type(config)}") + adapter = WatsonXInferenceAdapter(config) + return adapter + + +__all__ = ["get_adapter_impl", "WatsonXConfig"] diff --git a/llama_stack/providers/remote/inference/watsonx/config.py b/llama_stack/providers/remote/inference/watsonx/config.py new file mode 100644 index 000000000..7ee99b7e0 --- /dev/null +++ b/llama_stack/providers/remote/inference/watsonx/config.py @@ -0,0 +1,46 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, SecretStr + +from llama_stack.schema_utils import json_schema_type + + +class WatsonXProviderDataValidator(BaseModel): + url: str + api_key: str + project_id: str + + +@json_schema_type +class WatsonXConfig(BaseModel): + url: str = Field( + default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"), + description="A base url for accessing the watsonx.ai", + ) + api_key: Optional[SecretStr] = Field( + default_factory=lambda: os.getenv("WATSONX_API_KEY"), + description="The watsonx API key, only needed of using the hosted service", + ) + project_id: Optional[str] = Field( + default_factory=lambda: os.getenv("WATSONX_PROJECT_ID"), + description="The Project ID key, only needed of using the hosted service", + ) + timeout: int = Field( + default=60, + description="Timeout for the HTTP requests", + ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "url": "${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}", + "api_key": "${env.WATSONX_API_KEY:}", + "project_id": "${env.WATSONX_PROJECT_ID:}", + } diff --git a/llama_stack/providers/remote/inference/watsonx/models.py b/llama_stack/providers/remote/inference/watsonx/models.py new file mode 100644 index 000000000..d98f0510a --- /dev/null +++ b/llama_stack/providers/remote/inference/watsonx/models.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.models.llama.sku_types import CoreModelId +from llama_stack.providers.utils.inference.model_registry import build_hf_repo_model_entry + +MODEL_ENTRIES = [ + build_hf_repo_model_entry( + "meta-llama/llama-3-3-70b-instruct", + CoreModelId.llama3_3_70b_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-2-13b-chat", + CoreModelId.llama2_13b.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-3-1-70b-instruct", + CoreModelId.llama3_1_70b_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-3-1-8b-instruct", + CoreModelId.llama3_1_8b_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-3-2-11b-vision-instruct", + CoreModelId.llama3_2_11b_vision_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-3-2-1b-instruct", + CoreModelId.llama3_2_1b_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-3-2-3b-instruct", + CoreModelId.llama3_2_3b_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-3-2-90b-vision-instruct", + CoreModelId.llama3_2_90b_vision_instruct.value, + ), + build_hf_repo_model_entry( + "meta-llama/llama-guard-3-11b-vision", + CoreModelId.llama_guard_3_11b_vision.value, + ), +] diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py new file mode 100644 index 000000000..d5d87ec01 --- /dev/null +++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -0,0 +1,260 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import AsyncGenerator, List, Optional, Union + +from ibm_watson_machine_learning.foundation_models import Model +from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams + +from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem +from llama_stack.apis.inference import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + EmbeddingsResponse, + EmbeddingTaskType, + Inference, + LogProbConfig, + Message, + ResponseFormat, + SamplingParams, + TextTruncation, + ToolChoice, + ToolConfig, + ToolDefinition, + ToolPromptFormat, +) +from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper +from llama_stack.providers.utils.inference.openai_compat import ( + OpenAICompatCompletionChoice, + OpenAICompatCompletionResponse, + process_chat_completion_response, + process_chat_completion_stream_response, + process_completion_response, + process_completion_stream_response, +) +from llama_stack.providers.utils.inference.prompt_adapter import ( + chat_completion_request_to_prompt, + completion_request_to_prompt, + request_has_media, +) + +from . import WatsonXConfig +from .models import MODEL_ENTRIES + + +class WatsonXInferenceAdapter(Inference, ModelRegistryHelper): + def __init__(self, config: WatsonXConfig) -> None: + ModelRegistryHelper.__init__(self, MODEL_ENTRIES) + + print(f"Initializing watsonx InferenceAdapter({config.url})...") + + self._config = config + + self._project_id = self._config.project_id + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def completion( + self, + model_id: str, + content: InterleavedContent, + sampling_params: Optional[SamplingParams] = None, + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + ) -> AsyncGenerator: + if sampling_params is None: + sampling_params = SamplingParams() + model = await self.model_store.get_model(model_id) + request = CompletionRequest( + model=model.provider_resource_id, + content=content, + sampling_params=sampling_params, + response_format=response_format, + stream=stream, + logprobs=logprobs, + ) + if stream: + return self._stream_completion(request) + else: + return await self._nonstream_completion(request) + + def _get_client(self, model_id) -> Model: + config_api_key = self._config.api_key.get_secret_value() if self._config.api_key else None + config_url = self._config.url + project_id = self._config.project_id + credentials = {"url": config_url, "apikey": config_api_key} + + return Model(model_id=model_id, credentials=credentials, project_id=project_id) + + async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse: + params = await self._get_params(request) + r = self._get_client(request.model).generate(**params) + choices = [] + if "results" in r: + for result in r["results"]: + choice = OpenAICompatCompletionChoice( + finish_reason=result["stop_reason"] if result["stop_reason"] else None, + text=result["generated_text"], + ) + choices.append(choice) + response = OpenAICompatCompletionResponse( + choices=choices, + ) + return process_completion_response(response) + + async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator: + params = await self._get_params(request) + + async def _generate_and_convert_to_openai_compat(): + s = self._get_client(request.model).generate_text_stream(**params) + for chunk in s: + choice = OpenAICompatCompletionChoice( + finish_reason=None, + text=chunk, + ) + yield OpenAICompatCompletionResponse( + choices=[choice], + ) + + stream = _generate_and_convert_to_openai_compat() + async for chunk in process_completion_stream_response(stream): + yield chunk + + async def chat_completion( + self, + model_id: str, + messages: List[Message], + sampling_params: Optional[SamplingParams] = None, + tools: Optional[List[ToolDefinition]] = None, + tool_choice: Optional[ToolChoice] = ToolChoice.auto, + tool_prompt_format: Optional[ToolPromptFormat] = None, + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + tool_config: Optional[ToolConfig] = None, + ) -> AsyncGenerator: + if sampling_params is None: + sampling_params = SamplingParams() + model = await self.model_store.get_model(model_id) + request = ChatCompletionRequest( + model=model.provider_resource_id, + messages=messages, + sampling_params=sampling_params, + tools=tools or [], + response_format=response_format, + stream=stream, + logprobs=logprobs, + tool_config=tool_config, + ) + + if stream: + return self._stream_chat_completion(request) + else: + return await self._nonstream_chat_completion(request) + + async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse: + params = await self._get_params(request) + r = self._get_client(request.model).generate(**params) + choices = [] + if "results" in r: + for result in r["results"]: + choice = OpenAICompatCompletionChoice( + finish_reason=result["stop_reason"] if result["stop_reason"] else None, + text=result["generated_text"], + ) + choices.append(choice) + response = OpenAICompatCompletionResponse( + choices=choices, + ) + return process_chat_completion_response(response, request) + + async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator: + params = await self._get_params(request) + model_id = request.model + + # if we shift to TogetherAsyncClient, we won't need this wrapper + async def _to_async_generator(): + s = self._get_client(model_id).generate_text_stream(**params) + for chunk in s: + choice = OpenAICompatCompletionChoice( + finish_reason=None, + text=chunk, + ) + yield OpenAICompatCompletionResponse( + choices=[choice], + ) + + stream = _to_async_generator() + async for chunk in process_chat_completion_stream_response(stream, request): + yield chunk + + async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict: + input_dict = {"params": {}} + media_present = request_has_media(request) + llama_model = self.get_llama_model(request.model) + if isinstance(request, ChatCompletionRequest): + input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model) + else: + assert not media_present, "Together does not support media for Completion requests" + input_dict["prompt"] = await completion_request_to_prompt(request) + if request.sampling_params: + if request.sampling_params.strategy: + input_dict["params"][GenParams.DECODING_METHOD] = request.sampling_params.strategy.type + if request.sampling_params.max_tokens: + input_dict["params"][GenParams.MAX_NEW_TOKENS] = request.sampling_params.max_tokens + if request.sampling_params.repetition_penalty: + input_dict["params"][GenParams.REPETITION_PENALTY] = request.sampling_params.repetition_penalty + if request.sampling_params.additional_params.get("top_p"): + input_dict["params"][GenParams.TOP_P] = request.sampling_params.additional_params["top_p"] + if request.sampling_params.additional_params.get("top_k"): + input_dict["params"][GenParams.TOP_K] = request.sampling_params.additional_params["top_k"] + if request.sampling_params.additional_params.get("temperature"): + input_dict["params"][GenParams.TEMPERATURE] = request.sampling_params.additional_params["temperature"] + if request.sampling_params.additional_params.get("length_penalty"): + input_dict["params"][GenParams.LENGTH_PENALTY] = request.sampling_params.additional_params[ + "length_penalty" + ] + if request.sampling_params.additional_params.get("random_seed"): + input_dict["params"][GenParams.RANDOM_SEED] = request.sampling_params.additional_params["random_seed"] + if request.sampling_params.additional_params.get("min_new_tokens"): + input_dict["params"][GenParams.MIN_NEW_TOKENS] = request.sampling_params.additional_params[ + "min_new_tokens" + ] + if request.sampling_params.additional_params.get("stop_sequences"): + input_dict["params"][GenParams.STOP_SEQUENCES] = request.sampling_params.additional_params[ + "stop_sequences" + ] + if request.sampling_params.additional_params.get("time_limit"): + input_dict["params"][GenParams.TIME_LIMIT] = request.sampling_params.additional_params["time_limit"] + if request.sampling_params.additional_params.get("truncate_input_tokens"): + input_dict["params"][GenParams.TRUNCATE_INPUT_TOKENS] = request.sampling_params.additional_params[ + "truncate_input_tokens" + ] + if request.sampling_params.additional_params.get("return_options"): + input_dict["params"][GenParams.RETURN_OPTIONS] = request.sampling_params.additional_params[ + "return_options" + ] + + params = { + **input_dict, + } + return params + + async def embeddings( + self, + model_id: str, + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, + ) -> EmbeddingsResponse: + pass diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json index 63c4ecfa5..4c16411f0 100644 --- a/llama_stack/templates/dependencies.json +++ b/llama_stack/templates/dependencies.json @@ -755,5 +755,41 @@ "vllm", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], + "watsonx": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "datasets", + "emoji", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "ibm_watson_machine_learning", + "langdetect", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "pythainlp", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "tree_sitter", + "uvicorn" ] } diff --git a/llama_stack/templates/watsonx/__init__.py b/llama_stack/templates/watsonx/__init__.py new file mode 100644 index 000000000..078d86144 --- /dev/null +++ b/llama_stack/templates/watsonx/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .watsonx import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/watsonx/build.yaml b/llama_stack/templates/watsonx/build.yaml new file mode 100644 index 000000000..badd643ad --- /dev/null +++ b/llama_stack/templates/watsonx/build.yaml @@ -0,0 +1,30 @@ +version: '2' +distribution_spec: + description: Use watsonx for running LLM inference + providers: + inference: + - remote::watsonx + vector_io: + - inline::faiss + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust + tool_runtime: + - remote::brave-search + - remote::tavily-search + - inline::code-interpreter + - inline::rag-runtime + - remote::model-context-protocol +image_type: conda diff --git a/llama_stack/templates/watsonx/doc_template.md b/llama_stack/templates/watsonx/doc_template.md new file mode 100644 index 000000000..af0ae15a8 --- /dev/null +++ b/llama_stack/templates/watsonx/doc_template.md @@ -0,0 +1,74 @@ +--- +orphan: true +--- +# watsonx Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +{% if run_config_env_vars %} + +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }} {{ model.doc_string }}` +{% endfor %} +{% endif %} + + +### Prerequisite: API Keys + +Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key). + + +## Running Llama Stack with watsonx + +You can do this via Conda (build code), venv or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env WATSONX_API_KEY=$WATSONX_API_KEY \ + --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \ + --env WATSONX_BASE_URL=$WATSONX_BASE_URL +``` + +### Via Conda + +```bash +llama stack build --template watsonx --image-type conda +llama stack run ./run.yaml \ + --port $LLAMA_STACK_PORT \ + --env WATSONX_API_KEY=$WATSONX_API_KEY \ + --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID +``` diff --git a/llama_stack/templates/watsonx/run.yaml b/llama_stack/templates/watsonx/run.yaml new file mode 100644 index 000000000..1048f7192 --- /dev/null +++ b/llama_stack/templates/watsonx/run.yaml @@ -0,0 +1,210 @@ +version: '2' +image_name: watsonx +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: watsonx + provider_type: remote::watsonx + config: + url: ${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com} + api_key: ${env.WATSONX_API_KEY:} + project_id: ${env.WATSONX_PROJECT_ID:} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:\u200B}" + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/watsonx/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/registry.db +models: +- metadata: {} + model_id: meta-llama/llama-3-3-70b-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-3-70b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-3-70b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-2-13b-chat + provider_id: watsonx + provider_model_id: meta-llama/llama-2-13b-chat + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-2-13b + provider_id: watsonx + provider_model_id: meta-llama/llama-2-13b-chat + model_type: llm +- metadata: {} + model_id: meta-llama/llama-3-1-70b-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-1-70b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-70B-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-1-70b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-3-1-8b-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-1-8b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-1-8b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-3-2-11b-vision-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-3-2-1b-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-1b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-1B-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-1b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-3-2-3b-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-3b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-3b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-3-2-90b-vision-instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: watsonx + provider_model_id: meta-llama/llama-3-2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/llama-guard-3-11b-vision + provider_id: watsonx + provider_model_id: meta-llama/llama-guard-3-11b-vision + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-11B-Vision + provider_id: watsonx + provider_model_id: meta-llama/llama-guard-3-11b-vision + model_type: llm +shields: [] +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/watsonx/watsonx.py b/llama_stack/templates/watsonx/watsonx.py new file mode 100644 index 000000000..d59bb6f20 --- /dev/null +++ b/llama_stack/templates/watsonx/watsonx.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import Provider, ToolGroupInput +from llama_stack.providers.remote.inference.watsonx import WatsonXConfig +from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::watsonx"], + "vector_io": ["inline::faiss"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "inline::code-interpreter", + "inline::rag-runtime", + "remote::model-context-protocol", + ], + } + + inference_provider = Provider( + provider_id="watsonx", + provider_type="remote::watsonx", + config=WatsonXConfig.sample_run_config(), + ) + + available_models = { + "watsonx": MODEL_ENTRIES, + } + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::code_interpreter", + provider_id="code-interpreter", + ), + ] + + default_models = get_model_registry(available_models) + return DistributionTemplate( + name="watsonx", + distro_type="remote_hosted", + description="Use watsonx for running LLM inference", + container_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + available_models_by_provider=available_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=default_models, + default_tool_groups=default_tool_groups, + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "WATSONX_API_KEY": ( + "", + "watsonx API Key", + ), + "WATSONX_PROJECT_ID": ( + "", + "watsonx Project ID", + ), + }, + ) diff --git a/pyproject.toml b/pyproject.toml index 209367c4b..d661f45fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -274,6 +274,7 @@ exclude = [ "^llama_stack/providers/remote/inference/sample/", "^llama_stack/providers/remote/inference/tgi/", "^llama_stack/providers/remote/inference/together/", + "^llama_stack/providers/remote/inference/watsonx/", "^llama_stack/providers/remote/safety/bedrock/", "^llama_stack/providers/remote/safety/nvidia/", "^llama_stack/providers/remote/safety/sample/",