From dd86427ce3ef182c57292819a56a46e97a7da624 Mon Sep 17 00:00:00 2001 From: Jeff MAURY Date: Thu, 20 Mar 2025 16:09:15 +0100 Subject: [PATCH] feat: Podman AI Lab provider and distribution Signed-off-by: Jeff MAURY --- .../self_hosted_distro/podman-ai-lab.md | 141 +++++++++ llama_stack/providers/registry/inference.py | 10 + .../inference/podman_ai_lab/__init__.py | 18 ++ .../remote/inference/podman_ai_lab/config.py | 21 ++ .../inference/podman_ai_lab/podman_ai_lab.py | 294 ++++++++++++++++++ llama_stack/templates/dependencies.json | 38 +++ .../templates/podman-ai-lab/__init__.py | 7 + .../templates/podman-ai-lab/build.yaml | 33 ++ .../templates/podman-ai-lab/doc_template.md | 131 ++++++++ .../templates/podman-ai-lab/podman_ai_lab.py | 137 ++++++++ llama_stack/templates/podman-ai-lab/report.md | 44 +++ .../podman-ai-lab/run-with-safety.yaml | 133 ++++++++ llama_stack/templates/podman-ai-lab/run.yaml | 123 ++++++++ pyproject.toml | 1 + 14 files changed, 1131 insertions(+) create mode 100644 docs/source/distributions/self_hosted_distro/podman-ai-lab.md create mode 100644 llama_stack/providers/remote/inference/podman_ai_lab/__init__.py create mode 100644 llama_stack/providers/remote/inference/podman_ai_lab/config.py create mode 100644 llama_stack/providers/remote/inference/podman_ai_lab/podman_ai_lab.py create mode 100644 llama_stack/templates/podman-ai-lab/__init__.py create mode 100644 llama_stack/templates/podman-ai-lab/build.yaml create mode 100644 llama_stack/templates/podman-ai-lab/doc_template.md create mode 100644 llama_stack/templates/podman-ai-lab/podman_ai_lab.py create mode 100644 llama_stack/templates/podman-ai-lab/report.md create mode 100644 llama_stack/templates/podman-ai-lab/run-with-safety.yaml create mode 100644 llama_stack/templates/podman-ai-lab/run.yaml diff --git a/docs/source/distributions/self_hosted_distro/podman-ai-lab.md b/docs/source/distributions/self_hosted_distro/podman-ai-lab.md new file mode 100644 index 000000000..ec4371154 --- /dev/null +++ b/docs/source/distributions/self_hosted_distro/podman-ai-lab.md @@ -0,0 +1,141 @@ +--- +orphan: true +--- + +# Podman AI Lab Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-podman-ai-lab` distribution consists of the following provider configurations. + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | +| inference | `remote::podman-ai-lab` | +| safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | +| telemetry | `inline::meta-reference` | +| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | + + +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `PODMAN_AI_LAB_URL`: URL of the Podman AI Lab server (default: `http://127.0.0.1:10434`) +- `SAFETY_MODEL`: Safety model loaded into the Ollama server (default: `meta-llama/Llama-Guard-3-1B`) + + +## Setting up Podman AI Lab server + +Please check the [Podman AI Lab Documentation](https://github.com/containers/podman-desktop-extension-ai-lab) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server. + + +If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model. + +```bash +export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" + +# ollama names this model differently, and we must use the ollama name when loading the model +export PODMAN_AI_LAB_SAFETY_MODEL="llama-guard3:1b" +``` + +## Running Llama Stack + +Now you are ready to run Llama Stack with Podman AI Lab as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +export LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + llamastack/distribution-podman-ai-lab \ + --port $LLAMA_STACK_PORT \ + --env PODMAN_AI_LAB_URL=http://host.docker.internal:10434 +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +# You need a local checkout of llama-stack to run this, get it using +# git clone https://github.com/meta-llama/llama-stack.git +cd /path/to/llama-stack + +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-podman-ai-lab \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env PODMAN_AI_LAB_URL=http://host.docker.internal:11434 +``` + +### Via Conda + +Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. + +```bash +export LLAMA_STACK_PORT=5001 + +llama stack build --template podman-ai-lab --image-type conda +llama stack run ./run.yaml \ + --port $LLAMA_STACK_PORT \ + --env PODMAN_AI_LAB_URL=http://localhost:10434 +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +llama stack run ./run-with-safety.yaml \ + --port $LLAMA_STACK_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env PODMAN_AI_LAB_URL=http://localhost:11434 +``` + + +### (Optional) Update Model Serving Configuration + +To serve a new model with `Podman AI Lab`: +- launch Podman Desktop with Podman AI Lab extension installed +- download the model +- start an inference server for the model + +To make sure that the model is being served correctly, run `curl localhost:10434/api/tags` to get a list of models being served by Podman AI Lab. +``` +$ curl localhost:10434/api/tags +{"models":[{"model":"hf.ibm-research.granite-3.2-8b-instruct-GGUF","name":"ibm-research/granite-3.2-8b-instruct-GGUF","digest":"363f0bbc3200b9c9b0ab87efe237d77b1e05bb929d5d7e4b57c1447c911223e8","size":4942859552,"modified_at":"2025-03-17T14:48:32.417Z","details":{}}]} +``` + +To verify that the model served by Podman AI Lab is correctly connected to Llama Stack server +```bash +$ llama-stack-client models list + +Available Models + +┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ +┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃ +┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ +│ llm │ ibm-research/granite-3.2-8b-instruct-GGUF │ ibm-research/granite-3.2-8b-instruct-GGUF │ │ podman-ai-lab │ +└──────────────┴────────────────────────────────────────────────┴───────────────────────────────────────────────┴───────────┴────────────────┘ + +Total models: 1 +``` diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 3c54cabcf..c540ade43 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -77,6 +77,16 @@ def available_providers() -> List[ProviderSpec]: module="llama_stack.providers.remote.inference.ollama", ), ), + remote_provider_spec( + api=Api.inference, + api_dependencies=[Api.models], + adapter=AdapterSpec( + adapter_type="podman-ai-lab", + pip_packages=["ollama", "aiohttp"], + config_class="llama_stack.providers.remote.inference.podman_ai_lab.PodmanAILabImplConfig", + module="llama_stack.providers.remote.inference.podman_ai_lab", + ), + ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec( diff --git a/llama_stack/providers/remote/inference/podman_ai_lab/__init__.py b/llama_stack/providers/remote/inference/podman_ai_lab/__init__.py new file mode 100644 index 000000000..073b6f71d --- /dev/null +++ b/llama_stack/providers/remote/inference/podman_ai_lab/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict + +from llama_stack.apis.datatypes import Api + +from .config import PodmanAILabImplConfig + + +async def get_adapter_impl(config: PodmanAILabImplConfig, deps: Dict[Api, Any]): + from .podman_ai_lab import PodmanAILabInferenceAdapter + + impl = PodmanAILabInferenceAdapter(config.url, deps[Api.models]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/remote/inference/podman_ai_lab/config.py b/llama_stack/providers/remote/inference/podman_ai_lab/config.py new file mode 100644 index 000000000..4ac20ddc7 --- /dev/null +++ b/llama_stack/providers/remote/inference/podman_ai_lab/config.py @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict + +from pydantic import BaseModel + +DEFAULT_PODMAN_AI_LAB_URL = "http://localhost:10434" + + +class PodmanAILabImplConfig(BaseModel): + url: str = DEFAULT_PODMAN_AI_LAB_URL + + @classmethod + def sample_run_config( + cls, url: str = "${env.PODMAN_AI_LAB_URL:http://localhost:10434}", **kwargsi + ) -> Dict[str, Any]: + return {"url": url} diff --git a/llama_stack/providers/remote/inference/podman_ai_lab/podman_ai_lab.py b/llama_stack/providers/remote/inference/podman_ai_lab/podman_ai_lab.py new file mode 100644 index 000000000..61fbddfa9 --- /dev/null +++ b/llama_stack/providers/remote/inference/podman_ai_lab/podman_ai_lab.py @@ -0,0 +1,294 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +from typing import AsyncGenerator, List, Optional, Union + +from ollama import AsyncClient + +from llama_stack.apis.common.content_types import ( + ImageContentItem, + InterleavedContent, + InterleavedContentItem, + TextContentItem, +) +from llama_stack.apis.inference import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + EmbeddingsResponse, + EmbeddingTaskType, + Inference, + LogProbConfig, + Message, + ResponseFormat, + SamplingParams, + TextTruncation, + ToolChoice, + ToolConfig, + ToolDefinition, + ToolPromptFormat, +) +from llama_stack.apis.models import Model, Models +from llama_stack.log import get_logger +from llama_stack.providers.datatypes import ModelsProtocolPrivate +from llama_stack.providers.utils.inference.openai_compat import ( + OpenAICompatCompletionChoice, + OpenAICompatCompletionResponse, + get_sampling_options, + process_chat_completion_response, + process_chat_completion_stream_response, + process_completion_response, + process_completion_stream_response, +) +from llama_stack.providers.utils.inference.prompt_adapter import ( + chat_completion_request_to_prompt, + completion_request_to_prompt, + convert_image_content_to_url, + request_has_media, +) + +logger = get_logger(name=__name__, category="inference") + + +class PodmanAILabInferenceAdapter(Inference, ModelsProtocolPrivate): + def __init__(self, url: str, models: Models) -> None: + self.url = url + self.models = models + + @property + def client(self) -> AsyncClient: + return AsyncClient(host=self.url) + + async def initialize(self) -> None: + logger.info(f"checking connectivity to Podman AI Lab at `{self.url}`...") + try: + await self.client.list() + # for model in response["models"]: + # await self.models.register_model(model.model, model.model, 'podman-ai-lab') + except ConnectionError as e: + raise RuntimeError("Podman AI Lab Server is not running, start it using Podman Desktop") from e + + async def shutdown(self) -> None: + pass + + async def unregister_model(self, model_id: str) -> None: + pass + + async def completion( + self, + model_id: str, + content: InterleavedContent, + sampling_params: Optional[SamplingParams] = None, + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + ) -> AsyncGenerator: + if sampling_params is None: + sampling_params = SamplingParams() + model = await self.model_store.get_model(model_id) + request = CompletionRequest( + model=model.provider_resource_id, + content=content, + sampling_params=sampling_params, + response_format=response_format, + stream=stream, + logprobs=logprobs, + ) + if stream: + return self._stream_completion(request) + else: + return await self._nonstream_completion(request) + + async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator: + params = await self._get_params(request) + + async def _generate_and_convert_to_openai_compat(): + s = await self.client.generate(**params) + async for chunk in s: + choice = OpenAICompatCompletionChoice( + finish_reason=chunk["done_reason"] if chunk["done"] else None, + text=chunk["response"], + ) + yield OpenAICompatCompletionResponse( + choices=[choice], + ) + + stream = _generate_and_convert_to_openai_compat() + async for chunk in process_completion_stream_response(stream): + yield chunk + + async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator: + params = await self._get_params(request) + r = await self.client.generate(**params) + + choice = OpenAICompatCompletionChoice( + finish_reason=r["done_reason"] if r["done"] else None, + text=r["response"], + ) + response = OpenAICompatCompletionResponse( + choices=[choice], + ) + + return process_completion_response(response) + + async def chat_completion( + self, + model_id: str, + messages: List[Message], + sampling_params: Optional[SamplingParams] = None, + response_format: Optional[ResponseFormat] = None, + tools: Optional[List[ToolDefinition]] = None, + tool_choice: Optional[ToolChoice] = ToolChoice.auto, + tool_prompt_format: Optional[ToolPromptFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + tool_config: Optional[ToolConfig] = None, + ) -> AsyncGenerator: + if sampling_params is None: + sampling_params = SamplingParams() + model = await self.model_store.get_model(model_id) + request = ChatCompletionRequest( + model=model.provider_resource_id, + messages=messages, + sampling_params=sampling_params, + tools=tools or [], + stream=stream, + logprobs=logprobs, + response_format=response_format, + tool_config=tool_config, + ) + if stream: + return self._stream_chat_completion(request) + else: + return await self._nonstream_chat_completion(request) + + async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict: + sampling_options = get_sampling_options(request.sampling_params) + # This is needed since the Ollama API expects num_predict to be set + # for early truncation instead of max_tokens. + if sampling_options.get("max_tokens") is not None: + sampling_options["num_predict"] = sampling_options["max_tokens"] + + input_dict = {} + media_present = request_has_media(request) + llama_model = self.register_helper.get_llama_model(request.model) + if isinstance(request, ChatCompletionRequest): + if media_present or not llama_model: + contents = [await convert_message_to_openai_dict_for_podman_ai_lab(m) for m in request.messages] + # flatten the list of lists + input_dict["messages"] = [item for sublist in contents for item in sublist] + else: + input_dict["raw"] = True + input_dict["prompt"] = await chat_completion_request_to_prompt( + request, + llama_model, + ) + else: + assert not media_present, "Ollama does not support media for Completion requests" + input_dict["prompt"] = await completion_request_to_prompt(request) + input_dict["raw"] = True + + if fmt := request.response_format: + if fmt.type == "json_schema": + input_dict["format"] = fmt.json_schema + elif fmt.type == "grammar": + raise NotImplementedError("Grammar response format is not supported") + else: + raise ValueError(f"Unknown response format type: {fmt.type}") + + params = { + "model": request.model, + **input_dict, + "options": sampling_options, + "stream": request.stream, + } + logger.debug(f"params to Podman AI Lab: {params}") + + return params + + async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse: + params = await self._get_params(request) + if "messages" in params: + r = await self.client.chat(**params) + else: + r = await self.client.generate(**params) + + if "message" in r: + choice = OpenAICompatCompletionChoice( + finish_reason=r["done_reason"] if r["done"] else None, + text=r["message"]["content"], + ) + else: + choice = OpenAICompatCompletionChoice( + finish_reason=r["done_reason"] if r["done"] else None, + text=r["response"], + ) + response = OpenAICompatCompletionResponse( + choices=[choice], + ) + return process_chat_completion_response(response, request) + + async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator: + params = await self._get_params(request) + + async def _generate_and_convert_to_openai_compat(): + if "messages" in params: + s = await self.client.chat(**params) + else: + s = await self.client.generate(**params) + async for chunk in s: + if "message" in chunk: + choice = OpenAICompatCompletionChoice( + finish_reason=chunk["done_reason"] if chunk["done"] else None, + text=chunk["message"]["content"], + ) + else: + choice = OpenAICompatCompletionChoice( + finish_reason=chunk["done_reason"] if chunk["done"] else None, + text=chunk["response"], + ) + yield OpenAICompatCompletionResponse( + choices=[choice], + ) + + stream = _generate_and_convert_to_openai_compat() + async for chunk in process_chat_completion_stream_response(stream, request): + yield chunk + + async def embeddings( + self, + model_id: str, + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, + ) -> EmbeddingsResponse: + raise NotImplementedError("embeddings endpoint is not implemented") + + async def register_model(self, model: Model) -> Model: + return model + + +async def convert_message_to_openai_dict_for_podman_ai_lab(message: Message) -> List[dict]: + async def _convert_content(content) -> dict: + if isinstance(content, ImageContentItem): + return { + "role": message.role, + "images": [await convert_image_content_to_url(content, download=True, include_format=False)], + } + else: + text = content.text if isinstance(content, TextContentItem) else content + assert isinstance(text, str) + return { + "role": message.role, + "content": text, + } + + if isinstance(message.content, list): + return [await _convert_content(c) for c in message.content] + else: + return [await _convert_content(message.content)] diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json index b96191752..78de764ec 100644 --- a/llama_stack/templates/dependencies.json +++ b/llama_stack/templates/dependencies.json @@ -536,6 +536,44 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "podman-ai-lab": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "emoji", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "langdetect", + "matplotlib", + "mcp", + "nltk", + "numpy", + "ollama", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "pythainlp", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "tree_sitter", + "uvicorn" + ], "remote-vllm": [ "aiosqlite", "autoevals", diff --git a/llama_stack/templates/podman-ai-lab/__init__.py b/llama_stack/templates/podman-ai-lab/__init__.py new file mode 100644 index 000000000..be35a74ac --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .podman_ai_lab import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/podman-ai-lab/build.yaml b/llama_stack/templates/podman-ai-lab/build.yaml new file mode 100644 index 000000000..6d2f28279 --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/build.yaml @@ -0,0 +1,33 @@ +version: '2' +distribution_spec: + description: Use (an external) Podman AI Lab server for running LLM inference + providers: + inference: + - remote::podman-ai-lab + vector_io: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust + tool_runtime: + - remote::brave-search + - remote::tavily-search + - inline::code-interpreter + - inline::rag-runtime + - remote::model-context-protocol + - remote::wolfram-alpha +image_type: conda diff --git a/llama_stack/templates/podman-ai-lab/doc_template.md b/llama_stack/templates/podman-ai-lab/doc_template.md new file mode 100644 index 000000000..a09aa857a --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/doc_template.md @@ -0,0 +1,131 @@ +--- +orphan: true +--- +# Podman AI Lab Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + + +## Setting up Podman AI Lab server + +Please check the [Podman AI Lab Documentation](https://github.com/containers/podman-desktop-extension-ai-lab) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server. + + +If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model. + +```bash +export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" + +# ollama names this model differently, and we must use the ollama name when loading the model +export PODMAN_AI_LAB_SAFETY_MODEL="llama-guard3:1b" +``` + +## Running Llama Stack + +Now you are ready to run Llama Stack with Podman AI Lab as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +export LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + llamastack/distribution-{{ name }} \ + --port $LLAMA_STACK_PORT \ + --env PODMAN_AI_LAB_URL=http://host.docker.internal:10434 +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +# You need a local checkout of llama-stack to run this, get it using +# git clone https://github.com/meta-llama/llama-stack.git +cd /path/to/llama-stack + +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env PODMAN_AI_LAB_URL=http://host.docker.internal:11434 +``` + +### Via Conda + +Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. + +```bash +export LLAMA_STACK_PORT=5001 + +llama stack build --template {{ name }} --image-type conda +llama stack run ./run.yaml \ + --port $LLAMA_STACK_PORT \ + --env PODMAN_AI_LAB_URL=http://localhost:10434 +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +llama stack run ./run-with-safety.yaml \ + --port $LLAMA_STACK_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env PODMAN_AI_LAB_URL=http://localhost:11434 +``` + + +### (Optional) Update Model Serving Configuration + +To serve a new model with `Podman AI Lab`: +- launch Podman Desktop with Podman AI Lab extension installed +- download the model +- start an inference server for the model + +To make sure that the model is being served correctly, run `curl localhost:10434/api/tags` to get a list of models being served by Podman AI Lab. +``` +$ curl localhost:10434/api/tags +{"models":[{"model":"hf.ibm-research.granite-3.2-8b-instruct-GGUF","name":"ibm-research/granite-3.2-8b-instruct-GGUF","digest":"363f0bbc3200b9c9b0ab87efe237d77b1e05bb929d5d7e4b57c1447c911223e8","size":4942859552,"modified_at":"2025-03-17T14:48:32.417Z","details":{}}]} +``` + +To verify that the model served by Podman AI Lab is correctly connected to Llama Stack server +```bash +$ llama-stack-client models list + +Available Models + +┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ +┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃ +┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ +│ llm │ ibm-research/granite-3.2-8b-instruct-GGUF │ ibm-research/granite-3.2-8b-instruct-GGUF │ │ podman-ai-lab │ +└──────────────┴────────────────────────────────────────────────┴───────────────────────────────────────────────┴───────────┴────────────────┘ + +Total models: 1 +``` diff --git a/llama_stack/templates/podman-ai-lab/podman_ai_lab.py b/llama_stack/templates/podman-ai-lab/podman_ai_lab.py new file mode 100644 index 000000000..b3b12e03c --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/podman_ai_lab.py @@ -0,0 +1,137 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ( + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) +from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig +from llama_stack.providers.remote.inference.podman_ai_lab import PodmanAILabImplConfig +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::podman-ai-lab"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "inline::code-interpreter", + "inline::rag-runtime", + "remote::model-context-protocol", + "remote::wolfram-alpha", + ], + } + name = "podman-ai-lab" + inference_provider = Provider( + provider_id="podman-ai-lab", + provider_type="remote::podman-ai-lab", + config=PodmanAILabImplConfig.sample_run_config(), + ) + vector_io_provider_faiss = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ) + + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="ollama", + ) + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::code_interpreter", + provider_id="code-interpreter", + ), + ToolGroupInput( + toolgroup_id="builtin::wolfram_alpha", + provider_id="wolfram-alpha", + ), + ] + + return DistributionTemplate( + name=name, + distro_type="self_hosted", + description="Use (an external) Podman AI Lab server for running LLM inference", + container_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + "vector_io": [vector_io_provider_faiss], + }, + default_models=[], + default_tool_groups=default_tool_groups, + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + "vector_io": [vector_io_provider_faiss], + "safety": [ + Provider( + provider_id="llama-guard", + provider_type="inline::llama-guard", + config={}, + ), + Provider( + provider_id="code-scanner", + provider_type="inline::code-scanner", + config={}, + ), + ], + }, + default_models=[ + safety_model, + ], + default_shields=[ + ShieldInput( + shield_id="${env.SAFETY_MODEL}", + provider_id="llama-guard", + ), + ShieldInput( + shield_id="CodeScanner", + provider_id="code-scanner", + ), + ], + default_tool_groups=default_tool_groups, + ), + }, + run_config_env_vars={ + "LLAMA_STACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "PODMAN_AI_LAB_URL": ( + "http://127.0.0.1:10434", + "URL of the Podman AI Lab server", + ), + "SAFETY_MODEL": ( + "meta-llama/Llama-Guard-3-1B", + "Safety model loaded into the Ollama server", + ), + }, + ) diff --git a/llama_stack/templates/podman-ai-lab/report.md b/llama_stack/templates/podman-ai-lab/report.md new file mode 100644 index 000000000..5223f1bf8 --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/report.md @@ -0,0 +1,44 @@ +# Report for Podman AI Lab distribution + +## Supported Models +| Model Descriptor | ollama | +|:---|:---| +| Llama-3-8B-Instruct | ❌ | +| Llama-3-70B-Instruct | ❌ | +| Llama3.1-8B-Instruct | ✅ | +| Llama3.1-70B-Instruct | ✅ | +| Llama3.1-405B-Instruct | ✅ | +| Llama3.2-1B-Instruct | ✅ | +| Llama3.2-3B-Instruct | ✅ | +| Llama3.2-11B-Vision-Instruct | ✅ | +| Llama3.2-90B-Vision-Instruct | ✅ | +| Llama3.3-70B-Instruct | ✅ | +| Llama-Guard-3-11B-Vision | ❌ | +| Llama-Guard-3-1B | ✅ | +| Llama-Guard-3-8B | ✅ | +| Llama-Guard-2-8B | ❌ | + +## Inference +| Model | API | Capability | Test | Status | +|:----- |:-----|:-----|:-----|:-----| +| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ | +| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ | +| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ | +| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ | +| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ | +| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ | +| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ | +| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ | +| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ | + +## Vector IO +| API | Capability | Test | Status | +|:-----|:-----|:-----|:-----| +| /retrieve | | test_vector_db_retrieve | ✅ | + +## Agents +| API | Capability | Test | Status | +|:-----|:-----|:-----|:-----| +| /create_agent_turn | rag | test_rag_agent | ✅ | +| /create_agent_turn | custom_tool | test_custom_tool | ✅ | +| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ | diff --git a/llama_stack/templates/podman-ai-lab/run-with-safety.yaml b/llama_stack/templates/podman-ai-lab/run-with-safety.yaml new file mode 100644 index 000000000..1fd68d322 --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/run-with-safety.yaml @@ -0,0 +1,133 @@ +version: '2' +image_name: podman-ai-lab +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: podman-ai-lab + provider_type: remote::podman-ai-lab + config: + url: ${env.PODMAN_AI_LAB_URL:http://localhost:10434} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + - provider_id: code-scanner + provider_type: inline::code-scanner + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/podman-ai-lab/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/registry.db +models: +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: ollama + model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL} + provider_id: llama-guard +- shield_id: CodeScanner + provider_id: code-scanner +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +- toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +server: + port: 8321 diff --git a/llama_stack/templates/podman-ai-lab/run.yaml b/llama_stack/templates/podman-ai-lab/run.yaml new file mode 100644 index 000000000..cf24b70bf --- /dev/null +++ b/llama_stack/templates/podman-ai-lab/run.yaml @@ -0,0 +1,123 @@ +version: '2' +image_name: podman-ai-lab +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: podman-ai-lab + provider_type: remote::podman-ai-lab + config: + url: ${env.PODMAN_AI_LAB_URL:http://localhost:10434} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/podman-ai-lab/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/podman-ai-lab}/registry.db +models: [] +shields: [] +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +- toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +server: + port: 8321 diff --git a/pyproject.toml b/pyproject.toml index 7e910f673..50f4e9e03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -259,6 +259,7 @@ exclude = [ "^llama_stack/providers/remote/inference/nvidia/", "^llama_stack/providers/remote/inference/openai/", "^llama_stack/providers/remote/inference/passthrough/", + "^llama_stack/providers/remote/inference/podman_ai_lab/", "^llama_stack/providers/remote/inference/runpod/", "^llama_stack/providers/remote/inference/sambanova/", "^llama_stack/providers/remote/inference/sample/",