feat: split API and provider specs into separate llama-stack-api pkg (#3895)

# What does this PR do?

Extract API definitions and provider specifications into a standalone
llama-stack-api package that can be published to PyPI independently of
the main llama-stack server.


see: https://github.com/llamastack/llama-stack/pull/2978 and
https://github.com/llamastack/llama-stack/pull/2978#issuecomment-3145115942

Motivation

External providers currently import from llama-stack, which overrides
the installed version and causes dependency conflicts. This separation
allows external providers to:

- Install only the type definitions they need without server
dependencies
- Avoid version conflicts with the installed llama-stack package
- Be versioned and released independently

This enables us to re-enable external provider module tests that were
previously blocked by these import conflicts.

Changes

- Created llama-stack-api package with minimal dependencies (pydantic,
jsonschema)
- Moved APIs, providers datatypes, strong_typing, and schema_utils
- Updated all imports from llama_stack.* to llama_stack_api.*
- Configured local editable install for development workflow
- Updated linting and type-checking configuration for both packages

Next Steps

- Publish llama-stack-api to PyPI
- Update external provider dependencies
- Re-enable external provider module tests


Pre-cursor PRs to this one:

- #4093 
- #3954 
- #4064 

These PRs moved key pieces _out_ of the Api pkg, limiting the scope of
change here.


relates to #3237 

## Test Plan

Package builds successfully and can be imported independently. All
pre-commit hooks pass with expected exclusions maintained.

---------

Signed-off-by: Charlie Doern <cdoern@redhat.com>
This commit is contained in:
Charlie Doern 2025-11-13 14:51:17 -05:00 committed by GitHub
parent ceb716b9a0
commit 840ad75fe9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
358 changed files with 2337 additions and 1424 deletions

View file

@ -13,8 +13,7 @@ from contextlib import contextmanager
from io import BytesIO
import pytest
from llama_stack.apis.files import OpenAIFilePurpose
from llama_stack_api import OpenAIFilePurpose
class BatchHelper:

View file

@ -9,8 +9,8 @@ from unittest.mock import patch
import pytest
import requests
from llama_stack_api import OpenAIFilePurpose
from llama_stack.apis.files import OpenAIFilePurpose
from llama_stack.core.datatypes import User
purpose = OpenAIFilePurpose.ASSISTANTS

View file

@ -15,14 +15,14 @@ that enables routing based on provider_data alone.
from unittest.mock import AsyncMock, patch
import pytest
from llama_stack.apis.datatypes import Api
from llama_stack.apis.inference.inference import (
from llama_stack_api import (
Api,
OpenAIAssistantMessageParam,
OpenAIChatCompletion,
OpenAIChatCompletionUsage,
OpenAIChoice,
)
from llama_stack.core.library_client import LlamaStackAsLibraryClient
from llama_stack.core.telemetry.telemetry import MetricEvent

View file

@ -9,8 +9,7 @@ import time
import uuid
import pytest
from llama_stack.apis.post_training import (
from llama_stack_api import (
DataConfig,
DatasetFormat,
DPOAlignmentConfig,
@ -18,6 +17,7 @@ from llama_stack.apis.post_training import (
LoraFinetuningConfig,
TrainingConfig,
)
from llama_stack.log import get_logger
# Configure logging

View file

@ -10,7 +10,7 @@
},
"response": {
"body": {
"__type__": "llama_stack.apis.tools.tools.ToolInvocationResult",
"__type__": "llama_stack_api.tools.ToolInvocationResult",
"__data__": {
"content": "{\"query\": \"Llama 4 Maverick model experts\", \"top_k\": [{\"url\": \"https://console.groq.com/docs/model/meta-llama/llama-4-maverick-17b-128e-instruct\", \"title\": \"Llama 4 Maverick 17B 128E\", \"content\": \"Llama 4 Maverick is Meta's natively multimodal model that enables text and image understanding. With a 17 billion parameter mixture-of-experts architecture (128 experts), this model offers industry-leading performance for multimodal tasks like natural assistant-like chat, image recognition, and coding tasks. Llama 4 Maverick features an auto-regressive language model that uses a mixture-of-experts (MoE) architecture with 17B activated parameters (400B total) and incorporates early fusion for native multimodality. The model uses 128 experts to efficiently handle both text and image inputs while maintaining high performance across chat, knowledge, and code generation tasks, with a knowledge cutoff of August 2024. * For multimodal applications, this model supports up to 5 image inputs create( model =\\\"meta-llama/llama-4-maverick-17b-128e-instruct\\\", messages =[ { \\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Explain why fast inference is critical for reasoning models\\\" } ] ) print(completion.\", \"score\": 0.9170729, \"raw_content\": null}, {\"url\": \"https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E\", \"title\": \"meta-llama/Llama-4-Maverick-17B-128E - Hugging Face\", \"content\": \"Model Architecture: The Llama 4 models are auto-regressive language models that use a mixture-of-experts (MoE) architecture and incorporate\", \"score\": 0.8021998, \"raw_content\": null}, {\"url\": \"https://www.ibm.com/new/announcements/meta-llama-4-maverick-and-llama-4-scout-now-available-in-watsonx-ai\", \"title\": \"Meta Llama 4 Maverick and Llama 4 Scout now available in watsonx ...\", \"content\": \"# Meta Llama 4 Maverick and Llama 4 Scout now available in watsonx.ai **IBM is excited to announce the addition of Meta\\u2019s latest generation of open models, Llama 4, to** **watsonx.ai****.** Llama 4 Scout and Llama 4 Maverick, the first mixture of experts (MoE) models released by Meta, provide frontier multimodal performance, high speeds, low cost, and industry leading context length. With the introduction of these latest offerings from Meta, IBM now supports a total of 13 Meta models in the expansive library of \\u00a0foundation models available in watsonx.ai. Trained on 40 trillion tokens of data, Llama 4 Scout offers performance rivalling or exceeding that of models with significantly larger active parameter counts while keeping costs and latency low. ## Llama 4 models on IBM watsonx\", \"score\": 0.78194773, \"raw_content\": null}, {\"url\": \"https://medium.com/@divyanshbhatiajm19/metas-llama-4-family-the-complete-guide-to-scout-maverick-and-behemoth-ai-models-in-2025-21a90c882e8a\", \"title\": \"Meta's Llama 4 Family: The Complete Guide to Scout, Maverick, and ...\", \"content\": \"# Meta\\u2019s Llama 4 Family: The Complete Guide to Scout, Maverick, and Behemoth AI Models in 2025 Feature Llama 4 Scout Llama 4 Maverick Llama 4 Behemoth **Total Parameters** 109B 400B ~2T **Active Parameters** 17B 17B 288B **Expert Count** 16 128 16 **Context Window** 10M tokens 1M tokens Not specified **Hardware Requirements** Single H100 GPU Single H100 DGX host Multiple GPUs **Inference Cost** Not specified $0.19-$0.49 per 1M tokens Not specified **Release Status** Available now Available now In training **Primary Use Cases** Long-context analysis, code processing High-performance multimodal applications Research, STEM reasoning The Llama 4 family represents Meta\\u2019s most significant AI development to date, with each model offering distinct advantages for different use cases:\", \"score\": 0.69672287, \"raw_content\": null}, {\"url\": \"https://www.llama.com/models/llama-4/\", \"title\": \"Unmatched Performance and Efficiency | Llama 4\", \"content\": \"# Llama 4 # Llama 4 Llama 4 Scout Class-leading natively multimodal model that offers superior text and visual intelligence, single H100 GPU efficiency, and a 10M context window for seamless long document analysis. Llama 4 MaverickIndustry-leading natively multimodal model for image and text understanding with groundbreaking intelligence and fast responses at a low cost. We evaluated model performance on a suite of common benchmarks across a wide range of languages, testing for coding, reasoning, knowledge, vision understanding, multilinguality, and long context. 4. Specialized long context evals are not traditionally reported for generalist models, so we share internal runs to showcase llama's frontier performance. 4. Specialized long context evals are not traditionally reported for generalist models, so we share internal runs to showcase llama's frontier performance.\", \"score\": 0.629889, \"raw_content\": null}]}",
"error_message": null,

View file

@ -10,7 +10,7 @@
},
"response": {
"body": {
"__type__": "llama_stack.apis.tools.tools.ToolInvocationResult",
"__type__": "llama_stack_api.tools.ToolInvocationResult",
"__data__": {
"content": "{\"query\": \"Llama 4 Maverick model number of experts\", \"top_k\": [{\"url\": \"https://console.groq.com/docs/model/meta-llama/llama-4-maverick-17b-128e-instruct\", \"title\": \"Llama 4 Maverick 17B 128E\", \"content\": \"Llama 4 Maverick is Meta's natively multimodal model that enables text and image understanding. With a 17 billion parameter mixture-of-experts architecture (128 experts), this model offers industry-leading performance for multimodal tasks like natural assistant-like chat, image recognition, and coding tasks. Llama 4 Maverick features an auto-regressive language model that uses a mixture-of-experts (MoE) architecture with 17B activated parameters (400B total) and incorporates early fusion for native multimodality. The model uses 128 experts to efficiently handle both text and image inputs while maintaining high performance across chat, knowledge, and code generation tasks, with a knowledge cutoff of August 2024. * For multimodal applications, this model supports up to 5 image inputs create( model =\\\"meta-llama/llama-4-maverick-17b-128e-instruct\\\", messages =[ { \\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Explain why fast inference is critical for reasoning models\\\" } ] ) print(completion.\", \"score\": 0.9287263, \"raw_content\": null}, {\"url\": \"https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E\", \"title\": \"meta-llama/Llama-4-Maverick-17B-128E\", \"content\": \"... model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts. Model developer: Meta. Model Architecture: The\", \"score\": 0.9183121, \"raw_content\": null}, {\"url\": \"https://build.nvidia.com/meta/llama-4-maverick-17b-128e-instruct/modelcard\", \"title\": \"llama-4-maverick-17b-128e-instruct Model by Meta\", \"content\": \"... model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts. Third-Party Community Consideration. This model\", \"score\": 0.91399205, \"raw_content\": null}, {\"url\": \"https://replicate.com/meta/llama-4-maverick-instruct\", \"title\": \"meta/llama-4-maverick-instruct | Run with an API on ...\", \"content\": \"... model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts. All services are online \\u00b7 Home \\u00b7 About \\u00b7 Changelog\", \"score\": 0.9073207, \"raw_content\": null}, {\"url\": \"https://openrouter.ai/meta-llama/llama-4-maverick\", \"title\": \"Llama 4 Maverick - API, Providers, Stats\", \"content\": \"# Meta: Llama 4 Maverick ### meta-llama/llama-4-maverick Llama 4 Maverick 17B Instruct (128E) is a high-capacity multimodal language model from Meta, built on a mixture-of-experts (MoE) architecture with 128 experts and 17 billion active parameters per forward pass (400B total). Released on April 5, 2025 under the Llama 4 Community License, Maverick is suited for research and commercial applications requiring advanced multimodal understanding and high model throughput. Llama 4 Maverick - API, Providers, Stats | OpenRouter ## Providers for Llama 4 Maverick ## Performance for Llama 4 Maverick ## Apps using Llama 4 Maverick ## Recent activity on Llama 4 Maverick ## Uptime stats for Llama 4 Maverick ## Sample code and API for Llama 4 Maverick\", \"score\": 0.8958969, \"raw_content\": null}]}",
"error_message": null,

View file

@ -10,7 +10,7 @@
},
"response": {
"body": {
"__type__": "llama_stack.apis.tools.tools.ToolInvocationResult",
"__type__": "llama_stack_api.tools.ToolInvocationResult",
"__data__": {
"content": "{\"query\": \"latest version of Python\", \"top_k\": [{\"url\": \"https://www.liquidweb.com/blog/latest-python-version/\", \"title\": \"The latest Python version: Python 3.14 - Liquid Web\", \"content\": \"The latest major version, Python 3.14 was officially released on October 7, 2025. Let's explore the key features of Python's current version, how to download\", \"score\": 0.890761, \"raw_content\": null}, {\"url\": \"https://docs.python.org/3/whatsnew/3.14.html\", \"title\": \"What's new in Python 3.14 \\u2014 Python 3.14.0 documentation\", \"content\": \"Python 3.14 is the latest stable release of the Python programming language, with a mix of changes to the language, the implementation, and the standard\", \"score\": 0.8124067, \"raw_content\": null}, {\"url\": \"https://devguide.python.org/versions/\", \"title\": \"Status of Python versions - Python Developer's Guide\", \"content\": \"The main branch is currently the future Python 3.15, and is the only branch that accepts new features. The latest release for each Python version can be found\", \"score\": 0.80089486, \"raw_content\": null}, {\"url\": \"https://www.python.org/doc/versions/\", \"title\": \"Python documentation by version\", \"content\": \"Python 3.12.4, documentation released on 6 June 2024. Python 3.12.3, documentation released on 9 April 2024. Python 3.12.2, documentation released on 6 February\", \"score\": 0.74563974, \"raw_content\": null}, {\"url\": \"https://www.python.org/downloads/\", \"title\": \"Download Python | Python.org\", \"content\": \"Active Python Releases \\u00b7 3.15 pre-release 2026-10-07 (planned) 2031-10 PEP 790 \\u00b7 3.14 bugfix 2025-10-07 2030-10 PEP 745 \\u00b7 3.13 bugfix 2024-10-07 2029-10 PEP 719\", \"score\": 0.6551821, \"raw_content\": null}]}",
"error_message": null,

View file

@ -12,8 +12,8 @@ import warnings
from collections.abc import Generator
import pytest
from llama_stack_api import ViolationLevel
from llama_stack.apis.safety import ViolationLevel
from llama_stack.models.llama.sku_types import CoreModelId
# Llama Guard models available for text and vision shields

View file

@ -7,8 +7,7 @@ import base64
import mimetypes
import pytest
from llama_stack.apis.safety import ViolationLevel
from llama_stack_api import ViolationLevel
CODE_SCANNER_ENABLED_PROVIDERS = {"ollama", "together", "fireworks"}

View file

@ -9,8 +9,7 @@ import mimetypes
import os
import pytest
from llama_stack.apis.safety import ViolationLevel
from llama_stack_api import ViolationLevel
VISION_SHIELD_ENABLED_PROVIDERS = {"together"}

View file

@ -7,8 +7,8 @@
import re
import pytest
from llama_stack_api import ToolGroupNotFoundError
from llama_stack.apis.common.errors import ToolGroupNotFoundError
from llama_stack.core.library_client import LlamaStackAsLibraryClient
from tests.common.mcp import MCP_TOOLGROUP_ID, make_mcp_server

View file

@ -8,11 +8,10 @@ import time
from io import BytesIO
import pytest
from llama_stack_api import Chunk, ExpiresAfter
from llama_stack_client import BadRequestError
from openai import BadRequestError as OpenAIBadRequestError
from llama_stack.apis.files import ExpiresAfter
from llama_stack.apis.vector_io import Chunk
from llama_stack.core.library_client import LlamaStackAsLibraryClient
from llama_stack.log import get_logger
@ -646,7 +645,7 @@ def test_openai_vector_store_attach_file(
):
"""Test OpenAI vector store attach file."""
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
compat_client = compat_client_with_empty_stores
@ -710,7 +709,7 @@ def test_openai_vector_store_attach_files_on_creation(
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
compat_client = compat_client_with_empty_stores
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
# Create some files and attach them to the vector store
valid_file_ids = []
@ -775,7 +774,7 @@ def test_openai_vector_store_list_files(
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
compat_client = compat_client_with_empty_stores
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
# Create a vector store
vector_store = compat_client.vector_stores.create(
@ -867,7 +866,7 @@ def test_openai_vector_store_retrieve_file_contents(
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
compat_client = compat_client_with_empty_stores
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
# Create a vector store
vector_store = compat_client.vector_stores.create(
@ -928,7 +927,7 @@ def test_openai_vector_store_delete_file(
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
compat_client = compat_client_with_empty_stores
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
# Create a vector store
vector_store = compat_client.vector_stores.create(
@ -994,7 +993,7 @@ def test_openai_vector_store_delete_file_removes_from_vector_store(
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
compat_client = compat_client_with_empty_stores
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
# Create a vector store
vector_store = compat_client.vector_stores.create(
@ -1046,7 +1045,7 @@ def test_openai_vector_store_update_file(
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
compat_client = compat_client_with_empty_stores
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
# Create a vector store
vector_store = compat_client.vector_stores.create(
@ -1103,7 +1102,7 @@ def test_create_vector_store_files_duplicate_vector_store_name(
This test confirms that client.vector_stores.create() creates a unique ID
"""
skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
from llama_stack.apis.files import ExpiresAfter
from llama_stack_api import ExpiresAfter
compat_client = compat_client_with_empty_stores

View file

@ -5,8 +5,7 @@
# the root directory of this source tree.
import pytest
from llama_stack.apis.vector_io import Chunk
from llama_stack_api import Chunk
from ..conftest import vector_provider_wrapper