Merge branch 'llamastack:main' into restore_responses_unit_tests

2025-12-03 01:48:05 +00:00 · 2025-11-14 11:23:26 -05:00 · 2025-11-14 11:23:26 -05:00 · 73a114f585
commit 73a114f585
parent f97b66f3ec a078f089d9
404 changed files with 19023 additions and 1727 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -53,7 +53,7 @@ jobs:
        working-directory: src/llama_stack_ui

      - name: Install pre-commit
-        run: python -m pip install pre-commit
+        run: python -m pip install 'pre-commit>=4.4.0'

      - name: Cache pre-commit
        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -30,13 +30,16 @@ jobs:
        activate-environment: true
        version: 0.7.6

-    - name: Build Llama Stack package
-      run: |
-        uv build
+    - name: Build Llama Stack API package
+      working-directory: src/llama_stack_api
+      run: uv build

-    - name: Install Llama Stack package
+    - name: Build Llama Stack package
+      run: uv build
+
+    - name: Install Llama Stack package (with api stubs from local build)
      run: |
-        uv pip install dist/*.whl
+        uv pip install --find-links src/llama_stack_api/dist dist/*.whl

    - name: Verify Llama Stack package
      run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,5 +1,5 @@
 exclude: 'build/'
-
+minimum_pre_commit_version: 4.4.0
 default_language_version:
    python: python3.12
    node: "22"
@ -42,7 +42,7 @@ repos:
    hooks:
    -   id: ruff
        args: [ --fix ]
-        exclude: ^src/llama_stack/strong_typing/.*$
+        exclude: ^(src/llama_stack_api/strong_typing/.*)$
    -   id: ruff-format

 -   repo: https://github.com/adamchainz/blacken-docs
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -998,6 +998,39 @@ paths:
      description: List models using the OpenAI API.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: A Model.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Model'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Models
+      summary: Register model.
+      description: >-
+        Register model.
+
+        Register a model.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterModelRequest'
+        required: true
+      deprecated: true
  /v1/models/{model_id}:
    get:
      responses:
@ -1032,6 +1065,36 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Models
+      summary: Unregister model.
+      description: >-
+        Unregister model.
+
+        Unregister a model.
+      parameters:
+        - name: model_id
+          in: path
+          description: >-
+            The identifier of the model to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/moderations:
    post:
      responses:
@ -1662,6 +1725,32 @@ paths:
      description: List all scoring functions.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      summary: Register a scoring function.
+      description: Register a scoring function.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
+        required: true
+      deprecated: true
  /v1/scoring-functions/{scoring_fn_id}:
    get:
      responses:
@ -1693,6 +1782,33 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      summary: Unregister a scoring function.
+      description: Unregister a scoring function.
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          description: >-
+            The ID of the scoring function to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/scoring/score:
    post:
      responses:
@ -1781,6 +1897,36 @@ paths:
      description: List all shields.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: A Shield.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Shield'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Shields
+      summary: Register a shield.
+      description: Register a shield.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterShieldRequest'
+        required: true
+      deprecated: true
  /v1/shields/{identifier}:
    get:
      responses:
@ -1812,6 +1958,33 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Shields
+      summary: Unregister a shield.
+      description: Unregister a shield.
+      parameters:
+        - name: identifier
+          in: path
+          description: >-
+            The identifier of the shield to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/tool-runtime/invoke:
    post:
      responses:
@ -1907,6 +2080,32 @@ paths:
      description: List tool groups with optional provider.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ToolGroups
+      summary: Register a tool group.
+      description: Register a tool group.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterToolGroupRequest'
+        required: true
+      deprecated: true
  /v1/toolgroups/{toolgroup_id}:
    get:
      responses:
@ -1938,6 +2137,32 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ToolGroups
+      summary: Unregister a tool group.
+      description: Unregister a tool group.
+      parameters:
+        - name: toolgroup_id
+          in: path
+          description: The ID of the tool group to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/tools:
    get:
      responses:
@ -11420,6 +11645,152 @@ components:
        - hyperparam_search_config
        - logger_config
      title: SupervisedFineTuneRequest
+    RegisterModelRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
+          description: The identifier of the model to register.
+        provider_model_id:
+          type: string
+          description: >-
+            The identifier of the model in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Any additional metadata for this model.
+        model_type:
+          $ref: '#/components/schemas/ModelType'
+          description: The type of model to register.
+      additionalProperties: false
+      required:
+        - model_id
+      title: RegisterModelRequest
+    ParamType:
+      oneOf:
+        - $ref: '#/components/schemas/StringType'
+        - $ref: '#/components/schemas/NumberType'
+        - $ref: '#/components/schemas/BooleanType'
+        - $ref: '#/components/schemas/ArrayType'
+        - $ref: '#/components/schemas/ObjectType'
+        - $ref: '#/components/schemas/JsonType'
+        - $ref: '#/components/schemas/UnionType'
+        - $ref: '#/components/schemas/ChatCompletionInputType'
+        - $ref: '#/components/schemas/CompletionInputType'
+      discriminator:
+        propertyName: type
+        mapping:
+          string: '#/components/schemas/StringType'
+          number: '#/components/schemas/NumberType'
+          boolean: '#/components/schemas/BooleanType'
+          array: '#/components/schemas/ArrayType'
+          object: '#/components/schemas/ObjectType'
+          json: '#/components/schemas/JsonType'
+          union: '#/components/schemas/UnionType'
+          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+          completion_input: '#/components/schemas/CompletionInputType'
+    RegisterScoringFunctionRequest:
+      type: object
+      properties:
+        scoring_fn_id:
+          type: string
+          description: >-
+            The ID of the scoring function to register.
+        description:
+          type: string
+          description: The description of the scoring function.
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+          description: The return type of the scoring function.
+        provider_scoring_fn_id:
+          type: string
+          description: >-
+            The ID of the provider scoring function to use for the scoring function.
+        provider_id:
+          type: string
+          description: >-
+            The ID of the provider to use for the scoring function.
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+          description: >-
+            The parameters for the scoring function for benchmark eval, these can
+            be overridden for app eval.
+      additionalProperties: false
+      required:
+        - scoring_fn_id
+        - description
+        - return_type
+      title: RegisterScoringFunctionRequest
+    RegisterShieldRequest:
+      type: object
+      properties:
+        shield_id:
+          type: string
+          description: >-
+            The identifier of the shield to register.
+        provider_shield_id:
+          type: string
+          description: >-
+            The identifier of the shield in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        params:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: The parameters of the shield.
+      additionalProperties: false
+      required:
+        - shield_id
+      title: RegisterShieldRequest
+    RegisterToolGroupRequest:
+      type: object
+      properties:
+        toolgroup_id:
+          type: string
+          description: The ID of the tool group to register.
+        provider_id:
+          type: string
+          description: >-
+            The ID of the provider to use for the tool group.
+        mcp_endpoint:
+          $ref: '#/components/schemas/URL'
+          description: >-
+            The MCP endpoint to use for the tool group.
+        args:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: >-
+            A dictionary of arguments to pass to the tool group.
+      additionalProperties: false
+      required:
+        - toolgroup_id
+        - provider_id
+      title: RegisterToolGroupRequest
    DataSource:
      oneOf:
        - $ref: '#/components/schemas/URIDataSource'
--- a/docs/docs/api-deprecated/index.mdx
+++ b/docs/docs/api-deprecated/index.mdx
@ -0,0 +1,62 @@
+---
+title: Deprecated APIs
+description: Legacy APIs that are being phased out
+sidebar_label: Deprecated
+sidebar_position: 1
+---
+
+# Deprecated APIs
+
+This section contains APIs that are being phased out in favor of newer, more standardized implementations. These APIs are maintained for backward compatibility but are not recommended for new projects.
+
+:::warning Deprecation Notice
+These APIs are deprecated and will be removed in future versions. Please migrate to the recommended alternatives listed below.
+:::
+
+## Migration Guide
+
+When using deprecated APIs, please refer to the migration guides provided for each API to understand how to transition to the supported alternatives.
+
+## Deprecated API List
+
+### Legacy Inference APIs
+Some older inference endpoints that have been superseded by the standardized Inference API.
+
+**Migration Path:** Use the [Inference API](../api/) instead.
+
+### Legacy Vector Operations
+Older vector database operations that have been replaced by the Vector IO API.
+
+**Migration Path:** Use the [Vector IO API](../api/) instead.
+
+### Legacy File Operations
+Older file management endpoints that have been replaced by the Files API.
+
+**Migration Path:** Use the [Files API](../api/) instead.
+
+## Support Timeline
+
+Deprecated APIs will be supported according to the following timeline:
+
+- **Current Version**: Full support with deprecation warnings
+- **Next Major Version**: Limited support with migration notices
+- **Following Major Version**: Removal of deprecated APIs
+
+## Getting Help
+
+If you need assistance migrating from deprecated APIs:
+
+1. Check the specific migration guides for each API
+2. Review the [API Reference](../api/) for current alternatives
+3. Consult the [Community Forums](https://github.com/llamastack/llama-stack/discussions) for migration support
+4. Open an issue on GitHub for specific migration questions
+
+## Contributing
+
+If you find issues with deprecated APIs or have suggestions for improving the migration process, please contribute by:
+
+1. Opening an issue describing the problem
+2. Submitting a pull request with improvements
+3. Updating migration documentation
+
+For more information on contributing, see our [Contributing Guide](../contributing/).
--- a/docs/docs/api-experimental/index.mdx
+++ b/docs/docs/api-experimental/index.mdx
@ -0,0 +1,128 @@
+---
+title: Experimental APIs
+description: APIs in development with limited support
+sidebar_label: Experimental
+sidebar_position: 1
+---
+
+# Experimental APIs
+
+This section contains APIs that are currently in development and may have limited support or stability. These APIs are available for testing and feedback but should not be used in production environments.
+
+:::warning Experimental Notice
+These APIs are experimental and may change without notice. Use with caution and provide feedback to help improve them.
+:::
+
+## Current Experimental APIs
+
+### Batch Inference API
+Run inference on a dataset of inputs in batch mode for improved efficiency.
+
+**Status:** In Development
+**Provider Support:** Limited
+**Use Case:** Large-scale inference operations
+
+**Features:**
+- Batch processing of multiple inputs
+- Optimized resource utilization
+- Progress tracking and monitoring
+
+### Batch Agents API
+Run agentic workflows on a dataset of inputs in batch mode.
+
+**Status:** In Development
+**Provider Support:** Limited
+**Use Case:** Large-scale agent operations
+
+**Features:**
+- Batch agent execution
+- Parallel processing capabilities
+- Result aggregation and analysis
+
+### Synthetic Data Generation API
+Generate synthetic data for model development and testing.
+
+**Status:** Early Development
+**Provider Support:** Very Limited
+**Use Case:** Training data augmentation
+
+**Features:**
+- Automated data generation
+- Quality control mechanisms
+- Customizable generation parameters
+
+### Batches API (OpenAI-compatible)
+OpenAI-compatible batch management for inference operations.
+
+**Status:** In Development
+**Provider Support:** Limited
+**Use Case:** OpenAI batch processing compatibility
+
+**Features:**
+- OpenAI batch API compatibility
+- Job scheduling and management
+- Status tracking and monitoring
+
+## Getting Started with Experimental APIs
+
+### Prerequisites
+- Llama Stack server running with experimental features enabled
+- Appropriate provider configurations
+- Understanding of API limitations
+
+### Configuration
+Experimental APIs may require special configuration flags or provider settings. Check the specific API documentation for setup requirements.
+
+### Usage Guidelines
+1. **Testing Only**: Use experimental APIs for testing and development only
+2. **Monitor Changes**: Watch for updates and breaking changes
+3. **Provide Feedback**: Report issues and suggest improvements
+4. **Backup Data**: Always backup important data when using experimental features
+
+## Feedback and Contribution
+
+We encourage feedback on experimental APIs to help improve them:
+
+### Reporting Issues
+- Use GitHub issues with the "experimental" label
+- Include detailed error messages and reproduction steps
+- Specify the API version and provider being used
+
+### Feature Requests
+- Submit feature requests through GitHub discussions
+- Provide use cases and expected behavior
+- Consider contributing implementations
+
+### Testing
+- Test experimental APIs in your environment
+- Report performance issues and optimization opportunities
+- Share success stories and use cases
+
+## Migration to Stable APIs
+
+As experimental APIs mature, they will be moved to the stable API section. When this happens:
+
+1. **Announcement**: We'll announce the promotion in release notes
+2. **Migration Guide**: Detailed migration instructions will be provided
+3. **Deprecation Timeline**: Experimental versions will be deprecated with notice
+4. **Support**: Full support will be available for stable versions
+
+## Provider Support
+
+Experimental APIs may have limited provider support. Check the specific API documentation for:
+
+- Supported providers
+- Configuration requirements
+- Known limitations
+- Performance characteristics
+
+## Roadmap
+
+Experimental APIs are part of our ongoing development roadmap:
+
+- **Q1 2024**: Batch Inference API stabilization
+- **Q2 2024**: Batch Agents API improvements
+- **Q3 2024**: Synthetic Data Generation API expansion
+- **Q4 2024**: Batches API full OpenAI compatibility
+
+For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).
--- a/docs/docs/api-openai/index.mdx
+++ b/docs/docs/api-openai/index.mdx
@ -0,0 +1,287 @@
+---
+title: OpenAI API Compatibility
+description: OpenAI-compatible APIs and features in Llama Stack
+sidebar_label: OpenAI Compatibility
+sidebar_position: 1
+---
+
+# OpenAI API Compatibility
+
+Llama Stack provides comprehensive OpenAI API compatibility, allowing you to use existing OpenAI API clients and tools with Llama Stack providers. This compatibility layer ensures seamless migration and interoperability.
+
+## Overview
+
+OpenAI API compatibility in Llama Stack includes:
+
+- **OpenAI-compatible endpoints** for all major APIs
+- **Request/response format compatibility** with OpenAI standards
+- **Authentication and authorization** using OpenAI-style API keys
+- **Error handling** with OpenAI-compatible error codes and messages
+- **Rate limiting** and usage tracking compatible with OpenAI patterns
+
+## Supported OpenAI APIs
+
+### Chat Completions API
+OpenAI-compatible chat completions for conversational AI applications.
+
+**Endpoint:** `/v1/chat/completions`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** All inference providers
+
+**Features:**
+- Message-based conversations
+- System prompts and user messages
+- Function calling support
+- Streaming responses
+- Temperature and other parameter controls
+
+### Completions API
+OpenAI-compatible text completions for general text generation.
+
+**Endpoint:** `/v1/completions`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** All inference providers
+
+**Features:**
+- Text completion generation
+- Prompt engineering support
+- Customizable parameters
+- Batch processing capabilities
+
+### Embeddings API
+OpenAI-compatible embeddings for vector operations.
+
+**Endpoint:** `/v1/embeddings`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** All embedding providers
+
+**Features:**
+- Text embedding generation
+- Multiple embedding models
+- Batch embedding processing
+- Vector similarity operations
+
+### Files API
+OpenAI-compatible file management for document processing.
+
+**Endpoint:** `/v1/files`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** Local Filesystem, S3
+
+**Features:**
+- File upload and management
+- Document processing
+- File metadata tracking
+- Secure file access
+
+### Vector Store Files API
+OpenAI-compatible vector store file operations for RAG applications.
+
+**Endpoint:** `/v1/vector_stores/{vector_store_id}/files`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** FAISS, SQLite-vec, Milvus, ChromaDB, Qdrant, Weaviate, Postgres (PGVector)
+
+**Features:**
+- Automatic document processing
+- Vector store integration
+- File chunking and indexing
+- Search and retrieval operations
+
+### Batches API
+OpenAI-compatible batch processing for large-scale operations.
+
+**Endpoint:** `/v1/batches`
+**Compatibility:** OpenAI API compatibility (experimental)
+**Providers:** Limited support
+
+**Features:**
+- Batch job creation and management
+- Progress tracking
+- Result retrieval
+- Error handling
+
+## Migration from OpenAI
+
+### Step 1: Update API Endpoint
+Change your API endpoint from OpenAI to your Llama Stack server:
+
+```python
+# Before (OpenAI)
+import openai
+client = openai.OpenAI(api_key="your-openai-key")
+
+# After (Llama Stack)
+import openai
+client = openai.OpenAI(
+    api_key="your-llama-stack-key",
+    base_url="http://localhost:8000/v1"  # Your Llama Stack server
+)
+```
+
+### Step 2: Configure Providers
+Set up your preferred providers in the Llama Stack configuration:
+
+```yaml
+# stack-config.yaml
+inference:
+  providers:
+    - name: "meta-reference"
+      type: "inline"
+      model: "llama-3.1-8b"
+```
+
+### Step 3: Test Compatibility
+Verify that your existing code works with Llama Stack:
+
+```python
+# Test chat completions
+response = client.chat.completions.create(
+    model="llama-3.1-8b",
+    messages=[
+        {"role": "user", "content": "Hello, world!"}
+    ]
+)
+print(response.choices[0].message.content)
+```
+
+## Provider-Specific Features
+
+### Meta Reference Provider
+- Full OpenAI API compatibility
+- Local model execution
+- Custom model support
+
+### Remote Providers
+- OpenAI API compatibility
+- Cloud-based execution
+- Scalable infrastructure
+
+### Vector Store Providers
+- OpenAI vector store API compatibility
+- Automatic document processing
+- Advanced search capabilities
+
+## Authentication
+
+Llama Stack supports OpenAI-style authentication:
+
+### API Key Authentication
+```python
+client = openai.OpenAI(
+    api_key="your-api-key",
+    base_url="http://localhost:8000/v1"
+)
+```
+
+### Environment Variables
+```bash
+export OPENAI_API_KEY="your-api-key"
+export OPENAI_BASE_URL="http://localhost:8000/v1"
+```
+
+## Error Handling
+
+Llama Stack provides OpenAI-compatible error responses:
+
+```python
+try:
+    response = client.chat.completions.create(...)
+except openai.APIError as e:
+    print(f"API Error: {e}")
+except openai.RateLimitError as e:
+    print(f"Rate Limit Error: {e}")
+except openai.APIConnectionError as e:
+    print(f"Connection Error: {e}")
+```
+
+## Rate Limiting
+
+OpenAI-compatible rate limiting is supported:
+
+- **Requests per minute** limits
+- **Tokens per minute** limits
+- **Concurrent request** limits
+- **Usage tracking** and monitoring
+
+## Monitoring and Observability
+
+Track your API usage with OpenAI-compatible monitoring:
+
+- **Request/response logging**
+- **Usage metrics** and analytics
+- **Performance monitoring**
+- **Error tracking** and alerting
+
+## Best Practices
+
+### 1. Provider Selection
+Choose providers based on your requirements:
+- **Local development**: Meta Reference, Ollama
+- **Production**: Cloud providers (Fireworks, Together, NVIDIA)
+- **Specialized use cases**: Custom providers
+
+### 2. Model Configuration
+Configure models for optimal performance:
+- **Model selection** based on task requirements
+- **Parameter tuning** for specific use cases
+- **Resource allocation** for performance
+
+### 3. Error Handling
+Implement robust error handling:
+- **Retry logic** for transient failures
+- **Fallback providers** for high availability
+- **Monitoring** and alerting for issues
+
+### 4. Security
+Follow security best practices:
+- **API key management** and rotation
+- **Access control** and authorization
+- **Data privacy** and compliance
+
+## Implementation Examples
+
+For detailed code examples and implementation guides, see our [OpenAI Implementation Guide](../providers/openai.mdx).
+
+## Known Limitations
+
+### Responses API Limitations
+The Responses API is still in active development. For detailed information about current limitations and implementation status, see our [OpenAI Responses API Limitations](../providers/openai_responses_limitations.mdx).
+
+## Troubleshooting
+
+### Common Issues
+
+**Connection Errors**
+- Verify server is running
+- Check network connectivity
+- Validate API endpoint URL
+
+**Authentication Errors**
+- Verify API key is correct
+- Check key permissions
+- Ensure proper authentication headers
+
+**Model Errors**
+- Verify model is available
+- Check provider configuration
+- Validate model parameters
+
+### Getting Help
+
+For OpenAI compatibility issues:
+
+1. **Check Documentation**: Review provider-specific documentation
+2. **Community Support**: Ask questions in GitHub discussions
+3. **Issue Reporting**: Open GitHub issues for bugs
+4. **Professional Support**: Contact support for enterprise issues
+
+## Roadmap
+
+Upcoming OpenAI compatibility features:
+
+- **Enhanced batch processing** support
+- **Advanced function calling** capabilities
+- **Improved error handling** and diagnostics
+- **Performance optimizations** for large-scale deployments
+
+For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).
--- a/docs/docs/api/index.mdx
+++ b/docs/docs/api/index.mdx
@ -0,0 +1,144 @@
+---
+title: API Reference
+description: Complete reference for Llama Stack APIs
+sidebar_label: Overview
+sidebar_position: 1
+---
+
+# API Reference
+
+Llama Stack provides a comprehensive set of APIs for building generative AI applications. All APIs follow OpenAI-compatible standards and can be used interchangeably across different providers.
+
+## Core APIs
+
+### Inference API
+Run inference with Large Language Models (LLMs) and embedding models.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- Ollama (Single Node)
+- Fireworks (Hosted)
+- Together (Hosted)
+- NVIDIA NIM (Hosted and Single Node)
+- vLLM (Hosted and Single Node)
+- TGI (Hosted and Single Node)
+- AWS Bedrock (Hosted)
+- Cerebras (Hosted)
+- Groq (Hosted)
+- SambaNova (Hosted)
+- PyTorch ExecuTorch (On-device iOS, Android)
+- OpenAI (Hosted)
+- Anthropic (Hosted)
+- Gemini (Hosted)
+- WatsonX (Hosted)
+
+### Agents API
+Run multi-step agentic workflows with LLMs, including tool usage, memory (RAG), and complex reasoning.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- Fireworks (Hosted)
+- Together (Hosted)
+- PyTorch ExecuTorch (On-device iOS)
+
+### Vector IO API
+Perform operations on vector stores, including adding documents, searching, and deleting documents.
+
+**Supported Providers:**
+- FAISS (Single Node)
+- SQLite-Vec (Single Node)
+- Chroma (Hosted and Single Node)
+- Milvus (Hosted and Single Node)
+- Postgres (PGVector) (Hosted and Single Node)
+- Weaviate (Hosted)
+- Qdrant (Hosted and Single Node)
+
+### Files API (OpenAI-compatible)
+Manage file uploads, storage, and retrieval with OpenAI-compatible endpoints.
+
+**Supported Providers:**
+- Local Filesystem (Single Node)
+- S3 (Hosted)
+
+### Vector Store Files API (OpenAI-compatible)
+Integrate file operations with vector stores for automatic document processing and search.
+
+**Supported Providers:**
+- FAISS (Single Node)
+- SQLite-vec (Single Node)
+- Milvus (Single Node)
+- ChromaDB (Hosted and Single Node)
+- Qdrant (Hosted and Single Node)
+- Weaviate (Hosted)
+- Postgres (PGVector) (Hosted and Single Node)
+
+### Safety API
+Apply safety policies to outputs at a systems level, not just model level.
+
+**Supported Providers:**
+- Llama Guard (Depends on Inference Provider)
+- Prompt Guard (Single Node)
+- Code Scanner (Single Node)
+- AWS Bedrock (Hosted)
+
+### Post Training API
+Fine-tune models for specific use cases and domains.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- HuggingFace (Single Node)
+- TorchTune (Single Node)
+- NVIDIA NEMO (Hosted)
+
+### Eval API
+Generate outputs and perform scoring to evaluate system performance.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- NVIDIA NEMO (Hosted)
+
+### Telemetry API
+Collect telemetry data from the system for monitoring and observability.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+
+### Tool Runtime API
+Interact with various tools and protocols to extend LLM capabilities.
+
+**Supported Providers:**
+- Brave Search (Hosted)
+- RAG Runtime (Single Node)
+
+## API Compatibility
+
+All Llama Stack APIs are designed to be OpenAI-compatible, allowing you to:
+- Use existing OpenAI API clients and tools
+- Migrate from OpenAI to other providers seamlessly
+- Maintain consistent API contracts across different environments
+
+## Getting Started
+
+To get started with Llama Stack APIs:
+
+1. **Choose a Distribution**: Select a pre-configured distribution that matches your environment
+2. **Configure Providers**: Set up the providers you want to use for each API
+3. **Start the Server**: Launch the Llama Stack server with your configuration
+4. **Use the APIs**: Make requests to the API endpoints using your preferred client
+
+For detailed setup instructions, see our [Getting Started Guide](../getting_started/quickstart).
+
+## Provider Details
+
+For complete provider compatibility and setup instructions, see our [Providers Documentation](../providers/).
+
+## API Stability
+
+Llama Stack APIs are organized by stability level:
+- **[Stable APIs](./index.mdx)** - Production-ready APIs with full support
+- **[Experimental APIs](../api-experimental/)** - APIs in development with limited support
+- **[Deprecated APIs](../api-deprecated/)** - Legacy APIs being phased out
+
+## OpenAI Integration
+
+For specific OpenAI API compatibility features, see our [OpenAI Compatibility Guide](../api-openai/).
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -0,0 +1,87 @@
+---
+title: Admin UI & Chat Playground
+description: Web-based admin interface and chat playground for Llama Stack
+sidebar_label: Playground
+sidebar_position: 10
+---
+
+# Admin UI & Chat Playground
+
+The Llama Stack UI provides a comprehensive web-based admin interface for managing your Llama Stack server, with an integrated chat playground for interactive testing. This admin interface is the primary way to monitor, manage, and debug your Llama Stack applications.
+
+## Quick Start
+
+Launch the admin UI with:
+
+```bash
+npx llama-stack-ui
+```
+
+Then visit `http://localhost:8322` to access the interface.
+
+## Admin Interface Features
+
+The Llama Stack UI is organized into three main sections:
+
+### 🎯 Create
+**Chat Playground** - Interactive testing environment
+- Real-time chat interface for testing agents and models
+- Multi-turn conversations with tool calling support
+- Agent SDK integration (will be migrated to Responses API)
+- Custom system prompts and model parameter adjustment
+
+### 📊 Manage
+**Logs & Resource Management** - Monitor and manage your stack
+- **Responses Logs**: View and analyze agent responses and interactions
+- **Chat Completions Logs**: Monitor chat completion requests and responses
+- **Vector Stores**: Create, manage, and monitor vector databases for RAG workflows
+- **Prompts**: Full CRUD operations for prompt templates and management
+- **Files**: Forthcoming file management capabilities
+
+## Key Capabilities for Application Development
+
+### Real-time Monitoring
+- **Response Tracking**: Monitor all agent responses and tool calls
+- **Completion Analysis**: View chat completion performance and patterns
+- **Vector Store Activity**: Track RAG operations and document processing
+- **Prompt Usage**: Analyze prompt template performance
+
+### Resource Management
+- **Vector Store CRUD**: Create, update, and delete vector databases
+- **Prompt Library**: Organize and version control your prompts
+- **File Operations**: Manage documents and assets (forthcoming)
+
+### Interactive Testing
+- **Chat Playground**: Test conversational flows before production deployment
+- **Agent Prototyping**: Validate agent behaviors and tool integrations
+
+## Development Workflow Integration
+
+The admin UI supports your development lifecycle:
+
+1. **Development**: Use chat playground to prototype and test features
+2. **Monitoring**: Track system performance through logs and metrics
+3. **Management**: Organize prompts, vector stores, and other resources
+4. **Debugging**: Analyze logs to identify and resolve issues
+
+## Architecture Notes
+
+- **Current**: Chat playground uses Agents SDK
+- **Future**: Migration to Responses API for improved performance and consistency
+- **Admin Focus**: Primary emphasis on monitoring, logging, and resource management
+
+## Getting Started
+
+1. **Launch the UI**: Run `npx llama-stack-ui`
+2. **Explore Logs**: Start with Responses and Chat Completions logs to understand your system activity
+3. **Test in Playground**: Use the chat interface to validate your agent configurations
+4. **Manage Resources**: Create vector stores and organize prompts through the UI
+
+For detailed setup and configuration, see the [Llama Stack UI documentation](/docs/distributions/llama_stack_ui).
+
+## Next Steps
+
+- Set up your [first agent](/docs/building_applications/agent)
+- Implement [RAG functionality](/docs/building_applications/rag)
+- Add [evaluation metrics](/docs/building_applications/evals)
+- Configure [safety measures](/docs/building_applications/safety)
--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -58,7 +58,7 @@ External APIs must expose a `available_providers()` function in their module tha

 ```python
 # llama_stack_api_weather/api.py
-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec


 def available_providers() -> list[ProviderSpec]:
@ -79,7 +79,7 @@ A Protocol class like so:
 # llama_stack_api_weather/api.py
 from typing import Protocol

-from llama_stack.schema_utils import webmethod
+from llama_stack_api import webmethod


 class WeatherAPI(Protocol):
@ -151,13 +151,12 @@ __all__ = ["WeatherAPI", "available_providers"]
 # llama-stack-api-weather/src/llama_stack_api_weather/weather.py
 from typing import Protocol

-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    ProviderSpec,
    RemoteProviderSpec,
+    webmethod,
 )
-from llama_stack.schema_utils import webmethod
-

 def available_providers() -> list[ProviderSpec]:
    return [
--- a/docs/docs/concepts/apis/index.mdx
+++ b/docs/docs/concepts/apis/index.mdx
@ -7,7 +7,7 @@ sidebar_position: 1

 # APIs

-A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
+A Llama Stack API is described as a collection of REST endpoints following OpenAI API standards. We currently support the following APIs:

 - **Inference**: run inference with a LLM
 - **Safety**: apply safety policies to the output at a Systems (not only model) level
@ -16,11 +16,26 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 - **Scoring**: evaluate outputs of the system
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
+- **Files**: manage file uploads, storage, and retrieval
+- **Telemetry**: collect telemetry data from the system
 - **Post Training**: fine-tune a model
 - **Tool Runtime**: interact with various tools and protocols
- **Responses**: generate responses from an LLM using this OpenAI compatible API.
+- **Responses**: generate responses from an LLM

 We are working on adding a few more APIs to complete the application lifecycle. These will include:
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
 - **Batches**: OpenAI-compatible batch management for inference
+
+
+## OpenAI API Compatibility
+We are working on adding OpenAI API compatibility to Llama Stack. This will allow you to use Llama Stack with OpenAI API clients and tools.
+
+### File Operations and Vector Store Integration
+
+The Files API and Vector Store APIs work together through file operations, enabling automatic document processing and search. This integration implements the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files) and allows you to:
+- Upload documents through the Files API
+- Automatically process and chunk documents into searchable vectors
+- Store processed content in vector databases based on the availability of [our providers](../../providers/index.mdx)
+- Search through documents using natural language queries
+For detailed information about this integration, see [File Operations and Vector Store Integration](../file_operations_vector_stores.md).
--- a/docs/docs/concepts/file_operations_vector_stores.mdx
+++ b/docs/docs/concepts/file_operations_vector_stores.mdx
@ -0,0 +1,420 @@
+# File Operations and Vector Store Integration
+
+## Overview
+
+Llama Stack provides seamless integration between the Files API and Vector Store APIs, enabling you to upload documents and automatically process them into searchable vector embeddings. This integration implements file operations following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
+
+## Enhanced Capabilities Beyond OpenAI
+
+While Llama Stack maintains full compatibility with OpenAI's Vector Store API, it provides several additional capabilities that enhance functionality and flexibility:
+
+### **Embedding Model Specification**
+Unlike OpenAI's vector stores which use a fixed embedding model, Llama Stack allows you to specify which embedding model to use when creating a vector store:
+
+```python
+# Create vector store with specific embedding model
+vector_store = client.vector_stores.create(
+    name="my_documents",
+    embedding_model="all-MiniLM-L6-v2",  # Specify your preferred model
+    embedding_dimension=384,
+)
+```
+
+### **Advanced Search Modes**
+Llama Stack supports multiple search modes beyond basic vector similarity:
+
+- **Vector Search**: Pure semantic similarity search using embeddings
+- **Keyword Search**: Traditional keyword-based search for exact matches
+- **Hybrid Search**: Combines both vector and keyword search for optimal results
+
+```python
+# Different search modes
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="machine learning algorithms",
+    search_mode="hybrid",  # or "vector", "keyword"
+    max_num_results=5,
+)
+```
+
+### **Flexible Ranking Options**
+For hybrid search, Llama Stack offers configurable ranking strategies:
+
+- **RRF (Reciprocal Rank Fusion)**: Combines rankings with configurable impact factor
+- **Weighted Ranker**: Linear combination of vector and keyword scores with adjustable weights
+
+```python
+# Custom ranking configuration
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="neural networks",
+    search_mode="hybrid",
+    ranking_options={
+        "ranker": {"type": "weighted", "alpha": 0.7}  # 70% vector, 30% keyword
+    },
+)
+```
+
+### **Provider Selection**
+Choose from multiple vector store providers based on your specific needs:
+
+- **Inline Providers**: FAISS (fast in-memory), SQLite-vec (disk-based), Milvus (high-performance)
+- **Remote Providers**: ChromaDB, Qdrant, Weaviate, Postgres (PGVector), Milvus
+
+```python
+# Specify provider when creating vector store
+vector_store = client.vector_stores.create(
+    name="my_documents", provider_id="sqlite-vec"  # Choose your preferred provider
+)
+```
+
+## How It Works
+
+The file operations work through several key components:
+
+1. **File Upload**: Documents are uploaded through the Files API
+2. **Automatic Processing**: Files are automatically chunked and converted to embeddings
+3. **Vector Storage**: Chunks are stored in vector databases with metadata
+4. **Search & Retrieval**: Users can search through processed documents using natural language
+
+## Supported Vector Store Providers
+
+The following vector store providers support file operations:
+
+### Inline Providers (Single Node)
+
+- **FAISS**: Fast in-memory vector similarity search
+- **SQLite-vec**: Disk-based storage with hybrid search capabilities
+
+### Remote Providers (Hosted)
+
+- **ChromaDB**: Vector database with metadata filtering
+- **Weaviate**: Vector database with GraphQL interface
+- **Postgres (PGVector)**: Vector extensions for PostgreSQL
+
+### Both Inline & Remote Providers
+- **Milvus**: High-performance vector database with advanced indexing
+- **Qdrant**: Vector similarity search with payload filtering
+
+## File Processing Pipeline
+
+### 1. File Upload
+
+```python
+from llama_stack import LlamaStackClient
+
+client = LlamaStackClient("http://localhost:8000")
+
+# Upload a document
+with open("document.pdf", "rb") as f:
+    file_info = await client.files.upload(file=f, purpose="assistants")
+```
+
+### 2. Attach to Vector Store
+
+```python
+# Create a vector store
+vector_store = client.vector_stores.create(name="my_documents")
+
+# Attach the file to the vector store
+file_attach_response = await client.vector_stores.files.create(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+```
+
+### 3. Automatic Processing
+
+The system automatically:
+- Detects the file type and extracts text content
+- Splits content into chunks (default: 800 tokens with 400 token overlap)
+- Generates embeddings for each chunk
+- Stores chunks with metadata in the vector store
+- Updates file status to "completed"
+
+### 4. Search and Retrieval
+
+```python
+# Search through processed documents
+search_results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="What is the main topic discussed?",
+    max_num_results=5,
+)
+
+# Process results
+for result in search_results.data:
+    print(f"Score: {result.score}")
+    for content in result.content:
+        print(f"Content: {content.text}")
+```
+
+## Supported File Types
+
+The FileResponse system supports various document formats:
+
+- **Text Files**: `.txt`, `.md`, `.rst`
+- **Documents**: `.pdf`, `.docx`, `.doc`
+- **Code**: `.py`, `.js`, `.java`, `.cpp`, etc.
+- **Data**: `.json`, `.csv`, `.xml`
+- **Web Content**: HTML files
+
+## Chunking Strategies
+
+### Default Strategy
+
+The default chunking strategy uses:
+- **Max Chunk Size**: 800 tokens
+- **Overlap**: 400 tokens
+- **Method**: Semantic boundary detection
+
+### Custom Chunking
+
+You can customize chunking when attaching files:
+
+```python
+from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
+
+# Attach file with custom chunking
+file_attach_response = await client.vector_stores.files.create(
+    vector_store_id=vector_store.id,
+    file_id=file_info.id,
+    chunking_strategy=chunking_strategy,
+)
+```
+
+**Note**: While Llama Stack is OpenAI-compatible, it also supports additional options beyond the standard OpenAI API. When creating vector stores, you can specify custom embedding models and embedding dimensions that will be used when processing chunks from attached files.
+
+
+## File Management
+
+### List Files in Vector Store
+
+```python
+# List all files in a vector store
+files = await client.vector_stores.files.list(vector_store_id=vector_store.id)
+
+for file in files:
+    print(f"File: {file.filename}, Status: {file.status}")
+```
+
+### File Status Tracking
+
+Files go through several statuses:
+- **in_progress**: File is being processed
+- **completed**: File successfully processed and searchable
+- **failed**: Processing failed (check `last_error` for details)
+- **cancelled**: Processing was cancelled
+
+### Retrieve File Content
+
+```python
+# Get chunked content from vector store
+content_response = await client.vector_stores.files.retrieve_content(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+
+for chunk in content_response.content:
+    print(f"Chunk {chunk.metadata.get('chunk_index', 0)}: {chunk.text}")
+```
+
+## Vector Store Management
+
+### List Vector Stores
+
+Retrieve a paginated list of all vector stores:
+
+```python
+# List all vector stores with default pagination
+vector_stores = await client.vector_stores.list()
+
+# Custom pagination and ordering
+vector_stores = await client.vector_stores.list(
+    limit=10,
+    order="asc",  # or "desc"
+    after="vs_12345678",  # cursor-based pagination
+)
+
+for store in vector_stores.data:
+    print(f"Store: {store.name}, Files: {store.file_counts.total}")
+    print(f"Created: {store.created_at}, Status: {store.status}")
+```
+
+### Retrieve Vector Store Details
+
+Get detailed information about a specific vector store:
+
+```python
+# Get vector store details
+store_details = await client.vector_stores.retrieve(vector_store_id="vs_12345678")
+
+print(f"Name: {store_details.name}")
+print(f"Status: {store_details.status}")
+print(f"File Counts: {store_details.file_counts}")
+print(f"Usage: {store_details.usage_bytes} bytes")
+print(f"Created: {store_details.created_at}")
+print(f"Metadata: {store_details.metadata}")
+```
+
+### Update Vector Store
+
+Modify vector store properties such as name, metadata, or expiration settings:
+
+```python
+# Update vector store name and metadata
+updated_store = await client.vector_stores.update(
+    vector_store_id="vs_12345678",
+    name="Updated Document Collection",
+    metadata={
+        "description": "Updated collection for research",
+        "category": "research",
+        "version": "2.0",
+    },
+)
+
+# Set expiration policy
+expired_store = await client.vector_stores.update(
+    vector_store_id="vs_12345678",
+    expires_after={"anchor": "last_active_at", "days": 30},
+)
+
+print(f"Updated store: {updated_store.name}")
+print(f"Last active: {updated_store.last_active_at}")
+```
+
+### Delete Vector Store
+
+Remove a vector store and all its associated data:
+
+```python
+# Delete a vector store
+delete_response = await client.vector_stores.delete(vector_store_id="vs_12345678")
+
+if delete_response.deleted:
+    print(f"Vector store {delete_response.id} successfully deleted")
+else:
+    print("Failed to delete vector store")
+```
+
+**Important Notes:**
+- Deleting a vector store removes all files, chunks, and embeddings
+- This operation cannot be undone
+- The underlying vector database is also cleaned up
+- Consider backing up important data before deletion
+
+## Search Capabilities
+
+### Vector Search
+
+Pure similarity search using embeddings:
+
+```python
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="machine learning algorithms",
+    max_num_results=10,
+)
+```
+
+### Filtered Search
+
+Combine vector search with metadata filtering:
+
+```python
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="machine learning algorithms",
+    filters={"file_type": "pdf", "upload_date": "2024-01-01"},
+    max_num_results=10,
+)
+```
+
+### Hybrid Search
+
+[SQLite-vec](../providers/vector_io/inline_sqlite-vec.mdx), [pgvector](../providers/vector_io/remote_pgvector.mdx), and [Milvus](../providers/vector_io/inline_milvus.mdx) support combining vector and keyword search.
+
+## Performance Considerations
+
+> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../providers/files/openai_file_operations_support.md#performance-considerations) in the provider documentation.
+
+**Key Points:**
+- **Chunk Size**: 400-600 tokens for precision, 800-1200 for context
+- **Storage**: Choose provider based on your performance needs
+- **Search**: Optimize for your specific use case
+
+## Error Handling
+
+> **Note**: For comprehensive troubleshooting and error handling, see [Troubleshooting](../providers/files/openai_file_operations_support.md#troubleshooting) in the provider documentation.
+
+**Common Issues:**
+- File processing failures (format, size limits)
+- Search performance optimization
+- Storage and memory issues
+
+## Best Practices
+
+> **Note**: For detailed best practices and recommendations, see [Best Practices](../providers/files/openai_file_operations_support.md#best-practices) in the provider documentation.
+
+**Key Recommendations:**
+- File organization and naming conventions
+- Chunking strategy optimization
+- Metadata and monitoring practices
+- Regular cleanup and maintenance
+
+## Integration Examples
+
+### RAG Application
+
+```python
+# Build a RAG system with file uploads
+async def build_rag_system():
+    # Create vector store
+    vector_store = client.vector_stores.create(name="knowledge_base")
+
+    # Upload and process documents
+    documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
+    for doc in documents:
+        with open(doc, "rb") as f:
+            file_info = await client.files.create(file=f, purpose="assistants")
+            await client.vector_stores.files.create(
+                vector_store_id=vector_store.id, file_id=file_info.id
+            )
+
+    return vector_store
+
+
+# Query the RAG system
+async def query_rag(vector_store_id, question):
+    results = await client.vector_stores.search(
+        vector_store_id=vector_store_id, query=question, max_num_results=5
+    )
+    return results
+```
+
+### Document Analysis
+
+```python
+# Analyze document content through vector search
+async def analyze_document(vector_store_id, file_id):
+    # Get document content
+    content = await client.vector_stores.files.retrieve_content(
+        vector_store_id=vector_store_id, file_id=file_id
+    )
+
+    # Search for specific topics
+    topics = ["introduction", "methodology", "conclusion"]
+    analysis = {}
+
+    for topic in topics:
+        results = await client.vector_stores.search(
+            vector_store_id=vector_store_id, query=topic, max_num_results=3
+        )
+        analysis[topic] = results.data
+
+    return analysis
+```
+
+## Next Steps
+
+- Explore the [Files API documentation](../../providers/files/files.mdx) for detailed API reference
+- Check [Vector Store Providers](../providers/vector_io/index.mdx) for specific implementation details
+- Review [Getting Started](../getting_started/quickstart.mdx) for quick setup instructions
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -65,7 +65,7 @@ external_providers_dir: /workspace/providers.d
 Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:

 ```python
-from llama_stack.providers.datatypes import ProviderSpec
+from llama_stack_api.providers.datatypes import ProviderSpec


 def get_provider_spec() -> ProviderSpec:
--- a/docs/docs/providers/external/external-providers-guide.mdx
+++ b/docs/docs/providers/external/external-providers-guide.mdx
@ -80,7 +80,7 @@ container_image: custom-vector-store:latest  # optional
 All providers must contain a `get_provider_spec` function in their `provider` module. This is a standardized structure that Llama Stack expects and is necessary for getting things such as the config class. The `get_provider_spec` method returns a structure identical to the `adapter`. An example function may look like:

 ```python
-from llama_stack.providers.datatypes import (
+from llama_stack_api.providers.datatypes import (
    ProviderSpec,
    Api,
    RemoteProviderSpec,
--- a/docs/docs/providers/files/files.mdx
+++ b/docs/docs/providers/files/files.mdx
@ -0,0 +1,290 @@
+---
+sidebar_label: Files
+title: Files
+---
+
+## Overview
+
+The Files API provides file management capabilities for Llama Stack. It allows you to upload, store, retrieve, and manage files that can be used across various endpoints in your application.
+
+## Features
+
+- **File Upload**: Upload files with metadata and purpose classification
+- **File Management**: List, retrieve, and delete files
+- **Content Retrieval**: Access raw file content for processing
+- **API Compatibility**: Full compatibility with OpenAI Files API endpoints
+- **Flexible Storage**: Support for local filesystem and cloud storage backends
+
+## API Endpoints
+
+### Upload File
+
+**POST** `/v1/openai/v1/files`
+
+Upload a file that can be used across various endpoints.
+
+**Request Body:**
+- `file`: The file object to be uploaded (multipart form data)
+- `purpose`: The intended purpose of the uploaded file
+
+**Supported Purposes:**
+- `batch`: Files for batch operations
+
+**Response:**
+```json
+{
+  "id": "file-abc123",
+  "object": "file",
+  "bytes": 140,
+  "created_at": 1613779121,
+  "filename": "mydata.jsonl",
+  "purpose": "batch"
+}
+```
+
+**Example:**
+```python
+import requests
+
+with open("data.jsonl", "rb") as f:
+    files = {"file": f}
+    data = {"purpose": "batch"}
+    response = requests.post(
+        "http://localhost:8000/v1/openai/v1/files", files=files, data=data
+      )
+    file_info = response.json()
+```
+
+### List Files
+
+**GET** `/v1/openai/v1/files`
+
+Returns a list of files that belong to the user's organization.
+
+**Query Parameters:**
+- `after` (optional): A cursor for pagination
+- `limit` (optional): Limit on number of objects (1-10,000, default: 10,000)
+- `order` (optional): Sort order by created_at timestamp (`asc` or `desc`, default: `desc`)
+- `purpose` (optional): Filter files by purpose
+
+**Response:**
+```json
+{
+  "object": "list",
+  "data": [
+    {
+      "id": "file-abc123",
+      "object": "file",
+      "bytes": 140,
+      "created_at": 1613779121,
+      "filename": "mydata.jsonl",
+      "purpose": "fine-tune"
+    }
+  ],
+  "has_more": false
+}
+```
+
+**Example:**
+```python
+import requests
+
+# List all files
+response = requests.get("http://localhost:8000/v1/openai/v1/files")
+files = response.json()
+
+# List files with pagination
+response = requests.get(
+    "http://localhost:8000/v1/openAi/v1/files",
+    params={"limit": 10, "after": "file-abc123"},
+)
+files = response.json()
+
+# Filter by purpose
+response = requests.get(
+    "http://localhost:8000/v1/openAi/v1/files", params={"purpose": "fine-tune"}
+)
+files = response.json()
+```
+
+### Retrieve File
+
+**GET** `/v1/openAi/v1/files/{file_id}`
+
+Returns information about a specific file.
+
+**Path Parameters:**
+- `file_id`: The ID of the file to retrieve
+
+**Response:**
+```json
+{
+  "id": "file-abc123",
+  "object": "file",
+  "bytes": 140,
+  "created_at": 1613779121,
+  "filename": "mydata.jsonl",
+  "purpose": "fine-tune"
+}
+```
+
+**Example:**
+```python
+import requests
+
+file_id = "file-abc123"
+response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
+file_info = response.json()
+```
+
+### Delete File
+
+**DELETE** `/v1/openAi/v1/files/{file_id}`
+
+Delete a file.
+
+**Path Parameters:**
+- `file_id`: The ID of the file to delete
+
+**Response:**
+```json
+{
+  "id": "file-abc123",
+  "object": "file",
+  "deleted": true
+}
+```
+
+**Example:**
+```python
+import requests
+
+file_id = "file-abc123"
+response = requests.delete(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
+result = response.json()
+```
+
+### Retrieve File Content
+
+**GET** `/v1/openAi/v1/files/{file_id}/content`
+
+Returns the raw file content as a binary response.
+
+**Path Parameters:**
+- `file_id`: The ID of the file to retrieve content from
+
+**Response:**
+Binary file content with appropriate headers:
+- `Content-Type`: `application/octet-stream`
+- `Content-Disposition`: `attachment; filename="filename"`
+
+**Example:**
+```python
+import requests
+
+file_id = "file-abc123"
+response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}/content")
+
+# Save content to file
+with open("downloaded_file.jsonl", "wb") as f:
+    f.write(response.content)
+
+# Or process content directly
+content = response.content
+```
+
+## Vector Store Integration
+
+The Files API integrates with Vector Stores to enable document processing and search. For detailed information about this integration, see [File Operations and Vector Store Integration](../concepts/file_operations_vector_stores.md).
+
+### Vector Store File Operations
+
+**List Vector Store Files:**
+- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
+
+**Retrieve Vector Store File Content:**
+- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files/{file_id}/content`
+
+**Attach File to Vector Store:**
+- **POST** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
+
+## Error Handling
+
+The Files API returns standard HTTP status codes and error responses:
+
+- `400 Bad Request`: Invalid request parameters
+- `404 Not Found`: File not found
+- `429 Too Many Requests`: Rate limit exceeded
+- `500 Internal Server Error`: Server error
+
+**Error Response Format:**
+```json
+{
+  "error": {
+    "message": "Error description",
+    "type": "invalid_request_error",
+    "code": "file_not_found"
+  }
+}
+```
+
+## Rate Limits
+
+The Files API implements rate limiting to ensure fair usage:
+- File uploads: 100 files per minute
+- File retrievals: 1000 requests per minute
+- File deletions: 100 requests per minute
+
+## Best Practices
+
+1. **File Organization**: Use descriptive filenames and appropriate purpose classifications
+2. **Batch Operations**: For multiple files, consider using batch endpoints when available
+3. **Error Handling**: Always check response status codes and handle errors gracefully
+4. **Content Types**: Ensure files are uploaded with appropriate content types
+5. **Cleanup**: Regularly delete unused files to manage storage costs
+
+## Integration Examples
+
+### With Python Client
+
+```python
+from llama_stack import LlamaStackClient
+
+client = LlamaStackClient("http://localhost:8000")
+
+# Upload a file
+with open("data.jsonl", "rb") as f:
+    file_info = await client.files.upload(file=f, purpose="fine-tune")
+
+# List files
+files = await client.files.list(purpose="fine-tune")
+
+# Retrieve file content
+content = await client.files.retrieve_content(file_info.id)
+```
+
+### With cURL
+
+```bash
+# Upload file
+curl -X POST http://localhost:8000/v1/openAi/v1/files \
+  -F "file=@data.jsonl" \
+  -F "purpose=fine-tune"
+
+# List files
+curl http://localhost:8000/v1/openAi/v1/files
+
+# Download file content
+curl http://localhost:8000/v1/openAi/v1/files/file-abc123/content \
+  -o downloaded_file.jsonl
+```
+
+## Provider Support
+
+The Files API supports multiple storage backends:
+
+- **Local Filesystem**: Store files on local disk (inline provider)
+- **S3**: Store files in AWS S3 or S3-compatible services (remote provider)
+- **Custom Backends**: Extensible architecture for custom storage providers
+
+See the [Files Providers](index.md) documentation for detailed configuration options.
--- a/docs/docs/providers/files/openai_file_operations_quick_reference.md
+++ b/docs/docs/providers/files/openai_file_operations_quick_reference.md
@ -0,0 +1,80 @@
+# File Operations Quick Reference
+
+## Overview
+
+As of release 0.2.14, Llama Stack provides comprehensive file operations and Vector Store API integration, following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
+
+> **Note**: For detailed overview and implementation details, see [Overview](../openai_file_operations_support.md#overview) in the full documentation.
+
+## Supported Providers
+
+> **Note**: For complete provider details and features, see [Supported Providers](../openai_file_operations_support.md#supported-providers) in the full documentation.
+
+**Inline Providers**: FAISS, SQLite-vec, Milvus
+**Remote Providers**: ChromaDB, Qdrant, Weaviate, PGVector
+
+## Quick Start
+
+### 1. Upload File
+```python
+file_info = await client.files.upload(
+    file=open("document.pdf", "rb"), purpose="assistants"
+)
+```
+
+### 2. Create Vector Store
+```python
+vector_store = client.vector_stores.create(name="my_docs")
+```
+
+### 3. Attach File
+```python
+await client.vector_stores.files.create(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+```
+
+### 4. Search
+```python
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id, query="What is the main topic?", max_num_results=5
+)
+```
+
+## File Processing & Search
+
+**Processing**: 800 tokens default chunk size, 400 token overlap
+**Formats**: PDF, DOCX, TXT, Code files, etc.
+**Search**: Vector similarity, Hybrid (SQLite-vec), Filtered with metadata
+
+## Configuration
+
+> **Note**: For detailed configuration examples and options, see [Configuration Examples](../openai_file_operations_support.md#configuration-examples) in the full documentation.
+
+**Basic Setup**: Configure vector_io and files providers in your run.yaml
+
+## Common Use Cases
+
+- **RAG Systems**: Document Q&A with file uploads
+- **Knowledge Bases**: Searchable document collections
+- **Content Analysis**: Document similarity and clustering
+- **Research Tools**: Literature review and analysis
+
+## Performance Tips
+
+> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../openai_file_operations_support.md#performance-considerations) in the full documentation.
+
+**Quick Tips**: Choose provider based on your needs (speed vs. storage vs. scalability)
+
+## Troubleshooting
+
+> **Note**: For comprehensive troubleshooting, see [Troubleshooting](../openai_file_operations_support.md#troubleshooting) in the full documentation.
+
+**Quick Fixes**: Check file format compatibility, optimize chunk sizes, monitor storage
+
+## Resources
+
+- [Full Documentation](openai_file_operations_support.md)
+- [Integration Guide](../concepts/file_operations_vector_stores.md)
+- [Files API](files_api.md)
+- [Provider Details](../vector_io/index.md)
--- a/docs/docs/providers/files/openai_file_operations_support.md
+++ b/docs/docs/providers/files/openai_file_operations_support.md
@ -0,0 +1,291 @@
+# File Operations Support in Vector Store Providers
+
+## Overview
+
+This document provides a comprehensive overview of file operations and Vector Store API support across all available vector store providers in Llama Stack. As of release 0.2.24, the following providers support full file operations integration.
+
+## Supported Providers
+
+### ✅ Full File Operations Support
+
+The following providers support complete file operations integration, including file upload, automatic processing, and search:
+
+#### Inline Providers (Single Node)
+
+| Provider | File Operations | Key Features |
+|----------|----------------|--------------|
+| **FAISS** | ✅ Full Support | Fast in-memory search, GPU acceleration |
+| **SQLite-vec** | ✅ Full Support | Hybrid search, disk-based storage |
+| **Milvus** | ✅ Full Support | High-performance, scalable indexing |
+
+#### Remote Providers (Hosted)
+
+| Provider | File Operations | Key Features |
+|----------|----------------|--------------|
+| **ChromaDB** | ✅ Full Support | Metadata filtering, persistent storage |
+| **Qdrant** | ✅ Full Support | Payload filtering, advanced search |
+| **Weaviate** | ✅ Full Support | GraphQL interface, schema management |
+| **Postgres (PGVector)** | ✅ Full Support | SQL integration, ACID compliance |
+
+### 🔄 Partial Support
+
+Some providers may support basic vector operations but lack full file operations integration:
+
+| Provider | Status | Notes |
+|----------|--------|-------|
+| **Meta Reference** | 🔄 Basic | Core vector operations only |
+
+## File Operations Features
+
+All supported providers offer the following file operations capabilities:
+
+### Core Functionality
+
+- **File Upload & Processing**: Automatic document ingestion and chunking
+- **Vector Storage**: Embedding generation and storage
+- **Search & Retrieval**: Semantic search with metadata filtering
+- **File Management**: List, retrieve, and manage files in vector stores
+
+### Advanced Features
+
+- **Automatic Chunking**: Configurable chunk sizes and overlap
+- **Metadata Preservation**: File attributes and chunk metadata
+- **Status Tracking**: Monitor file processing progress
+- **Error Handling**: Comprehensive error reporting and recovery
+
+## Implementation Details
+
+### File Processing Pipeline
+
+1. **Upload**: File uploaded via Files API
+2. **Extraction**: Text content extracted from various formats
+3. **Chunking**: Content split into optimal chunks (default: 800 tokens)
+4. **Embedding**: Chunks converted to vector embeddings
+5. **Storage**: Vectors stored with metadata in vector database
+6. **Indexing**: Search index updated for fast retrieval
+
+### Supported File Formats
+
+- **Documents**: PDF, DOCX, DOC
+- **Text**: TXT, MD, RST
+- **Code**: Python, JavaScript, Java, C++, etc.
+- **Data**: JSON, CSV, XML
+- **Web**: HTML files
+
+### Chunking Strategies
+
+- **Default**: 800 tokens with 400 token overlap
+- **Custom**: Configurable chunk sizes and overlap
+- **Static**: Fixed-size chunks with overlap
+
+## Provider-Specific Features
+
+### FAISS
+
+- **Storage**: In-memory with optional persistence
+- **Performance**: Optimized for speed and GPU acceleration
+- **Use Case**: High-performance, memory-constrained environments
+
+### SQLite-vec
+
+- **Storage**: Disk-based with SQLite backend
+- **Search**: Hybrid vector + keyword search
+- **Use Case**: Large document collections, frequent updates
+
+### Milvus
+
+- **Storage**: Scalable distributed storage
+- **Indexing**: Multiple index types (IVF, HNSW)
+- **Use Case**: Production deployments, large-scale applications
+
+### ChromaDB
+
+- **Storage**: Persistent storage with metadata
+- **Filtering**: Advanced metadata filtering
+- **Use Case**: Applications requiring rich metadata
+
+### Qdrant
+
+- **Storage**: High-performance vector database
+- **Filtering**: Payload-based filtering
+- **Use Case**: Real-time applications, complex queries
+
+### Weaviate
+
+- **Storage**: GraphQL-native vector database
+- **Schema**: Flexible schema management
+- **Use Case**: Applications requiring complex data relationships
+
+### Postgres (PGVector)
+
+- **Storage**: SQL database with vector extensions
+- **Integration**: ACID compliance, existing SQL workflows
+- **Use Case**: Applications requiring transactional guarantees
+
+## Configuration Examples
+
+### Basic Configuration
+
+```yaml
+vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ~/.llama/faiss_store.db
+```
+
+### With FileResponse Support
+
+```yaml
+vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ~/.llama/faiss_store.db
+
+files:
+  - provider_id: local-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ~/.llama/files
+      metadata_store:
+        type: sqlite
+        db_path: ~/.llama/files_metadata.db
+```
+
+## Usage Examples
+
+### Python Client
+
+```python
+from llama_stack import LlamaStackClient
+
+client = LlamaStackClient("http://localhost:8000")
+
+# Create vector store
+vector_store = client.vector_stores.create(name="documents")
+
+# Upload and process file
+with open("document.pdf", "rb") as f:
+    file_info = await client.files.upload(file=f, purpose="assistants")
+
+# Attach to vector store
+await client.vector_stores.files.create(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+
+# Search
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id, query="What is the main topic?", max_num_results=5
+)
+```
+
+### cURL Commands
+
+```bash
+# Upload file
+curl -X POST http://localhost:8000/v1/openai/v1/files \
+  -F "file=@document.pdf" \
+  -F "purpose=assistants"
+
+# Create vector store
+curl -X POST http://localhost:8000/v1/openai/v1/vector_stores \
+  -H "Content-Type: application/json" \
+  -d '{"name": "documents"}'
+
+# Attach file to vector store
+curl -X POST http://localhost:8000/v1/openai/v1/vector_stores/{store_id}/files \
+  -H "Content-Type: application/json" \
+  -d '{"file_id": "file-abc123"}'
+
+# Search vector store
+curl -X POST http://localhost:8000/v1/openai/v1/vector_stores/{store_id}/search \
+  -H "Content-Type: application/json" \
+  -d '{"query": "What is the main topic?", "max_num_results": 5}'
+```
+
+## Performance Considerations
+
+### Chunk Size Optimization
+
+- **Small chunks (400-600 tokens)**: Better precision, more results
+- **Large chunks (800-1200 tokens)**: Better context, fewer results
+- **Overlap (50%)**: Maintains context between chunks
+
+### Storage Efficiency
+
+- **FAISS**: Fastest, but memory-limited
+- **SQLite-vec**: Good balance of performance and storage
+- **Milvus**: Scalable, production-ready
+- **Remote providers**: Managed, but network-dependent
+
+### Search Performance
+
+- **Vector search**: Fastest for semantic queries
+- **Hybrid search**: Best accuracy (SQLite-vec only)
+- **Filtered search**: Fast with metadata constraints
+
+## Troubleshooting
+
+### Common Issues
+
+1. **File Processing Failures**
+   - Check file format compatibility
+   - Verify file size limits
+   - Review error messages in file status
+
+2. **Search Performance**
+   - Optimize chunk sizes for your use case
+   - Use filters to narrow search scope
+   - Monitor vector store metrics
+
+3. **Storage Issues**
+   - Check available disk space
+   - Verify database permissions
+   - Monitor memory usage (for in-memory providers)
+
+### Monitoring
+
+```python
+# Check file processing status
+file_status = await client.vector_stores.files.retrieve(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+
+if file_status.status == "failed":
+    print(f"Error: {file_status.last_error.message}")
+
+# Monitor vector store health
+health = await client.vector_stores.health(vector_store_id=vector_store.id)
+print(f"Status: {health.status}")
+```
+
+## Best Practices
+
+1. **File Organization**: Use descriptive names and organize by purpose
+2. **Chunking Strategy**: Test different sizes for your specific use case
+3. **Metadata**: Add relevant attributes for better filtering
+4. **Monitoring**: Track processing status and search performance
+5. **Cleanup**: Regularly remove unused files to manage storage
+
+## Future Enhancements
+
+Planned improvements for file operations support:
+
+- **Batch Processing**: Process multiple files simultaneously
+- **Advanced Chunking**: More sophisticated chunking algorithms
+- **Custom Embeddings**: Support for custom embedding models
+- **Real-time Updates**: Live file processing and indexing
+- **Multi-format Support**: Enhanced file format support
+
+## Support and Resources
+
+- **Documentation**: [File Operations and Vector Store Integration](../../concepts/file_operations_vector_stores.mdx)
+- **API Reference**: [Files API](files_api.md)
+- **Provider Docs**: [Vector Store Providers](../vector_io/index.md)
+- **Examples**: [Getting Started](../getting_started/index.md)
+- **Community**: [GitHub Discussions](https://github.com/meta-llama/llama-stack/discussions)
--- a/docs/docs/providers/index.mdx
+++ b/docs/docs/providers/index.mdx
@ -22,6 +22,7 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
 ## Provider Categories

 - **[External Providers](external/index.mdx)** - Guide for building and using external providers
+- **[OpenAI Compatibility](../api-openai/index.mdx)** - OpenAI API compatibility layer
 - **[Inference](inference/index.mdx)** - LLM and embedding model providers
 - **[Agents](agents/index.mdx)** - Agentic system providers
 - **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers
@ -30,6 +31,16 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
 - **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers
 - **[Files](files/index.mdx)** - File system and storage providers

-## Other information about Providers
- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
+## API Documentation
+
+For comprehensive API documentation and reference:
+
+- **[API Reference](../api/index.mdx)** - Complete API documentation
+- **[Experimental APIs](../api-experimental/index.mdx)** - APIs in development
+- **[Deprecated APIs](../api-deprecated/index.mdx)** - Legacy APIs being phased out
+- **[OpenAI Compatibility](../api-openai/index.mdx)** - OpenAI API compatibility guide
+
+## Additional Provider Information
+
+- **[OpenAI Implementation Guide](./openai.mdx)** - Code examples and implementation details for OpenAI APIs
 - **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack
--- a/docs/docs/providers/openai.mdx
+++ b/docs/docs/providers/openai.mdx
@ -1,9 +1,14 @@
 ---
-title: OpenAI Compatibility
-description: OpenAI API Compatibility
-sidebar_label: OpenAI Compatibility
-sidebar_position: 1
+title: OpenAI Implementation Guide
+description: Code examples and implementation details for OpenAI API compatibility
+sidebar_label: OpenAI Implementation
+sidebar_position: 2
 ---
+
+# OpenAI Implementation Guide
+
+This guide provides detailed code examples and implementation details for using OpenAI-compatible APIs with Llama Stack. For a comprehensive overview of OpenAI compatibility features, see our [OpenAI API Compatibility Guide](../api-openai/index.mdx).
+
 ## OpenAI API Compatibility

 ### Server path
@ -195,3 +200,9 @@ Lines of code unfurl
 Logic whispers in the dark
 Art in hidden form
 ```
+
+## Additional Resources
+
+- **[OpenAI API Compatibility Guide](../api-openai/index.mdx)** - Comprehensive overview of OpenAI compatibility features
+- **[OpenAI Responses API Limitations](./openai_responses_limitations.mdx)** - Detailed limitations and known issues
+- **[Provider Documentation](../index.mdx)** - Complete provider ecosystem overview
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -153,7 +153,7 @@ description: |
  Example using RAGQueryConfig with different search modes:

  ```python
-  from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
+  from llama_stack_api import RAGQueryConfig, RRFRanker, WeightedRanker

  # Vector search
  config = RAGQueryConfig(mode="vector", max_chunks=5)
@ -358,7 +358,7 @@ Two ranker types are supported:
 Example using RAGQueryConfig with different search modes:

 ```python
-from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
+from llama_stack_api import RAGQueryConfig, RRFRanker, WeightedRanker

 # Vector search
 config = RAGQueryConfig(mode="vector", max_chunks=5)
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -16,7 +16,7 @@ import sys
 import fire
 import ruamel.yaml as yaml

-from llama_stack.apis.version import LLAMA_STACK_API_V1 # noqa: E402
+from llama_stack_api import LLAMA_STACK_API_V1 # noqa: E402
 from llama_stack.core.stack import LlamaStack  # noqa: E402

 from .pyopenapi.options import Options  # noqa: E402
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -16,27 +16,27 @@ from typing import Annotated, Any, Dict, get_args, get_origin, Set, Union

 from fastapi import UploadFile

-from llama_stack.apis.datatypes import Error
-from llama_stack.strong_typing.core import JsonType
-from llama_stack.strong_typing.docstring import Docstring, parse_type
-from llama_stack.strong_typing.inspection import (
+from llama_stack_api import (
+    Docstring,
+    Error,
+    JsonSchemaGenerator,
+    JsonType,
+    Schema,
+    SchemaOptions,
+    get_schema_identifier,
    is_generic_list,
    is_type_optional,
    is_type_union,
    is_unwrapped_body_param,
+    json_dump_string,
+    object_to_json,
+    parse_type,
+    python_type_to_name,
+    register_schema,
    unwrap_generic_list,
    unwrap_optional_type,
    unwrap_union_types,
 )
-from llama_stack.strong_typing.name import python_type_to_name
-from llama_stack.strong_typing.schema import (
-    get_schema_identifier,
-    JsonSchemaGenerator,
-    register_schema,
-    Schema,
-    SchemaOptions,
-)
-from llama_stack.strong_typing.serialization import json_dump_string, object_to_json
 from pydantic import BaseModel

 from .operations import (
@ -979,8 +979,8 @@ class Generator:
                    if deprecated:
                        filtered_operations.append(op)
                elif self.options.stability_filter == "stainless":
-                    # Include both stable (v1 non-deprecated) and experimental (v1alpha, v1beta) endpoints
-                    if (stability_level == "v1" and not deprecated) or stability_level in ["v1alpha", "v1beta"]:
+                    # Include stable (v1), deprecated (v1 deprecated), and experimental (v1alpha, v1beta) endpoints
+                    if stability_level == "v1" or stability_level in ["v1alpha", "v1beta"]:
                        filtered_operations.append(op)

            operations = filtered_operations
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -11,19 +11,21 @@ import typing
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union

-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA, LLAMA_STACK_API_V1ALPHA
-
 from termcolor import colored

-from llama_stack.strong_typing.inspection import get_signature
-
 from typing import get_origin, get_args

 from fastapi import UploadFile
 from fastapi.params import File, Form
 from typing import Annotated

-from llama_stack.schema_utils import ExtraBodyField
+from llama_stack_api import (
+    ExtraBodyField,
+    LLAMA_STACK_API_V1,
+    LLAMA_STACK_API_V1ALPHA,
+    LLAMA_STACK_API_V1BETA,
+    get_signature,
+)


 def split_prefix(
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@ -9,7 +9,7 @@ import enum
 from dataclasses import dataclass
 from typing import Any, ClassVar, Dict, List, Optional, Union

-from llama_stack.strong_typing.schema import JsonType, Schema, StrictJsonType
+from llama_stack_api import JsonType, Schema, StrictJsonType

 URL = str

--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -11,8 +11,7 @@ from pathlib import Path
 from typing import Any, List, Optional, TextIO, Union, get_type_hints, get_origin, get_args

 from pydantic import BaseModel
-from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
-from llama_stack.strong_typing.inspection import is_unwrapped_body_param
+from llama_stack_api import StrictJsonType, is_unwrapped_body_param, object_to_json
 from llama_stack.core.resolver import api_protocol_map

 from .generator import Generator
@ -165,12 +164,12 @@ def _validate_api_delete_method_returns_none(method) -> str | None:
        return "has no return type annotation"

    return_type = hints['return']
-    
+
    # Allow OpenAI endpoints to return response objects since they follow OpenAI specification
    method_name = getattr(method, '__name__', '')
    if method_name.__contains__('openai_'):
        return None
-    
+
    if return_type is not None and return_type is not type(None):
        return "does not return None where None is mandatory"

--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -998,6 +998,39 @@ paths:
      description: List models using the OpenAI API.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: A Model.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Model'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Models
+      summary: Register model.
+      description: >-
+        Register model.
+
+        Register a model.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterModelRequest'
+        required: true
+      deprecated: true
  /v1/models/{model_id}:
    get:
      responses:
@ -1032,6 +1065,36 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Models
+      summary: Unregister model.
+      description: >-
+        Unregister model.
+
+        Unregister a model.
+      parameters:
+        - name: model_id
+          in: path
+          description: >-
+            The identifier of the model to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/moderations:
    post:
      responses:
@ -1662,6 +1725,32 @@ paths:
      description: List all scoring functions.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      summary: Register a scoring function.
+      description: Register a scoring function.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
+        required: true
+      deprecated: true
  /v1/scoring-functions/{scoring_fn_id}:
    get:
      responses:
@ -1693,6 +1782,33 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      summary: Unregister a scoring function.
+      description: Unregister a scoring function.
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          description: >-
+            The ID of the scoring function to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/scoring/score:
    post:
      responses:
@ -1781,6 +1897,36 @@ paths:
      description: List all shields.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: A Shield.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Shield'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Shields
+      summary: Register a shield.
+      description: Register a shield.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterShieldRequest'
+        required: true
+      deprecated: true
  /v1/shields/{identifier}:
    get:
      responses:
@ -1812,6 +1958,33 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Shields
+      summary: Unregister a shield.
+      description: Unregister a shield.
+      parameters:
+        - name: identifier
+          in: path
+          description: >-
+            The identifier of the shield to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/tool-runtime/invoke:
    post:
      responses:
@ -1907,6 +2080,32 @@ paths:
      description: List tool groups with optional provider.
      parameters: []
      deprecated: false
+    post:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ToolGroups
+      summary: Register a tool group.
+      description: Register a tool group.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterToolGroupRequest'
+        required: true
+      deprecated: true
  /v1/toolgroups/{toolgroup_id}:
    get:
      responses:
@ -1938,6 +2137,32 @@ paths:
          schema:
            type: string
      deprecated: false
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ToolGroups
+      summary: Unregister a tool group.
+      description: Unregister a tool group.
+      parameters:
+        - name: toolgroup_id
+          in: path
+          description: The ID of the tool group to unregister.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/tools:
    get:
      responses:
@ -11420,6 +11645,152 @@ components:
        - hyperparam_search_config
        - logger_config
      title: SupervisedFineTuneRequest
+    RegisterModelRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
+          description: The identifier of the model to register.
+        provider_model_id:
+          type: string
+          description: >-
+            The identifier of the model in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Any additional metadata for this model.
+        model_type:
+          $ref: '#/components/schemas/ModelType'
+          description: The type of model to register.
+      additionalProperties: false
+      required:
+        - model_id
+      title: RegisterModelRequest
+    ParamType:
+      oneOf:
+        - $ref: '#/components/schemas/StringType'
+        - $ref: '#/components/schemas/NumberType'
+        - $ref: '#/components/schemas/BooleanType'
+        - $ref: '#/components/schemas/ArrayType'
+        - $ref: '#/components/schemas/ObjectType'
+        - $ref: '#/components/schemas/JsonType'
+        - $ref: '#/components/schemas/UnionType'
+        - $ref: '#/components/schemas/ChatCompletionInputType'
+        - $ref: '#/components/schemas/CompletionInputType'
+      discriminator:
+        propertyName: type
+        mapping:
+          string: '#/components/schemas/StringType'
+          number: '#/components/schemas/NumberType'
+          boolean: '#/components/schemas/BooleanType'
+          array: '#/components/schemas/ArrayType'
+          object: '#/components/schemas/ObjectType'
+          json: '#/components/schemas/JsonType'
+          union: '#/components/schemas/UnionType'
+          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+          completion_input: '#/components/schemas/CompletionInputType'
+    RegisterScoringFunctionRequest:
+      type: object
+      properties:
+        scoring_fn_id:
+          type: string
+          description: >-
+            The ID of the scoring function to register.
+        description:
+          type: string
+          description: The description of the scoring function.
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+          description: The return type of the scoring function.
+        provider_scoring_fn_id:
+          type: string
+          description: >-
+            The ID of the provider scoring function to use for the scoring function.
+        provider_id:
+          type: string
+          description: >-
+            The ID of the provider to use for the scoring function.
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+          description: >-
+            The parameters for the scoring function for benchmark eval, these can
+            be overridden for app eval.
+      additionalProperties: false
+      required:
+        - scoring_fn_id
+        - description
+        - return_type
+      title: RegisterScoringFunctionRequest
+    RegisterShieldRequest:
+      type: object
+      properties:
+        shield_id:
+          type: string
+          description: >-
+            The identifier of the shield to register.
+        provider_shield_id:
+          type: string
+          description: >-
+            The identifier of the shield in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        params:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: The parameters of the shield.
+      additionalProperties: false
+      required:
+        - shield_id
+      title: RegisterShieldRequest
+    RegisterToolGroupRequest:
+      type: object
+      properties:
+        toolgroup_id:
+          type: string
+          description: The ID of the tool group to register.
+        provider_id:
+          type: string
+          description: >-
+            The ID of the provider to use for the tool group.
+        mcp_endpoint:
+          $ref: '#/components/schemas/URL'
+          description: >-
+            The MCP endpoint to use for the tool group.
+        args:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: >-
+            A dictionary of arguments to pass to the tool group.
+      additionalProperties: false
+      required:
+        - toolgroup_id
+        - provider_id
+      title: RegisterToolGroupRequest
    DataSource:
      oneOf:
        - $ref: '#/components/schemas/URIDataSource'
--- a/pyproject.toml
+++ b/pyproject.toml
@ -31,6 +31,7 @@ dependencies = [
    "httpx",
    "jinja2>=3.1.6",
    "jsonschema",
+    "llama-stack-api",  # API and provider specifications (local dev via tool.uv.sources)
    "openai>=2.5.0",
    "prompt-toolkit",
    "python-dotenv",
@ -69,7 +70,7 @@ dev = [
    "black",
    "ruff",
    "mypy",
-    "pre-commit",
+    "pre-commit>=4.4.0",
    "ruamel.yaml", # needed for openapi generator
 ]
 # Type checking dependencies - includes type stubs and optional runtime dependencies
@ -180,7 +181,7 @@ install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_p

 [tool.setuptools.packages.find]
 where = ["src"]
-include = ["llama_stack", "llama_stack.*"]
+include = ["llama_stack", "llama_stack.*", "llama_stack_api", "llama_stack_api.*"]

 [[tool.uv.index]]
 name = "pytorch-cpu"
@ -190,6 +191,7 @@ explicit = true
 [tool.uv.sources]
 torch = [{ index = "pytorch-cpu" }]
 torchvision = [{ index = "pytorch-cpu" }]
+llama-stack-api = [{ path = "src/llama_stack_api", editable = true }]

 [tool.ruff]
 line-length = 120
@ -257,7 +259,7 @@ unfixable = [

 [tool.mypy]
 mypy_path = ["src"]
-packages = ["llama_stack"]
+packages = ["llama_stack", "llama_stack_api"]
 plugins = ['pydantic.mypy']
 disable_error_code = []
 warn_return_any = true
@ -282,12 +284,13 @@ exclude = [
    "^src/llama_stack/models/llama/llama3/interface\\.py$",
    "^src/llama_stack/models/llama/llama3/tokenizer\\.py$",
    "^src/llama_stack/models/llama/llama3/tool_utils\\.py$",
-    "^src/llama_stack/providers/inline/datasetio/localfs/",
-    "^src/llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
-    "^src/llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
    "^src/llama_stack/models/llama/llama3/generation\\.py$",
    "^src/llama_stack/models/llama/llama3/multimodal/model\\.py$",
    "^src/llama_stack/models/llama/llama4/",
+    "^src/llama_stack/providers/inline/agents/meta_reference/",
+    "^src/llama_stack/providers/inline/datasetio/localfs/",
+    "^src/llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
+    "^src/llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
    "^src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$",
    "^src/llama_stack/providers/inline/post_training/common/validator\\.py$",
    "^src/llama_stack/providers/inline/safety/code_scanner/",
@ -337,7 +340,7 @@ exclude = [
    "^src/llama_stack/providers/utils/telemetry/dataset_mixin\\.py$",
    "^src/llama_stack/providers/utils/telemetry/trace_protocol\\.py$",
    "^src/llama_stack/providers/utils/telemetry/tracing\\.py$",
-    "^src/llama_stack/strong_typing/auxiliary\\.py$",
+    "^src/llama_stack_api/strong_typing/auxiliary\\.py$",
    "^src/llama_stack/distributions/template\\.py$",
 ]

--- a/scripts/generate_prompt_format.py
+++ b/scripts/generate_prompt_format.py
@ -15,10 +15,10 @@ from pathlib import Path

 import fire

-from llama_stack.apis.common.errors import ModelNotFoundError
 from llama_stack.models.llama.llama3.generation import Llama3
 from llama_stack.models.llama.llama4.generation import Llama4
 from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack_api import ModelNotFoundError

 THIS_DIR = Path(__file__).parent.resolve()

--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -162,6 +162,17 @@ if [[ "$COLLECT_ONLY" == false ]]; then
        export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="library_client"
        echo "Setting stack config type: library_client"
    fi
+
+    # Set MCP host for in-process MCP server tests
+    # - For library client and server mode: localhost (both on same host)
+    # - For docker mode: host.docker.internal (container needs to reach host)
+    if [[ "$STACK_CONFIG" == docker:* ]]; then
+        export LLAMA_STACK_TEST_MCP_HOST="host.docker.internal"
+        echo "Setting MCP host: host.docker.internal (docker mode)"
+    else
+        export LLAMA_STACK_TEST_MCP_HOST="localhost"
+        echo "Setting MCP host: localhost (library/server mode)"
+    fi
 fi

 SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
@ -338,6 +349,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    DOCKER_ENV_VARS=""
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_MCP_HOST=${LLAMA_STACK_TEST_MCP_HOST:-host.docker.internal}"
    # Disabled: https://github.com/llamastack/llama-stack/issues/4089
    #DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_METRIC_EXPORT_INTERVAL=200"
@ -371,8 +383,11 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    # Use regular port mapping instead
    NETWORK_MODE=""
    PORT_MAPPINGS=""
+    ADD_HOST_FLAG=""
    if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
        NETWORK_MODE="--network host"
+        # On Linux with host network, also add host.docker.internal mapping for consistency
+        ADD_HOST_FLAG="--add-host=host.docker.internal:host-gateway"
    else
        # On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry
        PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT"
@ -381,6 +396,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then

    docker run -d $NETWORK_MODE --name "$container_name" \
        $PORT_MAPPINGS \
+        $ADD_HOST_FLAG \
        $DOCKER_ENV_VARS \
        "$IMAGE_NAME" \
        --port $LLAMA_STACK_PORT
--- a/scripts/provider_codegen.py
+++ b/scripts/provider_codegen.py
@ -22,7 +22,7 @@ def get_api_docstring(api_name: str) -> str | None:
    """Extract docstring from the API protocol class."""
    try:
        # Import the API module dynamically
-        api_module = __import__(f"llama_stack.apis.{api_name}", fromlist=[api_name.title()])
+        api_module = __import__(f"llama_stack_api.{api_name}", fromlist=[api_name.title()])

        # Get the main protocol class (usually capitalized API name)
        protocol_class_name = api_name.title()
@ -83,8 +83,9 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
                # this string replace is ridiculous
                field_type = field_type.replace("typing.", "").replace("Optional[", "").replace("]", "")
                field_type = field_type.replace("Annotated[", "").replace("FieldInfo(", "").replace(")", "")
-                field_type = field_type.replace("llama_stack.apis.inference.inference.", "")
+                field_type = field_type.replace("llama_stack_api.inference.", "")
                field_type = field_type.replace("llama_stack.providers.", "")
+                field_type = field_type.replace("llama_stack_api.datatypes.", "")

                default_value = field.default
                if field.default_factory is not None:
--- a/src/llama_stack/apis/agents/init.py
+++ b/src/llama_stack/apis/agents/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .agents import *
--- a/src/llama_stack/apis/batches/init.py
+++ b/src/llama_stack/apis/batches/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batches import Batches, BatchObject, ListBatchesResponse
-
-__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
--- a/src/llama_stack/apis/benchmarks/init.py
+++ b/src/llama_stack/apis/benchmarks/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import *
--- a/src/llama_stack/apis/common/init.py
+++ b/src/llama_stack/apis/common/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/apis/conversations/init.py
+++ b/src/llama_stack/apis/conversations/init.py
@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .conversations import (
-    Conversation,
-    ConversationDeletedResource,
-    ConversationItem,
-    ConversationItemCreateRequest,
-    ConversationItemDeletedResource,
-    ConversationItemList,
-    Conversations,
-    Metadata,
-)
-
-__all__ = [
-    "Conversation",
-    "ConversationDeletedResource",
-    "ConversationItem",
-    "ConversationItemCreateRequest",
-    "ConversationItemDeletedResource",
-    "ConversationItemList",
-    "Conversations",
-    "Metadata",
-]
--- a/src/llama_stack/apis/datasetio/init.py
+++ b/src/llama_stack/apis/datasetio/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .datasetio import *
--- a/src/llama_stack/apis/datasets/init.py
+++ b/src/llama_stack/apis/datasets/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .datasets import *
--- a/src/llama_stack/apis/datatypes.py
+++ b/src/llama_stack/apis/datatypes.py
@ -1,158 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, EnumMeta
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class DynamicApiMeta(EnumMeta):
-    def __new__(cls, name, bases, namespace):
-        # Store the original enum values
-        original_values = {k: v for k, v in namespace.items() if not k.startswith("_")}
-
-        # Create the enum class
-        cls = super().__new__(cls, name, bases, namespace)
-
-        # Store the original values for reference
-        cls._original_values = original_values
-        # Initialize _dynamic_values
-        cls._dynamic_values = {}
-
-        return cls
-
-    def __call__(cls, value):
-        try:
-            return super().__call__(value)
-        except ValueError as e:
-            # If this value was already dynamically added, return it
-            if value in cls._dynamic_values:
-                return cls._dynamic_values[value]
-
-            # If the value doesn't exist, create a new enum member
-            # Create a new member name from the value
-            member_name = value.lower().replace("-", "_")
-
-            # If this member name already exists in the enum, return the existing member
-            if member_name in cls._member_map_:
-                return cls._member_map_[member_name]
-
-            # Instead of creating a new member, raise ValueError to force users to use Api.add() to
-            # register new APIs explicitly
-            raise ValueError(f"API '{value}' does not exist. Use Api.add() to register new APIs.") from e
-
-    def __iter__(cls):
-        # Allow iteration over both static and dynamic members
-        yield from super().__iter__()
-        if hasattr(cls, "_dynamic_values"):
-            yield from cls._dynamic_values.values()
-
-    def add(cls, value):
-        """
-        Add a new API to the enum.
-        Used to register external APIs.
-        """
-        member_name = value.lower().replace("-", "_")
-
-        # If this member name already exists in the enum, return it
-        if member_name in cls._member_map_:
-            return cls._member_map_[member_name]
-
-        # Create a new enum member
-        member = object.__new__(cls)
-        member._name_ = member_name
-        member._value_ = value
-
-        # Add it to the enum class
-        cls._member_map_[member_name] = member
-        cls._member_names_.append(member_name)
-        cls._member_type_ = str
-
-        # Store it in our dynamic values
-        cls._dynamic_values[value] = member
-
-        return member
-
-
-@json_schema_type
-class Api(Enum, metaclass=DynamicApiMeta):
-    """Enumeration of all available APIs in the Llama Stack system.
-    :cvar providers: Provider management and configuration
-    :cvar inference: Text generation, chat completions, and embeddings
-    :cvar safety: Content moderation and safety shields
-    :cvar agents: Agent orchestration and execution
-    :cvar batches: Batch processing for asynchronous API requests
-    :cvar vector_io: Vector database operations and queries
-    :cvar datasetio: Dataset input/output operations
-    :cvar scoring: Model output evaluation and scoring
-    :cvar eval: Model evaluation and benchmarking framework
-    :cvar post_training: Fine-tuning and model training
-    :cvar tool_runtime: Tool execution and management
-    :cvar telemetry: Observability and system monitoring
-    :cvar models: Model metadata and management
-    :cvar shields: Safety shield implementations
-    :cvar datasets: Dataset creation and management
-    :cvar scoring_functions: Scoring function definitions
-    :cvar benchmarks: Benchmark suite management
-    :cvar tool_groups: Tool group organization
-    :cvar files: File storage and management
-    :cvar prompts: Prompt versions and management
-    :cvar inspect: Built-in system inspection and introspection
-    """
-
-    providers = "providers"
-    inference = "inference"
-    safety = "safety"
-    agents = "agents"
-    batches = "batches"
-    vector_io = "vector_io"
-    datasetio = "datasetio"
-    scoring = "scoring"
-    eval = "eval"
-    post_training = "post_training"
-    tool_runtime = "tool_runtime"
-
-    models = "models"
-    shields = "shields"
-    vector_stores = "vector_stores"  # only used for routing table
-    datasets = "datasets"
-    scoring_functions = "scoring_functions"
-    benchmarks = "benchmarks"
-    tool_groups = "tool_groups"
-    files = "files"
-    prompts = "prompts"
-    conversations = "conversations"
-
-    # built-in API
-    inspect = "inspect"
-
-
-@json_schema_type
-class Error(BaseModel):
-    """
-    Error response from the API. Roughly follows RFC 7807.
-
-    :param status: HTTP status code
-    :param title: Error title, a short summary of the error which is invariant for an error type
-    :param detail: Error detail, a longer human-readable description of the error
-    :param instance: (Optional) A URL which can be used to retrieve more information about the specific occurrence of the error
-    """
-
-    status: int
-    title: str
-    detail: str
-    instance: str | None = None
-
-
-class ExternalApiSpec(BaseModel):
-    """Specification for an external API implementation."""
-
-    module: str = Field(..., description="Python module containing the API implementation")
-    name: str = Field(..., description="Name of the API")
-    pip_packages: list[str] = Field(default=[], description="List of pip packages to install the API")
-    protocol: str = Field(..., description="Name of the protocol class for the API")
--- a/src/llama_stack/apis/eval/init.py
+++ b/src/llama_stack/apis/eval/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .eval import *
--- a/src/llama_stack/apis/files/init.py
+++ b/src/llama_stack/apis/files/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .files import *
--- a/src/llama_stack/apis/inference/init.py
+++ b/src/llama_stack/apis/inference/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .inference import *
--- a/src/llama_stack/apis/inspect/init.py
+++ b/src/llama_stack/apis/inspect/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .inspect import *
--- a/src/llama_stack/apis/models/init.py
+++ b/src/llama_stack/apis/models/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .models import *
--- a/src/llama_stack/apis/post_training/init.py
+++ b/src/llama_stack/apis/post_training/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .post_training import *
--- a/src/llama_stack/apis/prompts/init.py
+++ b/src/llama_stack/apis/prompts/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .prompts import ListPromptsResponse, Prompt, Prompts
-
-__all__ = ["Prompt", "Prompts", "ListPromptsResponse"]
--- a/src/llama_stack/apis/providers/init.py
+++ b/src/llama_stack/apis/providers/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .providers import *
--- a/src/llama_stack/apis/safety/init.py
+++ b/src/llama_stack/apis/safety/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .safety import *
--- a/src/llama_stack/apis/scoring/init.py
+++ b/src/llama_stack/apis/scoring/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring import *
--- a/src/llama_stack/apis/scoring_functions/init.py
+++ b/src/llama_stack/apis/scoring_functions/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring_functions import *
--- a/src/llama_stack/apis/shields/init.py
+++ b/src/llama_stack/apis/shields/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .shields import *
--- a/src/llama_stack/apis/tools/init.py
+++ b/src/llama_stack/apis/tools/init.py
@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .rag_tool import *
-from .tools import *
--- a/src/llama_stack/apis/vector_io/init.py
+++ b/src/llama_stack/apis/vector_io/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_io import *
--- a/src/llama_stack/apis/vector_stores/init.py
+++ b/src/llama_stack/apis/vector_stores/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_stores import *
--- a/src/llama_stack/cli/stack/_list_deps.py
+++ b/src/llama_stack/cli/stack/_list_deps.py
@ -21,7 +21,7 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.core.stack import replace_env_vars
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"

--- a/src/llama_stack/cli/stack/utils.py
+++ b/src/llama_stack/cli/stack/utils.py
@ -32,7 +32,7 @@ from llama_stack.core.storage.datatypes import (
 from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.image_types import LlamaStackImageType
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "distributions"

--- a/src/llama_stack/core/build.py
+++ b/src/llama_stack/core/build.py
@ -13,7 +13,7 @@ from llama_stack.core.datatypes import BuildConfig
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 log = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/client.py
+++ b/src/llama_stack/core/client.py
@ -15,7 +15,7 @@ import httpx
 from pydantic import BaseModel, parse_obj_as
 from termcolor import cprint

-from llama_stack.providers.datatypes import RemoteProviderConfig
+from llama_stack_api import RemoteProviderConfig

 _CLIENT_CLASSES = {}

--- a/src/llama_stack/core/configure.py
+++ b/src/llama_stack/core/configure.py
@ -20,7 +20,7 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack_api import Api, ProviderSpec

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/conversations/conversations.py
+++ b/src/llama_stack/core/conversations/conversations.py
@ -10,7 +10,12 @@ from typing import Any, Literal

 from pydantic import BaseModel, TypeAdapter

-from llama_stack.apis.conversations.conversations import (
+from llama_stack.core.datatypes import AccessRule, StackRunConfig
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
+from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
+from llama_stack_api import (
    Conversation,
    ConversationDeletedResource,
    ConversationItem,
@ -20,11 +25,6 @@ from llama_stack.apis.conversations.conversations import (
    Conversations,
    Metadata,
 )
-from llama_stack.core.datatypes import AccessRule, StackRunConfig
-from llama_stack.log import get_logger
-from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl

 logger = get_logger(name=__name__, category="openai_conversations")

--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@ -11,20 +11,6 @@ from urllib.parse import urlparse

 from pydantic import BaseModel, Field, field_validator, model_validator

-from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Dataset, DatasetInput
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.models import Model, ModelInput
-from llama_stack.apis.resource import Resource
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
-from llama_stack.apis.shields import Shield, ShieldInput
-from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore, VectorStoreInput
 from llama_stack.core.access_control.datatypes import AccessRule
 from llama_stack.core.storage.datatypes import (
    KVStoreReference,
@ -32,7 +18,32 @@ from llama_stack.core.storage.datatypes import (
    StorageConfig,
 )
 from llama_stack.log import LoggingConfig
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack_api import (
+    Api,
+    Benchmark,
+    BenchmarkInput,
+    Dataset,
+    DatasetInput,
+    DatasetIO,
+    Eval,
+    Inference,
+    Model,
+    ModelInput,
+    ProviderSpec,
+    Resource,
+    Safety,
+    Scoring,
+    ScoringFn,
+    ScoringFnInput,
+    Shield,
+    ShieldInput,
+    ToolGroup,
+    ToolGroupInput,
+    ToolRuntime,
+    VectorIO,
+    VectorStore,
+    VectorStoreInput,
+)

 LLAMA_STACK_BUILD_CONFIG_VERSION = 2
 LLAMA_STACK_RUN_CONFIG_VERSION = 2
--- a/src/llama_stack/core/distribution.py
+++ b/src/llama_stack/core/distribution.py
@ -15,7 +15,7 @@ from pydantic import BaseModel
 from llama_stack.core.datatypes import BuildConfig, DistributionSpec
 from llama_stack.core.external import load_external_apis
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
--- a/src/llama_stack/core/external.py
+++ b/src/llama_stack/core/external.py
@ -7,9 +7,9 @@

 import yaml

-from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.core.datatypes import BuildConfig, StackRunConfig
 from llama_stack.log import get_logger
+from llama_stack_api import Api, ExternalApiSpec

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/inspect.py
+++ b/src/llama_stack/core/inspect.py
@ -8,17 +8,17 @@ from importlib.metadata import version

 from pydantic import BaseModel

-from llama_stack.apis.inspect import (
+from llama_stack.core.datatypes import StackRunConfig
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.server.routes import get_all_api_routes
+from llama_stack_api import (
    HealthInfo,
+    HealthStatus,
    Inspect,
    ListRoutesResponse,
    RouteInfo,
    VersionInfo,
 )
-from llama_stack.core.datatypes import StackRunConfig
-from llama_stack.core.external import load_external_apis
-from llama_stack.core.server.routes import get_all_api_routes
-from llama_stack.providers.datatypes import HealthStatus


 class DistributionInspectConfig(BaseModel):
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -19,6 +19,8 @@ import httpx
 import yaml
 from fastapi import Response as FastAPIResponse

+from llama_stack_api import is_unwrapped_body_param
+
 try:
    from llama_stack_client import (
        NOT_GIVEN,
@ -57,7 +59,6 @@ from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
 from llama_stack.log import get_logger, setup_logging
-from llama_stack.strong_typing.inspection import is_unwrapped_body_param

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/prompts/prompts.py
+++ b/src/llama_stack/core/prompts/prompts.py
@ -9,9 +9,9 @@ from typing import Any

 from pydantic import BaseModel

-from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
 from llama_stack.core.datatypes import StackRunConfig
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
+from llama_stack_api import ListPromptsResponse, Prompt, Prompts


 class PromptServiceConfig(BaseModel):
--- a/src/llama_stack/core/providers.py
+++ b/src/llama_stack/core/providers.py
@ -9,9 +9,8 @@ from typing import Any

 from pydantic import BaseModel

-from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus
+from llama_stack_api import HealthResponse, HealthStatus, ListProvidersResponse, ProviderInfo, Providers

 from .datatypes import StackRunConfig
 from .utils.config import redact_sensitive_fields
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -8,29 +8,6 @@ import importlib.metadata
 import inspect
 from typing import Any

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
-from llama_stack.apis.benchmarks import Benchmarks
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.datatypes import ExternalApiSpec
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InferenceProvider
-from llama_stack.apis.inspect import Inspect
-from llama_stack.apis.models import Models
-from llama_stack.apis.post_training import PostTraining
-from llama_stack.apis.prompts import Prompts
-from llama_stack.apis.providers import Providers as ProvidersAPI
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
-from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.client import get_client_impl
 from llama_stack.core.datatypes import (
    AccessRule,
@ -44,17 +21,44 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.store import DistributionRegistry
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
+    LLAMA_STACK_API_V1ALPHA,
+    Agents,
    Api,
+    Batches,
+    Benchmarks,
    BenchmarksProtocolPrivate,
+    Conversations,
+    DatasetIO,
+    Datasets,
    DatasetsProtocolPrivate,
+    Eval,
+    ExternalApiSpec,
+    Files,
+    Inference,
+    InferenceProvider,
+    Inspect,
+    Models,
    ModelsProtocolPrivate,
+    PostTraining,
+    Prompts,
    ProviderSpec,
    RemoteProviderConfig,
    RemoteProviderSpec,
+    Safety,
+    Scoring,
+    ScoringFunctions,
    ScoringFunctionsProtocolPrivate,
+    Shields,
    ShieldsProtocolPrivate,
+    ToolGroups,
    ToolGroupsProtocolPrivate,
+    ToolRuntime,
+    VectorIO,
+    VectorStore,
+)
+from llama_stack_api import (
+    Providers as ProvidersAPI,
 )

 logger = get_logger(name=__name__, category="core")
--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -12,8 +12,8 @@ from llama_stack.core.datatypes import (
 )
 from llama_stack.core.stack import StackRunConfig
 from llama_stack.core.store import DistributionRegistry
-from llama_stack.providers.datatypes import Api, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack_api import Api, RoutingTable


 async def get_routing_table_impl(
--- a/src/llama_stack/core/routers/datasets.py
+++ b/src/llama_stack/core/routers/datasets.py
@ -6,11 +6,8 @@

 from typing import Any

-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import DatasetPurpose, DataSource
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable
+from llama_stack_api import DatasetIO, DatasetPurpose, DataSource, PaginatedResponse, RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/eval_scoring.py
+++ b/src/llama_stack/core/routers/eval_scoring.py
@ -6,15 +6,18 @@

 from typing import Any

-from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
-from llama_stack.apis.scoring import (
+from llama_stack.log import get_logger
+from llama_stack_api import (
+    BenchmarkConfig,
+    Eval,
+    EvaluateResponse,
+    Job,
+    RoutingTable,
    ScoreBatchResponse,
    ScoreResponse,
    Scoring,
    ScoringFnParams,
 )
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -15,13 +15,25 @@ from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatC
 from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
 from pydantic import TypeAdapter

-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
-from llama_stack.apis.inference import (
+from llama_stack.core.telemetry.telemetry import MetricEvent
+from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
+from llama_stack.log import get_logger
+from llama_stack.models.llama.llama3.chat_format import ChatFormat
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack_api import (
+    HealthResponse,
+    HealthStatus,
    Inference,
    ListOpenAIChatCompletionResponse,
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAIChatCompletionToolCall,
    OpenAIChatCompletionToolCallFunction,
@ -35,19 +47,8 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    Order,
    RerankResponse,
+    RoutingTable,
 )
-from llama_stack.apis.inference.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-)
-from llama_stack.apis.models import ModelType
-from llama_stack.core.telemetry.telemetry import MetricEvent
-from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
-from llama_stack.log import get_logger
-from llama_stack.models.llama.llama3.chat_format import ChatFormat
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
-from llama_stack.providers.utils.inference.inference_store import InferenceStore

 logger = get_logger(name=__name__, category="core::routers")

@ -416,7 +417,7 @@ class InferenceRouter(Inference):
                            prompt_tokens=chunk.usage.prompt_tokens,
                            completion_tokens=chunk.usage.completion_tokens,
                            total_tokens=chunk.usage.total_tokens,
-                            model_id=fully_qualified_model_id,
+                            fully_qualified_model_id=fully_qualified_model_id,
                            provider_id=provider_id,
                        )
                        for metric in metrics:
--- a/src/llama_stack/core/routers/safety.py
+++ b/src/llama_stack/core/routers/safety.py
@ -6,13 +6,9 @@

 from typing import Any

-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.safety import RunShieldResponse, Safety
-from llama_stack.apis.safety.safety import ModerationObject
-from llama_stack.apis.shields import Shield
 from llama_stack.core.datatypes import SafetyConfig
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable
+from llama_stack_api import ModerationObject, OpenAIMessageParam, RoutingTable, RunShieldResponse, Safety, Shield

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/tool_runtime.py
+++ b/src/llama_stack/core/routers/tool_runtime.py
@ -6,14 +6,12 @@

 from typing import Any

-from llama_stack.apis.common.content_types import (
+from llama_stack.log import get_logger
+from llama_stack_api import (
    URL,
-)
-from llama_stack.apis.tools import (
    ListToolDefsResponse,
    ToolRuntime,
 )
-from llama_stack.log import get_logger

 from ..routing_tables.toolgroups import ToolGroupsRoutingTable

--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -10,13 +10,20 @@ from typing import Annotated, Any

 from fastapi import Body

-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.models import ModelType
-from llama_stack.apis.vector_io import (
+from llama_stack.core.datatypes import VectorStoresConfig
+from llama_stack.log import get_logger
+from llama_stack_api import (
    Chunk,
+    HealthResponse,
+    HealthStatus,
+    InterleavedContent,
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
    OpenAICreateVectorStoreFileBatchRequestWithExtraBody,
    OpenAICreateVectorStoreRequestWithExtraBody,
    QueryChunksResponse,
+    RoutingTable,
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
@ -33,9 +40,6 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.core.datatypes import VectorStoresConfig
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

@ -122,6 +126,14 @@ class VectorIORouter(VectorIO):
        if embedding_model is not None and embedding_dimension is None:
            embedding_dimension = await self._get_embedding_model_dimension(embedding_model)

+        # Validate that embedding model exists and is of the correct type
+        if embedding_model is not None:
+            model = await self.routing_table.get_object_by_identifier("model", embedding_model)
+            if model is None:
+                raise ModelNotFoundError(embedding_model)
+            if model.model_type != ModelType.embedding:
+                raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
+
        # Auto-select provider if not specified
        if provider_id is None:
            num_providers = len(self.routing_table.impls_by_provider_id)
--- a/src/llama_stack/core/routing_tables/benchmarks.py
+++ b/src/llama_stack/core/routing_tables/benchmarks.py
@ -6,11 +6,11 @@

 from typing import Any

-from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
 from llama_stack.core.datatypes import (
    BenchmarkWithOwner,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import Benchmark, Benchmarks, ListBenchmarksResponse

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/common.py
+++ b/src/llama_stack/core/routing_tables/common.py
@ -6,9 +6,6 @@

 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError
-from llama_stack.apis.models import Model
-from llama_stack.apis.resource import ResourceType
 from llama_stack.core.access_control.access_control import AccessDeniedError, is_action_allowed
 from llama_stack.core.access_control.datatypes import Action
 from llama_stack.core.datatypes import (
@ -21,7 +18,7 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.core.store import DistributionRegistry
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api, RoutingTable
+from llama_stack_api import Api, Model, ModelNotFoundError, ResourceType, RoutingTable

 logger = get_logger(name=__name__, category="core::routing_tables")

--- a/src/llama_stack/core/routing_tables/datasets.py
+++ b/src/llama_stack/core/routing_tables/datasets.py
@ -7,22 +7,22 @@
 import uuid
 from typing import Any

-from llama_stack.apis.common.errors import DatasetNotFoundError
-from llama_stack.apis.datasets import (
+from llama_stack.core.datatypes import (
+    DatasetWithOwner,
+)
+from llama_stack.log import get_logger
+from llama_stack_api import (
    Dataset,
+    DatasetNotFoundError,
    DatasetPurpose,
    Datasets,
    DatasetType,
    DataSource,
    ListDatasetsResponse,
+    ResourceType,
    RowsDataSource,
    URIDataSource,
 )
-from llama_stack.apis.resource import ResourceType
-from llama_stack.core.datatypes import (
-    DatasetWithOwner,
-)
-from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/models.py
+++ b/src/llama_stack/core/routing_tables/models.py
@ -7,8 +7,6 @@
 import time
 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError
-from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
 from llama_stack.core.datatypes import (
    ModelWithOwner,
    RegistryEntrySource,
@ -16,6 +14,15 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.request_headers import PROVIDER_DATA_VAR, NeedsRequestProviderData
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
+from llama_stack_api import (
+    ListModelsResponse,
+    Model,
+    ModelNotFoundError,
+    Models,
+    ModelType,
+    OpenAIListModelsResponse,
+    OpenAIModel,
+)

 from .common import CommonRoutingTableImpl, lookup_model

--- a/src/llama_stack/core/routing_tables/scoring_functions.py
+++ b/src/llama_stack/core/routing_tables/scoring_functions.py
@ -4,18 +4,18 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.scoring_functions import (
-    ListScoringFunctionsResponse,
-    ScoringFn,
-    ScoringFnParams,
-    ScoringFunctions,
-)
 from llama_stack.core.datatypes import (
    ScoringFnWithOwner,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import (
+    ListScoringFunctionsResponse,
+    ParamType,
+    ResourceType,
+    ScoringFn,
+    ScoringFnParams,
+    ScoringFunctions,
+)

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/shields.py
+++ b/src/llama_stack/core/routing_tables/shields.py
@ -6,12 +6,11 @@

 from typing import Any

-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
 from llama_stack.core.datatypes import (
    ShieldWithOwner,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import ListShieldsResponse, ResourceType, Shield, Shields

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/toolgroups.py
+++ b/src/llama_stack/core/routing_tables/toolgroups.py
@ -6,11 +6,17 @@

 from typing import Any

-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.errors import ToolGroupNotFoundError
-from llama_stack.apis.tools import ListToolDefsResponse, ListToolGroupsResponse, ToolDef, ToolGroup, ToolGroups
 from llama_stack.core.datatypes import AuthenticationRequiredError, ToolGroupWithOwner
 from llama_stack.log import get_logger
+from llama_stack_api import (
+    URL,
+    ListToolDefsResponse,
+    ListToolGroupsResponse,
+    ToolDef,
+    ToolGroup,
+    ToolGroupNotFoundError,
+    ToolGroups,
+)

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/vector_stores.py
+++ b/src/llama_stack/core/routing_tables/vector_stores.py
@ -6,12 +6,17 @@

 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
-from llama_stack.apis.models import ModelType
-from llama_stack.apis.resource import ResourceType
+from llama_stack.core.datatypes import (
+    VectorStoreWithOwner,
+)
+from llama_stack.log import get_logger

 # Removed VectorStores import to avoid exposing public API
-from llama_stack.apis.vector_io.vector_io import (
+from llama_stack_api import (
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
+    ResourceType,
    SearchRankingOptions,
    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
@ -22,10 +27,6 @@ from llama_stack.apis.vector_io.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.core.datatypes import (
-    VectorStoreWithOwner,
-)
-from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl, lookup_model

--- a/src/llama_stack/core/server/auth_providers.py
+++ b/src/llama_stack/core/server/auth_providers.py
@ -13,7 +13,6 @@ import httpx
 import jwt
 from pydantic import BaseModel, Field

-from llama_stack.apis.common.errors import TokenValidationError
 from llama_stack.core.datatypes import (
    AuthenticationConfig,
    CustomAuthConfig,
@ -23,6 +22,7 @@ from llama_stack.core.datatypes import (
    User,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import TokenValidationError

 logger = get_logger(name=__name__, category="core::auth")

--- a/src/llama_stack/core/server/routes.py
+++ b/src/llama_stack/core/server/routes.py
@ -12,9 +12,8 @@ from typing import Any
 from aiohttp import hdrs
 from starlette.routing import Route

-from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.core.resolver import api_protocol_map
-from llama_stack.schema_utils import WebMethod
+from llama_stack_api import Api, ExternalApiSpec, WebMethod

 EndpointFunc = Callable[..., Any]
 PathParams = dict[str, str]
--- a/src/llama_stack/core/server/server.py
+++ b/src/llama_stack/core/server/server.py
@ -31,8 +31,6 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError

-from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
-from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.core.access_control.access_control import AccessDeniedError
 from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
@ -58,7 +56,7 @@ from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.log import LoggingConfig, get_logger, setup_logging
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api, ConflictError, PaginatedResponse, ResourceNotFoundError

 from .auth import AuthenticationMiddleware
 from .quota import QuotaMiddleware
@ -526,8 +524,8 @@ def extract_path_params(route: str) -> list[str]:

 def remove_disabled_providers(obj):
    if isinstance(obj, dict):
-        keys = ["provider_id", "shield_id", "provider_model_id", "model_id"]
-        if any(k in obj and obj[k] in ("__disabled__", "", None) for k in keys):
+        # Filter out items where provider_id is explicitly disabled or empty
+        if "provider_id" in obj and obj["provider_id"] in ("__disabled__", "", None):
            return None
        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
    elif isinstance(obj, list):
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -13,26 +13,6 @@ from typing import Any

 import yaml

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
-from llama_stack.apis.benchmarks import Benchmarks
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.inspect import Inspect
-from llama_stack.apis.models import Models
-from llama_stack.apis.post_training import PostTraining
-from llama_stack.apis.prompts import Prompts
-from llama_stack.apis.providers import Providers
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
-from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
 from llama_stack.core.datatypes import Provider, SafetyConfig, StackRunConfig, VectorStoresConfig
 from llama_stack.core.distribution import get_provider_registry
@ -54,7 +34,30 @@ from llama_stack.core.storage.datatypes import (
 from llama_stack.core.store.registry import create_dist_registry
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import (
+    Agents,
+    Api,
+    Batches,
+    Benchmarks,
+    Conversations,
+    DatasetIO,
+    Datasets,
+    Eval,
+    Files,
+    Inference,
+    Inspect,
+    Models,
+    PostTraining,
+    Prompts,
+    Providers,
+    Safety,
+    Scoring,
+    ScoringFunctions,
+    Shields,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
+)

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -28,7 +28,7 @@ from pydantic import BaseModel, Field

 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import Primitive
-from llama_stack.schema_utils import json_schema_type, register_schema
+from llama_stack_api import json_schema_type, register_schema

 ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]

--- a/src/llama_stack/distributions/dell/dell.py
+++ b/src/llama_stack/distributions/dell/dell.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    BuildProvider,
    ModelInput,
@ -17,6 +16,7 @@ from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
 from llama_stack.providers.remote.vector_io.chroma import ChromaVectorIOConfig
+from llama_stack_api import ModelType


 def get_distribution_template() -> DistributionTemplate:
--- a/src/llama_stack/distributions/meta-reference-gpu/meta_reference.py
+++ b/src/llama_stack/distributions/meta-reference-gpu/meta_reference.py
@ -6,7 +6,6 @@

 from pathlib import Path

-from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    BuildProvider,
    ModelInput,
@ -22,6 +21,7 @@ from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack_api import ModelType


 def get_distribution_template() -> DistributionTemplate:
--- a/src/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/src/llama_stack/distributions/open-benchmark/open_benchmark.py
@ -5,8 +5,6 @@
 # the root directory of this source tree.


-from llama_stack.apis.datasets import DatasetPurpose, URIDataSource
-from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    BenchmarkInput,
    BuildProvider,
@ -34,6 +32,7 @@ from llama_stack.providers.remote.vector_io.pgvector.config import (
    PGVectorVectorIOConfig,
 )
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
+from llama_stack_api import DatasetPurpose, ModelType, URIDataSource


 def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -19,7 +19,6 @@ from llama_stack.core.datatypes import (
 )
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
-from llama_stack.providers.datatypes import RemoteProviderSpec
 from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
@ -38,6 +37,7 @@ from llama_stack.providers.remote.vector_io.qdrant.config import QdrantVectorIOC
 from llama_stack.providers.remote.vector_io.weaviate.config import WeaviateVectorIOConfig
 from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
+from llama_stack_api import RemoteProviderSpec


 def _get_config_for_provider(provider_spec: ProviderSpec) -> dict[str, Any]:
--- a/src/llama_stack/distributions/template.py
+++ b/src/llama_stack/distributions/template.py
@ -12,8 +12,6 @@ import rich
 import yaml
 from pydantic import BaseModel, Field

-from llama_stack.apis.datasets import DatasetPurpose
-from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    LLAMA_STACK_RUN_CONFIG_VERSION,
    Api,
@ -44,6 +42,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 from llama_stack.providers.utils.kvstore.config import get_pip_packages as get_kv_pip_packages
 from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import get_pip_packages as get_sql_pip_packages
+from llama_stack_api import DatasetPurpose, ModelType


 def filter_empty_values(obj: Any) -> Any:
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -5,29 +5,29 @@
 # the root directory of this source tree.


-from llama_stack.apis.agents import (
+from llama_stack.core.datatypes import AccessRule
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
+from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack_api import (
    Agents,
+    Conversations,
+    Inference,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIDeleteResponseObject,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
+    OpenAIResponsePrompt,
+    OpenAIResponseText,
    Order,
+    ResponseGuardrail,
+    Safety,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.agents.agents import ResponseGuardrail
-from llama_stack.apis.agents.openai_responses import OpenAIResponsePrompt, OpenAIResponseText
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.inference import (
-    Inference,
-)
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.core.datatypes import AccessRule
-from llama_stack.log import get_logger
-from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
-from llama_stack.providers.utils.responses.responses_store import ResponsesStore

 from .config import MetaReferenceAgentsImplConfig
 from .responses.openai_responses import OpenAIResponsesImpl
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -10,12 +10,20 @@ from collections.abc import AsyncIterator

 from pydantic import BaseModel, TypeAdapter

-from llama_stack.apis.agents import Order
-from llama_stack.apis.agents.agents import ResponseGuardrailSpec
-from llama_stack.apis.agents.openai_responses import (
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.responses.responses_store import (
+    ResponsesStore,
+    _OpenAIResponseObjectWithInputAndMessages,
+)
+from llama_stack_api import (
+    ConversationItem,
+    Conversations,
+    Inference,
+    InvalidConversationIdError,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIDeleteResponseObject,
+    OpenAIMessageParam,
    OpenAIResponseInput,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
@ -25,24 +33,13 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponsePrompt,
    OpenAIResponseText,
    OpenAIResponseTextFormat,
-)
-from llama_stack.apis.common.errors import (
-    InvalidConversationIdError,
-)
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.conversations.conversations import ConversationItem
-from llama_stack.apis.inference import (
-    Inference,
-    OpenAIMessageParam,
    OpenAISystemMessageParam,
-)
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.log import get_logger
-from llama_stack.providers.utils.responses.responses_store import (
-    ResponsesStore,
-    _OpenAIResponseObjectWithInputAndMessages,
+    Order,
+    ResponseGuardrailSpec,
+    Safety,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
 )

 from .streaming import StreamingResponseOrchestrator
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -8,10 +8,21 @@ import uuid
 from collections.abc import AsyncIterator
 from typing import Any

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack.core.telemetry import tracing
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
+from llama_stack_api import (
    AllowedToolsFilter,
    ApprovalFilter,
+    Inference,
    MCPListToolsTool,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionToolCall,
+    OpenAIChoice,
+    OpenAIMessageParam,
    OpenAIResponseContentPartOutputText,
    OpenAIResponseContentPartReasoningText,
    OpenAIResponseContentPartRefusal,
@ -56,19 +67,6 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseUsageOutputTokensDetails,
    WebSearchToolTypes,
 )
-from llama_stack.apis.inference import (
-    Inference,
-    OpenAIAssistantMessageParam,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAIChatCompletionToolCall,
-    OpenAIChoice,
-    OpenAIMessageParam,
-)
-from llama_stack.core.telemetry import tracing
-from llama_stack.log import get_logger
-from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str

 from .types import ChatCompletionContext, ChatCompletionResult
 from .utils import (
@ -1025,9 +1023,9 @@ class StreamingResponseOrchestrator:
        """Process all tools and emit appropriate streaming events."""
        from openai.types.chat import ChatCompletionToolParam

-        from llama_stack.apis.tools import ToolDef
        from llama_stack.models.llama.datatypes import ToolDefinition
        from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+        from llama_stack_api import ToolDef

        def make_openai_tool(tool_name: str, tool: ToolDef) -> ChatCompletionToolParam:
            tool_def = ToolDefinition(
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -9,7 +9,14 @@ import json
 from collections.abc import AsyncIterator
 from typing import Any

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack.core.telemetry import tracing
+from llama_stack.log import get_logger
+from llama_stack_api import (
+    ImageContentItem,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
+    OpenAIImageURL,
    OpenAIResponseInputToolFileSearch,
    OpenAIResponseInputToolMCP,
    OpenAIResponseObjectStreamResponseFileSearchCallCompleted,
@ -23,24 +30,14 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectStreamResponseWebSearchCallSearching,
    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFileSearchToolCallResults,
-    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageWebSearchToolCall,
-)
-from llama_stack.apis.common.content_types import (
-    ImageContentItem,
-    TextContentItem,
-)
-from llama_stack.apis.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-    OpenAIChatCompletionToolCall,
-    OpenAIImageURL,
    OpenAIToolMessageParam,
+    TextContentItem,
+    ToolGroups,
+    ToolInvocationResult,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.core.telemetry import tracing
-from llama_stack.log import get_logger

 from .types import ChatCompletionContext, ToolExecutionResult

@ -398,6 +395,10 @@ class ToolExecutor:
        # Build output message
        message: Any
        if mcp_tool_to_server and function.name in mcp_tool_to_server:
+            from llama_stack_api import (
+                OpenAIResponseOutputMessageMCPCall,
+            )
+
            message = OpenAIResponseOutputMessageMCPCall(
                id=item_id,
                arguments=function.arguments,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
@ -10,7 +10,10 @@ from typing import cast
 from openai.types.chat import ChatCompletionToolParam
 from pydantic import BaseModel

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    OpenAIChatCompletionToolCall,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseInputToolFileSearch,
@ -26,7 +29,6 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseTool,
    OpenAIResponseToolMCP,
 )
-from llama_stack.apis.inference import OpenAIChatCompletionToolCall, OpenAIMessageParam, OpenAIResponseFormatParam


 class ToolExecutionResult(BaseModel):
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
@ -9,9 +9,23 @@ import re
 import uuid
 from collections.abc import Sequence

-from llama_stack.apis.agents.agents import ResponseGuardrailSpec
-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIDeveloperMessageParam,
+    OpenAIImageURL,
+    OpenAIJSONSchema,
+    OpenAIMessageParam,
    OpenAIResponseAnnotationFileCitation,
+    OpenAIResponseFormatJSONObject,
+    OpenAIResponseFormatJSONSchema,
+    OpenAIResponseFormatParam,
+    OpenAIResponseFormatText,
    OpenAIResponseInput,
    OpenAIResponseInputFunctionToolCallOutput,
    OpenAIResponseInputMessageContent,
@ -27,28 +41,12 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseText,
-)
-from llama_stack.apis.inference import (
-    OpenAIAssistantMessageParam,
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartParam,
-    OpenAIChatCompletionContentPartTextParam,
-    OpenAIChatCompletionToolCall,
-    OpenAIChatCompletionToolCallFunction,
-    OpenAIChoice,
-    OpenAIDeveloperMessageParam,
-    OpenAIImageURL,
-    OpenAIJSONSchema,
-    OpenAIMessageParam,
-    OpenAIResponseFormatJSONObject,
-    OpenAIResponseFormatJSONSchema,
-    OpenAIResponseFormatParam,
-    OpenAIResponseFormatText,
    OpenAISystemMessageParam,
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
+    ResponseGuardrailSpec,
+    Safety,
 )
-from llama_stack.apis.safety import Safety


 async def convert_chat_choice_to_response_message(
--- a/src/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -6,10 +6,9 @@

 import asyncio

-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
+from llama_stack_api import OpenAIMessageParam, Safety, SafetyViolation, ViolationLevel

 log = get_logger(name=__name__, category="agents::meta_reference")

--- a/src/llama_stack/providers/inline/batches/reference/init.py
+++ b/src/llama_stack/providers/inline/batches/reference/init.py
@ -6,11 +6,9 @@

 from typing import Any

-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.models import Models
 from llama_stack.core.datatypes import AccessRule, Api
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack_api import Files, Inference, Models

 from .batches import ReferenceBatchesImpl
 from .config import ReferenceBatchesImplConfig
--- a/Show more
+++ b/Show more