feat: add base64 encoded PDF support for OpenAI Chat Completions (#2881)

# What does this PR do? OpenAI Chat Completions supports passing a base64 encoded PDF file to a model, but Llama Stack currently does not allow for this behavior. This PR extends our implementation of the OpenAI API spec to change that. Closes #2129 ## Test Plan A new functional test has been added to test the validity of such a request Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-07-29 06:23:41 -04:00 · 2025-07-29 06:23:41 -04:00 · 870a37ff4b
commit 870a37ff4b
parent cf8722079c
6 changed files with 1514 additions and 1200 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -9821,13 +9821,17 @@
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIFile"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam",
-                        "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+                        "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam",
                        "file": "#/components/schemas/OpenAIFile"
                    }
                }
            },
@ -9974,6 +9978,41 @@
                "title": "OpenAIDeveloperMessageParam",
                "description": "A message from the developer in an OpenAI-compatible chat completion request."
            },
            "OpenAIFile": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "file",
                        "default": "file"
                    },
                    "file": {
                        "$ref": "#/components/schemas/OpenAIFileFile"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "file"
                ],
                "title": "OpenAIFile"
            },
            "OpenAIFileFile": {
                "type": "object",
                "properties": {
                    "file_data": {
                        "type": "string"
                    },
                    "file_id": {
                        "type": "string"
                    },
                    "filename": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "title": "OpenAIFileFile"
            },
            "OpenAIImageURL": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -6934,11 +6934,13 @@ components:
      oneOf:
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
        - $ref: '#/components/schemas/OpenAIFile'
      discriminator:
        propertyName: type
        mapping:
          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
          file: '#/components/schemas/OpenAIFile'
    OpenAIChatCompletionContentPartTextParam:
      type: object
      properties:
@ -7050,6 +7052,31 @@ components:
      title: OpenAIDeveloperMessageParam
      description: >-
        A message from the developer in an OpenAI-compatible chat completion request.
    OpenAIFile:
      type: object
      properties:
        type:
          type: string
          const: file
          default: file
        file:
          $ref: '#/components/schemas/OpenAIFileFile'
      additionalProperties: false
      required:
        - type
        - file
      title: OpenAIFile
    OpenAIFileFile:
      type: object
      properties:
        file_data:
          type: string
        file_id:
          type: string
        filename:
          type: string
      additionalProperties: false
      title: OpenAIFileFile
    OpenAIImageURL:
      type: object
      properties:
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -455,8 +455,21 @@ class OpenAIChatCompletionContentPartImageParam(BaseModel):
    image_url: OpenAIImageURL
@json_schema_type
 class OpenAIFileFile(BaseModel):
    file_data: str | None = None
    file_id: str | None = None
    filename: str | None = None
@json_schema_type
 class OpenAIFile(BaseModel):
    type: Literal["file"] = "file"
    file: OpenAIFileFile
 OpenAIChatCompletionContentPartParam = Annotated[
-    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam | OpenAIFile,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -114,6 +114,7 @@ test = [
    "sqlalchemy[asyncio]>=2.0.41",
    "requests",
    "pymilvus>=2.5.12",
    "reportlab",
 ]
 docs = [
    "setuptools",
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -5,8 +5,14 @@
 # the root directory of this source tree.
 import base64
 import os
 import tempfile
 import pytest
 from openai import OpenAI
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
@ -82,6 +88,14 @@ def skip_if_provider_isnt_vllm(client_with_models, model_id):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
 def skip_if_provider_isnt_openai(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type != "remote::openai":
        pytest.skip(
            f"Model {model_id} hosted by {provider.provider_type} doesn't support chat completion calls with base64 encoded files."
        )
@pytest.fixture
 def openai_client(client_with_models):
    base_url = f"{client_with_models.base_url}/v1/openai/v1"
@ -418,3 +432,45 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        # failed tool call parses show up as a message with content, so ensure
        # that the retrieve response content matches the original request
        assert retrieved_response.choices[0].message.content == content
 def test_openai_chat_completion_non_streaming_with_file(openai_client, client_with_models, text_model_id):
    skip_if_provider_isnt_openai(client_with_models, text_model_id)
    # Generate temporary PDF with "Hello World" text
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
        c = canvas.Canvas(temp_pdf.name, pagesize=letter)
        c.drawString(100, 750, "Hello World")
        c.save()
        # Read the PDF and sencode to base64
        with open(temp_pdf.name, "rb") as pdf_file:
            pdf_base64 = base64.b64encode(pdf_file.read()).decode("utf-8")
        # Clean up temporary file
        os.unlink(temp_pdf.name)
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=[
            {
                "role": "user",
                "content": "Describe what you see in this PDF file.",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "file",
                        "file": {
                            "filename": "my-temp-hello-world-pdf",
                            "file_data": f"data:application/pdf;base64,{pdf_base64}",
                        },
                    }
                ],
            },
        ],
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert "hello world" in message_content
--- a/uv.lock
+++ b/uv.lock