fix(vector-io): handle missing document_id in insert_chunks (#3521)

Fixed KeyError when chunks don't have document_id in metadata or
chunk_metadata. Updated logging to safely extract document_id using
getattr and RAG memory to handle different document_id locations. Added
test for missing document_id scenarios.

Fixes issue #3494 where /v1/vector-io/insert would crash with KeyError.
Fixed KeyError when chunks don't have document_id in metadata or
chunk_metadata. Updated logging to safely extract document_id using
getattr and RAG memory to handle different document_id locations. Added
test for missing document_id scenarios.

 # What does this PR do?

Fixes a KeyError crash in `/v1/vector-io/insert` when chunks are missing
`document_id` fields. The API
was failing even though `document_id` is optional according to the
schema.

  Closes #3494

  ## Test Plan

  **Before fix:**
  - POST to `/v1/vector-io/insert` with chunks → 500 KeyError
  - Happened regardless of where `document_id` was placed

  **After fix:**
  - Same request works fine → 200 OK
  - Tested with Postman using FAISS backend
  - Added unit test covering missing `document_id` scenarios
This commit is contained in:
Sumanth Kamenani 2025-10-15 14:02:48 -04:00 committed by GitHub
parent e9b4278a51
commit bc8b377a7c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 51 additions and 2 deletions

View file

@ -128,6 +128,37 @@ async def test_insert_chunks_missing_db_raises(vector_io_adapter):
await vector_io_adapter.insert_chunks("db_not_exist", [])
async def test_insert_chunks_with_missing_document_id(vector_io_adapter):
"""Ensure no KeyError when document_id is missing or in different places."""
from llama_stack.apis.vector_io import Chunk, ChunkMetadata
fake_index = AsyncMock()
vector_io_adapter.cache["db1"] = fake_index
# Various document_id scenarios that shouldn't crash
chunks = [
Chunk(content="has doc_id in metadata", metadata={"document_id": "doc-1"}),
Chunk(content="no doc_id anywhere", metadata={"source": "test"}),
Chunk(content="doc_id in chunk_metadata", chunk_metadata=ChunkMetadata(document_id="doc-3")),
]
# Should work without KeyError
await vector_io_adapter.insert_chunks("db1", chunks)
fake_index.insert_chunks.assert_awaited_once()
async def test_document_id_with_invalid_type_raises_error():
"""Ensure TypeError is raised when document_id is not a string."""
from llama_stack.apis.vector_io import Chunk
# Integer document_id should raise TypeError
chunk = Chunk(content="test", metadata={"document_id": 12345})
with pytest.raises(TypeError) as exc_info:
_ = chunk.document_id
assert "metadata['document_id'] must be a string" in str(exc_info.value)
assert "got int" in str(exc_info.value)
async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))