fix(vector-io): handle missing document_id in insert_chunks (#3521)

Fixed KeyError when chunks don't have document_id in metadata or
chunk_metadata. Updated logging to safely extract document_id using
getattr and RAG memory to handle different document_id locations. Added
test for missing document_id scenarios.

Fixes issue #3494 where /v1/vector-io/insert would crash with KeyError.
Fixed KeyError when chunks don't have document_id in metadata or
chunk_metadata. Updated logging to safely extract document_id using
getattr and RAG memory to handle different document_id locations. Added
test for missing document_id scenarios.

 # What does this PR do?

Fixes a KeyError crash in `/v1/vector-io/insert` when chunks are missing
`document_id` fields. The API
was failing even though `document_id` is optional according to the
schema.

  Closes #3494

  ## Test Plan

  **Before fix:**
  - POST to `/v1/vector-io/insert` with chunks → 500 KeyError
  - Happened regardless of where `document_id` was placed

  **After fix:**
  - Same request works fine → 200 OK
  - Tested with Postman using FAISS backend
  - Added unit test covering missing `document_id` scenarios
This commit is contained in:
Sumanth Kamenani 2025-10-15 14:02:48 -04:00 committed by GitHub
parent e9b4278a51
commit bc8b377a7c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 51 additions and 2 deletions

View file

@ -93,6 +93,22 @@ class Chunk(BaseModel):
return generate_chunk_id(str(uuid.uuid4()), str(self.content))
@property
def document_id(self) -> str | None:
"""Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
# Check metadata first (takes precedence)
doc_id = self.metadata.get("document_id")
if doc_id is not None:
if not isinstance(doc_id, str):
raise TypeError(f"metadata['document_id'] must be a string, got {type(doc_id).__name__}: {doc_id!r}")
return doc_id
# Fall back to chunk_metadata if available (Pydantic ensures type safety)
if self.chunk_metadata is not None:
return self.chunk_metadata.document_id
return None
@json_schema_type
class QueryChunksResponse(BaseModel):

View file

@ -93,8 +93,10 @@ class VectorIORouter(VectorIO):
chunks: list[Chunk],
ttl_seconds: int | None = None,
) -> None:
doc_ids = [chunk.document_id for chunk in chunks[:3]]
logger.debug(
f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, "
f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}"
)
provider = await self.routing_table.get_provider_impl(vector_db_id)
return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds)

View file

@ -272,7 +272,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
return RAGQueryResult(
content=picked,
metadata={
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
"document_ids": [c.document_id for c in chunks[: len(picked)]],
"chunks": [c.content for c in chunks[: len(picked)]],
"scores": scores[: len(picked)],
"vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],