fix: add check for interleavedContent (#1973)

# What does this PR do?
Checks for RAGDocument of type InterleavedContent

I noticed when stepping through the code that the supported types for
`RAGDocument` included `InterleavedContent` as a content type. This type
is not checked against before putting the `doc.content` is regex matched
against. This would cause a runtime error. This change adds an explicit
check for type.

The only other part that I'm unclear on is how to handle the
`ImageContent` type since this would always just return `<image>` which
seems like an undesired behavior. Should the `InterleavedContent` type
be removed from `RAGDocument` and replaced with `URI | str`?

## Test Plan


[//]: # (## Documentation)

---------

Signed-off-by: Kevin <kpostlet@redhat.com>
This commit is contained in:
Kevin Postlethwait 2025-05-06 12:55:07 -04:00 committed by GitHub
parent 1a529705da
commit a57985eeac
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 170 additions and 17 deletions

View file

@ -118,27 +118,25 @@ async def content_from_doc(doc: RAGDocument) -> str:
if isinstance(doc.content, URL):
if doc.content.uri.startswith("data:"):
return content_from_data(doc.content.uri)
else:
async with httpx.AsyncClient() as client:
r = await client.get(doc.content.uri)
if doc.mime_type == "application/pdf":
return parse_pdf(r.content)
else:
return r.text
pattern = re.compile("^(https?://|file://|data:)")
if pattern.match(doc.content):
if doc.content.startswith("data:"):
return content_from_data(doc.content)
else:
async with httpx.AsyncClient() as client:
r = await client.get(doc.content.uri)
if doc.mime_type == "application/pdf":
return parse_pdf(r.content)
return r.text
elif isinstance(doc.content, str):
pattern = re.compile("^(https?://|file://|data:)")
if pattern.match(doc.content):
if doc.content.startswith("data:"):
return content_from_data(doc.content)
async with httpx.AsyncClient() as client:
r = await client.get(doc.content)
if doc.mime_type == "application/pdf":
return parse_pdf(r.content)
else:
return r.text
return interleaved_content_as_str(doc.content)
return r.text
return doc.content
else:
# will raise ValueError if the content is not List[InterleavedContent] or InterleavedContent
return interleaved_content_as_str(doc.content)
def make_overlapped_chunks(document_id: str, text: str, window_len: int, overlap_len: int) -> list[Chunk]: