API Updates (#73)

* API Keys passed from Client instead of distro configuration * delete distribution registry * Rename the "package" word away * Introduce a "Router" layer for providers Some providers need to be factorized and considered as thin routing layers on top of other providers. Consider two examples: - The inference API should be a routing layer over inference providers, routed using the "model" key - The memory banks API is another instance where various memory bank types will be provided by independent providers (e.g., a vector store is served by Chroma while a keyvalue memory can be served by Redis or PGVector) This commit introduces a generalized routing layer for this purpose. * update `apis_to_serve` * llama_toolchain -> llama_stack * Codemod from llama_toolchain -> llama_stack - added providers/registry - cleaned up api/ subdirectories and moved impls away - restructured api/api.py - from llama_stack.apis.<api> import foo should work now - update imports to do llama_stack.apis.<api> - update many other imports - added __init__, fixed some registry imports - updated registry imports - create_agentic_system -> create_agent - AgenticSystem -> Agent * Moved some stuff out of common/; re-generated OpenAPI spec * llama-toolchain -> llama-stack (hyphens) * add control plane API * add redis adapter + sqlite provider * move core -> distribution * Some more toolchain -> stack changes * small naming shenanigans * Removing custom tool and agent utilities and moving them client side * Move control plane to distribution server for now * Remove control plane from API list * no codeshield dependency randomly plzzzzz * Add "fire" as a dependency * add back event loggers * stack configure fixes * use brave instead of bing in the example client * add init file so it gets packaged * add init files so it gets packaged * Update MANIFEST * bug fix --------- Co-authored-by: Hardik Shah <hjshah@fb.com> Co-authored-by: Xi Yan <xiyan@meta.com> Co-authored-by: Ashwin Bharambe <ashwin@meta.com>
2025-07-07 22:34:37 +00:00 · 2024-09-17 19:51:35 -07:00 · 2024-09-17 19:51:35 -07:00 · 9487ad8294
commit 9487ad8294
parent f294eac5f5
213 changed files with 1725 additions and 1204 deletions
--- a/llama_stack/providers/utils/memory/init.py
+++ b/llama_stack/providers/utils/memory/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/utils/memory/file_utils.py
+++ b/llama_stack/providers/utils/memory/file_utils.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import mimetypes
+import os
+
+from llama_models.llama3.api.datatypes import URL
+
+
+def data_url_from_file(file_path: str) -> URL:
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type, _ = mimetypes.guess_type(file_path)
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return URL(uri=data_url)
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -0,0 +1,180 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import base64
+import io
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from urllib.parse import unquote
+
+import chardet
+import httpx
+import numpy as np
+from numpy.typing import NDArray
+from pypdf import PdfReader
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_models.llama3.api.tokenizer import Tokenizer
+
+from llama_stack.apis.memory import *  # noqa: F403
+
+ALL_MINILM_L6_V2_DIMENSION = 384
+
+EMBEDDING_MODEL = None
+
+
+def get_embedding_model() -> "SentenceTransformer":
+    global EMBEDDING_MODEL
+
+    if EMBEDDING_MODEL is None:
+        print("Loading sentence transformer")
+
+        from sentence_transformers import SentenceTransformer
+
+        EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
+
+    return EMBEDDING_MODEL
+
+
+def parse_data_url(data_url: str):
+    data_url_pattern = re.compile(
+        r"^"
+        r"data:"
+        r"(?P<mimetype>[\w/\-+.]+)"
+        r"(?P<charset>;charset=(?P<encoding>[\w-]+))?"
+        r"(?P<base64>;base64)?"
+        r",(?P<data>.*)"
+        r"$",
+        re.DOTALL,
+    )
+    match = data_url_pattern.match(data_url)
+    if not match:
+        raise ValueError("Invalid Data URL format")
+
+    parts = match.groupdict()
+    parts["is_base64"] = bool(parts["base64"])
+    return parts
+
+
+def content_from_data(data_url: str) -> str:
+    parts = parse_data_url(data_url)
+    data = parts["data"]
+
+    if parts["is_base64"]:
+        data = base64.b64decode(data)
+    else:
+        data = unquote(data)
+        encoding = parts["encoding"] or "utf-8"
+        data = data.encode(encoding)
+
+    encoding = parts["encoding"]
+    if not encoding:
+        detected = chardet.detect(data)
+        encoding = detected["encoding"]
+
+    mime_type = parts["mimetype"]
+    mime_category = mime_type.split("/")[0]
+    if mime_category == "text":
+        # For text-based files (including CSV, MD)
+        return data.decode(encoding)
+
+    elif mime_type == "application/pdf":
+        # For PDF and DOC/DOCX files, we can't reliably convert to string)
+        pdf_bytes = io.BytesIO(data)
+        pdf_reader = PdfReader(pdf_bytes)
+        return "\n".join([page.extract_text() for page in pdf_reader.pages])
+
+    else:
+        cprint("Could not extract content from data_url properly.", color="red")
+        return ""
+
+
+async def content_from_doc(doc: MemoryBankDocument) -> str:
+    if isinstance(doc.content, URL):
+        if doc.content.uri.startswith("data:"):
+            return content_from_data(doc.content.uri)
+        else:
+            async with httpx.AsyncClient() as client:
+                r = await client.get(doc.content.uri)
+                return r.text
+
+    return interleaved_text_media_as_str(doc.content)
+
+
+def make_overlapped_chunks(
+    document_id: str, text: str, window_len: int, overlap_len: int
+) -> List[Chunk]:
+    tokenizer = Tokenizer.get_instance()
+    tokens = tokenizer.encode(text, bos=False, eos=False)
+
+    chunks = []
+    for i in range(0, len(tokens), window_len - overlap_len):
+        toks = tokens[i : i + window_len]
+        chunk = tokenizer.decode(toks)
+        chunks.append(
+            Chunk(content=chunk, token_count=len(toks), document_id=document_id)
+        )
+
+    return chunks
+
+
+class EmbeddingIndex(ABC):
+    @abstractmethod
+    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+        raise NotImplementedError()
+
+    @abstractmethod
+    async def query(self, embedding: NDArray, k: int) -> QueryDocumentsResponse:
+        raise NotImplementedError()
+
+
+@dataclass
+class BankWithIndex:
+    bank: MemoryBank
+    index: EmbeddingIndex
+
+    async def insert_documents(
+        self,
+        documents: List[MemoryBankDocument],
+    ) -> None:
+        model = get_embedding_model()
+        for doc in documents:
+            content = await content_from_doc(doc)
+            chunks = make_overlapped_chunks(
+                doc.document_id,
+                content,
+                self.bank.config.chunk_size_in_tokens,
+                self.bank.config.overlap_size_in_tokens
+                or (self.bank.config.chunk_size_in_tokens // 4),
+            )
+            embeddings = model.encode([x.content for x in chunks]).astype(np.float32)
+
+            await self.index.add_chunks(chunks, embeddings)
+
+    async def query_documents(
+        self,
+        query: InterleavedTextMedia,
+        params: Optional[Dict[str, Any]] = None,
+    ) -> QueryDocumentsResponse:
+        if params is None:
+            params = {}
+        k = params.get("max_chunks", 3)
+
+        def _process(c) -> str:
+            if isinstance(c, str):
+                return c
+            else:
+                return "<media>"
+
+        if isinstance(query, list):
+            query_str = " ".join([_process(c) for c in query])
+        else:
+            query_str = _process(query)
+
+        model = get_embedding_model()
+        query_vector = model.encode([query_str])[0].astype(np.float32)
+        return await self.index.query(query_vector, k)