mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-18 02:42:31 +00:00
95 commits
Author | SHA1 | Message | Date | |
---|---|---|---|---|
|
31b088978a
|
fix: Fix /vector-stores/create API when vector store with duplicate name (#2617)
# What does this PR do? Resolves https://github.com/meta-llama/llama-stack/issues/2735 Currently, if you test against OpenAI's Vector Stores API the `client.vector_stores.search` call fails with an invalid vector_db during routing (see the script referenced in the clickable item under the Test Plan section). This PR ensures that `client.vector_stores.search()` is compatible with OpenAI's Vector Stores API. Two biggest changes: 1. The `name`, which was previously used as the `vector_db_id`, has been changed to be consistent with OpenAI's `vs_{uuid}` format. 2. The vector store ID has to be referenced by the ID, the name is not reliable as every `client.vector_stores.create` results in a new vector store. NOTE: I believe this is a breaking change for end users as they'll need to update their VectorDB identifiers. ## Test Plan Unit tests: ```bash ./scripts/unit-tests.sh tests/unit/providers/vector_io/ -v ``` Integration tests: ```bash ENABLE_MILVUS=milvus llama stack run /Users/farceo/dev/llama-stack/llama_stack/templates/starter/run.yaml --image-type venv LLAMA_STACK_CONFIG=http://localhost:8321 pytest -sv tests/integration/vector_io/test_openai_vector_stores.py --embedding-model=all-MiniLM-L6-v2 -vv ``` Unit tests and test script below 👇 <details> <summary>Click here for script used to test OpenAI and Llama Stack Vector Store implementation</summary> ```python import json import argparse from openai import OpenAI, pagination import logging from colorama import Fore, Style, init import traceback import os # Initialize colorama for color support in terminal init(autoreset=True) # Setup basic logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') DEMO_VECTOR_STORE_NAME = "Support FAQ FJA" global DEMO_VECTOR_STORE_ID global DEMO_VECTOR_STORE_ID2 def colored_print(color, text): """Prints text to the console with the specified color.""" print(f"{color}{text}{Style.RESET_ALL}") def log_and_print(color, message, level=logging.INFO): """Logs a message and prints it to the console with the specified color.""" logging.log(level, message) colored_print(color, message) def run_tests(client, prefix="openai"): """ Runs all tests using the provided OpenAI client and saves the output to JSON files with the given prefix. """ # Create the directory if it doesn't exist os.makedirs('openai_testing', exist_ok=True) # Default values in case tests fail global DEMO_VECTOR_STORE_ID, DEMO_VECTOR_STORE_ID2 DEMO_VECTOR_STORE_ID = None DEMO_VECTOR_STORE_ID2 = None def test_idempotent_vector_store_creation(): """ Test that creating a vector store with the same name is idempotent. """ log_and_print(Fore.BLUE, "Starting vector store creation test...") try: vector_store = client.vector_stores.create( name=DEMO_VECTOR_STORE_NAME, ) # Attempt to create the same vector store again vector_store2 = client.vector_stores.create( name=DEMO_VECTOR_STORE_NAME, ) # Check instead of assert if vector_store2.id != vector_store.id: log_and_print(Fore.YELLOW, f"FAILED IDEMPOTENCY: the same VectorStore name for {prefix.upper()} does not return the same ID", level=logging.WARNING) else: log_and_print(Fore.GREEN, f"PASSED IDEMPOTENCY: f{vector_store2.id} == {vector_store.id} the same VectorStore name for {prefix.upper()} returns the same ID") vector_store_data = vector_store.to_dict() log_and_print(Fore.WHITE, f"vector_stores.create = {json.dumps(vector_store_data, indent=2)}") with open(f'openai_testing/{prefix}_vector_store_create.json', 'w') as f: json.dump(vector_store_data, f, indent=2) global DEMO_VECTOR_STORE_ID, DEMO_VECTOR_STORE_ID2 DEMO_VECTOR_STORE_ID = vector_store.id DEMO_VECTOR_STORE_ID2 = vector_store2.id return DEMO_VECTOR_STORE_ID, DEMO_VECTOR_STORE_ID2 except Exception as e: log_and_print(Fore.RED, f"Idempotent vector store creation test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) # Create a fallback vector store ID if needed if 'vector_store' in locals() and vector_store: DEMO_VECTOR_STORE_ID = vector_store.id return DEMO_VECTOR_STORE_ID, DEMO_VECTOR_STORE_ID2 def test_vector_store_list(): """ Test listing vector stores. """ log_and_print(Fore.BLUE, "Starting vector store list test...") try: vector_stores = client.vector_stores.list() # Check instead of assert if not isinstance(vector_stores, pagination.SyncCursorPage): log_and_print(Fore.YELLOW, f"FAILED: Expected a list of vector stores, got {type(vector_stores)}", level=logging.WARNING) else: log_and_print(Fore.GREEN, "Vector store list test passed!") vector_stores_data = vector_stores.to_dict() log_and_print(Fore.WHITE, f"vector_stores.list = {json.dumps(vector_stores_data, indent=2)}") with open(f'openai_testing/{prefix}_vector_store_list.json', 'w') as f: json.dump(vector_stores_data, f, indent=2) except Exception as e: log_and_print(Fore.RED, f"Vector store list test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) def test_retrieve_vector_store(): """ Test retrieving a specific vector store. """ log_and_print(Fore.BLUE, "Starting retrieve vector store test...") if not DEMO_VECTOR_STORE_ID: log_and_print(Fore.YELLOW, "Skipping retrieve vector store test - no vector store ID available", level=logging.WARNING) return try: vector_store = client.vector_stores.retrieve( vector_store_id=DEMO_VECTOR_STORE_ID, ) # Check instead of assert if vector_store.id != DEMO_VECTOR_STORE_ID: log_and_print(Fore.YELLOW, "FAILED: Retrieved vector store ID does not match", level=logging.WARNING) else: log_and_print(Fore.GREEN, "Retrieve vector store test passed!") vector_store_data = vector_store.to_dict() log_and_print(Fore.WHITE, f"vector_stores.retrieve = {json.dumps(vector_store_data, indent=2)}") with open(f'openai_testing/{prefix}_vector_store_retrieve.json', 'w') as f: json.dump(vector_store_data, f, indent=2) except Exception as e: log_and_print(Fore.RED, f"Retrieve vector store test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) def test_modify_vector_store(): """ Test modifying a vector store. """ log_and_print(Fore.BLUE, "Starting modify vector store test...") if not DEMO_VECTOR_STORE_ID: log_and_print(Fore.YELLOW, "Skipping modify vector store test - no vector store ID available", level=logging.WARNING) return try: updated_vector_store = client.vector_stores.update( vector_store_id=DEMO_VECTOR_STORE_ID, name="Updated Support FAQ FJA", ) # Check instead of assert if updated_vector_store.name != "Updated Support FAQ FJA": log_and_print(Fore.YELLOW, "FAILED: Vector store name was not updated correctly", level=logging.WARNING) else: log_and_print(Fore.GREEN, "Modify vector store test passed!") updated_vector_store_data = updated_vector_store.to_dict() log_and_print(Fore.WHITE, f"vector_stores.modify = {json.dumps(updated_vector_store_data, indent=2)}") with open(f'openai_testing/{prefix}_vector_store_modify.json', 'w') as f: json.dump(updated_vector_store_data, f, indent=2) except Exception as e: log_and_print(Fore.RED, f"Modify vector store test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) def test_delete_vector_store(): """ Test deleting a vector store. """ log_and_print(Fore.BLUE, "Starting delete vector store test...") if not DEMO_VECTOR_STORE_ID2: log_and_print(Fore.YELLOW, "Skipping delete vector store test - no second vector store ID available", level=logging.WARNING) return try: response = client.vector_stores.delete( vector_store_id=DEMO_VECTOR_STORE_ID2, ) log_and_print(Fore.GREEN, "Delete vector store test passed!") response_data = response.to_dict() log_and_print(Fore.WHITE, f"Vector store delete response = {json.dumps(response_data, indent=2)}") with open(f'openai_testing/{prefix}_vector_store_delete.json', 'w') as f: json.dump(response_data, f, indent=2) except Exception as e: log_and_print(Fore.RED, f"Delete vector store test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) def test_create_vector_store_file(): log_and_print(Fore.BLUE, "Starting create vector store file test...") if not DEMO_VECTOR_STORE_ID: log_and_print(Fore.YELLOW, "Skipping create vector store file test - no vector store ID available", level=logging.WARNING) return try: # create jsonl of files as an example with open("mydata.jsonl", "w") as f: f.write('{"text": "What is the return policy?", "metadata": {"category": "support"}}\n') f.write('{"text": "How do I reset my password?", "metadata": {"category": "support"}}\n') f.write('{"text": "Where can I find my order history?", "metadata": {"category": "support"}}\n') f.write('{"text": "What are the shipping options?", "metadata": {"category": "support"}}\n') f.write('{"text": "What is your favorite banana?", "metadata": {"category": "support"}}\n') # Create a simple text file if my_data_small.txt doesn't exist if not os.path.exists("my_data_small.txt"): with open("my_data_small.txt", "w") as f: f.write("This is a test file for vector store testing.\n") created_file = client.files.create( file=open("my_data_small.txt", "rb"), purpose="assistants", ) created_file_data = created_file.to_dict() log_and_print(Fore.WHITE, f"Created file {json.dumps(created_file_data, indent=2)}") with open(f'openai_testing/{prefix}_file_create.json', 'w') as f: json.dump(created_file_data, f, indent=2) retrieved_files = client.files.retrieve(created_file.id) retrieved_files_data = retrieved_files.to_dict() log_and_print(Fore.WHITE, f"Retrieved file {json.dumps(retrieved_files_data, indent=2)}") with open(f'openai_testing/{prefix}_file_retrieve.json', 'w') as f: json.dump(retrieved_files_data, f, indent=2) vector_store_file = client.vector_stores.files.create( vector_store_id=DEMO_VECTOR_STORE_ID, file_id=created_file.id, ) log_and_print(Fore.GREEN, "Create vector store file test passed!") except Exception as e: log_and_print(Fore.RED, f"Create vector store file test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) def test_search_vector_store(): """ Test searching a vector store. """ log_and_print(Fore.BLUE, "Starting search vector store test...") if not DEMO_VECTOR_STORE_ID: log_and_print(Fore.YELLOW, "Skipping search vector store test - no vector store ID available", level=logging.WARNING) return try: query = "What is the banana policy?" search_results = client.vector_stores.search( vector_store_id=DEMO_VECTOR_STORE_ID, query=query, max_num_results=10, ranking_options={ 'ranker': 'default-2024-11-15', 'score_threshold': 0.0, }, rewrite_query=False, ) # Check instead of assert if not isinstance(search_results, pagination.SyncPage): log_and_print(Fore.YELLOW, f"FAILED: Expected a list of search results, got {type(search_results)}", level=logging.WARNING) else: log_and_print(Fore.GREEN, "Search vector store test passed!") search_results_dict = search_results.to_dict() log_and_print(Fore.WHITE, f"Search results = {search_results_dict}") with open(f'openai_testing/{prefix}_vector_store_search.json', 'w') as f: json.dump(search_results_dict, f, indent=2) log_and_print(Fore.WHITE, f"vector_stores.search = {search_results.to_json()}") except Exception as e: log_and_print(Fore.RED, f"Search vector store test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) # Run all tests in sequence, even if some fail test_results = [] try: result = test_idempotent_vector_store_creation() if result and len(result) == 2: DEMO_VECTOR_STORE_ID, DEMO_VECTOR_STORE_ID2 = result test_results.append(True) except Exception as e: log_and_print(Fore.RED, f"Vector store creation test failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) test_results.append(False) for test_func in [ test_vector_store_list, test_retrieve_vector_store, test_modify_vector_store, test_delete_vector_store, test_create_vector_store_file, test_search_vector_store ]: try: test_func() test_results.append(True) except Exception as e: log_and_print(Fore.RED, f"{test_func.__name__} failed: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) test_results.append(False) if all(test_results): log_and_print(Fore.GREEN, f"All {prefix} tests completed successfully!") else: failed_count = test_results.count(False) log_and_print(Fore.YELLOW, f"{failed_count} {prefix} test(s) failed, but script completed.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run OpenAI and/or LlamaStack tests.") parser.add_argument( "--provider", type=str, default="llama", choices=["openai", "llama", "both"], help="Specify which environment to test: openai, llama, or both. Default is both.", ) args = parser.parse_args() try: if args.provider in ("openai", "both"): openai_client = OpenAI() run_tests(openai_client, prefix="openai") if args.provider in ("llama", "both"): llama_client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none") run_tests(llama_client, prefix="llama") log_and_print(Fore.GREEN, "All tests completed!") except Exception as e: log_and_print(Fore.RED, f"Tests failed to complete: {e}", level=logging.ERROR) logging.error(traceback.format_exc()) ``` </details> --------- Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> |
||
|
618ccea090
|
feat: add input validation for search mode of rag query config (#2275)
# What does this PR do? Adds input validation for mode in RagQueryConfig This will prevent users from inputting search modes other than `vector` and `keyword` for the time being with `hybrid` to follow when that functionality is implemented. ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] ``` # Check out this PR and enter the LS directory uv sync --extra dev ``` Run the quickstart [example](https://llama-stack.readthedocs.io/en/latest/getting_started/#step-3-run-the-demo) Alter the Agent to include a query_config ``` agent = Agent( client, model=model_id, instructions="You are a helpful assistant", tools=[ { "name": "builtin::rag/knowledge_search", "args": { "vector_db_ids": [vector_db_id], "query_config": { "mode": "i-am-not-vector", # Test for non valid search mode "max_chunks": 6 } }, } ], ) ``` Ensure you get the following error: ``` 400: {'errors': [{'loc': ['mode'], 'msg': "Value error, mode must be either 'vector' or 'keyword' if supported by the vector_io provider", 'type': 'value_error'}]} ``` ## Running unit tests ``` uv sync --extra dev uv run pytest tests/unit/rag/test_rag_query.py -v ``` [//]: # (## Documentation) |
||
|
cd0ad21111
|
chore(api): add mypy coverage to apis (#2648)
# What does this PR do? <!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. --> This PR adds static type coverage to `llama-stack/apis` Part of https://github.com/meta-llama/llama-stack/issues/2647 <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> ## Test Plan <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* --> Signed-off-by: Mustafa Elbehery <melbeher@redhat.com> |
||
|
be9bf68246
|
feat: Add webmethod for deleting openai responses (#2160)
Some checks failed
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 16s
Integration Tests / test-matrix (http, 3.13, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.13, scoring) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, providers) (push) Failing after 17s
Integration Tests / test-matrix (http, 3.13, agents) (push) Failing after 11s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 5s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 16s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 18s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 19s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 21s
Test External Providers / test-external-providers (venv) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 19s
Unit Tests / unit-tests (3.12) (push) Failing after 9s
Update ReadTheDocs / update-readthedocs (push) Failing after 7s
Unit Tests / unit-tests (3.13) (push) Failing after 10s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 39s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 37s
Python Package Build Test / build (3.13) (push) Failing after 33s
Python Package Build Test / build (3.12) (push) Failing after 36s
Pre-commit / pre-commit (push) Failing after 1m19s
# What does this PR do? This PR creates a webmethod for deleting open AI responses, adds and implementation for it and makes an integration test for the OpenAI delete response method. [//]: # (If resolving an issue, uncomment and update the line below) # (Closes #2077) ## Test Plan Ran the standard tests and the pre-commit hooks and the unit tests. # (## Documentation) For this pr I made the routes and implementation based on the current get and create methods. The unit tests were not able to handle this test due to the mock interface in use, which did not allow for effective CRUD to be tested. I instead created an integration test to match the existing ones in the test_openai_responses. |
||
|
2d9fd041eb
|
fix: annotations list and web_search_preview in Responses (#2520)
# What does this PR do? These are a couple of fixes to get an example LangChain app working with our OpenAI Responses API implementation. The Responses API spec requires an annotations array in `output[*].content[*].annotations` and we were not providing one. So, this adds that as an empty list, even though we don't do anything to populate it yet. This prevents an error from client libraries like Langchain that expect this field to always exist, even if an empty list. The other fix is `web_search_preview` is a valid name for the web search tool in the Responses API, but we only responded to `web_search` or `web_search_preview_2025_03_11`. ## Test Plan The existing Responses unit tests were expanded to test these cases, via: ``` pytest -sv tests/unit/providers/agents/meta_reference/test_openai_responses.py ``` The existing test_openai_responses.py integration tests still pass with this change, tested as below with Fireworks: ``` uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv tests/integration/agents/test_openai_responses.py \ --text-model accounts/fireworks/models/llama4-scout-instruct-basic ``` Lastly, this example LangChain app now works with Llama stack (tested with Ollama in the starter template in this case). This LangChain code is using the example snippets for using Responses API at https://python.langchain.com/docs/integrations/chat/openai/#responses-api ```python from langchain_openai import ChatOpenAI llm = ChatOpenAI( base_url="http://localhost:8321/v1/openai/v1", api_key="fake", model="ollama/meta-llama/Llama-3.2-3B-Instruct", ) tool = {"type": "web_search_preview"} llm_with_tools = llm.bind_tools([tool]) response = llm_with_tools.invoke("What was a positive news story from today?") print(response.content) ``` Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
82f13fe83e
|
feat: Add ChunkMetadata to Chunk (#2497)
# What does this PR do? Adding `ChunkMetadata` so we can properly delete embeddings later. More specifically, this PR refactors and extends the chunk metadata handling in the vector database and introduces a distinction between metadata used for model context and backend-only metadata required for chunk management, storage, and retrieval. It also improves chunk ID generation and propagation throughout the stack, enhances test coverage, and adds new utility modules. ```python class ChunkMetadata(BaseModel): """ `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that will NOT be inserted into the context during inference, but is required for backend functionality. Use `metadata` in `Chunk` for metadata that will be used during inference. """ document_id: str | None = None chunk_id: str | None = None source: str | None = None created_timestamp: int | None = None updated_timestamp: int | None = None chunk_window: str | None = None chunk_tokenizer: str | None = None chunk_embedding_model: str | None = None chunk_embedding_dimension: int | None = None content_token_count: int | None = None metadata_token_count: int | None = None ``` Eventually we can migrate the document_id out of the `metadata` field. I've introduced the changes so that `ChunkMetadata` is backwards compatible with `metadata`. <!-- If resolving an issue, uncomment and update the line below --> Closes https://github.com/meta-llama/llama-stack/issues/2501 ## Test Plan Added unit tests --------- Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> |
||
|
cfee63bd0d
|
feat: Add search_mode support to OpenAI vector store API (#2500)
Some checks failed
Integration Tests / test-matrix (http, 3.13, scoring) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 11s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 7s
Integration Tests / test-matrix (http, 3.13, post_training) (push) Failing after 17s
Python Package Build Test / build (3.13) (push) Failing after 5s
Integration Tests / test-matrix (http, 3.13, providers) (push) Failing after 18s
Test Llama Stack Build / build-single-provider (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.13, tool_runtime) (push) Failing after 17s
Unit Tests / unit-tests (3.12) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 17s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.12, vector_io) (push) Failing after 18s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 8s
Unit Tests / unit-tests (3.13) (push) Failing after 8s
Integration Tests / test-matrix (http, 3.13, datasets) (push) Failing after 19s
Test Llama Stack Build / build (push) Failing after 5s
Update ReadTheDocs / update-readthedocs (push) Failing after 44s
Test External Providers / test-external-providers (venv) (push) Failing after 47s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 50s
Pre-commit / pre-commit (push) Successful in 2m12s
# What does this PR do? Add search_mode parameter (vector/keyword/hybrid) to openai_search_vector_store method. Fixes OpenAPI code generation by using str instead of Literal type. Closes: #2459 ## Test Plan <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* --> Signed-off-by: Varsha Prasad Narsing <varshaprasad96@gmail.com> |
||
|
f394c7f2d9
|
feat: Add missing Vector Store Files API surface (#2468)
Some checks failed
Integration Tests / test-matrix (library, 3.11, inference) (push) Failing after 16s
Integration Tests / test-matrix (http, 3.11, agents) (push) Failing after 26s
Integration Tests / test-matrix (http, 3.12, tool_runtime) (push) Failing after 19s
Python Package Build Test / build (3.11) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 6s
Python Package Build Test / build (3.12) (push) Failing after 3s
Integration Tests / test-matrix (http, 3.12, providers) (push) Failing after 18s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.11, post_training) (push) Failing after 17s
Integration Tests / test-matrix (library, 3.11, vector_io) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.11, scoring) (push) Failing after 18s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 8s
Python Package Build Test / build (3.13) (push) Failing after 5s
Integration Tests / test-matrix (http, 3.11, scoring) (push) Failing after 24s
Integration Tests / test-matrix (library, 3.11, agents) (push) Failing after 20s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, providers) (push) Failing after 15s
Integration Tests / test-matrix (http, 3.12, datasets) (push) Failing after 21s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 15s
Integration Tests / test-matrix (http, 3.11, inference) (push) Failing after 22s
Unit Tests / unit-tests (3.11) (push) Failing after 7s
Update ReadTheDocs / update-readthedocs (push) Failing after 4s
Unit Tests / unit-tests (3.12) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 48s
Test External Providers / test-external-providers (venv) (push) Failing after 43s
Unit Tests / unit-tests (3.13) (push) Failing after 52s
Pre-commit / pre-commit (push) Successful in 2m4s
# What does this PR do? This adds the ability to list, retrieve, update, and delete Vector Store Files. It implements these new APIs for the faiss and sqlite-vec providers, since those are the two that also have the rest of the vector store files implementation. Closes #2445 ## Test Plan ### test_openai_vector_stores Integration Tests There are a number of new integration tests added, which I ran for each provider as outlined below. faiss (from ollama distro): ``` INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run llama_stack/templates/ollama/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ pytest -sv tests/integration/vector_io/test_openai_vector_stores.py \ --embedding-model=all-MiniLM-L6-v2 ``` sqlite-vec (from starter distro): ``` llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ pytest -sv tests/integration/vector_io/test_openai_vector_stores.py \ --embedding-model=all-MiniLM-L6-v2 ``` ### file_search verification tests I also ensured the file_search verification tests continue to work, both for faiss and sqlite-vec. faiss (ollama distro): ``` INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run llama_stack/templates/ollama/run.yaml pytest -sv tests/verifications/openai_api/test_responses.py \ -k'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=meta-llama/Llama-3.2-3B-Instruct ``` sqlite-vec (starter distro): ``` llama stack run llama_stack/templates/starter/run.yaml pytest -sv tests/verifications/openai_api/test_responses.py \ -k'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=together/meta-llama/Llama-3.2-3B-Instruct-Turbo ``` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
db2cd9e8f3
|
feat: support filters in file search (#2472)
# What does this PR do? Move to use vector_stores.search for file search tool in Responses, which supports filters. closes #2435 ## Test Plan Added e2e test with fitlers. myenv ❯ llama stack run llama_stack/templates/fireworks/run.yaml pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search and filters' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=meta-llama/Llama-3.3-70B-Instruct |
||
|
436c7aa751
|
feat: Add url field to PaginatedResponse and populate it using route … (#2419)
Some checks failed
Integration Tests / test-matrix (library, 3.11, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.10, inspect) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.10, post_training) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.10, scoring) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, agents) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.10, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.10, providers) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.10, vector_io) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.11, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.11, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, inspect) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.11, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.11, vector_io) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 10s
Test External Providers / test-external-providers (venv) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 15s
Unit Tests / unit-tests (3.11) (push) Failing after 10s
Unit Tests / unit-tests (3.13) (push) Failing after 9s
Update ReadTheDocs / update-readthedocs (push) Failing after 50s
Unit Tests / unit-tests (3.12) (push) Failing after 58s
Unit Tests / unit-tests (3.10) (push) Failing after 1m0s
Pre-commit / pre-commit (push) Successful in 2m10s
…path # What does this PR do? Closes #1847 Changes: - llama_stack/apis/common/responses.py: adds optional `url` field to PaginatedResponse - llama_stack/distribution/server/server.py: automatically populate the URL field with route path ## Test Plan - Built and ran llama stack server using the following cmds: ```bash export INFERENCE_MODEL=llama3.1:8b llama stack build --run --template ollama --image-type container llama stack run llama_stack/templates/ollama/run.yaml ``` - Ran `curl` to test if we are seeing the `url` param in response: ```bash curl -X 'GET' \ 'http://localhost:8321/v1/agents' \ -H 'accept: application/json' ``` - Expected and Received Output: `{"data":[],"has_more":false,"url":"/v1/agents"}` --------- Co-authored-by: Rohan Awhad <rawhad@redhat.com> |
||
|
985d0b156c
|
feat: Add suffix to openai_completions (#2449)
Some checks failed
Integration Tests / test-matrix (library, 3.10, inspect) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.11, providers) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.10, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.10, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.10, scoring) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.11, agents) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, inference) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.11, datasets) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.11, inspect) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.11, post_training) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, providers) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.11, scoring) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, vector_io) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.11, tool_runtime) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 9s
Test External Providers / test-external-providers (venv) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 14s
Unit Tests / unit-tests (3.10) (push) Failing after 19s
Unit Tests / unit-tests (3.11) (push) Failing after 20s
Unit Tests / unit-tests (3.12) (push) Failing after 18s
Unit Tests / unit-tests (3.13) (push) Failing after 16s
Update ReadTheDocs / update-readthedocs (push) Failing after 8s
Pre-commit / pre-commit (push) Successful in 58s
For code completion apps need "fill in the middle" capabilities. Added option of `suffix` to `openai_completion` to enable this. Updated ollama provider to showcase the same. ### Test Plan ``` pytest -sv --stack-config="inference=ollama" tests/integration/inference/test_openai_completion.py --text-model qwen2.5-coder:1.5b -k test_openai_completion_non_streaming_suffix ``` ### OpenAI Sample script ``` from openai import OpenAI client = OpenAI(base_url="http://localhost:8321/v1/openai/v1") response = client.completions.create( model="qwen2.5-coder:1.5b", prompt="The capital of ", suffix="is Paris.", max_tokens=10, ) print(response.choices[0].text) ``` ### Output ``` France is ____. To answer this question, we ``` |
||
|
2e8054bede
|
feat: Implement hybrid search in SQLite-vec (#2312)
Some checks failed
Integration Tests / test-matrix (library, 3.11, agents) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.11, datasets) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.11, inspect) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.11, inference) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.11, vector_io) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.11, post_training) (push) Failing after 25s
Integration Tests / test-matrix (library, 3.11, providers) (push) Failing after 24s
Integration Tests / test-matrix (library, 3.11, scoring) (push) Failing after 22s
Integration Tests / test-matrix (library, 3.11, tool_runtime) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 41s
Test Llama Stack Build / generate-matrix (push) Successful in 37s
Test Llama Stack Build / build-single-provider (push) Failing after 37s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 35s
Test External Providers / test-external-providers (venv) (push) Failing after 5s
Update ReadTheDocs / update-readthedocs (push) Failing after 5s
Unit Tests / unit-tests (3.11) (push) Failing after 6s
Unit Tests / unit-tests (3.12) (push) Failing after 6s
Unit Tests / unit-tests (3.13) (push) Failing after 6s
Test Llama Stack Build / build (push) Failing after 7s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 18s
Unit Tests / unit-tests (3.10) (push) Failing after 17s
Pre-commit / pre-commit (push) Successful in 2m0s
# What does this PR do? Add support for hybrid search mode in SQLite-vec provider, which combines keyword and vector search for better results. The implementation: - Adds hybrid search mode as a new option alongside vector and keyword search - Implements query_hybrid method in SQLiteVecIndex that: - First performs keyword search to get candidate matches - Then applies vector similarity search on those candidates - Updates documentation to reflect the new search mode This change improves search quality by leveraging both semantic similarity and keyword matching, while maintaining backward compatibility with existing vector and keyword search modes. ## Test Plan ``` pytest tests/unit/providers/vector_io/test_sqlite_vec.py -v -s --tb=short /Users/vnarsing/miniconda3/envs/stack-client/lib/python3.10/site-packages/pytest_asyncio/plugin.py:217: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) =============================================================================================== test session starts =============================================================================================== platform darwin -- Python 3.10.16, pytest-8.3.5, pluggy-1.5.0 -- /Users/vnarsing/miniconda3/envs/stack-client/bin/python cachedir: .pytest_cache metadata: {'Python': '3.10.16', 'Platform': 'macOS-14.7.6-arm64-arm-64bit', 'Packages': {'pytest': '8.3.5', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'json-report': '1.5.0', 'timeout': '2.4.0', 'metadata': '3.1.1', 'anyio': '4.8.0', 'asyncio': '0.26.0', 'nbval': '0.11.0', 'cov': '6.1.1'}} rootdir: /Users/vnarsing/go/src/github/meta-llama/llama-stack configfile: pyproject.toml plugins: html-4.1.1, json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, anyio-4.8.0, asyncio-0.26.0, nbval-0.11.0, cov-6.1.1 asyncio: mode=strict, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 10 items tests/unit/providers/vector_io/test_sqlite_vec.py::test_add_chunks PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_vector PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_full_text_search PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_hybrid PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_full_text_search_k_greater_than_results PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_chunk_id_conflict PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_generate_chunk_id PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_hybrid_no_keyword_matches PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_hybrid_score_threshold PASSED tests/unit/providers/vector_io/test_sqlite_vec.py::test_query_chunks_hybrid_different_embedding PASSED ``` --------- Signed-off-by: Varsha Prasad Narsing <varshaprasad96@gmail.com> |
||
|
941f505eb0
|
feat: File search tool for Responses API (#2426)
# What does this PR do? This is an initial working prototype of wiring up the `file_search` builtin tool for the Responses API to our existing rag knowledge search tool. This is me seeing what I could pull together on top of the bits we already have merged. This may not be the ideal way to implement this, and things like how I shuffle the vector store ids from the original response API tool request to the actual tool execution feel a bit hacky (grep for `tool_kwargs["vector_db_ids"]` in `_execute_tool_call` to see what I mean). ## Test Plan I stubbed in some new tests to exercise this using text and pdf documents. Note that this is currently under tests/verification only because it sometimes flakes with tool calling of the small Llama-3.2-3B model we run in CI (and that I use as an example below). We'd want to make the test a bit more robust in some way if we moved this over to tests/integration and ran it in CI. ### OpenAI SaaS (to verify test correctness) ``` pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search' \ --base-url=https://api.openai.com/v1 \ --model=gpt-4o ``` ### Fireworks with faiss vector store ``` llama stack run llama_stack/templates/fireworks/run.yaml pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=meta-llama/Llama-3.3-70B-Instruct ``` ### Ollama with faiss vector store This sometimes flakes on Ollama because the quantized small model doesn't always choose to call the tool to answer the user's question. But, it often works. ``` ollama run llama3.2:3b INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run ./llama_stack/templates/ollama/run.yaml \ --image-type venv \ --env OLLAMA_URL="http://0.0.0.0:11434" pytest -sv tests/verifications/openai_api/test_responses.py \ -k'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=meta-llama/Llama-3.2-3B-Instruct ``` ### OpenAI provider with sqlite-vec vector store ``` llama stack run ./llama_stack/templates/starter/run.yaml --image-type venv pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=openai/gpt-4o-mini ``` ### Ensure existing vector store integration tests still pass ``` ollama run llama3.2:3b INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run ./llama_stack/templates/ollama/run.yaml \ --image-type venv \ --env OLLAMA_URL="http://0.0.0.0:11434" LLAMA_STACK_CONFIG=http://localhost:8321 \ pytest -sv tests/integration/vector_io \ --text-model "meta-llama/Llama-3.2-3B-Instruct" \ --embedding-model=all-MiniLM-L6-v2 ``` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
0bc1747ed8
|
feat: update search for vector_stores (#2441)
Updated the `search` functionality return response to match openai. ## Test Plan ``` pytest -sv --stack-config=http://localhost:8321 tests/integration/vector_io/test_openai_vector_stores.py --embedding-model all-MiniLM-L6-v2 ``` |
||
|
de37a04c3e
|
fix: set appropriate defaults for params (#2434)
Some checks failed
Integration Tests / test-matrix (http, 3.11, post_training) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.10, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.10, inspect) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.11, datasets) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.10, datasets) (push) Failing after 17s
Integration Tests / test-matrix (library, 3.11, inspect) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.10, agents) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.10, tool_runtime) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.11, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.10, post_training) (push) Failing after 19s
Integration Tests / test-matrix (library, 3.11, providers) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.11, agents) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.11, post_training) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.11, tool_runtime) (push) Failing after 17s
Integration Tests / test-matrix (library, 3.11, scoring) (push) Failing after 19s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 15s
Test External Providers / test-external-providers (venv) (push) Failing after 20s
Update ReadTheDocs / update-readthedocs (push) Failing after 17s
Unit Tests / unit-tests (3.12) (push) Failing after 20s
Unit Tests / unit-tests (3.11) (push) Failing after 1m39s
Unit Tests / unit-tests (3.13) (push) Failing after 1m37s
Unit Tests / unit-tests (3.10) (push) Failing after 1m41s
Pre-commit / pre-commit (push) Failing after 3h4m8s
Setting defaults to be `| None` else they get marked as required params in open-api spec. |
||
|
d55100d9b7
|
feat: OpenAIVectorIOMixin for vector_stores common logic (#2427)
Extracts common OpenAI vector-store code into its own mixin so that all providers can share the same core logic. This also makes it easy for Llama Stack to support both vector-stores and Llama Stack APIs in the interim so that both share the same underlying vector-dbs. Each provider contains storage specific logic to `create / edit / delete / list` vector dbs while the plumbing logic is standardized in the common code. Ensured that this works well with both faiss and sqllite-vec. ### Test Plan ``` llama stack run starter pytest -sv --stack-config http://localhost:8321 tests/integration/vector_io/test_openai_vector_stores.py --embedding-model all-MiniLM-L6-v2 ``` |
||
|
5ac43268e8
|
feat: Add OpenAI compat /v1/vector_store APIs (#2423)
Some checks failed
Integration Tests / test-matrix (library, 3.10, providers) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.10, scoring) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.10, post_training) (push) Failing after 41s
Integration Tests / test-matrix (library, 3.10, datasets) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.10, post_training) (push) Failing after 13s
Integration Tests / test-matrix (http, 3.10, tool_runtime) (push) Failing after 46s
Integration Tests / test-matrix (library, 3.10, tool_runtime) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.11, agents) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.11, inference) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.11, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.11, datasets) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.11, inspect) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.11, providers) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.11, tool_runtime) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.11, scoring) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 5s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 16s
Test External Providers / test-external-providers (venv) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 15s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 13s
Update ReadTheDocs / update-readthedocs (push) Failing after 8s
Unit Tests / unit-tests (3.13) (push) Failing after 11s
Unit Tests / unit-tests (3.12) (push) Failing after 1m31s
Unit Tests / unit-tests (3.11) (push) Failing after 1m33s
Unit Tests / unit-tests (3.10) (push) Failing after 1m35s
Pre-commit / pre-commit (push) Failing after 3h13m41s
Adding OpenAI compat `/v1/vector-store` apis. This PR implements the `faiss` provider with followup PRs coming up for other providers. Added routes to create, update, delete, list vector stores. Also added route to search a vector store Inserting into vector stores is missing and will be a follow up diff. ### Test Plan - Added new integration test for testing the faiss provider ``` pytest -sv --stack-config http://localhost:8321 tests/integration/vector_io/test_openai_vector_stores.py --embedding-model all-MiniLM-L6-v2 ``` |
||
|
ed69c1b3cc
|
feat(responses): add more streaming response types (#2375)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 6s
Integration Tests / test-matrix (http, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, scoring) (push) Failing after 8s
Integration Tests / test-matrix (http, inspect) (push) Failing after 9s
Integration Tests / test-matrix (http, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, datasets) (push) Failing after 9s
Integration Tests / test-matrix (http, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, inference) (push) Failing after 11s
Integration Tests / test-matrix (http, providers) (push) Failing after 10s
Integration Tests / test-matrix (http, tool_runtime) (push) Failing after 9s
Integration Tests / test-matrix (library, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, inspect) (push) Failing after 7s
Test External Providers / test-external-providers (venv) (push) Failing after 7s
Integration Tests / test-matrix (library, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, post_training) (push) Failing after 9s
Unit Tests / unit-tests (3.10) (push) Failing after 7s
Integration Tests / test-matrix (library, scoring) (push) Failing after 9s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Integration Tests / test-matrix (library, tool_runtime) (push) Failing after 10s
Update ReadTheDocs / update-readthedocs (push) Failing after 6s
Unit Tests / unit-tests (3.11) (push) Failing after 9s
Unit Tests / unit-tests (3.12) (push) Failing after 34s
Pre-commit / pre-commit (push) Successful in 1m21s
|
||
|
8bee2954be
|
feat: Structured output for Responses API (#2324)
# What does this PR do? This adds the missing `text` parameter to the Responses API that is how users control structured outputs. All we do with that parameter is map it to the corresponding chat completion response_format. ## Test Plan The new unit tests exercise the various permutations allowed for this property, while a couple of new verification tests actually use it for real to verify the model outputs are following the format as expected. Unit tests: `python -m pytest -s -v tests/unit/providers/agents/meta_reference/test_openai_responses.py` Verification tests: ``` llama stack run llama_stack/templates/together/run.yaml pytest -s -vv 'tests/verifications/openai_api/test_responses.py' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` Note that the verification tests can only be run with a real Llama Stack server (as opposed to using the library client via `--provider=stack:together`) because the Llama Stack python client is not yet updated to accept this text field. Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
dbe4e84aca
|
feat(responses): implement full multi-turn support (#2295)
I think the implementation needs more simplification. Spent way too much time trying to get the tests pass with models not co-operating :( Finally had to switch claude-sonnet to get things to pass reliably. ### Test Plan ``` export TAVILY_SEARCH_API_KEY=... export OPENAI_API_KEY=... uv run pytest -p no:warnings \ -s -v tests/verifications/openai_api/test_responses.py \ --provider=stack:starter \ --model openai/gpt-4o ``` |
||
|
31a3ae60f4
|
feat: openai files api (#2321)
# What does this PR do? * Adds the OpenAI compatible Files API * Modified doc gen script to support multipart parameter ## Test Plan --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/2321). * #2330 * __->__ #2321 |
||
|
b21050935e
|
feat: New OpenAI compat embeddings API (#2314)
Some checks failed
Integration Tests / test-matrix (http, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, inference) (push) Failing after 9s
Integration Tests / test-matrix (library, inspect) (push) Failing after 9s
Integration Tests / test-matrix (library, post_training) (push) Failing after 15s
Integration Tests / test-matrix (library, providers) (push) Failing after 14s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 43s
Integration Tests / test-matrix (library, scoring) (push) Failing after 8s
Integration Tests / test-matrix (http, inference) (push) Failing after 46s
Integration Tests / test-matrix (library, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, agents) (push) Failing after 44s
Integration Tests / test-matrix (http, inspect) (push) Failing after 47s
Integration Tests / test-matrix (http, providers) (push) Failing after 45s
Integration Tests / test-matrix (library, datasets) (push) Failing after 45s
Integration Tests / test-matrix (http, post_training) (push) Failing after 46s
Integration Tests / test-matrix (http, tool_runtime) (push) Failing after 47s
Integration Tests / test-matrix (http, datasets) (push) Failing after 49s
Test External Providers / test-external-providers (venv) (push) Failing after 6s
Update ReadTheDocs / update-readthedocs (push) Failing after 6s
Unit Tests / unit-tests (3.12) (push) Failing after 7s
Unit Tests / unit-tests (3.10) (push) Failing after 8s
Unit Tests / unit-tests (3.11) (push) Failing after 8s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Pre-commit / pre-commit (push) Successful in 1m12s
# What does this PR do? Adds a new endpoint that is compatible with OpenAI for embeddings api. `/openai/v1/embeddings` Added providers for OpenAI, LiteLLM and SentenceTransformer. ## Test Plan ``` LLAMA_STACK_CONFIG=http://localhost:8321 pytest -sv tests/integration/inference/test_openai_embeddings.py --embedding-model all-MiniLM-L6-v2,text-embedding-3-small,gemini/text-embedding-004 ``` |
||
|
f328436831
|
feat: Enable ingestion of precomputed embeddings (#2317)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 3s
Integration Tests / test-matrix (http, inspect) (push) Failing after 9s
Integration Tests / test-matrix (http, post_training) (push) Failing after 9s
Integration Tests / test-matrix (http, agents) (push) Failing after 10s
Integration Tests / test-matrix (http, datasets) (push) Failing after 10s
Integration Tests / test-matrix (http, inference) (push) Failing after 10s
Integration Tests / test-matrix (library, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, datasets) (push) Failing after 8s
Integration Tests / test-matrix (http, providers) (push) Failing after 9s
Integration Tests / test-matrix (http, tool_runtime) (push) Failing after 10s
Integration Tests / test-matrix (library, inference) (push) Failing after 9s
Test External Providers / test-external-providers (venv) (push) Failing after 6s
Integration Tests / test-matrix (library, inspect) (push) Failing after 8s
Integration Tests / test-matrix (library, providers) (push) Failing after 8s
Integration Tests / test-matrix (library, scoring) (push) Failing after 8s
Integration Tests / test-matrix (library, post_training) (push) Failing after 10s
Unit Tests / unit-tests (3.11) (push) Failing after 7s
Unit Tests / unit-tests (3.10) (push) Failing after 9s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Integration Tests / test-matrix (library, tool_runtime) (push) Failing after 9s
Unit Tests / unit-tests (3.12) (push) Failing after 9s
Update ReadTheDocs / update-readthedocs (push) Failing after 7s
Pre-commit / pre-commit (push) Successful in 1m15s
|
||
|
5cdb29758a
|
feat(responses): add output_text delta events to responses (#2265)
This adds initial streaming support to the Responses API. This PR makes sure that the _first_ inference call made to chat completions streams out. There's more to be done: - tool call output tokens need to stream out when possible - we need to loop through multiple rounds of inference and they all need to stream out. ## Test Plan Added a test. Executed as: ``` FIREWORKS_API_KEY=... \ pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --provider=stack:fireworks --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` Then, started a llama stack fireworks distro and tested against it like this: ``` OPENAI_API_KEY=blah \ pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --base-url http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` |
||
|
ce33d02443
|
fix(tools): do not index tools, only index toolgroups (#2261)
When registering a MCP endpoint, we cannot list tools (like we used to) since the MCP endpoint may be behind an auth wall. Registration can happen much sooner (via run.yaml). Instead, we do listing only when the _user_ actually calls listing. Furthermore, we cache the list in-memory in the server. Currently, the cache is not invalidated -- we may want to periodically re-list for MCP servers. Note that they must call `list_tools` before calling `invoke_tool` -- we use this critically. This will enable us to list MCP servers in run.yaml ## Test Plan Existing tests, updated tests accordingly. |
||
|
3faf1e4a79
|
feat: enable MCP execution in Responses impl (#2240)
## Test Plan ``` pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --provider=stack:together --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` |
||
|
15b0a67555
|
feat: add responses input items api (#2239)
# What does this PR do? TSIA ## Test Plan added integration and unit tests |
||
|
5844c2da68
|
feat: add list responses API (#2233)
# What does this PR do? This is not part of the official OpenAI API, but we'll use this for the logs UI. In order to support more filtering options, I'm adopting the newly introduced sql store in in place of the kv store. ## Test Plan Added integration/unit tests. |
||
|
558d109ab7
|
fix: signature change to match OpenAI SDK (#2237) | ||
|
d8c6ab9bfc
|
feat: add MCP tool signature to Responses API (#2232) | ||
|
e92301f2d7
|
feat(sqlite-vec): enable keyword search for sqlite-vec (#1439)
# What does this PR do? This PR introduces support for keyword based FTS5 search with BM25 relevance scoring. It makes changes to the existing EmbeddingIndex base class in order to support a search_mode and query_str parameter, that can be used for keyword based search implementations. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan run ``` pytest llama_stack/providers/tests/vector_io/test_sqlite_vec.py -v -s --tb=short --disable-warnings --asyncio-mode=auto ``` Output: ``` pytest llama_stack/providers/tests/vector_io/test_sqlite_vec.py -v -s --tb=short --disable-warnings --asyncio-mode=auto /Users/vnarsing/miniconda3/envs/stack-client/lib/python3.10/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) ====================================================== test session starts ======================================================= platform darwin -- Python 3.10.16, pytest-8.3.4, pluggy-1.5.0 -- /Users/vnarsing/miniconda3/envs/stack-client/bin/python cachedir: .pytest_cache metadata: {'Python': '3.10.16', 'Platform': 'macOS-14.7.4-arm64-arm-64bit', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0'}} rootdir: /Users/vnarsing/go/src/github/meta-llama/llama-stack configfile: pyproject.toml plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0 asyncio: mode=auto, asyncio_default_fixture_loop_scope=None collected 7 items llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_add_chunks PASSED llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_query_chunks_vector PASSED llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_query_chunks_fts PASSED llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_chunk_id_conflict PASSED llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_register_vector_db PASSED llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_unregister_vector_db PASSED llama_stack/providers/tests/vector_io/test_sqlite_vec.py::test_generate_chunk_id PASSED ``` For reference, with the implementation, the fts table looks like below: ``` Chunk ID: 9fbc39ce-c729-64a2-260f-c5ec9bb2a33e, Content: Sentence 0 from document 0 Chunk ID: 94062914-3e23-44cf-1e50-9e25821ba882, Content: Sentence 1 from document 0 Chunk ID: e6cfd559-4641-33ba-6ce1-7038226495eb, Content: Sentence 2 from document 0 Chunk ID: 1383af9b-f1f0-f417-4de5-65fe9456cc20, Content: Sentence 3 from document 0 Chunk ID: 2db19b1a-de14-353b-f4e1-085e8463361c, Content: Sentence 4 from document 0 Chunk ID: 9faf986a-f028-7714-068a-1c795e8f2598, Content: Sentence 5 from document 0 Chunk ID: ef593ead-5a4a-392f-7ad8-471a50f033e8, Content: Sentence 6 from document 0 Chunk ID: e161950f-021f-7300-4d05-3166738b94cf, Content: Sentence 7 from document 0 Chunk ID: 90610fc4-67c1-e740-f043-709c5978867a, Content: Sentence 8 from document 0 Chunk ID: 97712879-6fff-98ad-0558-e9f42e6b81d3, Content: Sentence 9 from document 0 Chunk ID: aea70411-51df-61ba-d2f0-cb2b5972c210, Content: Sentence 0 from document 1 Chunk ID: b678a463-7b84-92b8-abb2-27e9a1977e3c, Content: Sentence 1 from document 1 Chunk ID: 27bd63da-909c-1606-a109-75bdb9479882, Content: Sentence 2 from document 1 Chunk ID: a2ad49ad-f9be-5372-e0c7-7b0221d0b53e, Content: Sentence 3 from document 1 Chunk ID: cac53bcd-1965-082a-c0f4-ceee7323fc70, Content: Sentence 4 from document 1 ``` Query results: Result 1: Sentence 5 from document 0 Result 2: Sentence 5 from document 1 Result 3: Sentence 5 from document 2 [//]: # (## Documentation) --------- Signed-off-by: Varsha Prasad Narsing <varshaprasad96@gmail.com> |
||
|
3339844fda
|
feat: Add "instructions" support to responses API (#2205)
# What does this PR do? Add support for "instructions" to the responses API. Instructions provide a way to swap out system (or developer) messages in new responses. ## Test Plan unit tests added Signed-off-by: Derek Higgins <derekh@redhat.com> |
||
|
047303e339
|
feat: introduce APIs for retrieving chat completion requests (#2145)
# What does this PR do? This PR introduces APIs to retrieve past chat completion requests, which will be used in the LS UI. Our current `Telemetry` is ill-suited for this purpose as it's untyped so we'd need to filter by obscure attribute names, making it brittle. Since these APIs are 'provided by stack' and don't need to be implemented by inference providers, we introduce a new InferenceProvider class, containing the existing inference protocol, which is implemented by inference providers. The APIs are OpenAI-compliant, with an additional `input_messages` field. ## Test Plan This PR just adds the API and marks them provided_by_stack. S tart stack server -> doesn't crash |
||
|
bb5fca9521
|
chore: more API validators (#2165)
# What does this PR do? We added: * make sure docstrings are present with 'params' and 'returns' * fail if someone sets 'returns: None' * fix the failing APIs Signed-off-by: Sébastien Han <seb@redhat.com> |
||
|
8e7ab146f8
|
feat: Adding support for customizing chunk context in RAG insertion and querying (#2134)
# What does this PR do? his PR allows users to customize the template used for chunks when inserted into the context. Additionally, this enables metadata injection into the context of an LLM for RAG. This makes a naive and crude assumption that each chunk should include the metadata, this is obviously redundant when multiple chunks are returned from the same document. In order to remove any sort of duplication of chunks, we'd have to make much more significant changes so this is a reasonable first step that unblocks users requesting this enhancement in https://github.com/meta-llama/llama-stack/issues/1767. In the future, this can be extended to support citations. List of Changes: - `llama_stack/apis/tools/rag_tool.py` - Added `chunk_template` field in `RAGQueryConfig`. - Added `field_validator` to validate the `chunk_template` field in `RAGQueryConfig`. - Ensured the `chunk_template` field includes placeholders `{index}` and `{chunk.content}`. - Updated the `query` method to use the `chunk_template` for formatting chunk text content. - `llama_stack/providers/inline/tool_runtime/rag/memory.py` - Modified the `insert` method to pass `doc.metadata` for chunk creation. - Enhanced the `query` method to format results using `chunk_template` and exclude unnecessary metadata fields like `token_count`. - `llama_stack/providers/utils/memory/vector_store.py` - Updated `make_overlapped_chunks` to include metadata serialization and token count for both content and metadata. - Added error handling for metadata serialization issues. - `pyproject.toml` - Added `pydantic.field_validator` as a recognized `classmethod` decorator in the linting configuration. - `tests/integration/tool_runtime/test_rag_tool.py` - Refactored test assertions to separate `assert_valid_chunk_response` and `assert_valid_text_response`. - Added integration tests to validate `chunk_template` functionality with and without metadata inclusion. - Included a test case to ensure `chunk_template` validation errors are raised appropriately. - `tests/unit/rag/test_vector_store.py` - Added unit tests for `make_overlapped_chunks`, verifying chunk creation with overlapping tokens and metadata integrity. - Added tests to handle metadata serialization errors, ensuring proper exception handling. - `docs/_static/llama-stack-spec.html` - Added a new `chunk_template` field of type `string` with a default template for formatting retrieved chunks in RAGQueryConfig. - Updated the `required` fields to include `chunk_template`. - `docs/_static/llama-stack-spec.yaml` - Introduced `chunk_template` field with a default value for RAGQueryConfig. - Updated the required configuration list to include `chunk_template`. - `docs/source/building_applications/rag.md` - Documented the `chunk_template` configuration, explaining how to customize metadata formatting in RAG queries. - Added examples demonstrating the usage of the `chunk_template` field in RAG tool queries. - Highlighted default values for `RAG` agent configurations. # Resolves https://github.com/meta-llama/llama-stack/issues/1767 ## Test Plan Updated both `test_vector_store.py` and `test_rag_tool.py` and tested end-to-end with a script. I also tested the quickstart to enable this and specified this metadata: ```python document = RAGDocument( document_id="document_1", content=source, mime_type="text/html", metadata={"author": "Paul Graham", "title": "How to do great work"}, ) ``` Which produced the output below:  This highlights the usefulness of the additional metadata. Notice how the metadata is redundant for different chunks of the same document. I think we can update that in a subsequent PR. # Documentation I've added a brief comment about this in the documentation to outline this to users and updated the API documentation. --------- Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> |
||
|
8e316c9b1e
|
feat: function tools in OpenAI Responses (#2094)
# What does this PR do? This is a combination of what was previously 3 separate PRs - #2069, #2075, and #2083. It turns out all 3 of those are needed to land a working function calling Responses implementation. The web search builtin tool was already working, but this wires in support for custom function calling. I ended up combining all three into one PR because they all had lots of merge conflicts, both with each other but also with #1806 that just landed. And, because landing any of them individually would have only left a partially working implementation merged. The new things added here are: * Storing of input items from previous responses and restoring of those input items when adding previous responses to the conversation state * Handling of multiple input item messages roles, not just "user" messages. * Support for custom tools passed into the Responses API to enable function calling outside of just the builtin websearch tool. Closes #2074 Closes #2080 ## Test Plan ### Unit Tests Several new unit tests were added, and they all pass. Ran via: ``` python -m pytest -s -v tests/unit/providers/agents/meta_reference/test_openai_responses.py ``` ### Responses API Verification Tests I ran our verification run.yaml against multiple providers to ensure we were getting a decent pass rate. Specifically, I ensured the new custom tool verification test passed across multiple providers and that the multi-turn examples passed across at least some of the providers (some providers struggle with the multi-turn workflows still). Running the stack setup for verification testing: ``` llama stack run --image-type venv tests/verifications/openai-api-verification-run.yaml ``` Together, passing 100% as an example: ``` pytest -s -v 'tests/verifications/openai_api/test_responses.py' --provider=together-llama-stack ``` ## Documentation We will need to start documenting the OpenAI APIs, but for now the Responses stuff is still rapidly evolving so delaying that. --------- Signed-off-by: Derek Higgins <derekh@redhat.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Derek Higgins <derekh@redhat.com> Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com> |
||
|
fe5f5e530c
|
feat: add metrics query API (#1394)
# What does this PR do? Adds the API to query metrics from telemetry. ## Test Plan llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml --------- Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com> |
||
|
c91e3552a3
|
feat: implementation for agent/session list and describe (#1606)
Create a new agent: ``` curl --request POST \ --url http://localhost:8321/v1/agents \ --header 'Accept: application/json' \ --header 'Content-Type: application/json' \ --data '{ "agent_config": { "sampling_params": { "strategy": { "type": "greedy" }, "max_tokens": 0, "repetition_penalty": 1 }, "input_shields": [ "string" ], "output_shields": [ "string" ], "toolgroups": [ "string" ], "client_tools": [ { "name": "string", "description": "string", "parameters": [ { "name": "string", "parameter_type": "string", "description": "string", "required": true, "default": null } ], "metadata": { "property1": null, "property2": null } } ], "tool_choice": "auto", "tool_prompt_format": "json", "tool_config": { "tool_choice": "auto", "tool_prompt_format": "json", "system_message_behavior": "append" }, "max_infer_iters": 10, "model": "string", "instructions": "string", "enable_session_persistence": false, "response_format": { "type": "json_schema", "json_schema": { "property1": null, "property2": null } } } }' ``` Get agent: ``` curl http://127.0.0.1:8321/v1/agents/9abad4ab-2c77-45f9-9d16-46b79d2bea1f {"agent_id":"9abad4ab-2c77-45f9-9d16-46b79d2bea1f","agent_config":{"sampling_params":{"strategy":{"type":"greedy"},"max_tokens":0,"repetition_penalty":1.0},"input_shields":["string"],"output_shields":["string"],"toolgroups":["string"],"client_tools":[{"name":"string","description":"string","parameters":[{"name":"string","parameter_type":"string","description":"string","required":true,"default":null}],"metadata":{"property1":null,"property2":null}}],"tool_choice":"auto","tool_prompt_format":"json","tool_config":{"tool_choice":"auto","tool_prompt_format":"json","system_message_behavior":"append"},"max_infer_iters":10,"model":"string","instructions":"string","enable_session_persistence":false,"response_format":{"type":"json_schema","json_schema":{"property1":null,"property2":null}}},"created_at":"2025-03-12T16:18:28.369144Z"}% ``` List agents: ``` curl http://127.0.0.1:8321/v1/agents|jq % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 1680 100 1680 0 0 498k 0 --:--:-- --:--:-- --:--:-- 546k { "data": [ { "agent_id": "9abad4ab-2c77-45f9-9d16-46b79d2bea1f", "agent_config": { "sampling_params": { "strategy": { "type": "greedy" }, "max_tokens": 0, "repetition_penalty": 1.0 }, "input_shields": [ "string" ], "output_shields": [ "string" ], "toolgroups": [ "string" ], "client_tools": [ { "name": "string", "description": "string", "parameters": [ { "name": "string", "parameter_type": "string", "description": "string", "required": true, "default": null } ], "metadata": { "property1": null, "property2": null } } ], "tool_choice": "auto", "tool_prompt_format": "json", "tool_config": { "tool_choice": "auto", "tool_prompt_format": "json", "system_message_behavior": "append" }, "max_infer_iters": 10, "model": "string", "instructions": "string", "enable_session_persistence": false, "response_format": { "type": "json_schema", "json_schema": { "property1": null, "property2": null } } }, "created_at": "2025-03-12T16:18:28.369144Z" }, { "agent_id": "a6643aaa-96dd-46db-a405-333dc504b168", "agent_config": { "sampling_params": { "strategy": { "type": "greedy" }, "max_tokens": 0, "repetition_penalty": 1.0 }, "input_shields": [ "string" ], "output_shields": [ "string" ], "toolgroups": [ "string" ], "client_tools": [ { "name": "string", "description": "string", "parameters": [ { "name": "string", "parameter_type": "string", "description": "string", "required": true, "default": null } ], "metadata": { "property1": null, "property2": null } } ], "tool_choice": "auto", "tool_prompt_format": "json", "tool_config": { "tool_choice": "auto", "tool_prompt_format": "json", "system_message_behavior": "append" }, "max_infer_iters": 10, "model": "string", "instructions": "string", "enable_session_persistence": false, "response_format": { "type": "json_schema", "json_schema": { "property1": null, "property2": null } } }, "created_at": "2025-03-12T16:17:12.811273Z" } ] } ``` Create sessions: ``` curl --request POST \ --url http://localhost:8321/v1/agents/{agent_id}/session \ --header 'Accept: application/json' \ --header 'Content-Type: application/json' \ --data '{ "session_name": "string" }' ``` List sessions: ``` curl http://127.0.0.1:8321/v1/agents/9abad4ab-2c77-45f9-9d16-46b79d2bea1f/sessions|jq % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 263 100 263 0 0 90099 0 --:--:-- --:--:-- --:--:-- 128k [ { "session_id": "2b15c4fc-e348-46c1-ae32-f6d424441ac1", "session_name": "string", "turns": [], "started_at": "2025-03-12T17:19:17.784328" }, { "session_id": "9432472d-d483-4b73-b682-7b1d35d64111", "session_name": "string", "turns": [], "started_at": "2025-03-12T17:19:19.885834" } ] ``` Signed-off-by: Sébastien Han <seb@redhat.com> |
||
|
1a529705da
|
chore: more mypy fixes (#2029)
# What does this PR do? Mainly tried to cover the entire llama_stack/apis directory, we only have one left. Some excludes were just noop. Signed-off-by: Sébastien Han <seb@redhat.com> |
||
|
64829947d0
|
feat: Add temperature support to responses API (#2065)
# What does this PR do? Add support for the temperature to the responses API ## Test Plan Manually tested simple case unit tests added for simple case and tool calls Signed-off-by: Derek Higgins <derekh@redhat.com> |
||
|
8dfce2f596
|
feat: OpenAI Responses API (#1989)
# What does this PR do? This provides an initial [OpenAI Responses API](https://platform.openai.com/docs/api-reference/responses) implementation. The API is not yet complete, and this is more a proof-of-concept to show how we can store responses in our key-value stores and use them to support the Responses API concepts like `previous_response_id`. ## Test Plan I've added a new `tests/integration/openai_responses/test_openai_responses.py` as part of a test-driven development for this new API. I'm only testing this locally with the remote-vllm provider for now, but it should work with any of our inference providers since the only API it requires out of the inference provider is the `openai_chat_completion` endpoint. ``` VLLM_URL="http://localhost:8000/v1" \ INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack build --template remote-vllm --image-type venv --run ``` ``` LLAMA_STACK_CONFIG="http://localhost:8321" \ python -m pytest -v \ tests/integration/openai_responses/test_openai_responses.py \ --text-model "meta-llama/Llama-3.2-3B-Instruct" ``` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com> |
||
|
cb874287a4
|
fix: resync api spec (#1987) | ||
|
326cbba579
|
feat(agents): add agent naming functionality (#1922)
# What does this PR do? Allow users to name an agent and use the name in telemetry instead of relying on randomly generated agent_ids. This improves the developer experience by making it easier to find specific agents in telemetry logs. Closes #1832 ## Test Plan - Added tests to verify the agent name is properly stored and retrieved - Ran `uv run -- pytest -v tests/integration/telemetry/test_telemetry.py::test_agent_name_filtering` from the root of the project and made sure the tests pass - Ran `uv run -- pytest -v tests/integration/telemetry/test_telemetry.py::test_agent_query_spans` to verify existing code without agent names still works correctly ## Use Example ``` agent = Agent( llama_stack_client, model=text_model_id, name="CustomerSupportAgent", # New parameter instructions="You are a helpful customer support assistant" ) session_id = agent.create_session(f"test-session-{uuid4()}") ``` ## Implementation Notes - Agent names are optional string parameters with no additional validation - Names are not required to be unique - multiple agents can have the same name - The agent_id remains the unique identifier for an agent --------- Co-authored-by: raghotham <raghotham@gmail.com> |
||
|
5b8e75b392
|
fix: OpenAI spec cleanup for assistant requests (#1963)
# What does this PR do? Some of our multi-turn verification tests were failing because I had accidentally marked content as a required field in the OpenAI chat completion request assistant messages, but it's actually optional. It is required for messages from other roles, but assistant is explicitly allowed to be optional. Similarly, the assistant message tool_calls field should default to None instead of an empty list. These two changes get the openai-llama-stack verification test back to 100% passing, just like it passes 100% when not behind Llama Stack. They also increase the pass rate of some of the other providers in the verification test, but don't get them to 100%. ## Test Plan I started a Llama Stack server setup to run all the verification tests (requires OPENAI_API_KEY env variable) ``` llama stack run --image-type venv tests/verifications/openai-api-verification-run.yaml ``` Then, I manually ran the verification tests to see which were failing, fix them, and ran them again after these changes to ensure they were all passing. ``` python -m pytest -s -v tests/verifications/openai_api/test_chat_completion.py --provider=openai-llama-stack ``` Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
7641a5cd0b
|
fix: 100% OpenAI API verification for together and fireworks (#1946)
# What does this PR do? TLDR: Changes needed to get 100% passing tests for OpenAI API verification tests when run against Llama Stack with the `together`, `fireworks`, and `openai` providers. And `groq` is better than before, at 88% passing. This cleans up the OpenAI API support for image message types (specifically `image_url` types) and handling of the `response_format` chat completion parameter. Both of these required a few more Pydantic model definitions in our Inference API, just to move from the not-quite-right stubs I had in place to something fleshed out to match the actual OpenAI API specs. As part of testing this, I also found and fixed a bug in the litellm implementation of openai_completion and openai_chat_completion, so the providers based on those should actually be working now. The method `prepare_openai_completion_params` in `llama_stack/providers/utils/inference/openai_compat.py` was improved to actually recursively clean up input parameters, including handling of lists, dicts, and dumping of Pydantic models to dicts. These changes were required to get to 100% passing tests on the OpenAI API verification against the `openai` provider. With the above, the together.ai provider was passing as well as it is without Llama Stack. But, since we have Llama Stack in the middle, I took the opportunity to clean up the together.ai provider so that it now also passes the OpenAI API spec tests we have at 100%. That means together.ai is now passing our verification test better when using an OpenAI client talking to Llama Stack than it is when hitting together.ai directly, without Llama Stack in the middle. And, another round of work for Fireworks to improve translation of incoming OpenAI chat completion requests to Llama Stack chat completion requests gets the fireworks provider passing at 100%. The server-side fireworks.ai tool calling support with OpenAI chat completions and Llama 4 models isn't great yet, but by pointing the OpenAI clients at Llama Stack's API we can clean things up and get everything working as expected for Llama 4 models. ## Test Plan ### OpenAI API Verification Tests I ran the OpenAI API verification tests as below and 100% of the tests passed. First, start a Llama Stack server that runs the `openai` provider with the `gpt-4o` and `gpt-4o-mini` models deployed. There's not a template setup to do this out of the box, so I added a `tests/verifications/openai-api-verification-run.yaml` to do this. First, ensure you have the necessary API key environment variables set: ``` export TOGETHER_API_KEY="..." export FIREWORKS_API_KEY="..." export OPENAI_API_KEY="..." ``` Then, run a Llama Stack server that serves up all these providers: ``` llama stack run \ --image-type venv \ tests/verifications/openai-api-verification-run.yaml ``` Finally, generate a new verification report against all these providers, both with and without the Llama Stack server in the middle. ``` python tests/verifications/generate_report.py \ --run-tests \ --provider \ together \ fireworks \ groq \ openai \ together-llama-stack \ fireworks-llama-stack \ groq-llama-stack \ openai-llama-stack ``` You'll see that most of the configurations with Llama Stack in the middle now pass at 100%, even though some of them do not pass at 100% when hitting the backend provider's API directly with an OpenAI client. ### OpenAI Completion Integration Tests with vLLM: I also ran the smaller `test_openai_completion.py` test suite (that's not yet merged with the verification tests) on multiple of the providers, since I had to adjust the method signature of openai_chat_completion a bit and thus had to touch lots of these providers to match. Here's the tests I ran there, all passing: ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" llama stack build --template remote-vllm --image-type venv --run ``` in another terminal ``` LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.2-3B-Instruct" ``` ### OpenAI Completion Integration Tests with ollama ``` INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" llama stack build --template ollama --image-type venv --run ``` in another terminal ``` LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "llama3.2:3b-instruct-q8_0" ``` ### OpenAI Completion Integration Tests with together.ai ``` INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct-Turbo" llama stack build --template together --image-type venv --run ``` in another terminal ``` LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct-Turbo" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.2-3B-Instruct-Turbo" ``` ### OpenAI Completion Integration Tests with fireworks.ai ``` INFERENCE_MODEL="meta-llama/Llama-3.1-8B-Instruct" llama stack build --template fireworks --image-type venv --run ``` in another terminal ``` LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.1-8B-Instruct" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.1-8B-Instruct" --------- Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
69554158fa
|
feat: add health to all providers through providers endpoint (#1418)
The `/v1/providers` now reports the health status of each provider when implemented. ``` curl -L http://127.0.0.1:8321/v1/providers|jq % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 4072 100 4072 0 0 246k 0 --:--:-- --:--:-- --:--:-- 248k { "data": [ { "api": "inference", "provider_id": "ollama", "provider_type": "remote::ollama", "config": { "url": "http://localhost:11434" }, "health": { "status": "OK" } }, { "api": "vector_io", "provider_id": "faiss", "provider_type": "inline::faiss", "config": { "kvstore": { "type": "sqlite", "namespace": null, "db_path": "/Users/leseb/.llama/distributions/ollama/faiss_store.db" } }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "safety", "provider_id": "llama-guard", "provider_type": "inline::llama-guard", "config": { "excluded_categories": [] }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "agents", "provider_id": "meta-reference", "provider_type": "inline::meta-reference", "config": { "persistence_store": { "type": "sqlite", "namespace": null, "db_path": "/Users/leseb/.llama/distributions/ollama/agents_store.db" } }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "telemetry", "provider_id": "meta-reference", "provider_type": "inline::meta-reference", "config": { "service_name": "llama-stack", "sinks": "console,sqlite", "sqlite_db_path": "/Users/leseb/.llama/distributions/ollama/trace_store.db" }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "eval", "provider_id": "meta-reference", "provider_type": "inline::meta-reference", "config": { "kvstore": { "type": "sqlite", "namespace": null, "db_path": "/Users/leseb/.llama/distributions/ollama/meta_reference_eval.db" } }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "datasetio", "provider_id": "huggingface", "provider_type": "remote::huggingface", "config": { "kvstore": { "type": "sqlite", "namespace": null, "db_path": "/Users/leseb/.llama/distributions/ollama/huggingface_datasetio.db" } }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "datasetio", "provider_id": "localfs", "provider_type": "inline::localfs", "config": { "kvstore": { "type": "sqlite", "namespace": null, "db_path": "/Users/leseb/.llama/distributions/ollama/localfs_datasetio.db" } }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "scoring", "provider_id": "basic", "provider_type": "inline::basic", "config": {}, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "scoring", "provider_id": "llm-as-judge", "provider_type": "inline::llm-as-judge", "config": {}, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "scoring", "provider_id": "braintrust", "provider_type": "inline::braintrust", "config": { "openai_api_key": "********" }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "tool_runtime", "provider_id": "brave-search", "provider_type": "remote::brave-search", "config": { "api_key": "********", "max_results": 3 }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "tool_runtime", "provider_id": "tavily-search", "provider_type": "remote::tavily-search", "config": { "api_key": "********", "max_results": 3 }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "tool_runtime", "provider_id": "code-interpreter", "provider_type": "inline::code-interpreter", "config": {}, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "tool_runtime", "provider_id": "rag-runtime", "provider_type": "inline::rag-runtime", "config": {}, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "tool_runtime", "provider_id": "model-context-protocol", "provider_type": "remote::model-context-protocol", "config": {}, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } }, { "api": "tool_runtime", "provider_id": "wolfram-alpha", "provider_type": "remote::wolfram-alpha", "config": { "api_key": "********" }, "health": { "status": "Not Implemented", "message": "Provider does not implement health check" } } ] } ``` Per providers too: ``` curl -L http://127.0.0.1:8321/v1/providers/ollama {"api":"inference","provider_id":"ollama","provider_type":"remote::ollama","config":{"url":"http://localhost:11434"},"health":{"status":"OK"}} ``` Signed-off-by: Sébastien Han <seb@redhat.com> |
||
|
f34f22f8c7
|
feat: add batch inference API to llama stack inference (#1945)
# What does this PR do? This PR adds two methods to the Inference API: - `batch_completion` - `batch_chat_completion` The motivation is for evaluations targeting a local inference engine (like meta-reference or vllm) where batch APIs provide for a substantial amount of acceleration. Why did I not add this to `Api.batch_inference` though? That just resulted in a _lot_ more book-keeping given the structure of Llama Stack. Had I done that, I would have needed to create a notion of a "batch model" resource, setup routing based on that, etc. This does not sound ideal. So what's the future of the batch inference API? I am not sure. Maybe we can keep it for true _asynchronous_ execution. So you can submit requests, and it can return a Job instance, etc. ## Test Plan Run meta-reference-gpu using: ```bash export INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct export INFERENCE_CHECKPOINT_DIR=../checkpoints/Llama-4-Scout-17B-16E-Instruct-20250331210000 export MODEL_PARALLEL_SIZE=4 export MAX_BATCH_SIZE=32 export MAX_SEQ_LEN=6144 LLAMA_MODELS_DEBUG=1 llama stack run meta-reference-gpu ``` Then run the batch inference test case. |
||
|
0751a960a5
|
feat: make training config fields optional (#1861)
# What does this PR do? Today, supervised_fine_tune itself and the `TrainingConfig` class have a bunch of required fields that a provider implementation might not need. for example, if a provider wants to handle hyperparameters in its configuration as well as any type of dataset retrieval, optimizer or LoRA config, a user will still need to pass in a virtually empty `DataConfig`, `OptimizerConfig` and `AlgorithmConfig` in some cases. Many of these fields are intended to work specifically with llama models and knobs intended for customizing inline. Adding remote post_training providers will require loosening these arguments, or forcing users to pass in empty objects to satisfy the pydantic models. Signed-off-by: Charlie Doern <cdoern@redhat.com> |
||
|
2b2db5fbda
|
feat: OpenAI-Compatible models, completions, chat/completions (#1894)
# What does this PR do? This stubs in some OpenAI server-side compatibility with three new endpoints: /v1/openai/v1/models /v1/openai/v1/completions /v1/openai/v1/chat/completions This gives common inference apps using OpenAI clients the ability to talk to Llama Stack using an endpoint like http://localhost:8321/v1/openai/v1 . The two "v1" instances in there isn't awesome, but the thinking is that Llama Stack's API is v1 and then our OpenAI compatibility layer is compatible with OpenAI V1. And, some OpenAI clients implicitly assume the URL ends with "v1", so this gives maximum compatibility. The openai models endpoint is implemented in the routing layer, and just returns all the models Llama Stack knows about. The following providers should be working with the new OpenAI completions and chat/completions API: * remote::anthropic (untested) * remote::cerebras-openai-compat (untested) * remote::fireworks (tested) * remote::fireworks-openai-compat (untested) * remote::gemini (untested) * remote::groq-openai-compat (untested) * remote::nvidia (tested) * remote::ollama (tested) * remote::openai (untested) * remote::passthrough (untested) * remote::sambanova-openai-compat (untested) * remote::together (tested) * remote::together-openai-compat (untested) * remote::vllm (tested) The goal to support this for every inference provider - proxying directly to the provider's OpenAI endpoint for OpenAI-compatible providers. For providers that don't have an OpenAI-compatible API, we'll add a mixin to translate incoming OpenAI requests to Llama Stack inference requests and translate the Llama Stack inference responses to OpenAI responses. This is related to #1817 but is a bit larger in scope than just chat completions, as I have real use-cases that need the older completions API as well. ## Test Plan ### vLLM ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" llama stack build --template remote-vllm --image-type venv --run LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.2-3B-Instruct" ``` ### ollama ``` INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" llama stack build --template ollama --image-type venv --run LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "llama3.2:3b-instruct-q8_0" ``` ## Documentation Run a Llama Stack distribution that uses one of the providers mentioned in the list above. Then, use your favorite OpenAI client to send completion or chat completion requests with the base_url set to http://localhost:8321/v1/openai/v1 . Replace "localhost:8321" with the host and port of your Llama Stack server, if different. --------- Signed-off-by: Ben Browning <bbrownin@redhat.com> |
||
|
0a895c70d1
|
fix(api): don't return list for runtime tools (#1686)
# What does this PR do? Don't return list for runtime tools. Instead return Response object for pagination and consistency with other APIs. --------- Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> |