Merge branch 'meta-llama:main' into qdrant

2025-12-15 16:22:46 +00:00 · 2024-10-22 21:45:31 +05:30 · 2024-10-22 21:45:31 +05:30 · 1575578446
commit 1575578446
parent 29156780ff b279d3bc58
101 changed files with 3310 additions and 722 deletions
--- a/llama_stack/providers/tests/agents/provider_config_example.yaml
+++ b/llama_stack/providers/tests/agents/provider_config_example.yaml
@ -31,4 +31,4 @@ providers:
        persistence_store:
          namespace: null
          type: sqlite
-          db_path: /Users/ashwin/.llama/runtime/kvstore.db
+          db_path: ~/.llama/runtime/kvstore.db
--- a/llama_stack/providers/tests/agents/test_agents.py
+++ b/llama_stack/providers/tests/agents/test_agents.py
@ -64,6 +64,24 @@ def search_query_messages():
    ]


+@pytest.fixture
+def attachment_message():
+    return [
+        UserMessage(
+            content="I am attaching some documentation for Torchtune. Help me answer questions I will ask next.",
+        ),
+    ]
+
+
+@pytest.fixture
+def query_attachment_messages():
+    return [
+        UserMessage(
+            content="What are the top 5 topics that were explained? Only list succinct bullet points."
+        ),
+    ]
+
+
@pytest.mark.asyncio
 async def test_create_agent_turn(agents_settings, sample_messages):
    agents_impl = agents_settings["impl"]
@ -98,7 +116,7 @@ async def test_create_agent_turn(agents_settings, sample_messages):
    )

    turn_response = [
-        chunk async for chunk in agents_impl.create_agent_turn(**turn_request)
+        chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
    ]

    assert len(turn_response) > 0
@ -123,6 +141,89 @@ async def test_create_agent_turn(agents_settings, sample_messages):
    assert len(final_event.turn.output_message.content) > 0


+@pytest.mark.asyncio
+async def test_rag_agent_as_attachments(
+    agents_settings, attachment_message, query_attachment_messages
+):
+    urls = [
+        "memory_optimizations.rst",
+        "chat.rst",
+        "llama3.rst",
+        "datasets.rst",
+        "qat_finetune.rst",
+        "lora_finetune.rst",
+    ]
+
+    attachments = [
+        Attachment(
+            content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
+            mime_type="text/plain",
+        )
+        for i, url in enumerate(urls)
+    ]
+
+    agents_impl = agents_settings["impl"]
+
+    agent_config = AgentConfig(
+        model=agents_settings["common_params"]["model"],
+        instructions=agents_settings["common_params"]["instructions"],
+        enable_session_persistence=True,
+        sampling_params=SamplingParams(temperature=0.7, top_p=0.95),
+        input_shields=[],
+        output_shields=[],
+        tools=[
+            MemoryToolDefinition(
+                memory_bank_configs=[],
+                query_generator_config={
+                    "type": "default",
+                    "sep": " ",
+                },
+                max_tokens_in_context=4096,
+                max_chunks=10,
+            ),
+        ],
+        max_infer_iters=5,
+    )
+
+    create_response = await agents_impl.create_agent(agent_config)
+    agent_id = create_response.agent_id
+
+    # Create a session
+    session_create_response = await agents_impl.create_agent_session(
+        agent_id, "Test Session"
+    )
+    session_id = session_create_response.session_id
+
+    # Create and execute a turn
+    turn_request = dict(
+        agent_id=agent_id,
+        session_id=session_id,
+        messages=attachment_message,
+        attachments=attachments,
+        stream=True,
+    )
+
+    turn_response = [
+        chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
+    ]
+
+    assert len(turn_response) > 0
+
+    # Create a second turn querying the agent
+    turn_request = dict(
+        agent_id=agent_id,
+        session_id=session_id,
+        messages=query_attachment_messages,
+        stream=True,
+    )
+
+    turn_response = [
+        chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
+    ]
+
+    assert len(turn_response) > 0
+
+
@pytest.mark.asyncio
 async def test_create_agent_turn_with_brave_search(
    agents_settings, search_query_messages
@ -169,7 +270,7 @@ async def test_create_agent_turn_with_brave_search(
    )

    turn_response = [
-        chunk async for chunk in agents_impl.create_agent_turn(**turn_request)
+        chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
    ]

    assert len(turn_response) > 0
--- a/llama_stack/providers/tests/inference/provider_config_example.yaml
+++ b/llama_stack/providers/tests/inference/provider_config_example.yaml
@ -4,6 +4,10 @@ providers:
    config:
      host: localhost
      port: 11434
+  - provider_id: meta-reference
+    provider_type: meta-reference
+    config:
+      model: Llama3.2-1B-Instruct
  - provider_id: test-tgi
    provider_type: remote::tgi
    config:
--- a/llama_stack/providers/tests/inference/test_inference.py
+++ b/llama_stack/providers/tests/inference/test_inference.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import itertools
+import os

 import pytest
 import pytest_asyncio
@ -50,14 +51,17 @@ def get_expected_stop_reason(model: str):
    return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn


+if "MODEL_IDS" not in os.environ:
+    MODEL_IDS = [Llama_8B, Llama_3B]
+else:
+    MODEL_IDS = os.environ["MODEL_IDS"].split(",")
+
+
 # This is going to create multiple Stack impls without tearing down the previous one
 # Fix that!
@pytest_asyncio.fixture(
    scope="session",
-    params=[
-        {"model": Llama_8B},
-        {"model": Llama_3B},
-    ],
+    params=[{"model": m} for m in MODEL_IDS],
    ids=lambda d: d["model"],
 )
 async def inference_settings(request):
@ -122,6 +126,48 @@ async def test_model_list(inference_settings):
    assert model_def.identifier == params["model"]


+@pytest.mark.asyncio
+async def test_completion(inference_settings):
+    inference_impl = inference_settings["impl"]
+    params = inference_settings["common_params"]
+
+    provider = inference_impl.routing_table.get_provider_impl(params["model"])
+    if provider.__provider_spec__.provider_type not in (
+        "meta-reference",
+        "remote::ollama",
+    ):
+        pytest.skip("Other inference providers don't support completion() yet")
+
+    response = await inference_impl.completion(
+        content="Roses are red,",
+        stream=False,
+        model=params["model"],
+        sampling_params=SamplingParams(
+            max_tokens=50,
+        ),
+    )
+
+    assert isinstance(response, CompletionResponse)
+    assert "violets are blue" in response.content
+
+    chunks = [
+        r
+        async for r in await inference_impl.completion(
+            content="Roses are red,",
+            stream=True,
+            model=params["model"],
+            sampling_params=SamplingParams(
+                max_tokens=50,
+            ),
+        )
+    ]
+
+    assert all(isinstance(chunk, CompletionResponseStreamChunk) for chunk in chunks)
+    assert len(chunks) == 51
+    last = chunks[-1]
+    assert last.stop_reason == StopReason.out_of_tokens
+
+
@pytest.mark.asyncio
 async def test_chat_completion_non_streaming(inference_settings, sample_messages):
    inference_impl = inference_settings["impl"]
@ -142,7 +188,7 @@ async def test_chat_completion_streaming(inference_settings, sample_messages):
    inference_impl = inference_settings["impl"]
    response = [
        r
-        async for r in inference_impl.chat_completion(
+        async for r in await inference_impl.chat_completion(
            messages=sample_messages,
            stream=True,
            **inference_settings["common_params"],
@ -213,7 +259,7 @@ async def test_chat_completion_with_tool_calling_streaming(

    response = [
        r
-        async for r in inference_impl.chat_completion(
+        async for r in await inference_impl.chat_completion(
            messages=messages,
            tools=[sample_tool_definition],
            stream=True,
--- a/llama_stack/providers/tests/memory/provider_config_example.yaml
+++ b/llama_stack/providers/tests/memory/provider_config_example.yaml
@ -2,8 +2,8 @@ providers:
  - provider_id: test-faiss
    provider_type: meta-reference
    config: {}
-  - provider_id: test-chroma
-    provider_type: remote::chroma
+  - provider_id: test-chromadb
+    provider_type: remote::chromadb
    config:
      host: localhost
      port: 6001
--- a/llama_stack/providers/tests/memory/test_memory.py
+++ b/llama_stack/providers/tests/memory/test_memory.py
@ -89,6 +89,30 @@ async def test_banks_list(memory_settings):
    assert len(response) == 0


+@pytest.mark.asyncio
+async def test_banks_register(memory_settings):
+    # NOTE: this needs you to ensure that you are starting from a clean state
+    # but so far we don't have an unregister API unfortunately, so be careful
+    banks_impl = memory_settings["memory_banks_impl"]
+    bank = VectorMemoryBankDef(
+        identifier="test_bank_no_provider",
+        embedding_model="all-MiniLM-L6-v2",
+        chunk_size_in_tokens=512,
+        overlap_size_in_tokens=64,
+    )
+
+    await banks_impl.register_memory_bank(bank)
+    response = await banks_impl.list_memory_banks()
+    assert isinstance(response, list)
+    assert len(response) == 1
+
+    # register same memory bank with same id again will fail
+    await banks_impl.register_memory_bank(bank)
+    response = await banks_impl.list_memory_banks()
+    assert isinstance(response, list)
+    assert len(response) == 1
+
+
@pytest.mark.asyncio
 async def test_query_documents(memory_settings, sample_documents):
    memory_impl = memory_settings["memory_impl"]
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@ -14,7 +14,7 @@ import yaml
 from llama_stack.distribution.datatypes import *  # noqa: F403
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
 from llama_stack.distribution.request_headers import set_request_provider_data
-from llama_stack.distribution.resolver import resolve_impls_with_routing
+from llama_stack.distribution.resolver import resolve_impls


 async def resolve_impls_for_test(api: Api, deps: List[Api] = None):
@ -36,7 +36,7 @@ async def resolve_impls_for_test(api: Api, deps: List[Api] = None):
        providers=chosen,
    )
    run_config = parse_and_maybe_upgrade_config(run_config)
-    impls = await resolve_impls_with_routing(run_config)
+    impls = await resolve_impls(run_config)

    if "provider_data" in config_dict:
        provider_id = chosen[api.value][0].provider_id