Merge branch 'meta-llama:main' into qdrant

This commit is contained in:
Anush 2024-10-22 21:45:31 +05:30 committed by GitHub
commit 1575578446
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
101 changed files with 3310 additions and 722 deletions

View file

@ -31,4 +31,4 @@ providers:
persistence_store:
namespace: null
type: sqlite
db_path: /Users/ashwin/.llama/runtime/kvstore.db
db_path: ~/.llama/runtime/kvstore.db

View file

@ -64,6 +64,24 @@ def search_query_messages():
]
@pytest.fixture
def attachment_message():
return [
UserMessage(
content="I am attaching some documentation for Torchtune. Help me answer questions I will ask next.",
),
]
@pytest.fixture
def query_attachment_messages():
return [
UserMessage(
content="What are the top 5 topics that were explained? Only list succinct bullet points."
),
]
@pytest.mark.asyncio
async def test_create_agent_turn(agents_settings, sample_messages):
agents_impl = agents_settings["impl"]
@ -98,7 +116,7 @@ async def test_create_agent_turn(agents_settings, sample_messages):
)
turn_response = [
chunk async for chunk in agents_impl.create_agent_turn(**turn_request)
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0
@ -123,6 +141,89 @@ async def test_create_agent_turn(agents_settings, sample_messages):
assert len(final_event.turn.output_message.content) > 0
@pytest.mark.asyncio
async def test_rag_agent_as_attachments(
agents_settings, attachment_message, query_attachment_messages
):
urls = [
"memory_optimizations.rst",
"chat.rst",
"llama3.rst",
"datasets.rst",
"qat_finetune.rst",
"lora_finetune.rst",
]
attachments = [
Attachment(
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
mime_type="text/plain",
)
for i, url in enumerate(urls)
]
agents_impl = agents_settings["impl"]
agent_config = AgentConfig(
model=agents_settings["common_params"]["model"],
instructions=agents_settings["common_params"]["instructions"],
enable_session_persistence=True,
sampling_params=SamplingParams(temperature=0.7, top_p=0.95),
input_shields=[],
output_shields=[],
tools=[
MemoryToolDefinition(
memory_bank_configs=[],
query_generator_config={
"type": "default",
"sep": " ",
},
max_tokens_in_context=4096,
max_chunks=10,
),
],
max_infer_iters=5,
)
create_response = await agents_impl.create_agent(agent_config)
agent_id = create_response.agent_id
# Create a session
session_create_response = await agents_impl.create_agent_session(
agent_id, "Test Session"
)
session_id = session_create_response.session_id
# Create and execute a turn
turn_request = dict(
agent_id=agent_id,
session_id=session_id,
messages=attachment_message,
attachments=attachments,
stream=True,
)
turn_response = [
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0
# Create a second turn querying the agent
turn_request = dict(
agent_id=agent_id,
session_id=session_id,
messages=query_attachment_messages,
stream=True,
)
turn_response = [
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0
@pytest.mark.asyncio
async def test_create_agent_turn_with_brave_search(
agents_settings, search_query_messages
@ -169,7 +270,7 @@ async def test_create_agent_turn_with_brave_search(
)
turn_response = [
chunk async for chunk in agents_impl.create_agent_turn(**turn_request)
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0

View file

@ -4,6 +4,10 @@ providers:
config:
host: localhost
port: 11434
- provider_id: meta-reference
provider_type: meta-reference
config:
model: Llama3.2-1B-Instruct
- provider_id: test-tgi
provider_type: remote::tgi
config:

View file

@ -5,6 +5,7 @@
# the root directory of this source tree.
import itertools
import os
import pytest
import pytest_asyncio
@ -50,14 +51,17 @@ def get_expected_stop_reason(model: str):
return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn
if "MODEL_IDS" not in os.environ:
MODEL_IDS = [Llama_8B, Llama_3B]
else:
MODEL_IDS = os.environ["MODEL_IDS"].split(",")
# This is going to create multiple Stack impls without tearing down the previous one
# Fix that!
@pytest_asyncio.fixture(
scope="session",
params=[
{"model": Llama_8B},
{"model": Llama_3B},
],
params=[{"model": m} for m in MODEL_IDS],
ids=lambda d: d["model"],
)
async def inference_settings(request):
@ -122,6 +126,48 @@ async def test_model_list(inference_settings):
assert model_def.identifier == params["model"]
@pytest.mark.asyncio
async def test_completion(inference_settings):
inference_impl = inference_settings["impl"]
params = inference_settings["common_params"]
provider = inference_impl.routing_table.get_provider_impl(params["model"])
if provider.__provider_spec__.provider_type not in (
"meta-reference",
"remote::ollama",
):
pytest.skip("Other inference providers don't support completion() yet")
response = await inference_impl.completion(
content="Roses are red,",
stream=False,
model=params["model"],
sampling_params=SamplingParams(
max_tokens=50,
),
)
assert isinstance(response, CompletionResponse)
assert "violets are blue" in response.content
chunks = [
r
async for r in await inference_impl.completion(
content="Roses are red,",
stream=True,
model=params["model"],
sampling_params=SamplingParams(
max_tokens=50,
),
)
]
assert all(isinstance(chunk, CompletionResponseStreamChunk) for chunk in chunks)
assert len(chunks) == 51
last = chunks[-1]
assert last.stop_reason == StopReason.out_of_tokens
@pytest.mark.asyncio
async def test_chat_completion_non_streaming(inference_settings, sample_messages):
inference_impl = inference_settings["impl"]
@ -142,7 +188,7 @@ async def test_chat_completion_streaming(inference_settings, sample_messages):
inference_impl = inference_settings["impl"]
response = [
r
async for r in inference_impl.chat_completion(
async for r in await inference_impl.chat_completion(
messages=sample_messages,
stream=True,
**inference_settings["common_params"],
@ -213,7 +259,7 @@ async def test_chat_completion_with_tool_calling_streaming(
response = [
r
async for r in inference_impl.chat_completion(
async for r in await inference_impl.chat_completion(
messages=messages,
tools=[sample_tool_definition],
stream=True,

View file

@ -2,8 +2,8 @@ providers:
- provider_id: test-faiss
provider_type: meta-reference
config: {}
- provider_id: test-chroma
provider_type: remote::chroma
- provider_id: test-chromadb
provider_type: remote::chromadb
config:
host: localhost
port: 6001

View file

@ -89,6 +89,30 @@ async def test_banks_list(memory_settings):
assert len(response) == 0
@pytest.mark.asyncio
async def test_banks_register(memory_settings):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
banks_impl = memory_settings["memory_banks_impl"]
bank = VectorMemoryBankDef(
identifier="test_bank_no_provider",
embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=512,
overlap_size_in_tokens=64,
)
await banks_impl.register_memory_bank(bank)
response = await banks_impl.list_memory_banks()
assert isinstance(response, list)
assert len(response) == 1
# register same memory bank with same id again will fail
await banks_impl.register_memory_bank(bank)
response = await banks_impl.list_memory_banks()
assert isinstance(response, list)
assert len(response) == 1
@pytest.mark.asyncio
async def test_query_documents(memory_settings, sample_documents):
memory_impl = memory_settings["memory_impl"]

View file

@ -14,7 +14,7 @@ import yaml
from llama_stack.distribution.datatypes import * # noqa: F403
from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
from llama_stack.distribution.request_headers import set_request_provider_data
from llama_stack.distribution.resolver import resolve_impls_with_routing
from llama_stack.distribution.resolver import resolve_impls
async def resolve_impls_for_test(api: Api, deps: List[Api] = None):
@ -36,7 +36,7 @@ async def resolve_impls_for_test(api: Api, deps: List[Api] = None):
providers=chosen,
)
run_config = parse_and_maybe_upgrade_config(run_config)
impls = await resolve_impls_with_routing(run_config)
impls = await resolve_impls(run_config)
if "provider_data" in config_dict:
provider_id = chosen[api.value][0].provider_id