mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-31 04:40:01 +00:00
119 lines
3.3 KiB
Python
119 lines
3.3 KiB
Python
|
|
import os
|
|
import os
|
|
import uuid
|
|
|
|
from termcolor import cprint
|
|
|
|
# Set environment variables
|
|
os.environ["INFERENCE_MODEL"] = "llama3.2:3b-instruct-fp16"
|
|
os.environ["LLAMA_STACK_CONFIG"] = "ollama"
|
|
|
|
# Import libraries after setting environment variables
|
|
from llama_stack_client.lib.agents.agent import Agent
|
|
from llama_stack_client.lib.agents.event_logger import EventLogger
|
|
from llama_stack_client.types import Document
|
|
from llama_stack_client.types.agent_create_params import AgentConfig
|
|
|
|
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
|
|
|
|
|
def main():
|
|
# Initialize the client
|
|
client = LlamaStackAsLibraryClient("ollama")
|
|
vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
|
|
|
|
_ = client.initialize()
|
|
|
|
model_id = "llama3.2:3b-instruct-fp16"
|
|
|
|
# Define the list of document URLs and create Document objects
|
|
urls = [
|
|
"chat.rst",
|
|
"llama3.rst",
|
|
"memory_optimizations.rst",
|
|
"lora_finetune.rst",
|
|
]
|
|
documents = [
|
|
Document(
|
|
document_id=f"num-{i}",
|
|
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
|
|
mime_type="text/plain",
|
|
metadata={},
|
|
)
|
|
for i, url in enumerate(urls)
|
|
]
|
|
# (Optional) Use the documents as needed with your client here
|
|
|
|
client.vector_dbs.register(
|
|
provider_id="sqlite-vec",
|
|
vector_db_id=vector_db_id,
|
|
embedding_model="all-MiniLM-L6-v2",
|
|
embedding_dimension=384,
|
|
)
|
|
|
|
client.tool_runtime.rag_tool.insert(
|
|
documents=documents,
|
|
vector_db_id=vector_db_id,
|
|
chunk_size_in_tokens=512,
|
|
)
|
|
# List of user prompts
|
|
user_prompts = [
|
|
"What are the top 5 topics that were explained in the documentation? Only list succinct bullet points.",
|
|
"Was anything related to 'Llama3' discussed, if so what?",
|
|
"Tell me how to use LoRA",
|
|
"What about Quantization?",
|
|
]
|
|
|
|
# Process each prompt and display the output
|
|
for prompt in user_prompts:
|
|
cprint(f"User> {prompt}", "green")
|
|
response = client.vector_io.query(
|
|
vector_db_id=vector_db_id,
|
|
query=prompt,
|
|
)
|
|
cprint(f"Response> {response}", "blue")
|
|
|
|
# # Create agent configuration
|
|
# agent_config = AgentConfig(
|
|
# model=model_id,
|
|
# instructions="You are a helpful assistant",
|
|
# enable_session_persistence=False,
|
|
# toolgroups=[
|
|
# {
|
|
# "name": "builtin::rag",
|
|
# "args": {
|
|
# "vector_db_ids": [vector_db_id],
|
|
# },
|
|
# }
|
|
# ],
|
|
# )
|
|
#
|
|
# # Instantiate the Agent
|
|
# agent = Agent(client, agent_config)
|
|
#
|
|
#
|
|
# # Create a session for the agent
|
|
# session_id = agent.create_session("test-session")
|
|
#
|
|
# # Process each prompt and display the output
|
|
# for prompt in user_prompts:
|
|
# cprint(f"User> {prompt}", "green")
|
|
# response = agent.create_turn(
|
|
# messages=[
|
|
# {
|
|
# "role": "user",
|
|
# "content": prompt,
|
|
# }
|
|
# ],
|
|
# session_id=session_id,
|
|
# )
|
|
# # Log and print events from the response
|
|
# for log in EventLogger().log(response):
|
|
# log.print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|