mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-27 21:02:00 +00:00
refactor: demo_script.py (#4409)
- simplify search result processing in demo script - optimize demo script by using inline text instead of big external file - improve printouts clarity and user experience --------- Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
This commit is contained in:
parent
22f84df68b
commit
325a0bd7b3
1 changed files with 25 additions and 51 deletions
|
|
@ -24,16 +24,12 @@ from openai import OpenAI
|
|||
# Initialize OpenAI client pointing to Llama Stack server
|
||||
client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
|
||||
|
||||
# Shared setup: Create vector store and upload document
|
||||
print("=" * 80)
|
||||
print("SETUP: Creating vector store and uploading document")
|
||||
print("=" * 80)
|
||||
print("RAG demonstration\n")
|
||||
|
||||
url = "https://www.paulgraham.com/greatwork.html"
|
||||
print(f"Fetching document from: {url}")
|
||||
|
||||
vs = client.vector_stores.create()
|
||||
print(f"Vector store created: {vs.id}")
|
||||
|
||||
response = requests.get(url)
|
||||
pseudo_file = io.BytesIO(str(response.content).encode("utf-8"))
|
||||
|
|
@ -44,15 +40,19 @@ client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.i
|
|||
print(f"File uploaded and added to vector store: {uploaded_file.id}")
|
||||
|
||||
query = "How do you do great work?"
|
||||
|
||||
# ============================================================================
|
||||
# APPROACH 1: Responses API (Recommended for most use cases)
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("APPROACH 1: Responses API (Automatic Tool Calling)")
|
||||
print("=" * 80)
|
||||
print(f"Query: {query}\n")
|
||||
|
||||
print(
|
||||
"""
|
||||
RAG using Responses API:
|
||||
- Automatic tool calling (model decides when to search)
|
||||
- Simpler code, less control
|
||||
- Best for: Conversational agents, automatic workflows
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
print("Reply via Responses API:\n")
|
||||
resp = client.responses.create(
|
||||
model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
|
||||
input=query,
|
||||
|
|
@ -60,39 +60,36 @@ resp = client.responses.create(
|
|||
include=["file_search_call.results"],
|
||||
)
|
||||
|
||||
print("Response (Responses API):")
|
||||
print("-" * 80)
|
||||
print(resp.output[-1].content[-1].text)
|
||||
print("-" * 80)
|
||||
|
||||
# ============================================================================
|
||||
# APPROACH 2: Chat Completions API
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("APPROACH 2: Chat Completions API (Manual Retrieval)")
|
||||
print("=" * 80)
|
||||
print(f"Query: {query}\n")
|
||||
print(
|
||||
"""
|
||||
|
||||
RAG using Chat Completions API:
|
||||
- Manual retrieval (you control the search)
|
||||
- More code, more control
|
||||
- Best for: Custom RAG patterns, batch processing, specialized workflows
|
||||
"""
|
||||
)
|
||||
|
||||
# Step 1: Search vector store explicitly
|
||||
print("Searching vector store...")
|
||||
search_results = client.vector_stores.search(
|
||||
vector_store_id=vs.id, query=query, max_num_results=3, rewrite_query=False
|
||||
)
|
||||
|
||||
# Step 2: Extract context from search results
|
||||
# Extract context from search results
|
||||
context_chunks = []
|
||||
for result in search_results.data:
|
||||
# result.content is a list of Content objects, extract the text from each
|
||||
if hasattr(result, "content") and result.content:
|
||||
for content_item in result.content:
|
||||
if hasattr(content_item, "text") and content_item.text:
|
||||
context_chunks.append(content_item.text)
|
||||
for content_item in result.content:
|
||||
context_chunks.append(content_item.text)
|
||||
|
||||
context = "\n\n".join(context_chunks)
|
||||
print(f"Found {len(context_chunks)} relevant chunks\n")
|
||||
|
||||
# Step 3: Use Chat Completions with retrieved context
|
||||
print("Generating response with chat completions...")
|
||||
print("Reply via Chat Completions API:\n")
|
||||
completion = client.chat.completions.create(
|
||||
model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
|
||||
messages=[
|
||||
|
|
@ -108,29 +105,6 @@ completion = client.chat.completions.create(
|
|||
temperature=0.7,
|
||||
)
|
||||
|
||||
print("Response (Chat Completions API):")
|
||||
print("-" * 80)
|
||||
print(completion.choices[0].message.content)
|
||||
print("-" * 80)
|
||||
|
||||
# ============================================================================
|
||||
# Summary
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
print(
|
||||
"""
|
||||
Both approaches successfully performed RAG:
|
||||
|
||||
1. Responses API:
|
||||
- Automatic tool calling (model decides when to search)
|
||||
- Simpler code, less control
|
||||
- Best for: Conversational agents, automatic workflows
|
||||
|
||||
2. Chat Completions API:
|
||||
- Manual retrieval (you control the search)
|
||||
- More code, more control
|
||||
- Best for: Custom RAG patterns, batch processing, specialized workflows
|
||||
"""
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue