llama-stack-mirror/example.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

import os

os.environ["NVIDIA_API_KEY"] = "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"
# Option 1: Use default NIM URL (will auto-switch to ai.api.nvidia.com for rerank)
# os.environ["NVIDIA_BASE_URL"] = "https://ai.api.nvidia.com"
# Option 2: Use AI Foundation URL directly for rerank models
# os.environ["NVIDIA_BASE_URL"] = "https://ai.api.nvidia.com/v1"
os.environ["NVIDIA_BASE_URL"] = "https://integrate.api.nvidia.com"

import base64
import io
from PIL import Image

from llama_stack.core.library_client import LlamaStackAsLibraryClient

client = LlamaStackAsLibraryClient("nvidia")
client.initialize()

# # response = client.inference.completion(
# #     model_id="meta/llama-3.1-8b-instruct",
# #     content="Complete the sentence using one word: Roses are red, violets are :",
# #     stream=False,
# #     sampling_params={
# #         "max_tokens": 50,
# #     },
# # )
# # print(f"Response: {response.content}")


# response = client.inference.chat_completion(
#     model_id="nvidia/nvidia-nemotron-nano-9b-v2",
#     messages=[
#         {
#             "role": "system",
#             "content": "/think",
#         },
#         {
#             "role": "user",
#             "content": "How are you?",
#         },
#     ],
#     stream=False,
#     sampling_params={
#         "max_tokens": 1024,
#     },
# )
# print(f"Response: {response}")


print(client.models.list())
rerank_response = client.inference.rerank(
    model="nvidia/llama-3.2-nv-rerankqa-1b-v2",
    query="query",
    items=[
        "item_1",
        "item_2",
        "item_3",
    ]
)

print(rerank_response)
for i, result in enumerate(rerank_response):
    print(f"{i+1}. [Index: {result.index}, "
          f"Score: {(result.relevance_score):.3f}]")

# # from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition

# # tool_definition = ToolDefinition(
# #     tool_name="get_weather",
# #     description="Get current weather information for a location",
# #     parameters={
# #         "location": ToolParamDefinition(
# #             param_type="string",
# #             description="The city and state, e.g. San Francisco, CA",
# #             required=True
# #         ),
# #         "unit": ToolParamDefinition(
# #             param_type="string",
# #             description="Temperature unit (celsius or fahrenheit)",
# #             required=False,
# #             default="celsius"
# #         )
# #     }
# # )

# # # tool_response = client.inference.chat_completion(
# # #     model_id="meta-llama/Llama-3.1-8B-Instruct",
# # #     messages=[
# # #         {"role": "user", "content": "What's the weather like in San Francisco?"}
# # #     ],
# # #     tools=[tool_definition],
# # # )

# # # print(f"Tool Response: {tool_response.completion_message.content}")
# # # if tool_response.completion_message.tool_calls:
# # #     for tool_call in tool_response.completion_message.tool_calls:
# # #         print(f"Tool Called: {tool_call.tool_name}")
# # #         print(f"Arguments: {tool_call.arguments}")


# # # from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType

# # # person_schema = {
# # #     "type": "object",
# # #     "properties": {
# # #         "name": {"type": "string"},
# # #         "age": {"type": "integer"},
# # #         "occupation": {"type": "string"},
# # #     },
# # #     "required": ["name", "age", "occupation"]
# # # }

# # # response_format = JsonSchemaResponseFormat(
# # #     type=ResponseFormatType.json_schema,
# # #     json_schema=person_schema
# # # )

# # # structured_response = client.inference.chat_completion(
# # #     model_id="meta-llama/Llama-3.1-8B-Instruct",
# # #     messages=[
# # #         {
# # #             "role": "user",
# # #             "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. "
# # #         }
# # #     ],
# # #     response_format=response_format,
# # # )

# # # print(f"Structured Response: {structured_response.completion_message.content}")

# # # print("\n" + "="*50)
# # # print("VISION LANGUAGE MODEL (VLM) EXAMPLE")
# # # print("="*50)

# # def load_image_as_base64(image_path):
# #     with open(image_path, "rb") as image_file:
# #         img_bytes = image_file.read()
# #         return base64.b64encode(img_bytes).decode("utf-8")

# # image_path = "/home/jiayin/llama-stack/docs/dog.jpg"
# # demo_image_b64 = load_image_as_base64(image_path)

# # vlm_response = client.inference.chat_completion(
# #     model_id="nvidia/vila",
# #     messages=[
# #         {
# #             "role": "user",
# #             "content": [
# #                 {
# #                     "type": "image",
# #                     "image": {
# #                         "data": demo_image_b64,
# #                     },
# #                 },
# #                 {
# #                     "type": "text",
# #                     "text": "Please describe what you see in this image in detail.",
# #                 },
# #             ],
# #         }
# #     ],
# # )

# # print(f"VLM Response: {vlm_response.completion_message.content}")

# # # print("\n" + "="*50)
# # # print("EMBEDDING EXAMPLE")
# # # print("="*50)

# # # # Embedding example
# # # embedding_response = client.inference.embeddings(
# # #     model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
# # #     contents=["Hello world", "How are you today?"],
# # #     task_type="query"
# # # )

# # # print(f"Number of embeddings: {len(embedding_response.embeddings)}")
# # # print(f"Embedding dimension: {len(embedding_response.embeddings[0])}")
# # # print(f"First few values: {embedding_response.embeddings[0][:5]}")

# # # # from openai import OpenAI

# # # # client = OpenAI(
# # # #   base_url = "http://10.176.230.61:8000/v1",
# # # #   api_key = "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"
# # # # )

# # # # # completion = client.completions.create(
# # # # #   model="meta/llama-3.1-405b-instruct",
# # # # #   prompt="How are you?",
# # # # #   temperature=0.2,
# # # # #   top_p=0.7,
# # # # #   max_tokens=1024,
# # # # #   stream=False
# # # # # )

# # # # # # completion = client.chat.completions.create(
# # # # # #   model="meta/llama-3.1-8b-instruct",
# # # # # #   messages=[{"role":"user","content":"hi"}],
# # # # # #   temperature=0.2,
# # # # # #   top_p=0.7,
# # # # # #   max_tokens=1024,
# # # # # #   stream=True
# # # # # # )

# # # # # for chunk in completion:
# # # # #   if chunk.choices[0].delta.content is not None:
# # # # #     print(chunk.choices[0].delta.content, end="")


# # # # # response = client.inference.completion(
# # # # #     model_id="meta/llama-3.1-8b-instruct",
# # # # #     content="Complete the sentence using one word: Roses are red, violets are :",
# # # # #     stream=False,
# # # # #     sampling_params={
# # # # #         "max_tokens": 50,
# # # # #     },
# # # # # )
# # # # # print(f"Response: {response.content}")


# from openai import OpenAI

# client = OpenAI(
#   base_url = "https://integrate.api.nvidia.com/v1",
#   api_key = "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"
# )

# completion = client.chat.completions.create(
#   model="nvidia/nvidia-nemotron-nano-9b-v2",
#   messages=[{"role":"system","content":"/think"}],
#   temperature=0.6,
#   top_p=0.95,
#   max_tokens=2048,
#   frequency_penalty=0,
#   presence_penalty=0,
#   stream=True,
#   extra_body={
#     "min_thinking_tokens": 1024,
#     "max_thinking_tokens": 2048
#   }
# )

# for chunk in completion:
#   reasoning = getattr(chunk.choices[0].delta, "reasoning_content", None)
#   if reasoning:
#     print(reasoning, end="")
#   if chunk.choices[0].delta.content is not None:
#     print(chunk.choices[0].delta.content, end="")