llama-stack-mirror/llama_toolchain/inference/client_sdk.py

import fire
from llama_stack import LlamaStack
from llama_stack.types import UserMessage

def main(host: str, port: int):
    client = LlamaStack(
        base_url=f"http://{host}:{port}",
    )

    # Need smt like this to work w/ server, however this is not what was generated by SDK (?)
    response = client.inference.chat_completion(
        request={
            "messages": [
                UserMessage(content="hello world, troll me in two-paragraphs about 42", role="user"),
            ],
            "model": "Meta-Llama3.1-8B-Instruct",
            "stream": False,
        },
    )

    print(response)
    # This does not work with current server
    # response = client.inference.chat_completion(
    #     messages=[
    #         UserMessage(content="hello world, troll me in two-paragraphs about 42", role="user"),
    #     ],
    #     model="Meta-Llama3.1-8B-Instruct",
    #     stream=True,
    # )

if __name__ == "__main__":
    fire.Fire(main)