Commit graph

49 commits

Author SHA1 Message Date
Ashwin Bharambe
b72154ce5e Merge remote-tracking branch 'origin/main' into stores
Some checks failed
Installer CI / smoke-test-on-dev (push) Failing after 3s
Installer CI / lint (push) Failing after 3s
2025-10-13 11:07:11 -07:00
Francisco Arceo
e7d21e1ee3
feat: Add support for Conversations in Responses API (#3743)
# What does this PR do?
This PR adds support for Conversations in Responses.

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
Unit tests
Integration tests

<Details>
<Summary>Manual testing with this script: (click to expand)</Summary>

```python
from openai import OpenAI

client = OpenAI()
client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")

def test_conversation_create():
    print("Testing conversation create...")
    conversation = client.conversations.create(
        metadata={"topic": "demo"},
        items=[
            {"type": "message", "role": "user", "content": "Hello!"}
        ]
    )
    print(f"Created: {conversation}")
    return conversation

def test_conversation_retrieve(conv_id):
    print(f"Testing conversation retrieve for {conv_id}...")
    retrieved = client.conversations.retrieve(conv_id)
    print(f"Retrieved: {retrieved}")
    return retrieved

def test_conversation_update(conv_id):
    print(f"Testing conversation update for {conv_id}...")
    updated = client.conversations.update(
        conv_id,
        metadata={"topic": "project-x"}
    )
    print(f"Updated: {updated}")
    return updated

def test_conversation_delete(conv_id):
    print(f"Testing conversation delete for {conv_id}...")
    deleted = client.conversations.delete(conv_id)
    print(f"Deleted: {deleted}")
    return deleted

def test_conversation_items_create(conv_id):
    print(f"Testing conversation items create for {conv_id}...")
    items = client.conversations.items.create(
        conv_id,
        items=[
            {
                "type": "message",
                "role": "user",
                "content": [{"type": "input_text", "text": "Hello!"}]
            },
            {
                "type": "message",
                "role": "user",
                "content": [{"type": "input_text", "text": "How are you?"}]
            }
        ]
    )
    print(f"Items created: {items}")
    return items

def test_conversation_items_list(conv_id):
    print(f"Testing conversation items list for {conv_id}...")
    items = client.conversations.items.list(conv_id, limit=10)
    print(f"Items list: {items}")
    return items

def test_conversation_item_retrieve(conv_id, item_id):
    print(f"Testing conversation item retrieve for {conv_id}/{item_id}...")
    item = client.conversations.items.retrieve(conversation_id=conv_id, item_id=item_id)
    print(f"Item retrieved: {item}")
    return item

def test_conversation_item_delete(conv_id, item_id):
    print(f"Testing conversation item delete for {conv_id}/{item_id}...")
    deleted = client.conversations.items.delete(conversation_id=conv_id, item_id=item_id)
    print(f"Item deleted: {deleted}")
    return deleted

def test_conversation_responses_create():
    print("\nTesting conversation create for a responses example...")
    conversation = client.conversations.create()
    print(f"Created: {conversation}")

    response = client.responses.create(
      model="gpt-4.1",
      input=[{"role": "user", "content": "What are the 5 Ds of dodgeball?"}],
      conversation=conversation.id,
    )
    print(f"Created response: {response} for conversation {conversation.id}")

    return response, conversation

def test_conversations_responses_create_followup(
        conversation,
        content="Repeat what you just said but add 'this is my second time saying this'",
    ):
    print(f"Using: {conversation.id}")

    response = client.responses.create(
      model="gpt-4.1",
      input=[{"role": "user", "content": content}],
      conversation=conversation.id,
    )
    print(f"Created response: {response} for conversation {conversation.id}")

    conv_items = client.conversations.items.list(conversation.id)
    print(f"\nRetrieving list of items for conversation {conversation.id}:")
    print(conv_items.model_dump_json(indent=2))

def test_response_with_fake_conv_id():
    fake_conv_id = "conv_zzzzzzzzz5dc81908289d62779d2ac510a2b0b602ef00a44"
    print(f"Using {fake_conv_id}")
    try:
        response = client.responses.create(
          model="gpt-4.1",
          input=[{"role": "user", "content": "say hello"}],
          conversation=fake_conv_id,
        )
        print(f"Created response: {response} for conversation {fake_conv_id}")
    except Exception as e:
        print(f"failed to create response for conversation {fake_conv_id} with error {e}")


def main():
    print("Testing OpenAI Conversations API...")

    # Create conversation
    conversation = test_conversation_create()
    conv_id = conversation.id

    # Retrieve conversation
    test_conversation_retrieve(conv_id)

    # Update conversation
    test_conversation_update(conv_id)

    # Create items
    items = test_conversation_items_create(conv_id)

    # List items
    items_list = test_conversation_items_list(conv_id)

    # Retrieve specific item
    if items_list.data:
        item_id = items_list.data[0].id
        test_conversation_item_retrieve(conv_id, item_id)

        # Delete item
        test_conversation_item_delete(conv_id, item_id)

    # Delete conversation
    test_conversation_delete(conv_id)

    response, conversation2 = test_conversation_responses_create()
    print('\ntesting reseponse retrieval')
    test_conversation_retrieve(conversation2.id)

    print('\ntesting responses follow up')
    test_conversations_responses_create_followup(conversation2)

    print('\ntesting responses follow up x2!')

    test_conversations_responses_create_followup(
        conversation2,
        content="Repeat what you just said but add 'this is my third time saying this'",
    )

    test_response_with_fake_conv_id()

    print("All tests completed!")


if __name__ == "__main__":
    main()
```
</Details>

---------

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2025-10-10 11:57:40 -07:00
Charlie Doern
6389bf5ffb
fix: make telemetry optional for agents (#3705)
# What does this PR do?

there is a lot of code in the agents API using the telemetry API and its
helpers without checking if that API is even enabled.

This is the only API besides inference actively using telemetry code, so
after this telemetry can be optional for the entire stack


resolves #3665


## Test Plan

existing agent tests.

Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-10-07 16:09:03 +02:00
Ashwin Bharambe
ea233c2134 feat!: providers use unified 'persistence' field
BREAKING CHANGE: Provider config field names changed for semantic clarity

- Rename kvstore → persistence for all providers
- Simple providers: flat persistence with backend reference
- Complex providers (agents): nested persistence.agent_state + persistence.responses
- Files provider: metadata_store → persistence
- Provider configs now clearly express 'how do I persist?' not 'what type of store?'

Example:
  # Before
  config:
    kvstore:
      backend: kvstore
      namespace: faiss

  # After
  config:
    persistence:
      backend: kvstore
      namespace: faiss

  # Agents (nested)
  config:
    persistence:
      agent_state:
        backend: kvstore
        namespace: agents
      responses:
        backend: sqlstore
        namespace: responses
2025-10-05 20:33:03 -07:00
Ashwin Bharambe
61b4238912
feat(api): add extra_body parameter support with shields example (#3670)
## Summary
Introduce `ExtraBodyField` annotation to enable parameters that arrive
via extra_body in client SDKs but are accessible server-side with full
typing.

These parameters are documented in OpenAPI specs under
**`x-llama-stack-extra-body-params`** but excluded from generated SDK
signatures.

Add `shields` parameter to `create_openai_response` as the first
implementation using this pattern.

## Test Plan
- added an integration test which checks that shields parameter passed
via extra_body reaches server implementation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-10-03 13:25:09 -07:00
Mustafa Elbehery
c3b2b06974
refactor(logging): rename llama_stack logger categories (#3065)
# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->
This PR renames categories of llama_stack loggers.

This PR aligns logging categories as per the package name, as well as
reviews from initial
https://github.com/meta-llama/llama-stack/pull/2868. This is a follow up
to #3061.

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

Replaces https://github.com/meta-llama/llama-stack/pull/2868
Part of https://github.com/meta-llama/llama-stack/issues/2865

cc @leseb @rhuss

Signed-off-by: Mustafa Elbehery <melbeher@redhat.com>
2025-08-21 17:31:04 -07:00
Mustafa Elbehery
3f8df167f3
chore(pre-commit): add pre-commit hook to enforce llama_stack logger usage (#3061)
# What does this PR do?

This PR adds a step in pre-commit to enforce using `llama_stack` logger.

Currently, various parts of the code base uses different loggers. As a
custom `llama_stack` logger exist and used in the codebase, it is better
to standardize its utilization.

Signed-off-by: Mustafa Elbehery <melbeher@redhat.com>
Co-authored-by: Matthew Farrellee <matt@cs.wisc.edu>
2025-08-20 07:15:35 -04:00
ashwinb
47d5af703c
chore(responses): Refactor Responses Impl to be civilized (#3138)
# What does this PR do?
Refactors the OpenAI responses implementation by extracting streaming and tool execution logic into separate modules. This improves code organization by:

1. Creating a new `StreamingResponseOrchestrator` class in `streaming.py` to handle the streaming response generation logic
2. Moving tool execution functionality to a dedicated `ToolExecutor` class in `tool_executor.py`

## Test Plan

Existing tests
2025-08-15 00:05:35 +00:00
Ashwin Bharambe
4fec49dfdb
feat(responses): add include parameter (#3115)
Well our Responses tests use it so we better include it in the API, no?

I discovered it because I want to make sure `llama-stack-client` can be
used always instead of `openai-python` as the client (we do want to be
_truly_ compatible.)
2025-08-12 10:24:01 -07:00
Nathan Weinberg
68b0071861
chore: standardize session not found error (#3031)
# What does this PR do?
1. Creates a new `SessionNotFoundError` class
2. Implements the new class where appropriate 

Relates to #2379

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2025-08-04 13:12:02 -07:00
Ashwin Bharambe
2665f00102
chore(rename): move llama_stack.distribution to llama_stack.core (#2975)
We would like to rename the term `template` to `distribution`. To
prepare for that, this is a precursor.

cc @leseb
2025-07-30 23:30:53 -07:00
Krzysztof Malczuk
be9bf68246
feat: Add webmethod for deleting openai responses (#2160)
Some checks failed
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 16s
Integration Tests / test-matrix (http, 3.13, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.13, scoring) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 12s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, providers) (push) Failing after 17s
Integration Tests / test-matrix (http, 3.13, agents) (push) Failing after 11s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 5s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 16s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 18s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 19s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 21s
Test External Providers / test-external-providers (venv) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 19s
Unit Tests / unit-tests (3.12) (push) Failing after 9s
Update ReadTheDocs / update-readthedocs (push) Failing after 7s
Unit Tests / unit-tests (3.13) (push) Failing after 10s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 39s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 37s
Python Package Build Test / build (3.13) (push) Failing after 33s
Python Package Build Test / build (3.12) (push) Failing after 36s
Pre-commit / pre-commit (push) Failing after 1m19s
# What does this PR do?
This PR creates a webmethod for deleting open AI responses, adds and
implementation for it and makes an integration test for the OpenAI
delete response method.

[//]: # (If resolving an issue, uncomment and update the line below)
# (Closes #2077)

## Test Plan
Ran the standard tests and the pre-commit hooks and the unit tests.

# (## Documentation)
For this pr I made the routes and implementation based on the current
get and create methods. The unit tests were not able to handle this test
due to the mock interface in use, which did not allow for effective CRUD
to be tested. I instead created an integration test to match the
existing ones in the test_openai_responses.
2025-06-30 11:28:02 +02:00
ehhuang
d3b60507d7
feat: support auth attributes in inference/responses stores (#2389)
# What does this PR do?
Inference/Response stores now store user attributes when inserting, and
respects them when fetching.

## Test Plan
pytest tests/unit/utils/test_sqlstore.py
2025-06-20 10:24:45 -07:00
Charlie Doern
d12f195f56
feat: drop python 3.10 support (#2469)
# What does this PR do?

dropped python3.10, updated pyproject and dependencies, and also removed
some blocks of code with special handling for enum.StrEnum

Closes #2458

Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-06-19 12:07:14 +05:30
ehhuang
db2cd9e8f3
feat: support filters in file search (#2472)
# What does this PR do?
Move to use vector_stores.search for file search tool in Responses,
which supports filters.

closes #2435 

## Test Plan
Added e2e test with fitlers.
myenv ❯ llama stack run llama_stack/templates/fireworks/run.yaml

pytest -sv tests/verifications/openai_api/test_responses.py \
  -k 'file_search and filters' \
  --base-url=http://localhost:8321/v1/openai/v1 \
  --model=meta-llama/Llama-3.3-70B-Instruct
2025-06-18 21:50:55 -07:00
grs
7c1998db25
feat: fine grained access control policy (#2264)
This allows a set of rules to be defined for determining access to
resources. The rules are (loosely) based on the cedar policy format.

A rule defines a list of action either to permit or to forbid. It may
specify a principal or a resource that must match for the rule to take
effect. It may also specify a condition, either a 'when' or an 'unless',
with additional constraints as to where the rule applies.

A list of rules is held for each type to be protected and tried in order
to find a match. If a match is found, the request is permitted or
forbidden depening on the type of rule. If no match is found, the
request is denied. If no rules are specified for a given type, a rule
that allows any action as long as the resource attributes match the user
attributes is added (i.e. the previous behaviour is the default.

Some examples in yaml:

```
    model:
    - permit:
      principal: user-1
      actions: [create, read, delete]
      comment: user-1 has full access to all models
    - permit:
      principal: user-2
      actions: [read]
      resource: model-1
      comment: user-2 has read access to model-1 only
    - permit:
      actions: [read]
      when:
        user_in: resource.namespaces
      comment: any user has read access to models with matching attributes
    vector_db:
    - forbid:
      actions: [create, read, delete]
      unless:
        user_in: role::admin
      comment: only user with admin role can use vector_db resources
```

---------

Signed-off-by: Gordon Sim <gsim@redhat.com>
2025-06-03 14:51:12 -07:00
Ben Browning
8bee2954be
feat: Structured output for Responses API (#2324)
# What does this PR do?

This adds the missing `text` parameter to the Responses API that is how
users control structured outputs. All we do with that parameter is map
it to the corresponding chat completion response_format.

## Test Plan

The new unit tests exercise the various permutations allowed for this
property, while a couple of new verification tests actually use it for
real to verify the model outputs are following the format as expected.

Unit tests:

`python -m pytest -s -v
tests/unit/providers/agents/meta_reference/test_openai_responses.py`

Verification tests:

```
llama stack run llama_stack/templates/together/run.yaml
pytest -s -vv 'tests/verifications/openai_api/test_responses.py' \
  --base-url=http://localhost:8321/v1/openai/v1 \
  --model meta-llama/Llama-4-Scout-17B-16E-Instruct
```

Note that the verification tests can only be run with a real Llama Stack
server (as opposed to using the library client via
`--provider=stack:together`) because the Llama Stack python client is
not yet updated to accept this text field.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-06-03 14:43:00 -07:00
Ashwin Bharambe
dbe4e84aca
feat(responses): implement full multi-turn support (#2295)
I think the implementation needs more simplification. Spent way too much
time trying to get the tests pass with models not co-operating :(
Finally had to switch claude-sonnet to get things to pass reliably.

### Test Plan

```
export TAVILY_SEARCH_API_KEY=...
export OPENAI_API_KEY=...

uv run pytest -p no:warnings \
   -s -v tests/verifications/openai_api/test_responses.py \
 --provider=stack:starter \
  --model openai/gpt-4o
```
2025-06-02 15:35:49 -07:00
ehhuang
15b0a67555
feat: add responses input items api (#2239)
# What does this PR do?
TSIA

## Test Plan
added integration and unit tests
2025-05-24 07:05:53 -07:00
ehhuang
5844c2da68
feat: add list responses API (#2233)
# What does this PR do?
This is not part of the official OpenAI API, but we'll use this for the
logs UI.
In order to support more filtering options, I'm adopting the newly
introduced sql store in in place of the kv store.

## Test Plan
Added integration/unit tests.
2025-05-23 13:16:48 -07:00
Ashwin Bharambe
558d109ab7
fix: signature change to match OpenAI SDK (#2237) 2025-05-23 10:59:30 -07:00
Derek Higgins
3339844fda
feat: Add "instructions" support to responses API (#2205)
# What does this PR do?
Add support for "instructions" to the responses API. Instructions
provide a way to swap out system (or developer) messages in new
responses.


## Test Plan
unit tests added

Signed-off-by: Derek Higgins <derekh@redhat.com>
2025-05-20 09:52:10 -07:00
Ben Browning
8e316c9b1e
feat: function tools in OpenAI Responses (#2094)
# What does this PR do?

This is a combination of what was previously 3 separate PRs - #2069,
#2075, and #2083. It turns out all 3 of those are needed to land a
working function calling Responses implementation. The web search
builtin tool was already working, but this wires in support for custom
function calling.

I ended up combining all three into one PR because they all had lots of
merge conflicts, both with each other but also with #1806 that just
landed. And, because landing any of them individually would have only
left a partially working implementation merged.

The new things added here are:
* Storing of input items from previous responses and restoring of those
input items when adding previous responses to the conversation state
* Handling of multiple input item messages roles, not just "user"
messages.
* Support for custom tools passed into the Responses API to enable
function calling outside of just the builtin websearch tool.

Closes #2074
Closes #2080

## Test Plan

### Unit Tests

Several new unit tests were added, and they all pass. Ran via:

```
python -m pytest -s -v tests/unit/providers/agents/meta_reference/test_openai_responses.py
```

### Responses API Verification Tests

I ran our verification run.yaml against multiple providers to ensure we
were getting a decent pass rate. Specifically, I ensured the new custom
tool verification test passed across multiple providers and that the
multi-turn examples passed across at least some of the providers (some
providers struggle with the multi-turn workflows still).

Running the stack setup for verification testing:

```
llama stack run --image-type venv tests/verifications/openai-api-verification-run.yaml
```

Together, passing 100% as an example:

```
pytest -s -v 'tests/verifications/openai_api/test_responses.py' --provider=together-llama-stack
```

## Documentation

We will need to start documenting the OpenAI APIs, but for now the
Responses stuff is still rapidly evolving so delaying that.

---------

Signed-off-by: Derek Higgins <derekh@redhat.com>
Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Derek Higgins <derekh@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2025-05-13 11:29:15 -07:00
Sébastien Han
80c349965f
chore(refact): move paginate_records fn outside of datasetio (#2137)
# What does this PR do?

Move under utils.

Signed-off-by: Sébastien Han <seb@redhat.com>
2025-05-12 10:56:14 -07:00
Sébastien Han
c91e3552a3
feat: implementation for agent/session list and describe (#1606)
Create a new agent:

```
curl --request POST \
  --url http://localhost:8321/v1/agents \
  --header 'Accept: application/json' \
  --header 'Content-Type: application/json' \
  --data '{
  "agent_config": {
    "sampling_params": {
      "strategy": {
        "type": "greedy"
      },
      "max_tokens": 0,
      "repetition_penalty": 1
    },
    "input_shields": [
      "string"
    ],
    "output_shields": [
      "string"
    ],
    "toolgroups": [
      "string"
    ],
    "client_tools": [
      {
        "name": "string",
        "description": "string",
        "parameters": [
          {
            "name": "string",
            "parameter_type": "string",
            "description": "string",
            "required": true,
            "default": null
          }
        ],
        "metadata": {
          "property1": null,
          "property2": null
        }
      }
    ],
    "tool_choice": "auto",
    "tool_prompt_format": "json",
    "tool_config": {
      "tool_choice": "auto",
      "tool_prompt_format": "json",
      "system_message_behavior": "append"
    },
    "max_infer_iters": 10,
    "model": "string",
    "instructions": "string",
    "enable_session_persistence": false,
    "response_format": {
      "type": "json_schema",
      "json_schema": {
        "property1": null,
        "property2": null
      }
    }
  }
}'
```

Get agent:

```
curl http://127.0.0.1:8321/v1/agents/9abad4ab-2c77-45f9-9d16-46b79d2bea1f
{"agent_id":"9abad4ab-2c77-45f9-9d16-46b79d2bea1f","agent_config":{"sampling_params":{"strategy":{"type":"greedy"},"max_tokens":0,"repetition_penalty":1.0},"input_shields":["string"],"output_shields":["string"],"toolgroups":["string"],"client_tools":[{"name":"string","description":"string","parameters":[{"name":"string","parameter_type":"string","description":"string","required":true,"default":null}],"metadata":{"property1":null,"property2":null}}],"tool_choice":"auto","tool_prompt_format":"json","tool_config":{"tool_choice":"auto","tool_prompt_format":"json","system_message_behavior":"append"},"max_infer_iters":10,"model":"string","instructions":"string","enable_session_persistence":false,"response_format":{"type":"json_schema","json_schema":{"property1":null,"property2":null}}},"created_at":"2025-03-12T16:18:28.369144Z"}%
```

List agents:

```
curl http://127.0.0.1:8321/v1/agents|jq
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1680  100  1680    0     0   498k      0 --:--:-- --:--:-- --:--:--  546k
{
  "data": [
    {
      "agent_id": "9abad4ab-2c77-45f9-9d16-46b79d2bea1f",
      "agent_config": {
        "sampling_params": {
          "strategy": {
            "type": "greedy"
          },
          "max_tokens": 0,
          "repetition_penalty": 1.0
        },
        "input_shields": [
          "string"
        ],
        "output_shields": [
          "string"
        ],
        "toolgroups": [
          "string"
        ],
        "client_tools": [
          {
            "name": "string",
            "description": "string",
            "parameters": [
              {
                "name": "string",
                "parameter_type": "string",
                "description": "string",
                "required": true,
                "default": null
              }
            ],
            "metadata": {
              "property1": null,
              "property2": null
            }
          }
        ],
        "tool_choice": "auto",
        "tool_prompt_format": "json",
        "tool_config": {
          "tool_choice": "auto",
          "tool_prompt_format": "json",
          "system_message_behavior": "append"
        },
        "max_infer_iters": 10,
        "model": "string",
        "instructions": "string",
        "enable_session_persistence": false,
        "response_format": {
          "type": "json_schema",
          "json_schema": {
            "property1": null,
            "property2": null
          }
        }
      },
      "created_at": "2025-03-12T16:18:28.369144Z"
    },
    {
      "agent_id": "a6643aaa-96dd-46db-a405-333dc504b168",
      "agent_config": {
        "sampling_params": {
          "strategy": {
            "type": "greedy"
          },
          "max_tokens": 0,
          "repetition_penalty": 1.0
        },
        "input_shields": [
          "string"
        ],
        "output_shields": [
          "string"
        ],
        "toolgroups": [
          "string"
        ],
        "client_tools": [
          {
            "name": "string",
            "description": "string",
            "parameters": [
              {
                "name": "string",
                "parameter_type": "string",
                "description": "string",
                "required": true,
                "default": null
              }
            ],
            "metadata": {
              "property1": null,
              "property2": null
            }
          }
        ],
        "tool_choice": "auto",
        "tool_prompt_format": "json",
        "tool_config": {
          "tool_choice": "auto",
          "tool_prompt_format": "json",
          "system_message_behavior": "append"
        },
        "max_infer_iters": 10,
        "model": "string",
        "instructions": "string",
        "enable_session_persistence": false,
        "response_format": {
          "type": "json_schema",
          "json_schema": {
            "property1": null,
            "property2": null
          }
        }
      },
      "created_at": "2025-03-12T16:17:12.811273Z"
    }
  ]
}
```

Create sessions:

```
curl --request POST \
  --url http://localhost:8321/v1/agents/{agent_id}/session \
  --header 'Accept: application/json' \
  --header 'Content-Type: application/json' \
  --data '{
  "session_name": "string"
}'
```

List sessions:

```
 curl http://127.0.0.1:8321/v1/agents/9abad4ab-2c77-45f9-9d16-46b79d2bea1f/sessions|jq
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   263  100   263    0     0  90099      0 --:--:-- --:--:-- --:--:--  128k
[
  {
    "session_id": "2b15c4fc-e348-46c1-ae32-f6d424441ac1",
    "session_name": "string",
    "turns": [],
    "started_at": "2025-03-12T17:19:17.784328"
  },
  {
    "session_id": "9432472d-d483-4b73-b682-7b1d35d64111",
    "session_name": "string",
    "turns": [],
    "started_at": "2025-03-12T17:19:19.885834"
  }
]
```

Signed-off-by: Sébastien Han <seb@redhat.com>
2025-05-07 14:49:23 +02:00
Ashwin Bharambe
272d3359ee
fix: remove code interpeter implementation (#2087)
# What does this PR do?

The builtin implementation of code interpreter is not robust and has a
really weak sandboxing shell (the `bubblewrap` container). Given the
availability of better MCP code interpreter servers coming up, we should
use them instead of baking an implementation into the Stack and
expanding the vulnerability surface to the rest of the Stack.

This PR only does the removal. We will add examples with how to
integrate with MCPs in subsequent ones.

## Test Plan

Existing tests.
2025-05-01 14:35:08 -07:00
Ihar Hrachyshka
9e6561a1ec
chore: enable pyupgrade fixes (#1806)
# What does this PR do?

The goal of this PR is code base modernization.

Schema reflection code needed a minor adjustment to handle UnionTypes
and collections.abc.AsyncIterator. (Both are preferred for latest Python
releases.)

Note to reviewers: almost all changes here are automatically generated
by pyupgrade. Some additional unused imports were cleaned up. The only
change worth of note can be found under `docs/openapi_generator` and
`llama_stack/strong_typing/schema.py` where reflection code was updated
to deal with "newer" types.

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
2025-05-01 14:23:50 -07:00
Derek Higgins
64829947d0
feat: Add temperature support to responses API (#2065)
# What does this PR do?
Add support for the temperature to the responses API 


## Test Plan
Manually tested simple case
unit tests added for simple case and tool calls

Signed-off-by: Derek Higgins <derekh@redhat.com>
2025-05-01 11:47:58 -07:00
Ben Browning
8dfce2f596
feat: OpenAI Responses API (#1989)
# What does this PR do?

This provides an initial [OpenAI Responses
API](https://platform.openai.com/docs/api-reference/responses)
implementation. The API is not yet complete, and this is more a
proof-of-concept to show how we can store responses in our key-value
stores and use them to support the Responses API concepts like
`previous_response_id`.

## Test Plan

I've added a new
`tests/integration/openai_responses/test_openai_responses.py` as part of
a test-driven development for this new API. I'm only testing this
locally with the remote-vllm provider for now, but it should work with
any of our inference providers since the only API it requires out of the
inference provider is the `openai_chat_completion` endpoint.

```
VLLM_URL="http://localhost:8000/v1" \
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \
llama stack build --template remote-vllm --image-type venv --run
```

```
LLAMA_STACK_CONFIG="http://localhost:8321" \
python -m pytest -v \
  tests/integration/openai_responses/test_openai_responses.py \
  --text-model "meta-llama/Llama-3.2-3B-Instruct"
 ```

---------

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2025-04-28 14:06:00 -07:00
Sébastien Han
10882bf478
chore: remove unused tempdir in agent (#1896)
# What does this PR do?

The usage of the tempdir was removed in
094eb6a5ae.

Signed-off-by: Sébastien Han <seb@redhat.com>
2025-04-09 09:43:48 +02:00
ehhuang
b7a9c45477
chore: deprecate ToolResponseMessage in agent.resume API (#1566)
# Summary:
closes #1431 

# Test Plan:
LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct
2025-03-12 12:10:21 -07:00
Xi Yan
43044f29e2
fix: fix llama stack run with missing agent impl (#1559)
# What does this PR do?

- recent merge https://github.com/meta-llama/llama-stack/pull/1410
introduce error
```
ValueError: Provider meta-reference (Api.agents) does not implement the following methods:
[('list_agent_sessions', 'not_actually_implemented'), ('list_agents', 'not_actually_implemented')]
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
llama stack run
```

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/agents/test_agents.py --text-model meta-llama/Llama-3.3-70B-Instruct
```

1379530386

[//]: # (## Documentation)
2025-03-11 11:22:22 -07:00
ehhuang
6cf79437b3
feat: support ClientTool output metadata (#1426)
# Summary:
Client side change in
https://github.com/meta-llama/llama-stack-client-python/pull/180
Changes the resume_turn API to accept `ToolResponse` instead of
`ToolResponseMessage`:
1. `ToolResponse` contains `metadata`
2. `ToolResponseMessage` is a concept for model inputs. Here we are just
submitting the outputs of tool execution.

# Test Plan:
Ran integration tests with newly added test using client tool with
metadata

LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --record-responses
2025-03-05 14:30:27 -08:00
Xi Yan
158b6dc404
chore: deprecate allow_turn_resume (#1377)
# What does this PR do?

- Deprecate allow_turn_resume flag as this is used for staying backward
compat.
- Closes https://github.com/meta-llama/llama-stack/issues/1363

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/api/agents/test_agents.py --inference-model "meta-llama/Llama-3.3-70B-Instruct" --record-responses
```

<img width="1054" alt="image"
src="https://github.com/user-attachments/assets/d31de2d4-0953-41e1-a71a-7e1579fa351a"
/>


[//]: # (## Documentation)
2025-03-04 12:22:11 -08:00
Xi Yan
fc5aff3ccf
feat: ability to retrieve agents session, turn, step by ids (#1286)
# What does this PR do?

- Fix up rotten implementation for retrieving agent's Session, Turn,
Step with actual working implementation.

- Update `getting_started` notebook with retrieving by agent session_id.
https://github.com/meta-llama/llama-stack/blob/export_agent_dataset/docs/getting_started.ipynb

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Test with script:
https://gist.github.com/yanxi0830/657cecee8f1f0e39d322963d9c0f598e

<img width="503" alt="image"
src="https://github.com/user-attachments/assets/5ea9bc33-83d1-40bc-98e1-b68393158387"
/>


[//]: # (## Documentation)
2025-02-27 09:45:14 -08:00
Ashwin Bharambe
45ffe87d7c Kill noise from test output 2025-02-21 15:37:23 -08:00
Xi Yan
0fe071764f
feat(1/n): api: unify agents for handling server & client tools (#1178)
# Problem

Our current Agent framework has discrepancies in definition on how we
handle server side and client side tools.

1. Server Tools: a single Turn is returned including `ToolExecutionStep`
in agenst
2. Client Tools: `create_agent_turn` is called in loop with client agent
lib yielding the agent chunk

ad6ffc63df/src/llama_stack_client/lib/agents/agent.py (L186-L211)

This makes it inconsistent to work with server & client tools. It also
complicates the logs to telemetry to get information about agents turn /
history for observability.

#### Principle
The same `turn_id` should be used to represent the steps required to
complete a user message including client tools.

## Solution

1. `AgentTurnResponseEventType.turn_awaiting_input` status to indicate
that the current turn is not completed, and awaiting tool input
2. `continue_agent_turn` endpoint to update agent turn with client's
tool response.


# What does this PR do?
- Skeleton API as example

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

- Just API update, no functionality change
```
llama stack run + client-sdk test
```

<img width="842" alt="image"
src="https://github.com/user-attachments/assets/7ac56b5f-f424-4632-9476-7e0f57555bc3"
/>


[//]: # (## Documentation)
2025-02-21 11:48:27 -08:00
Sébastien Han
418645696a
fix: improve signal handling and update dependencies (#1044)
# What does this PR do?
This commit enhances the signal handling mechanism in the server by
improving the `handle_signal` (previously handle_sigint) function. It
now properly retrieves the signal name, ensuring clearer logging when a
termination signal is received. Additionally, it cancels all running
tasks and waits for their completion before stopping the event loop,
allowing for a more graceful shutdown. Support for handling
SIGTERM has also been added alongside SIGINT.

Before the changes, handle_sigint used asyncio.run(run_shutdown()).
However, asyncio.run() is meant to start a new event loop, and calling
it inside an existing one (like when running Uvicorn) raises an error.
The fix replaces asyncio.run(run_shutdown()) with an async function
scheduled on the existing loop using loop.create_task(shutdown()). This
ensures that the shutdown coroutine runs within the current event loop
instead of trying to create a new one.

Furthermore, this commit updates the project dependencies. `fastapi` and
`uvicorn` have been added to the development dependencies in
`pyproject.toml` and `uv.lock`, ensuring that the necessary packages are
available for development and execution.

Closes: https://github.com/meta-llama/llama-stack/issues/1043
Signed-off-by: Sébastien Han <seb@redhat.com>

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Run a server and send SIGINT:

```
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml
Using config file: llama_stack/templates/ollama/run.yaml
Run configuration:
apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
container_image: null
datasets: []
eval_tasks: []
image_name: ollama
metadata_store:
  db_path: /Users/leseb/.llama/distributions/ollama/registry.db
  namespace: null
  type: sqlite
models:
- metadata: {}
  model_id: meta-llama/Llama-3.2-3B-Instruct
  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType
  - llm
  provider_id: ollama
  provider_model_id: null
- metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType
  - embedding
  provider_id: sentence-transformers
  provider_model_id: null
providers:
  agents:
  - config:
      persistence_store:
        db_path: /Users/leseb/.llama/distributions/ollama/agents_store.db
        namespace: null
        type: sqlite
    provider_id: meta-reference
    provider_type: inline::meta-reference
  datasetio:
  - config: {}
    provider_id: huggingface
    provider_type: remote::huggingface
  - config: {}
    provider_id: localfs
    provider_type: inline::localfs
  eval:
  - config: {}
    provider_id: meta-reference
    provider_type: inline::meta-reference
  inference:
  - config:
      url: http://localhost:11434
    provider_id: ollama
    provider_type: remote::ollama
  - config: {}
    provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  safety:
  - config: {}
    provider_id: llama-guard
    provider_type: inline::llama-guard
  scoring:
  - config: {}
    provider_id: basic
    provider_type: inline::basic
  - config: {}
    provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - config:
      openai_api_key: '********'
    provider_id: braintrust
    provider_type: inline::braintrust
  telemetry:
  - config:
      service_name: llama-stack
      sinks: console,sqlite
      sqlite_db_path: /Users/leseb/.llama/distributions/ollama/trace_store.db
    provider_id: meta-reference
    provider_type: inline::meta-reference
  tool_runtime:
  - config:
      api_key: '********'
      max_results: 3
    provider_id: brave-search
    provider_type: remote::brave-search
  - config:
      api_key: '********'
      max_results: 3
    provider_id: tavily-search
    provider_type: remote::tavily-search
  - config: {}
    provider_id: code-interpreter
    provider_type: inline::code-interpreter
  - config: {}
    provider_id: rag-runtime
    provider_type: inline::rag-runtime
  vector_io:
  - config:
      kvstore:
        db_path: /Users/leseb/.llama/distributions/ollama/faiss_store.db
        namespace: null
        type: sqlite
    provider_id: faiss
    provider_type: inline::faiss
scoring_fns: []
server:
  port: 8321
  tls_certfile: null
  tls_keyfile: null
shields: []
tool_groups:
- args: null
  mcp_endpoint: null
  provider_id: tavily-search
  toolgroup_id: builtin::websearch
- args: null
  mcp_endpoint: null
  provider_id: rag-runtime
  toolgroup_id: builtin::rag
- args: null
  mcp_endpoint: null
  provider_id: code-interpreter
  toolgroup_id: builtin::code_interpreter
vector_dbs: []
version: '2'

INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:213: Resolved 31 providers
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-inference => ollama
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-inference => sentence-transformers
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  models => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inference => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-vector_io => faiss
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-safety => llama-guard
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  shields => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  safety => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  vector_dbs => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  vector_io => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => brave-search
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => tavily-search
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => code-interpreter
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => rag-runtime
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  tool_groups => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  tool_runtime => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  agents => meta-reference
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-datasetio => huggingface
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-datasetio => localfs
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  datasets => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  datasetio => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  telemetry => meta-reference
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-scoring => basic
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-scoring => llm-as-judge
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-scoring => braintrust
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  scoring_functions => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  scoring => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-eval => meta-reference
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  eval_tasks => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  eval => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inspect => __builtin__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:216: 
INFO 2025-02-12 10:21:03,723 llama_stack.providers.remote.inference.ollama.ollama:148: checking connectivity to Ollama at `http://localhost:11434`...
INFO 2025-02-12 10:21:03,734 httpx:1740: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK"
INFO 2025-02-12 10:21:03,843 faiss.loader:148: Loading faiss.
INFO 2025-02-12 10:21:03,865 faiss.loader:150: Successfully loaded faiss.
INFO 2025-02-12 10:21:03,868 faiss:173: Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes.
Warning: `bwrap` is not available. Code interpreter tool will not work correctly.
INFO 2025-02-12 10:21:04,315 datasets:54: PyTorch version 2.6.0 available.
INFO 2025-02-12 10:21:04,556 httpx:1740: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK"
INFO 2025-02-12 10:21:04,557 llama_stack.providers.utils.inference.embedding_mixin:42: Loading sentence transformer for all-MiniLM-L6-v2...
INFO 2025-02-12 10:21:07,202 sentence_transformers.SentenceTransformer:210: Use pytorch device_name: mps
INFO 2025-02-12 10:21:07,202 sentence_transformers.SentenceTransformer:218: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO 2025-02-12 10:21:09,500 llama_stack.distribution.stack:102: Models: all-MiniLM-L6-v2 served by sentence-transformers
INFO 2025-02-12 10:21:09,500 llama_stack.distribution.stack:102: Models: meta-llama/Llama-3.2-3B-Instruct served by ollama
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::equality served by basic
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::subset_of served by basic
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-correctness served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-relevancy served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-similarity served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-entity-recall served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-precision served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-recall served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-relevancy served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::factuality served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::faithfulness served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: llm-as-judge::405b-simpleqa served by llm-as-judge
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: llm-as-judge::base served by llm-as-judge
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::code_interpreter served by code-interpreter
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::rag served by rag-runtime
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::websearch served by tavily-search
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:106: 
Serving API eval
 POST /v1/eval/tasks/{task_id}/evaluations
 DELETE /v1/eval/tasks/{task_id}/jobs/{job_id}
 GET /v1/eval/tasks/{task_id}/jobs/{job_id}/result
 GET /v1/eval/tasks/{task_id}/jobs/{job_id}
 POST /v1/eval/tasks/{task_id}/jobs
Serving API agents
 POST /v1/agents
 POST /v1/agents/{agent_id}/session
 POST /v1/agents/{agent_id}/session/{session_id}/turn
 DELETE /v1/agents/{agent_id}
 DELETE /v1/agents/{agent_id}/session/{session_id}
 GET /v1/agents/{agent_id}/session/{session_id}
 GET /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
 GET /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}
Serving API scoring_functions
 GET /v1/scoring-functions/{scoring_fn_id}
 GET /v1/scoring-functions
 POST /v1/scoring-functions
Serving API safety
 POST /v1/safety/run-shield
Serving API inspect
 GET /v1/health
 GET /v1/inspect/providers
 GET /v1/inspect/routes
 GET /v1/version
Serving API tool_runtime
 POST /v1/tool-runtime/invoke
 GET /v1/tool-runtime/list-tools
 POST /v1/tool-runtime/rag-tool/insert
 POST /v1/tool-runtime/rag-tool/query
Serving API datasetio
 POST /v1/datasetio/rows
 GET /v1/datasetio/rows
Serving API shields
 GET /v1/shields/{identifier}
 GET /v1/shields
 POST /v1/shields
Serving API eval_tasks
 GET /v1/eval-tasks/{eval_task_id}
 GET /v1/eval-tasks
 POST /v1/eval-tasks
Serving API models
 GET /v1/models/{model_id}
 GET /v1/models
 POST /v1/models
 DELETE /v1/models/{model_id}
Serving API datasets
 GET /v1/datasets/{dataset_id}
 GET /v1/datasets
 POST /v1/datasets
 DELETE /v1/datasets/{dataset_id}
Serving API vector_io
 POST /v1/vector-io/insert
 POST /v1/vector-io/query
Serving API inference
 POST /v1/inference/chat-completion
 POST /v1/inference/completion
 POST /v1/inference/embeddings
Serving API tool_groups
 GET /v1/tools/{tool_name}
 GET /v1/toolgroups/{toolgroup_id}
 GET /v1/toolgroups
 GET /v1/tools
 POST /v1/toolgroups
 DELETE /v1/toolgroups/{toolgroup_id}
Serving API vector_dbs
 GET /v1/vector-dbs/{vector_db_id}
 GET /v1/vector-dbs
 POST /v1/vector-dbs
 DELETE /v1/vector-dbs/{vector_db_id}
Serving API scoring
 POST /v1/scoring/score
 POST /v1/scoring/score-batch
Serving API telemetry
 GET /v1/telemetry/traces/{trace_id}/spans/{span_id}
 GET /v1/telemetry/spans/{span_id}/tree
 GET /v1/telemetry/traces/{trace_id}
 POST /v1/telemetry/events
 GET /v1/telemetry/spans
 GET /v1/telemetry/traces
 POST /v1/telemetry/spans/export

Listening on ['::', '0.0.0.0']:5001
INFO:     Started server process [65372]
INFO:     Waiting for application startup.
INFO:     ASGI 'lifespan' protocol appears unsupported.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://['::', '0.0.0.0']:5001 (Press CTRL+C to quit)
^CINFO:     Shutting down
INFO:     Finished server process [65372]
Received signal SIGINT (2). Exiting gracefully...
INFO 2025-02-12 10:21:11,215 __main__:151: Shutting down ModelsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down InferenceRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ShieldsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down SafetyRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down VectorDBsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down VectorIORouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ToolGroupsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ToolRuntimeRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down MetaReferenceAgentsImpl
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DatasetsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DatasetIORouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down TelemetryAdapter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ScoringFunctionsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ScoringRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down EvalTasksRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down EvalRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DistributionInspectImpl
```

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)

Signed-off-by: Sébastien Han <seb@redhat.com>
2025-02-13 08:07:59 -08:00
ehhuang
96c88397da
fix: agent config validation (#1053)
Summary:

Fixes AgentConfig init bug introduced with ToolConfig.

Namely, the below doesn't work
```
    agent_config = AgentConfig(
        **common_params,
        tool_config=ToolConfig(
            tool_choice="required",
        ),
    )
```
bvecause tool_choice was defaulted to 'auto' leading to validation check
failing.

Test Plan:

added unittests

LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/client-sdk/
--safety-shield meta-llama/Llama-Guard-3-8B
2025-02-11 14:48:42 -08:00
ehhuang
3922999118
sys_prompt support in Agent (#938)
# What does this PR do?

The current default system prompt for llama3.2 tends to overindex on
tool calling and doesn't work well when the prompt does not require tool
calling.

This PR adds an option to override the default system prompt, and
organizes tool-related configs into a new config object.

- [ ] Addresses issue (#issue)


## Test Plan


LLAMA_STACK_CONFIG=together pytest
\-\-inference\-model=meta\-llama/Llama\-3\.3\-70B\-Instruct -s -v
tests/client-sdk/agents/test_agents.py::test_override_system_message_behavior


## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
2025-02-05 21:11:32 -08:00
Yuan Tang
34ab7a3b6c
Fix precommit check after moving to ruff (#927)
Lint check in main branch is failing. This fixes the lint check after we
moved to ruff in https://github.com/meta-llama/llama-stack/pull/921. We
need to move to a `ruff.toml` file as well as fixing and ignoring some
additional checks.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-02 06:46:45 -08:00
Ashwin Bharambe
1a7490470a
[memory refactor][3/n] Introduce RAGToolRuntime as a specialized sub-protocol (#832)
See https://github.com/meta-llama/llama-stack/issues/827 for the broader
design.

Third part:
- we need to make `tool_runtime.rag_tool.query_context()` and
`tool_runtime.rag_tool.insert_documents()` methods work smoothly with
complete type safety. To that end, we introduce a sub-resource path
`tool-runtime/rag-tool/` and make changes to the resolver to make things
work.
- the PR updates the agents implementation to directly call these typed
APIs for memory accesses rather than going through the complex, untyped
"invoke_tool" API. the code looks much nicer and simpler (expectedly.)
- there are a number of hacks in the server resolver implementation
still, we will live with some and fix some

Note that we must make sure the client SDKs are able to handle this
subresource complexity also. Stainless has support for subresources, so
this should be possible but beware.

## Test Plan

Our RAG test is sad (doesn't actually test for actual RAG output) but I
verified that the implementation works. I will work on fixing the RAG
test afterwards.

```bash
pytest -s -v tests/agents/test_agents.py -k "rag and together" --safety-shield=meta-llama/Llama-Guard-3-8B
```
2025-01-22 10:04:16 -08:00
Dinesh Yeduguru
7fb2c1c48d
More idiomatic REST API (#765)
# What does this PR do?

This PR changes our API to follow more idiomatic REST API approaches of
having paths being resources and methods indicating the action being
performed.

Changes made to generator:
1) removed the prefix check of "get" as its not required and is actually
needed for other method types too
2) removed _ check on path since variables can have "_"



## Test Plan

LLAMA_STACK_BASE_URL=http://localhost:5000 pytest -v
tests/client-sdk/agents/test_agents.py
2025-01-15 13:20:09 -08:00
Dinesh Yeduguru
a5c57cd381
agents to use tools api (#673)
# What does this PR do?

PR #639 introduced the notion of Tools API and ability to invoke tools
through API just as any resource. This PR changes the Agents to start
using the Tools API to invoke tools. Major changes include:
1) Ability to specify tool groups with AgentConfig
2) Agent gets the corresponding tool definitions for the specified tools
and pass along to the model
3) Attachements are now named as Documents and their behavior is mostly
unchanged from user perspective
4) You can specify args that can be injected to a tool call through
Agent config. This is especially useful in case of memory tool, where
you want the tool to operate on a specific memory bank.
5) You can also register tool groups with args, which lets the agent
inject these as well into the tool call.
6) All tests have been migrated to use new tools API and fixtures
including client SDK tests
7) Telemetry just works with tools API because of our trace protocol
decorator


## Test Plan
```
pytest -s -v -k fireworks llama_stack/providers/tests/agents/test_agents.py  \
   --safety-shield=meta-llama/Llama-Guard-3-8B \
   --inference-model=meta-llama/Llama-3.1-8B-Instruct

pytest -s -v -k together  llama_stack/providers/tests/tools/test_tools.py \
   --safety-shield=meta-llama/Llama-Guard-3-8B \
   --inference-model=meta-llama/Llama-3.1-8B-Instruct

LLAMA_STACK_CONFIG="/Users/dineshyv/.llama/distributions/llamastack-together/together-run.yaml" pytest -v tests/client-sdk/agents/test_agents.py
```
run.yaml:
https://gist.github.com/dineshyv/0365845ad325e1c2cab755788ccc5994

Notebook:
https://colab.research.google.com/drive/1ck7hXQxRl6UvT-ijNRZ-gMZxH1G3cN2d?usp=sharing
2025-01-08 19:01:00 -08:00
Xi Yan
3c72c034e6
[remove import *] clean up import *'s (#689)
# What does this PR do?

- as title, cleaning up `import *`'s
- upgrade tests to make them more robust to bad model outputs
- remove import *'s in llama_stack/apis/* (skip __init__ modules)
<img width="465" alt="image"
src="https://github.com/user-attachments/assets/d8339c13-3b40-4ba5-9c53-0d2329726ee2"
/>

- run `sh run_openapi_generator.sh`, no types gets affected

## Test Plan

### Providers Tests

**agents**
```
pytest -v -s llama_stack/providers/tests/agents/test_agents.py -m "together" --safety-shield meta-llama/Llama-Guard-3-8B --inference-model meta-llama/Llama-3.1-405B-Instruct-FP8
```

**inference**
```bash
# meta-reference
torchrun $CONDA_PREFIX/bin/pytest -v -s -k "meta_reference" --inference-model="meta-llama/Llama-3.1-8B-Instruct" ./llama_stack/providers/tests/inference/test_text_inference.py
torchrun $CONDA_PREFIX/bin/pytest -v -s -k "meta_reference" --inference-model="meta-llama/Llama-3.2-11B-Vision-Instruct" ./llama_stack/providers/tests/inference/test_vision_inference.py

# together
pytest -v -s -k "together" --inference-model="meta-llama/Llama-3.1-8B-Instruct" ./llama_stack/providers/tests/inference/test_text_inference.py
pytest -v -s -k "together" --inference-model="meta-llama/Llama-3.2-11B-Vision-Instruct" ./llama_stack/providers/tests/inference/test_vision_inference.py

pytest ./llama_stack/providers/tests/inference/test_prompt_adapter.py 
```

**safety**
```
pytest -v -s llama_stack/providers/tests/safety/test_safety.py -m together --safety-shield meta-llama/Llama-Guard-3-8B
```

**memory**
```
pytest -v -s llama_stack/providers/tests/memory/test_memory.py -m "sentence_transformers" --env EMBEDDING_DIMENSION=384
```

**scoring**
```
pytest -v -s -m llm_as_judge_scoring_together_inference llama_stack/providers/tests/scoring/test_scoring.py --judge-model meta-llama/Llama-3.2-3B-Instruct
pytest -v -s -m basic_scoring_together_inference llama_stack/providers/tests/scoring/test_scoring.py
pytest -v -s -m braintrust_scoring_together_inference llama_stack/providers/tests/scoring/test_scoring.py
```


**datasetio**
```
pytest -v -s -m localfs llama_stack/providers/tests/datasetio/test_datasetio.py
pytest -v -s -m huggingface llama_stack/providers/tests/datasetio/test_datasetio.py
```


**eval**
```
pytest -v -s -m meta_reference_eval_together_inference llama_stack/providers/tests/eval/test_eval.py
pytest -v -s -m meta_reference_eval_together_inference_huggingface_datasetio llama_stack/providers/tests/eval/test_eval.py
```

### Client-SDK Tests
```
LLAMA_STACK_BASE_URL=http://localhost:5000 pytest -v ./tests/client-sdk
```

### llama-stack-apps
```
PORT=5000
LOCALHOST=localhost

python -m examples.agents.hello $LOCALHOST $PORT
python -m examples.agents.inflation $LOCALHOST $PORT
python -m examples.agents.podcast_transcript $LOCALHOST $PORT
python -m examples.agents.rag_as_attachments $LOCALHOST $PORT
python -m examples.agents.rag_with_memory_bank $LOCALHOST $PORT
python -m examples.safety.llama_guard_demo_mm $LOCALHOST $PORT
python -m examples.agents.e2e_loop_with_custom_tools $LOCALHOST $PORT

# Vision model
python -m examples.interior_design_assistant.app
python -m examples.agent_store.app $LOCALHOST $PORT
```

### CLI
```
which llama
llama model prompt-format -m Llama3.2-11B-Vision-Instruct
llama model list
llama stack list-apis
llama stack list-providers inference

llama stack build --template ollama --image-type conda
```

### Distributions Tests
**ollama**
```
llama stack build --template ollama --image-type conda
ollama run llama3.2:1b-instruct-fp16
llama stack run ./llama_stack/templates/ollama/run.yaml --env INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
```

**fireworks**
```
llama stack build --template fireworks --image-type conda
llama stack run ./llama_stack/templates/fireworks/run.yaml
```

**together**
```
llama stack build --template together --image-type conda
llama stack run ./llama_stack/templates/together/run.yaml
```

**tgi**
```
llama stack run ./llama_stack/templates/tgi/run.yaml --env TGI_URL=http://0.0.0.0:5009 --env INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
```

## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
2024-12-27 15:45:44 -08:00
Ashwin Bharambe
5335393fe3 Avoid deleting temp directory between agent turns
This brings an interesting aspect -- we need to maintain session-level
tempdir state (!) since the model was told there was some resource at a
given location that it needs to maintain
2024-12-08 22:25:37 -08:00
Ashwin Bharambe
cdfc98cf08 add a warning at least for when bwrap is not available for code execution 2024-12-05 20:54:28 -08:00
Dinesh Yeduguru
501e7c9d64
Fix opentelemetry adapter (#510)
# What does this PR do?

This PR fixes some of the issues with our telemetry setup to enable logs
to be delivered to opentelemetry and jaeger. Main fixes
1) Updates the open telemetry provider to use the latest oltp exports
instead of deprected ones.
2) Adds a tracing middleware, which injects traces into each HTTP
request that the server recieves and this is going to be the root trace.
Previously, we did this in the create_dynamic_route method, which is
actually not the actual exectuion flow, but more of a config and this
causes the traces to end prematurely. Through middleware, we plugin the
trace start and end at the right location.
3) We manage our own methods to create traces and spans and this does
not fit well with Opentelemetry SDK since it does not support provide a
way to take in traces and spans that are already created. it expects us
to use the SDK to create them. For now, I have a hacky approach of just
maintaining a map from our internal telemetry objects to the open
telemetry specfic ones. This is not the ideal solution. I will explore
other ways to get around this issue. for now, to have something that
works, i am going to keep this as is.

Addresses: #509
2024-11-22 18:18:11 -08:00
Ashwin Bharambe
694c142b89
Add provider deprecation support; change directory structure (#397)
* Add provider deprecation support; change directory structure

* fix a couple dangling imports

* move the meta_reference safety dir also
2024-11-07 13:04:53 -08:00
Renamed from llama_stack/providers/inline/meta_reference/agents/agents.py (Browse further)