llama-stack/llama_stack/templates/cerebras/run.yaml
Henry Tu 64c6df8392
Cerebras Inference Integration (#265)
Adding Cerebras Inference as an API provider.

## Testing

### Conda
```
$ llama stack build --template cerebras --image-type conda
$ llama stack run ~/.llama/distributions/llamastack-cerebras/cerebras-run.yaml
...
Listening on ['::', '0.0.0.0']:5000
INFO:     Started server process [12443]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
```

### Chat Completion
```
$ curl --location 'http://localhost:5000/alpha/inference/chat-completion' --header 'Content-Type: application/json' --data '{
    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
    "messages": [
        {
            "role": "user",
            "content": "What is the temperature in Seattle right now?"
        }
    ],
    "stream": false,
    "sampling_params": {
        "strategy": "top_p",
        "temperature": 0.5,
        "max_tokens": 100
    },                   
    "tool_choice": "auto",
    "tool_prompt_format": "json",
    "tools": [                   
        {
            "tool_name": "getTemperature",
            "description": "Gets the current temperature of a location.",
            "parameters": {                                              
                "location": {
                    "param_type": "string",
                    "description": "The name of the place to get the temperature from in degress celsius.",
                    "required": true                                                                       
                }                   
            }    
        }    
    ]    
}' 
```

#### Non-Streaming Response
```
{
  "completion_message": {
    "role": "assistant",
    "content": "",
    "stop_reason": "end_of_message",
    "tool_calls": [
      {
        "call_id": "6f42fdcc-6cbb-46ad-a17b-5d20ac64b678",
        "tool_name": "getTemperature",
        "arguments": {
          "location": "Seattle"
        }
      }
    ]
  },
  "logprobs": null
}
```

#### Streaming Response
```
data: {"event":{"event_type":"start","delta":"","logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"","parse_status":"started"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"{\"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"type","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"function","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\",","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"name","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"get","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"Temperature","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\",","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"parameters","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":" {\"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"location","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"Seattle","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":"\"}}","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}}
data: {"event":{"event_type":"progress","delta":{"content":{"call_id":"e742df1f-0ae9-40ad-a49e-18e5c905484f","tool_name":"getTemperature","arguments":{"location":"Seattle"}},"parse_status":"success"},"logprobs":null,"stop_reason":"end_of_message"}}
data: {"event":{"event_type":"complete","delta":"","logprobs":null,"stop_reason":"end_of_message"}}
```

### Completion
```
$ curl --location 'http://localhost:5000/alpha/inference/completion' --header 'Content-Type: application/json' --data '{
    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
    "content": "1,2,3,",
    "stream": true,
    "sampling_params": {
        "strategy": "top_p",
        "temperature": 0.5,
        "max_tokens": 10
    },                   
    "tool_choice": "auto",
    "tool_prompt_format": "json",
    "tools": [                   
        {
            "tool_name": "getTemperature",
            "description": "Gets the current temperature of a location.",
            "parameters": {                                              
                "location": {
                    "param_type": "string",
                    "description": "The name of the place to get the temperature from in degress celsius.",
                    "required": true                                                                       
                }                   
            }    
        }    
    ]    
}'
```

#### Non-Streaming Response
```
{
  "content": "4,5,6,7,8,",
  "stop_reason": "out_of_tokens",
  "logprobs": null
}
```

#### Streaming Response
```
data: {"delta":"4","stop_reason":null,"logprobs":null}
data: {"delta":",","stop_reason":null,"logprobs":null}
data: {"delta":"5","stop_reason":null,"logprobs":null}
data: {"delta":",","stop_reason":null,"logprobs":null}
data: {"delta":"6","stop_reason":null,"logprobs":null}
data: {"delta":",","stop_reason":null,"logprobs":null}
data: {"delta":"7","stop_reason":null,"logprobs":null}
data: {"delta":",","stop_reason":null,"logprobs":null}
data: {"delta":"8","stop_reason":null,"logprobs":null}
data: {"delta":",","stop_reason":null,"logprobs":null}
data: {"delta":"","stop_reason":null,"logprobs":null}
data: {"delta":"","stop_reason":"out_of_tokens","logprobs":null}
```

### Pre-Commit Checks
```
trim trailing whitespace.................................................Passed
check python ast.........................................................Passed
check for merge conflicts................................................Passed
check for added large files..............................................Passed
fix end of files.........................................................Passed
Insert license in comments...............................................Passed
flake8...................................................................Passed
Format files with µfmt...................................................Passed
```

### Testing with `test_inference.py`
```
$ export CEREBRAS_API_KEY=<insert API key here>
$ pytest -v -s llama_stack/providers/tests/inference/test_text_inference.py -m "cerebras and llama_8b" 
/net/henryt-dev/srv/nfs/henryt-data/ws/llama-stack/.venv/lib/python3.12/site-packages/pytest_asyncio/plugin.py:208: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"

  warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
=================================================== test session starts ===================================================
platform linux -- Python 3.12.3, pytest-8.3.3, pluggy-1.5.0 -- /net/henryt-dev/srv/nfs/henryt-data/ws/llama-stack/.venv/bin/python3.12
cachedir: .pytest_cache
rootdir: /net/henryt-dev/srv/nfs/henryt-data/ws/llama-stack
configfile: pyproject.toml
plugins: anyio-4.6.2.post1, asyncio-0.24.0
asyncio: mode=Mode.STRICT, default_loop_scope=None
collected 128 items / 120 deselected / 8 selected                                                                         

llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_model_list[llama_8b-cerebras] Resolved 4 providers
 inner-inference => cerebras
 models => __routing_table__
 inference => __autorouted__
 inspect => __builtin__

Models: meta-llama/Llama-3.1-8B-Instruct served by cerebras

PASSED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion[llama_8b-cerebras] PASSED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completions_structured_output[llama_8b-cerebras] SKIPPED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_non_streaming[llama_8b-cerebras] PASSED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_structured_output[llama_8b-cerebras] SKIPPED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_streaming[llama_8b-cerebras] PASSED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling[llama_8b-cerebras] PASSED
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling_streaming[llama_8b-cerebras] PASSED

================================ 6 passed, 2 skipped, 120 deselected, 6 warnings in 3.95s =================================
```

I ran `python llama_stack/scripts/distro_codegen.py` to run codegen.
2024-12-03 21:15:32 -08:00

63 lines
1.5 KiB
YAML

version: '2'
image_name: cerebras
docker_image: null
conda_env: cerebras
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: cerebras
provider_type: remote::cerebras
config:
base_url: https://api.cerebras.ai
api_key: ${env.CEREBRAS_API_KEY}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
memory:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/faiss_store.db
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/registry.db
models:
- metadata: {}
model_id: meta-llama/Llama-3.1-8B-Instruct
provider_id: null
provider_model_id: llama3.1-8b
- metadata: {}
model_id: meta-llama/Llama-3.1-70B-Instruct
provider_id: null
provider_model_id: llama3.1-70b
shields:
- params: null
shield_id: meta-llama/Llama-Guard-3-8B
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []