mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
Update default port from 5000 -> 8321
This commit is contained in:
parent
f1faa9c924
commit
03ac84a829
18 changed files with 27 additions and 27 deletions
|
@ -5,7 +5,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/llamastack-run-bedrock.yaml
|
- ./run.yaml:/root/llamastack-run-bedrock.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
|
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/llamastack-run-cerebras.yaml
|
- ./run.yaml:/root/llamastack-run-cerebras.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
|
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
|
|
|
@ -40,7 +40,7 @@ services:
|
||||||
# Link to TGI run.yaml file
|
# Link to TGI run.yaml file
|
||||||
- ./run.yaml:/root/my-run.yaml
|
- ./run.yaml:/root/my-run.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
# Hack: wait for TGI server to start before starting docker
|
# Hack: wait for TGI server to start before starting docker
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||||
restart_policy:
|
restart_policy:
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/llamastack-run-fireworks.yaml
|
- ./run.yaml:/root/llamastack-run-fireworks.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
|
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/my-run.yaml
|
- ./run.yaml:/root/my-run.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
devices:
|
devices:
|
||||||
- nvidia.com/gpu=all
|
- nvidia.com/gpu=all
|
||||||
environment:
|
environment:
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/my-run.yaml
|
- ./run.yaml:/root/my-run.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
devices:
|
devices:
|
||||||
- nvidia.com/gpu=all
|
- nvidia.com/gpu=all
|
||||||
environment:
|
environment:
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/llamastack-run-nvidia.yaml
|
- ./run.yaml:/root/llamastack-run-nvidia.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
environment:
|
environment:
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
- INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
||||||
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
|
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/llamastack-run-together.yaml
|
- ./run.yaml:/root/llamastack-run-together.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
|
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
|
|
|
@ -6,7 +6,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run.yaml:/root/my-run.yaml
|
- ./run.yaml:/root/my-run.yaml
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "8321:8321"
|
||||||
devices:
|
devices:
|
||||||
- nvidia.com/gpu=all
|
- nvidia.com/gpu=all
|
||||||
environment:
|
environment:
|
||||||
|
|
|
@ -139,7 +139,7 @@ Querying Traces for a agent session
|
||||||
The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command:
|
The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \
|
curl -X POST 'http://localhost:8321/alpha/telemetry/query-traces' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{
|
-d '{
|
||||||
"attribute_filters": [
|
"attribute_filters": [
|
||||||
|
@ -167,7 +167,7 @@ The client SDK is not updated to support the new telemetry API. It will be updat
|
||||||
Querying spans for a specifc root span id
|
Querying spans for a specifc root span id
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \
|
curl -X POST 'http://localhost:8321/alpha/telemetry/get-span-tree' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }'
|
-d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }'
|
||||||
|
|
||||||
|
@ -207,7 +207,7 @@ curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \
|
||||||
## Example: Save Spans to Dataset
|
## Example: Save Spans to Dataset
|
||||||
Save all spans for a specific agent session to a dataset.
|
Save all spans for a specific agent session to a dataset.
|
||||||
``` bash
|
``` bash
|
||||||
curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
|
curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{
|
-d '{
|
||||||
"attribute_filters": [
|
"attribute_filters": [
|
||||||
|
@ -225,7 +225,7 @@ curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
|
||||||
|
|
||||||
Save all spans for a specific agent turn to a dataset.
|
Save all spans for a specific agent turn to a dataset.
|
||||||
```bash
|
```bash
|
||||||
curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
|
curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{
|
-d '{
|
||||||
"attribute_filters": [
|
"attribute_filters": [
|
||||||
|
|
|
@ -402,11 +402,11 @@ Serving API agents
|
||||||
POST /agents/step/get
|
POST /agents/step/get
|
||||||
POST /agents/turn/get
|
POST /agents/turn/get
|
||||||
|
|
||||||
Listening on ['::', '0.0.0.0']:5000
|
Listening on ['::', '0.0.0.0']:8321
|
||||||
INFO: Started server process [2935911]
|
INFO: Started server process [2935911]
|
||||||
INFO: Waiting for application startup.
|
INFO: Waiting for application startup.
|
||||||
INFO: Application startup complete.
|
INFO: Application startup complete.
|
||||||
INFO: Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
|
INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
|
||||||
INFO: 2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
|
INFO: 2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ If you don't want to run inference on-device, then you can connect to any hosted
|
||||||
```swift
|
```swift
|
||||||
import LlamaStackClient
|
import LlamaStackClient
|
||||||
|
|
||||||
let agents = RemoteAgents(url: URL(string: "http://localhost:5000")!)
|
let agents = RemoteAgents(url: URL(string: "http://localhost:8321")!)
|
||||||
let request = Components.Schemas.CreateAgentTurnRequest(
|
let request = Components.Schemas.CreateAgentTurnRequest(
|
||||||
agent_id: agentId,
|
agent_id: agentId,
|
||||||
messages: [
|
messages: [
|
||||||
|
|
|
@ -41,7 +41,7 @@ The script will first start up TGI server, then start up Llama Stack distributio
|
||||||
INFO: Started server process [1]
|
INFO: Started server process [1]
|
||||||
INFO: Waiting for application startup.
|
INFO: Waiting for application startup.
|
||||||
INFO: Application startup complete.
|
INFO: Application startup complete.
|
||||||
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
|
INFO: Uvicorn running on http://[::]:8321 (Press CTRL+C to quit)
|
||||||
```
|
```
|
||||||
|
|
||||||
To kill the server
|
To kill the server
|
||||||
|
@ -65,7 +65,7 @@ registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1
|
||||||
#### Start Llama Stack server pointing to TGI server
|
#### Start Llama Stack server pointing to TGI server
|
||||||
|
|
||||||
```
|
```
|
||||||
docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml
|
docker run --network host -it -p 8321:8321 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
|
Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
|
||||||
|
|
|
@ -23,8 +23,8 @@ subcommands:
|
||||||
```bash
|
```bash
|
||||||
$ llama-stack-client configure
|
$ llama-stack-client configure
|
||||||
> Enter the host name of the Llama Stack distribution server: localhost
|
> Enter the host name of the Llama Stack distribution server: localhost
|
||||||
> Enter the port number of the Llama Stack distribution server: 5000
|
> Enter the port number of the Llama Stack distribution server: 8321
|
||||||
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
|
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
|
||||||
```
|
```
|
||||||
|
|
||||||
### `llama-stack-client providers list`
|
### `llama-stack-client providers list`
|
||||||
|
|
|
@ -32,8 +32,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"LOCAL_PORT = 5000 # Replace with your local distro port\n",
|
"LOCAL_PORT = 8321 # Replace with your local distro port\n",
|
||||||
"CLOUD_PORT = 5001 # Replace with your cloud distro port"
|
"CLOUD_PORT = 8322 # Replace with your cloud distro port"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -43,7 +43,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"#### 2. Set Up Local and Cloud Clients\n",
|
"#### 2. Set Up Local and Cloud Clients\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:5000` and the cloud distribution running on `http://localhost:5001`.\n"
|
"Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:8321` and the cloud distribution running on `http://localhost:5001`.\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -34,8 +34,8 @@ class StackRun(Subcommand):
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
"--port",
|
"--port",
|
||||||
type=int,
|
type=int,
|
||||||
help="Port to run the server on. Defaults to 5000",
|
help="Port to run the server on. Defaults to 8321",
|
||||||
default=int(os.getenv("LLAMA_STACK_PORT", 5000)),
|
default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
|
||||||
)
|
)
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
"--image-name",
|
"--image-name",
|
||||||
|
|
|
@ -293,7 +293,7 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--port",
|
"--port",
|
||||||
type=int,
|
type=int,
|
||||||
default=int(os.getenv("LLAMA_STACK_PORT", 5000)),
|
default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
|
||||||
help="Port to listen on",
|
help="Port to listen on",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
|
@ -14,7 +14,7 @@ from llama_stack_client import LlamaStackClient
|
||||||
class LlamaStackApi:
|
class LlamaStackApi:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.client = LlamaStackClient(
|
self.client = LlamaStackClient(
|
||||||
base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"),
|
base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"),
|
||||||
provider_data={
|
provider_data={
|
||||||
"fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
|
"fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
|
||||||
"together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
|
"together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue