From 03ac84a829c30b5d3ccc6c783cf917b8ac690e91 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Thu, 16 Jan 2025 15:26:48 -0800
Subject: [PATCH] Update default port from 5000 -> 8321

---
 distributions/bedrock/compose.yaml                        | 2 +-
 distributions/cerebras/compose.yaml                       | 2 +-
 distributions/dell-tgi/compose.yaml                       | 2 +-
 distributions/fireworks/compose.yaml                      | 2 +-
 distributions/meta-reference-gpu/compose.yaml             | 2 +-
 distributions/meta-reference-quantized-gpu/compose.yaml   | 2 +-
 distributions/remote-nvidia/compose.yaml                  | 2 +-
 distributions/together/compose.yaml                       | 2 +-
 distributions/vllm-gpu/compose.yaml                       | 2 +-
 docs/source/building_applications/telemetry.md            | 8 ++++----
 docs/source/distributions/building_distro.md              | 4 ++--
 docs/source/distributions/ondevice_distro/ios_sdk.md      | 2 +-
 docs/source/distributions/self_hosted_distro/dell-tgi.md  | 4 ++--
 .../source/references/llama_stack_client_cli_reference.md | 4 ++--
 docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb | 6 +++---
 llama_stack/cli/stack/run.py                              | 4 ++--
 llama_stack/distribution/server/server.py                 | 2 +-
 llama_stack/distribution/ui/modules/api.py                | 2 +-
 18 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/distributions/bedrock/compose.yaml b/distributions/bedrock/compose.yaml
index f988e33d1..055b92c67 100644
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@@ -5,7 +5,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/llamastack-run-bedrock.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
     deploy:
       restart_policy:
diff --git a/distributions/cerebras/compose.yaml b/distributions/cerebras/compose.yaml
index f2e9a6f42..8dc09a865 100644
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/llamastack-run-cerebras.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
     deploy:
       restart_policy:
diff --git a/distributions/dell-tgi/compose.yaml b/distributions/dell-tgi/compose.yaml
index 0e325aff5..d26636cbd 100644
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@@ -40,7 +40,7 @@ services:
       # Link to TGI run.yaml file
       - ./run.yaml:/root/my-run.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     # Hack: wait for TGI server to start before starting docker
     entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
     restart_policy:
diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml
index 71137c040..4b53fcf00 100644
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/llamastack-run-fireworks.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
     deploy:
       restart_policy:
diff --git a/distributions/meta-reference-gpu/compose.yaml b/distributions/meta-reference-gpu/compose.yaml
index 2b88c68fc..d977e92ea 100644
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/my-run.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     devices:
       - nvidia.com/gpu=all
     environment:
diff --git a/distributions/meta-reference-quantized-gpu/compose.yaml b/distributions/meta-reference-quantized-gpu/compose.yaml
index f9fe9f45d..98e943dce 100644
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/my-run.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     devices:
       - nvidia.com/gpu=all
     environment:
diff --git a/distributions/remote-nvidia/compose.yaml b/distributions/remote-nvidia/compose.yaml
index 04b12d0da..ab8b4ce25 100644
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/llamastack-run-nvidia.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     environment:
       - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
       - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml
index 8d938990e..c7251d0a7 100644
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/llamastack-run-together.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
     deploy:
       restart_policy:
diff --git a/distributions/vllm-gpu/compose.yaml b/distributions/vllm-gpu/compose.yaml
index f8779c9ce..98267cdc3 100644
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@@ -6,7 +6,7 @@ services:
       - ~/.llama:/root/.llama
       - ./run.yaml:/root/my-run.yaml
     ports:
-      - "5000:5000"
+      - "8321:8321"
     devices:
       - nvidia.com/gpu=all
     environment:
diff --git a/docs/source/building_applications/telemetry.md b/docs/source/building_applications/telemetry.md
index 6c8067035..70c54ac98 100644
--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@@ -139,7 +139,7 @@ Querying Traces for a agent session
 The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command:
 
 ``` bash
- curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \
+ curl -X POST 'http://localhost:8321/alpha/telemetry/query-traces' \
 -H 'Content-Type: application/json' \
 -d '{
   "attribute_filters": [
@@ -167,7 +167,7 @@ The client SDK is not updated to support the new telemetry API. It will be updat
 Querying spans for a specifc root span id
 
 ``` bash
-curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \
+curl -X POST 'http://localhost:8321/alpha/telemetry/get-span-tree' \
 -H 'Content-Type: application/json' \
 -d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }'
 
@@ -207,7 +207,7 @@ curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \
 ## Example: Save Spans to Dataset
 Save all spans for a specific agent session to a dataset.
 ``` bash
-curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
+curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \
 -H 'Content-Type: application/json' \
 -d '{
     "attribute_filters": [
@@ -225,7 +225,7 @@ curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
 
 Save all spans for a specific agent turn to a dataset.
 ```bash
-curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
+curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \
 -H 'Content-Type: application/json' \
 -d '{
     "attribute_filters": [
diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index cc94fa9db..aaf2462f7 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -402,11 +402,11 @@ Serving API agents
  POST /agents/step/get
  POST /agents/turn/get
 
-Listening on ['::', '0.0.0.0']:5000
+Listening on ['::', '0.0.0.0']:8321
 INFO:     Started server process [2935911]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
-INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
+INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
 
diff --git a/docs/source/distributions/ondevice_distro/ios_sdk.md b/docs/source/distributions/ondevice_distro/ios_sdk.md
index 0c3cf09af..c9d3a89b5 100644
--- a/docs/source/distributions/ondevice_distro/ios_sdk.md
+++ b/docs/source/distributions/ondevice_distro/ios_sdk.md
@@ -27,7 +27,7 @@ If you don't want to run inference on-device, then you can connect to any hosted
 ```swift
 import LlamaStackClient
 
-let agents = RemoteAgents(url: URL(string: "http://localhost:5000")!)
+let agents = RemoteAgents(url: URL(string: "http://localhost:8321")!)
 let request = Components.Schemas.CreateAgentTurnRequest(
         agent_id: agentId,
         messages: [
diff --git a/docs/source/distributions/self_hosted_distro/dell-tgi.md b/docs/source/distributions/self_hosted_distro/dell-tgi.md
index 705bf2fa7..cf0c02983 100644
--- a/docs/source/distributions/self_hosted_distro/dell-tgi.md
+++ b/docs/source/distributions/self_hosted_distro/dell-tgi.md
@@ -41,7 +41,7 @@ The script will first start up TGI server, then start up Llama Stack distributio
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
+INFO:     Uvicorn running on http://[::]:8321 (Press CTRL+C to quit)
 ```
 
 To kill the server
@@ -65,7 +65,7 @@ registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1
 #### Start Llama Stack server pointing to TGI server
 
 ```
-docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml
+docker run --network host -it -p 8321:8321 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml
 ```
 
 Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index c3abccfd9..bc5f3e5e6 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -23,8 +23,8 @@ subcommands:
 ```bash
 $ llama-stack-client configure
 > Enter the host name of the Llama Stack distribution server: localhost
-> Enter the port number of the Llama Stack distribution server: 5000
-Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
+> Enter the port number of the Llama Stack distribution server: 8321
+Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
 ```
 
 ### `llama-stack-client providers list`
diff --git a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
index bdfd3520f..39644ee51 100644
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@@ -32,8 +32,8 @@
    "outputs": [],
    "source": [
     "HOST = \"localhost\"  # Replace with your host\n",
-    "LOCAL_PORT = 5000        # Replace with your local distro port\n",
-    "CLOUD_PORT = 5001        # Replace with your cloud distro port"
+    "LOCAL_PORT = 8321        # Replace with your local distro port\n",
+    "CLOUD_PORT = 8322        # Replace with your cloud distro port"
    ]
   },
   {
@@ -43,7 +43,7 @@
    "source": [
     "#### 2. Set Up Local and Cloud Clients\n",
     "\n",
-    "Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:5000` and the cloud distribution running on `http://localhost:5001`.\n"
+    "Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:8321` and the cloud distribution running on `http://localhost:5001`.\n"
    ]
   },
   {
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index 7942f603a..9fa82bd61 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -34,8 +34,8 @@ class StackRun(Subcommand):
         self.parser.add_argument(
             "--port",
             type=int,
-            help="Port to run the server on. Defaults to 5000",
-            default=int(os.getenv("LLAMA_STACK_PORT", 5000)),
+            help="Port to run the server on. Defaults to 8321",
+            default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
         )
         self.parser.add_argument(
             "--image-name",
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 2d216d314..6a0047f69 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -293,7 +293,7 @@ def main():
     parser.add_argument(
         "--port",
         type=int,
-        default=int(os.getenv("LLAMA_STACK_PORT", 5000)),
+        default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
         help="Port to listen on",
     )
     parser.add_argument(
diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py
index d3852caee..70c7a0898 100644
--- a/llama_stack/distribution/ui/modules/api.py
+++ b/llama_stack/distribution/ui/modules/api.py
@@ -14,7 +14,7 @@ from llama_stack_client import LlamaStackClient
 class LlamaStackApi:
     def __init__(self):
         self.client = LlamaStackClient(
-            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"),
+            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"),
             provider_data={
                 "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
                 "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),