From 03ac84a829c30b5d3ccc6c783cf917b8ac690e91 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Thu, 16 Jan 2025 15:26:48 -0800 Subject: [PATCH] Update default port from 5000 -> 8321 --- distributions/bedrock/compose.yaml | 2 +- distributions/cerebras/compose.yaml | 2 +- distributions/dell-tgi/compose.yaml | 2 +- distributions/fireworks/compose.yaml | 2 +- distributions/meta-reference-gpu/compose.yaml | 2 +- distributions/meta-reference-quantized-gpu/compose.yaml | 2 +- distributions/remote-nvidia/compose.yaml | 2 +- distributions/together/compose.yaml | 2 +- distributions/vllm-gpu/compose.yaml | 2 +- docs/source/building_applications/telemetry.md | 8 ++++---- docs/source/distributions/building_distro.md | 4 ++-- docs/source/distributions/ondevice_distro/ios_sdk.md | 2 +- docs/source/distributions/self_hosted_distro/dell-tgi.md | 4 ++-- .../source/references/llama_stack_client_cli_reference.md | 4 ++-- docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb | 6 +++--- llama_stack/cli/stack/run.py | 4 ++-- llama_stack/distribution/server/server.py | 2 +- llama_stack/distribution/ui/modules/api.py | 2 +- 18 files changed, 27 insertions(+), 27 deletions(-) diff --git a/distributions/bedrock/compose.yaml b/distributions/bedrock/compose.yaml index f988e33d1..055b92c67 100644 --- a/distributions/bedrock/compose.yaml +++ b/distributions/bedrock/compose.yaml @@ -5,7 +5,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/llamastack-run-bedrock.yaml ports: - - "5000:5000" + - "8321:8321" entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml" deploy: restart_policy: diff --git a/distributions/cerebras/compose.yaml b/distributions/cerebras/compose.yaml index f2e9a6f42..8dc09a865 100644 --- a/distributions/cerebras/compose.yaml +++ b/distributions/cerebras/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/llamastack-run-cerebras.yaml ports: - - "5000:5000" + - "8321:8321" entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml" deploy: restart_policy: diff --git a/distributions/dell-tgi/compose.yaml b/distributions/dell-tgi/compose.yaml index 0e325aff5..d26636cbd 100644 --- a/distributions/dell-tgi/compose.yaml +++ b/distributions/dell-tgi/compose.yaml @@ -40,7 +40,7 @@ services: # Link to TGI run.yaml file - ./run.yaml:/root/my-run.yaml ports: - - "5000:5000" + - "8321:8321" # Hack: wait for TGI server to start before starting docker entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" restart_policy: diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml index 71137c040..4b53fcf00 100644 --- a/distributions/fireworks/compose.yaml +++ b/distributions/fireworks/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/llamastack-run-fireworks.yaml ports: - - "5000:5000" + - "8321:8321" entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml" deploy: restart_policy: diff --git a/distributions/meta-reference-gpu/compose.yaml b/distributions/meta-reference-gpu/compose.yaml index 2b88c68fc..d977e92ea 100644 --- a/distributions/meta-reference-gpu/compose.yaml +++ b/distributions/meta-reference-gpu/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/my-run.yaml ports: - - "5000:5000" + - "8321:8321" devices: - nvidia.com/gpu=all environment: diff --git a/distributions/meta-reference-quantized-gpu/compose.yaml b/distributions/meta-reference-quantized-gpu/compose.yaml index f9fe9f45d..98e943dce 100644 --- a/distributions/meta-reference-quantized-gpu/compose.yaml +++ b/distributions/meta-reference-quantized-gpu/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/my-run.yaml ports: - - "5000:5000" + - "8321:8321" devices: - nvidia.com/gpu=all environment: diff --git a/distributions/remote-nvidia/compose.yaml b/distributions/remote-nvidia/compose.yaml index 04b12d0da..ab8b4ce25 100644 --- a/distributions/remote-nvidia/compose.yaml +++ b/distributions/remote-nvidia/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/llamastack-run-nvidia.yaml ports: - - "5000:5000" + - "8321:8321" environment: - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml index 8d938990e..c7251d0a7 100644 --- a/distributions/together/compose.yaml +++ b/distributions/together/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/llamastack-run-together.yaml ports: - - "5000:5000" + - "8321:8321" entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml" deploy: restart_policy: diff --git a/distributions/vllm-gpu/compose.yaml b/distributions/vllm-gpu/compose.yaml index f8779c9ce..98267cdc3 100644 --- a/distributions/vllm-gpu/compose.yaml +++ b/distributions/vllm-gpu/compose.yaml @@ -6,7 +6,7 @@ services: - ~/.llama:/root/.llama - ./run.yaml:/root/my-run.yaml ports: - - "5000:5000" + - "8321:8321" devices: - nvidia.com/gpu=all environment: diff --git a/docs/source/building_applications/telemetry.md b/docs/source/building_applications/telemetry.md index 6c8067035..70c54ac98 100644 --- a/docs/source/building_applications/telemetry.md +++ b/docs/source/building_applications/telemetry.md @@ -139,7 +139,7 @@ Querying Traces for a agent session The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command: ``` bash - curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \ + curl -X POST 'http://localhost:8321/alpha/telemetry/query-traces' \ -H 'Content-Type: application/json' \ -d '{ "attribute_filters": [ @@ -167,7 +167,7 @@ The client SDK is not updated to support the new telemetry API. It will be updat Querying spans for a specifc root span id ``` bash -curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \ +curl -X POST 'http://localhost:8321/alpha/telemetry/get-span-tree' \ -H 'Content-Type: application/json' \ -d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }' @@ -207,7 +207,7 @@ curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \ ## Example: Save Spans to Dataset Save all spans for a specific agent session to a dataset. ``` bash -curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \ +curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \ -H 'Content-Type: application/json' \ -d '{ "attribute_filters": [ @@ -225,7 +225,7 @@ curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \ Save all spans for a specific agent turn to a dataset. ```bash -curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \ +curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \ -H 'Content-Type: application/json' \ -d '{ "attribute_filters": [ diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md index cc94fa9db..aaf2462f7 100644 --- a/docs/source/distributions/building_distro.md +++ b/docs/source/distributions/building_distro.md @@ -402,11 +402,11 @@ Serving API agents POST /agents/step/get POST /agents/turn/get -Listening on ['::', '0.0.0.0']:5000 +Listening on ['::', '0.0.0.0']:8321 INFO: Started server process [2935911] INFO: Waiting for application startup. INFO: Application startup complete. -INFO: Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit) +INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO: 2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK ``` diff --git a/docs/source/distributions/ondevice_distro/ios_sdk.md b/docs/source/distributions/ondevice_distro/ios_sdk.md index 0c3cf09af..c9d3a89b5 100644 --- a/docs/source/distributions/ondevice_distro/ios_sdk.md +++ b/docs/source/distributions/ondevice_distro/ios_sdk.md @@ -27,7 +27,7 @@ If you don't want to run inference on-device, then you can connect to any hosted ```swift import LlamaStackClient -let agents = RemoteAgents(url: URL(string: "http://localhost:5000")!) +let agents = RemoteAgents(url: URL(string: "http://localhost:8321")!) let request = Components.Schemas.CreateAgentTurnRequest( agent_id: agentId, messages: [ diff --git a/docs/source/distributions/self_hosted_distro/dell-tgi.md b/docs/source/distributions/self_hosted_distro/dell-tgi.md index 705bf2fa7..cf0c02983 100644 --- a/docs/source/distributions/self_hosted_distro/dell-tgi.md +++ b/docs/source/distributions/self_hosted_distro/dell-tgi.md @@ -41,7 +41,7 @@ The script will first start up TGI server, then start up Llama Stack distributio INFO: Started server process [1] INFO: Waiting for application startup. INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) +INFO: Uvicorn running on http://[::]:8321 (Press CTRL+C to quit) ``` To kill the server @@ -65,7 +65,7 @@ registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1 #### Start Llama Stack server pointing to TGI server ``` -docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml +docker run --network host -it -p 8321:8321 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml ``` Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g. diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md index c3abccfd9..bc5f3e5e6 100644 --- a/docs/source/references/llama_stack_client_cli_reference.md +++ b/docs/source/references/llama_stack_client_cli_reference.md @@ -23,8 +23,8 @@ subcommands: ```bash $ llama-stack-client configure > Enter the host name of the Llama Stack distribution server: localhost -> Enter the port number of the Llama Stack distribution server: 5000 -Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000 +> Enter the port number of the Llama Stack distribution server: 8321 +Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321 ``` ### `llama-stack-client providers list` diff --git a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb index bdfd3520f..39644ee51 100644 --- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb +++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb @@ -32,8 +32,8 @@ "outputs": [], "source": [ "HOST = \"localhost\" # Replace with your host\n", - "LOCAL_PORT = 5000 # Replace with your local distro port\n", - "CLOUD_PORT = 5001 # Replace with your cloud distro port" + "LOCAL_PORT = 8321 # Replace with your local distro port\n", + "CLOUD_PORT = 8322 # Replace with your cloud distro port" ] }, { @@ -43,7 +43,7 @@ "source": [ "#### 2. Set Up Local and Cloud Clients\n", "\n", - "Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:5000` and the cloud distribution running on `http://localhost:5001`.\n" + "Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:8321` and the cloud distribution running on `http://localhost:5001`.\n" ] }, { diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py index 7942f603a..9fa82bd61 100644 --- a/llama_stack/cli/stack/run.py +++ b/llama_stack/cli/stack/run.py @@ -34,8 +34,8 @@ class StackRun(Subcommand): self.parser.add_argument( "--port", type=int, - help="Port to run the server on. Defaults to 5000", - default=int(os.getenv("LLAMA_STACK_PORT", 5000)), + help="Port to run the server on. Defaults to 8321", + default=int(os.getenv("LLAMA_STACK_PORT", 8321)), ) self.parser.add_argument( "--image-name", diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 2d216d314..6a0047f69 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -293,7 +293,7 @@ def main(): parser.add_argument( "--port", type=int, - default=int(os.getenv("LLAMA_STACK_PORT", 5000)), + default=int(os.getenv("LLAMA_STACK_PORT", 8321)), help="Port to listen on", ) parser.add_argument( diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py index d3852caee..70c7a0898 100644 --- a/llama_stack/distribution/ui/modules/api.py +++ b/llama_stack/distribution/ui/modules/api.py @@ -14,7 +14,7 @@ from llama_stack_client import LlamaStackClient class LlamaStackApi: def __init__(self): self.client = LlamaStackClient( - base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"), + base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"), provider_data={ "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""), "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),