mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 20:14:13 +00:00
docs: concepts and building_applications migration (#3534)
# What does this PR do? - Migrates the remaining documentation sections to the new documentation format <!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. --> <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> ## Test Plan - Partial migration <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
This commit is contained in:
parent
05ff4c4420
commit
c71ce8df61
82 changed files with 2535 additions and 1237 deletions
30
docs/docs/deploying/aws_eks_deployment.mdx
Normal file
30
docs/docs/deploying/aws_eks_deployment.mdx
Normal file
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
title: AWS EKS Deployment Guide
|
||||
description: Deploy Llama Stack on AWS EKS
|
||||
sidebar_label: AWS EKS Deployment
|
||||
sidebar_position: 3
|
||||
---
|
||||
|
||||
## AWS EKS Deployment
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html)
|
||||
- Create a [GitHub OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app)
|
||||
- Set authorization callback URL to `http://<your-llama-stack-ui-url>/api/auth/callback/`
|
||||
|
||||
### Automated Deployment
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=<your-huggingface-token>
|
||||
export GITHUB_CLIENT_ID=<your-github-client-id>
|
||||
export GITHUB_CLIENT_SECRET=<your-github-client-secret>
|
||||
export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
|
||||
|
||||
cd docs/source/distributions/eks
|
||||
./apply.sh
|
||||
```
|
||||
|
||||
This script will:
|
||||
- Set up default storage class for AWS EKS
|
||||
- Deploy Llama Stack server in Kubernetes pods and services
|
14
docs/docs/deploying/index.mdx
Normal file
14
docs/docs/deploying/index.mdx
Normal file
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
title: Deploying Llama Stack
|
||||
description: Production deployment guides for Llama Stack in various environments
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
---
|
||||
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Deploying Llama Stack
|
||||
|
||||
[**→ Kubernetes Deployment Guide**](./kubernetes_deployment.mdx)
|
||||
[**→ AWS EKS Deployment Guide**](./aws_eks_deployment.mdx)
|
224
docs/docs/deploying/kubernetes_deployment.mdx
Normal file
224
docs/docs/deploying/kubernetes_deployment.mdx
Normal file
|
@ -0,0 +1,224 @@
|
|||
---
|
||||
title: Kubernetes Deployment Guide
|
||||
description: Deploy Llama Stack on Kubernetes clusters with vLLM inference service
|
||||
sidebar_label: Kubernetes
|
||||
sidebar_position: 2
|
||||
---
|
||||
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Kubernetes Deployment Guide
|
||||
|
||||
Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers both local development with Kind and production deployment on AWS EKS.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### Local Kubernetes Setup
|
||||
|
||||
Create a local Kubernetes cluster via Kind:
|
||||
|
||||
```bash
|
||||
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
|
||||
```
|
||||
|
||||
Set your Hugging Face token:
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=$(echo -n "your-hf-token" | base64)
|
||||
```
|
||||
|
||||
## Quick Deployment
|
||||
|
||||
### Step 1: Create Storage and Secrets
|
||||
|
||||
```yaml
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
type: Opaque
|
||||
data:
|
||||
token: $HF_TOKEN
|
||||
EOF
|
||||
```
|
||||
|
||||
### Step 2: Deploy vLLM Server
|
||||
|
||||
```yaml
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: ["vllm serve meta-llama/Llama-3.2-1B-Instruct"]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
||||
EOF
|
||||
```
|
||||
|
||||
### Step 3: Configure Llama Stack
|
||||
|
||||
Update your run configuration:
|
||||
|
||||
```yaml
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||
max_tokens: 4096
|
||||
api_token: fake
|
||||
```
|
||||
|
||||
Build container image:
|
||||
|
||||
```bash
|
||||
tmp_dir=$(mktemp -d) && cat >$tmp_dir/Containerfile.llama-stack-run-k8s <<EOF
|
||||
FROM distribution-myenv:dev
|
||||
RUN apt-get update && apt-get install -y git
|
||||
RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
|
||||
ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
|
||||
EOF
|
||||
podman build -f $tmp_dir/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s $tmp_dir
|
||||
```
|
||||
|
||||
### Step 4: Deploy Llama Stack Server
|
||||
|
||||
```yaml
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: llama-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-stack-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack
|
||||
image: localhost/llama-stack-run-k8s:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
|
||||
ports:
|
||||
- containerPort: 5000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.llama
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-pvc
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-stack-service
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 5000
|
||||
targetPort: 5000
|
||||
type: ClusterIP
|
||||
EOF
|
||||
```
|
||||
|
||||
### Step 5: Test Deployment
|
||||
|
||||
```bash
|
||||
# Port forward and test
|
||||
kubectl port-forward service/llama-stack-service 5000:5000
|
||||
llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Check pod status:**
|
||||
```bash
|
||||
kubectl get pods -l app.kubernetes.io/name=vllm
|
||||
kubectl logs -l app.kubernetes.io/name=vllm
|
||||
```
|
||||
|
||||
**Test service connectivity:**
|
||||
```bash
|
||||
kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://vllm-server:8000/v1/models
|
||||
```
|
||||
|
||||
## Related Resources
|
||||
|
||||
- **[Deployment Overview](./index)** - Overview of deployment options
|
||||
- **[Distributions](/docs/distributions)** - Understanding Llama Stack distributions
|
||||
- **[Configuration](/docs/distributions/configuration)** - Detailed configuration options
|
Loading…
Add table
Add a link
Reference in a new issue