forked from phoenix-oss/llama-stack-mirror
		
	* move docs -> source * Add files via upload * mv image * Add files via upload * colocate iOS setup doc * delete image * Add files via upload * fix * delete image * Add files via upload * Update developer_cookbook.md * toctree * wip subfolder * docs update * subfolder * updates * name * updates * index * updates * refactor structure * depth * docs * content * docs * getting started * distributions * fireworks * fireworks * update * theme * theme * theme * pdj theme * pytorch theme * css * theme * agents example * format * index * headers * copy button * test tabs * test tabs * fix * tabs * tab * tabs * sphinx_design * quick start commands * size * width * css * css * download models * asthetic fix * tab format * update * css * width * css * docs * tab based * tab * tabs * docs * style * image * css * color * typo * update docs * missing links * list templates * links * links update * troubleshooting * fix * distributions * docs * fix table * kill llamastack-local-gpu/cpu * Update index.md * Update index.md * mv ios_setup.md * Update ios_setup.md * Add remote_or_local.gif * Update ios_setup.md * release notes * typos * Add ios_setup to index * nav bar * hide torctree * ios image * links update * rename * rename * docs * rename * links * distributions * distributions * distributions * distributions * remove release * remote --------- Co-authored-by: dltn <6599399+dltn@users.noreply.github.com> Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
		
			
				
	
	
		
			33 lines
		
	
	
	
		
			1.1 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			33 lines
		
	
	
	
		
			1.1 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| services:
 | |
|   text-generation-inference:
 | |
|     image: ghcr.io/huggingface/text-generation-inference:latest
 | |
|     network_mode: "host"
 | |
|     volumes:
 | |
|       - $HOME/.cache/huggingface:/data
 | |
|     ports:
 | |
|       - "5009:5009"
 | |
|     command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
 | |
|     runtime: nvidia
 | |
|     healthcheck:
 | |
|       test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
 | |
|       interval: 5s
 | |
|       timeout: 5s
 | |
|       retries: 30
 | |
|   llamastack:
 | |
|     depends_on:
 | |
|       text-generation-inference:
 | |
|         condition: service_healthy
 | |
|     image: llamastack/llamastack-tgi
 | |
|     network_mode: "host"
 | |
|     volumes:
 | |
|       - ~/.llama:/root/.llama
 | |
|       # Link to run.yaml file
 | |
|       - ./run.yaml:/root/my-run.yaml
 | |
|     ports:
 | |
|       - "5000:5000"
 | |
|     entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
 | |
|     restart_policy:
 | |
|       condition: on-failure
 | |
|       delay: 3s
 | |
|       max_attempts: 5
 | |
|       window: 60s
 |