forked from phoenix-oss/llama-stack-mirror
		
	llama stack distributions / templates / docker refactor (#266)
* docker compose ollama * comment * update compose file * readme for distributions * readme * move distribution folders * move distribution/templates to distributions/ * rename * kill distribution/templates * readme * readme * build/developer cookbook/new api provider * developer cookbook * readme * readme * [bugfix] fix case for agent when memory bank registered without specifying provider_id (#264) * fix case where memory bank is registered without provider_id * memory test * agents unit test * Add an option to not use elastic agents for meta-reference inference (#269) * Allow overridding checkpoint_dir via config * Small rename * Make all methods `async def` again; add completion() for meta-reference (#270) PR #201 had made several changes while trying to fix issues with getting the stream=False branches of inference and agents API working. As part of this, it made a change which was slightly gratuitous. Namely, making chat_completion() and brethren "def" instead of "async def". The rationale was that this allowed the user (within llama-stack) of this to use it as: ``` async for chunk in api.chat_completion(params) ``` However, it causes unnecessary confusion for several folks. Given that clients (e.g., llama-stack-apps) anyway use the SDK methods (which are completely isolated) this choice was not ideal. Let's revert back so the call now looks like: ``` async for chunk in await api.chat_completion(params) ``` Bonus: Added a completion() implementation for the meta-reference provider. Technically should have been another PR :) * Improve an important error message * update ollama for llama-guard3 * Add vLLM inference provider for OpenAI compatible vLLM server (#178) This PR adds vLLM inference provider for OpenAI compatible vLLM server. * Create .readthedocs.yaml Trying out readthedocs * Update event_logger.py (#275) spelling error * vllm * build templates * delete templates * tmp add back build to avoid merge conflicts * vllm * vllm --------- Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com> Co-authored-by: Ashwin Bharambe <ashwin@meta.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com> Co-authored-by: raghotham <rsm@meta.com> Co-authored-by: nehal-a2z <nehal@coderabbit.ai>
This commit is contained in:
		
							parent
							
								
									c995219731
								
							
						
					
					
						commit
						23210e8679
					
				
					 32 changed files with 850 additions and 335 deletions
				
			
		
							
								
								
									
										48
									
								
								distributions/ollama/gpu/compose.yaml
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								distributions/ollama/gpu/compose.yaml
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,48 @@ | |||
| services: | ||||
|   ollama: | ||||
|     image: ollama/ollama:latest | ||||
|     network_mode: "host" | ||||
|     volumes: | ||||
|       - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast | ||||
|     ports: | ||||
|       - "11434:11434" | ||||
|     devices: | ||||
|       - nvidia.com/gpu=all | ||||
|     environment: | ||||
|       - CUDA_VISIBLE_DEVICES=0 | ||||
|     command: [] | ||||
|     deploy: | ||||
|       resources: | ||||
|         reservations: | ||||
|           devices: | ||||
|           - driver: nvidia | ||||
|             # that's the closest analogue to --gpus; provide | ||||
|             # an integer amount of devices or 'all' | ||||
|             count: 1 | ||||
|             # Devices are reserved using a list of capabilities, making | ||||
|             # capabilities the only required field. A device MUST | ||||
|             # satisfy all the requested capabilities for a successful | ||||
|             # reservation. | ||||
|             capabilities: [gpu] | ||||
|     runtime: nvidia | ||||
|   llamastack-local-cpu: | ||||
|     depends_on: | ||||
|     - ollama | ||||
|     image: llamastack/llamastack-local-cpu | ||||
|     network_mode: "host" | ||||
|     volumes: | ||||
|       - ~/.llama:/root/.llama | ||||
|       # Link to ollama run.yaml file | ||||
|       - ./ollama-run.yaml:/root/llamastack-run-ollama.yaml | ||||
|     ports: | ||||
|       - "5000:5000" | ||||
|     # Hack: wait for ollama server to start before starting docker | ||||
|     entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml" | ||||
|     deploy: | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 3s | ||||
|         max_attempts: 5 | ||||
|         window: 60s | ||||
| volumes: | ||||
|   ollama: | ||||
							
								
								
									
										46
									
								
								distributions/ollama/gpu/run.yaml
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								distributions/ollama/gpu/run.yaml
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,46 @@ | |||
| version: '2' | ||||
| built_at: '2024-10-08T17:40:45.325529' | ||||
| image_name: local | ||||
| docker_image: null | ||||
| conda_env: local | ||||
| apis: | ||||
| - shields | ||||
| - agents | ||||
| - models | ||||
| - memory | ||||
| - memory_banks | ||||
| - inference | ||||
| - safety | ||||
| providers: | ||||
|   inference: | ||||
|   - provider_id: ollama0 | ||||
|     provider_type: remote::ollama | ||||
|     config: | ||||
|       url: http://127.0.0.1:14343 | ||||
|   safety: | ||||
|   - provider_id: meta0 | ||||
|     provider_type: meta-reference | ||||
|     config: | ||||
|       llama_guard_shield: | ||||
|         model: Llama-Guard-3-1B | ||||
|         excluded_categories: [] | ||||
|         disable_input_check: false | ||||
|         disable_output_check: false | ||||
|       prompt_guard_shield: | ||||
|         model: Prompt-Guard-86M | ||||
|   memory: | ||||
|   - provider_id: meta0 | ||||
|     provider_type: meta-reference | ||||
|     config: {} | ||||
|   agents: | ||||
|   - provider_id: meta0 | ||||
|     provider_type: meta-reference | ||||
|     config: | ||||
|       persistence_store: | ||||
|         namespace: null | ||||
|         type: sqlite | ||||
|         db_path: ~/.llama/runtime/kvstore.db | ||||
|   telemetry: | ||||
|   - provider_id: meta0 | ||||
|     provider_type: meta-reference | ||||
|     config: {} | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue