forked from phoenix-oss/llama-stack-mirror
		
	## context Now, in llama stack, we only support inference / eval a finetuned checkpoint with meta-reference as inference provider. This is sub-optimal since meta-reference is pretty slow. Our vision is that developer can inference / eval a finetuned checkpoint produced by post training apis with all the inference providers on the stack. To achieve this, we'd like to define an unified output checkpoint format for post training providers. So that, all the inference provider can respect that format for customized model inference. By spotting check how [ollama](https://github.com/ollama/ollama/blob/main/docs/import.md) and [fireworks](https://docs.fireworks.ai/models/uploading-custom-models) do inference on a customized model, we defined the output checkpoint format as /adapter/adapter_config.json and /adapter/adapter_model.safetensors (as we only support LoRA post training now, we begin from adapter only checkpoint) ## test we kick off a post training job and configured checkpoint format as 'huggingface'. Output files  we did a proof of concept with ollama to see if ollama can inference our finetuned checkpoint 1. create Modelfile like <img width="799" alt="Screenshot 2025-01-22 at 5 04 18 PM" src="https://github.com/user-attachments/assets/7fca9ac3-a294-44f8-aab1-83852c600609" /> 2. create a customized model with `ollama create llama_3_2_finetuned` and run inference successfully  This is just a proof of concept with ollama cmd line. As next step, we'd like to wrap loading / inference customized model logic in the inference provider implementation.
		
			
				
	
	
		
			91 lines
		
	
	
	
		
			2.1 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			91 lines
		
	
	
	
		
			2.1 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| version: '2'
 | |
| image_name: experimental-post-training
 | |
| container_image: null
 | |
| conda_env: experimental-post-training
 | |
| apis:
 | |
| - agents
 | |
| - datasetio
 | |
| - eval
 | |
| - inference
 | |
| - vector_io
 | |
| - safety
 | |
| - scoring
 | |
| - telemetry
 | |
| - post_training
 | |
| - tool_runtime
 | |
| providers:
 | |
|   inference:
 | |
|   - provider_id: meta-reference-inference
 | |
|     provider_type: inline::meta-reference
 | |
|     config:
 | |
|       max_seq_len: 4096
 | |
|       checkpoint_dir: null
 | |
|       create_distributed_process_group: False
 | |
|   - provider_id: ollama
 | |
|     provider_type: remote::ollama
 | |
|     config:
 | |
|       url: ${env.OLLAMA_URL:http://localhost:11434}
 | |
|   eval:
 | |
|   - provider_id: meta-reference
 | |
|     provider_type: inline::meta-reference
 | |
|     config: {}
 | |
|   scoring:
 | |
|   - provider_id: basic
 | |
|     provider_type: inline::basic
 | |
|     config: {}
 | |
|   - provider_id: braintrust
 | |
|     provider_type: inline::braintrust
 | |
|     config:
 | |
|       openai_api_key: ${env.OPENAI_API_KEY:}
 | |
|   datasetio:
 | |
|   - provider_id: localfs
 | |
|     provider_type: inline::localfs
 | |
|     config: {}
 | |
|   telemetry:
 | |
|   - provider_id: meta-reference
 | |
|     provider_type: inline::meta-reference
 | |
|     config: {}
 | |
|   post_training:
 | |
|   - provider_id: torchtune-post-training
 | |
|     provider_type: inline::torchtune
 | |
|     config: {
 | |
|       checkpoint_format: huggingface
 | |
|     }
 | |
|   agents:
 | |
|   - provider_id: meta-reference
 | |
|     provider_type: inline::meta-reference
 | |
|     config:
 | |
|       persistence_store:
 | |
|         type: sqlite
 | |
|         namespace: null
 | |
|         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db
 | |
|   safety:
 | |
|   - provider_id: llama-guard
 | |
|     provider_type: inline::llama-guard
 | |
|     config: {}
 | |
|   vector_io:
 | |
|   - provider_id: faiss
 | |
|     provider_type: inline::faiss
 | |
|     config:
 | |
|       kvstore:
 | |
|         type: sqlite
 | |
|         namespace: null
 | |
|         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/faiss_store.db
 | |
|   tool_runtime:
 | |
|   - provider_id: brave-search
 | |
|     provider_type: remote::brave-search
 | |
|     config:
 | |
|       api_key: ${env.BRAVE_SEARCH_API_KEY:}
 | |
|       max_results: 3
 | |
| 
 | |
| 
 | |
| metadata_store:
 | |
|   namespace: null
 | |
|   type: sqlite
 | |
|   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db
 | |
| models: []
 | |
| shields: []
 | |
| vector_dbs: []
 | |
| datasets: []
 | |
| scoring_fns: []
 | |
| benchmarks: []
 |