example inference router run config

2025-12-04 02:03:44 +00:00 · 2024-09-19 22:22:00 -07:00 · 2024-09-19 22:22:00 -07:00 · e2c7a3cea9
commit e2c7a3cea9
parent bce79617bf
2 changed files with 75 additions and 10 deletions
--- a/llama_stack/apis/inference/client.py
+++ b/llama_stack/apis/inference/client.py
@ -101,16 +101,17 @@ async def run_main(host: str, port: int, stream: bool):
    async for log in EventLogger().log(iterator):
        log.print()
-    cprint(f"User>{message.content}", "green")
+    # For testing models routing
-    iterator = client.chat_completion(
+    # cprint(f"User>{message.content}", "green")
-        ChatCompletionRequest(
+    # iterator = client.chat_completion(
-            model="Meta-Llama3.1-8B",
+    #     ChatCompletionRequest(
-            messages=[message],
+    #         model="Meta-Llama3.1-8B",
-            stream=stream,
+    #         messages=[message],
-        )
+    #         stream=stream,
-    )
+    #     )
-    async for log in EventLogger().log(iterator):
+    # )
-        log.print()
+    # async for log in EventLogger().log(iterator):
    #     log.print()
 def main(host: str, port: int, stream: bool = True):
--- a/llama_stack/configs/examples/local-router-run.yaml
+++ b/llama_stack/configs/examples/local-router-run.yaml
@ -0,0 +1,64 @@
 built_at: '2024-09-18T13:41:17.656743'
 image_name: local
 docker_image: null
 conda_env: local
 apis_to_serve:
 - inference
 - memory
 - safety
 - telemetry
 - agents
 - models
 provider_map:
  inference: models-router
  safety:
    provider_id: meta-reference
    config:
      llama_guard_shield:
        model: Llama-Guard-3-8B
        excluded_categories: []
        disable_input_check: false
        disable_output_check: false
      prompt_guard_shield:
        model: Prompt-Guard-86M
  telemetry:
    provider_id: meta-reference
    config: {}
  agents:
    provider_id: meta-reference
    config: {}
  models:
    provider_id: builtin
    config:
      models_config:
      - core_model_id: Meta-Llama3.1-8B-Instruct
        provider_id: meta-reference
        api: inference
        config:
          model: Meta-Llama3.1-8B-Instruct
          quantization: null
          torch_seed: null
          max_seq_len: 4096
          max_batch_size: 1
      - core_model_id: Meta-Llama3.1-8B
        provider_id: meta-reference
        api: inference
        config:
          model: Meta-Llama3.1-8B
          quantization: null
          torch_seed: null
          max_seq_len: 4096
          max_batch_size: 1
      - core_model_id: Llama-Guard-3-8B
        provider_id: meta-reference
        api: safety
        config:
          model: Llama-Guard-3-8B
          excluded_categories: []
          disable_input_check: false
          disable_output_check: false
      - core_model_id: Prompt-Guard-86M
        provider_id: meta-reference
        api: safety
        config:
          model: Prompt-Guard-86M