feat: add batch inference API to llama stack inference

2025-12-31 07:23:54 +00:00 · 2025-04-08 13:50:52 -07:00 · 2025-04-08 13:50:52 -07:00 · 0cfb2e2473
commit 0cfb2e2473
parent ed58a94b30
24 changed files with 1041 additions and 377 deletions
--- a/llama_stack/models/llama/llama4/generation.py
+++ b/llama_stack/models/llama/llama4/generation.py
@ -233,7 +233,7 @@ class Llama4:
                        source="output",
                        logprobs=(token_logprobs[idx, cur_pos : cur_pos + 1].tolist() if logprobs else None),
                        batch_idx=idx,
-                        finished=eos_reached[idx],
+                        finished=eos_reached[idx].item(),
                        ignore_token=cur_pos < len(prompt_tokens[idx]),
                    )
                )