llama-stack-mirror/tests/integration/recordings/responses/257e85e44508.json

{
  "request": {
    "provider": "tavily",
    "tool_name": "web_search",
    "kwargs": {
      "query": "Llama 4 Maverick experts architecture"
    }
  },
  "response": {
    "body": {
      "__type__": "llama_stack.apis.tools.tools.ToolInvocationResult",
      "__data__": {
        "content": "{\"query\": \"Llama 4 Maverick experts architecture\", \"top_k\": [{\"title\": \"Medium \\ud83e\\udd99 Meta\\u2019s New Llama 4\\u2019s MoE Architecture Makes AI Faster & Cheaper | by Tahir | Medium\", \"url\": \"https://medium.com/@tahirbalarabe2/metas-new-llama-4-s-moe-architecture-makes-ai-faster-cheaper-635339e51e10\", \"content\": \"April 5, 2025 - Both Llama 4 Scout and Llama 4 Maverick are 17 billion active parameter models, but they differ in their MoE architecture. Llama 4 Scout has 16 experts, allowing it to fit on a single H100 GPU with Int4 quantization, while Llama 4 Maverick features 128 experts and fits on a single H100 host.\", \"score\": 0.90039873, \"raw_content\": null}, {\"title\": \"Hugging Face meta-llama/Llama-4-Maverick-17B-128E-Instruct \\u00b7 Hugging Face\", \"url\": \"https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct\", \"content\": \"Model Architecture: The Llama 4 models are auto-regressive language models that use a mixture-of-experts (MoE) architecture and incorporate early fusion for native multimodality.\", \"score\": 0.86358756, \"raw_content\": null}, {\"title\": \"Meta The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation\", \"url\": \"https://ai.meta.com/blog/llama-4-multimodal-intelligence/\", \"content\": \"* Llama 4 Scout, a 17 billion active parameter model with 16 experts, is the best multimodal model in the world in its class and is more powerful than all previous generation Llama models, while fitting in a single NVIDIA H100 GPU. * Llama 4 Maverick, a 17 billion active parameter model with 128 experts, is the best multimodal model in its class, beating GPT-4o and Gemini 2.0 Flash across a broad range of widely reported benchmarks, while achieving comparable results to the new DeepSeek v3 on reasoning and coding\\u2014at less than half the active parameters. We\\u2019re introducing Llama 4 Scout and Llama 4 Maverick, the first open-weight natively multimodal models with unprecedented context length support and our first built using a mixture-of-experts (MoE) architecture.\", \"score\": 0.84767824, \"raw_content\": null}, {\"title\": \"Medium Llama 4 Technical Analysis: Decoding the Architecture Behind Meta\\u2019s Multimodal MoE Revolution | by Karan_bhutani | Medium\", \"url\": \"https://medium.com/@karanbhutani477/llama-4-technical-analysis-decoding-the-architecture-behind-metas-multimodal-moe-revolution-535b2775d07d\", \"content\": \"**Enter Llama 4**, which tackles these challenges head-on with new approaches: a Mixture-of-Experts architecture to explode parameter count without proportional compute cost, an early-fusion multimodal design to natively handle text+images+video, an *iRoPE* positional encoding to support extremely long contexts, and **FP8** precision for efficient training and deployment. Compared to Llama 3\\u2019s dense transformer baseline, Llama 4 introduces specialist experts (MoE) to vastly expand capacity without a proportional compute hit, blends multiple input modalities from the very start for deeply integrated understanding, extends positional embeddings to support unprecedented context lengths, and leverages low-bit precision to make training and inference faster and more economical.\", \"score\": 0.7393665, \"raw_content\": null}, {\"title\": \"Llama Unmatched Performance and Efficiency | Llama 4\", \"url\": \"https://www.llama.com/models/llama-4/\", \"content\": \"# Llama 4 # Llama 4 Llama 4 Scout Class-leading natively multimodal model that offers superior text and visual intelligence, single H100 GPU efficiency, and a 10M context window for seamless long document analysis. Llama 4 MaverickIndustry-leading natively multimodal model for image and text understanding with groundbreaking intelligence and fast responses at a low cost. We evaluated model performance on a suite of common benchmarks across a wide range of languages, testing for coding, reasoning, knowledge, vision understanding, multilinguality, and long context. 4. Specialized long context evals are not traditionally reported for generalist models, so we share internal runs to showcase llama's frontier performance. 4. Specialized long context evals are not traditionally reported for generalist models, so we share internal runs to showcase llama's frontier performance.\", \"score\": 0.49758363, \"raw_content\": null}]}",
        "error_message": null,
        "error_code": null,
        "metadata": null
      }
    },
    "is_streaming": false
  }
}