feat: remote ramalama provider implementation

Implement remote ramalama provider using AsyncOpenAI as the client since ramalama doesn't have its own Async library. Ramalama is similar to ollama, as it is a lightweight local inference server. However, it runs by default in a containerized mode. RAMALAMA_URL is http://localhost:8080 by default Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-12-30 12:14:19 +00:00 · 2025-03-11 18:15:45 -04:00 · 2025-03-11 18:15:45 -04:00 · 4de45560bf
commit 4de45560bf
parent 94f83382eb
8 changed files with 680 additions and 0 deletions
--- a/llama_stack/providers/remote/inference/ramalama/models.py
+++ b/llama_stack/providers/remote/inference/ramalama/models.py
@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.providers.utils.inference.model_registry import (
+    ProviderModelEntry,
+    build_hf_repo_model_entry,
+    build_model_entry,
+)
+
+model_entries = [
+    build_hf_repo_model_entry(
+        "llama3.1:8b-instruct-fp16",
+        CoreModelId.llama3_1_8b_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.1:8b",
+        CoreModelId.llama3_1_8b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.1:70b-instruct-fp16",
+        CoreModelId.llama3_1_70b_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.1:70b",
+        CoreModelId.llama3_1_70b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.1:405b-instruct-fp16",
+        CoreModelId.llama3_1_405b_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.1:405b",
+        CoreModelId.llama3_1_405b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.2:1b-instruct-fp16",
+        CoreModelId.llama3_2_1b_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.2:1b",
+        CoreModelId.llama3_2_1b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.2:3b-instruct-fp16",
+        CoreModelId.llama3_2_3b_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.2:3b",
+        CoreModelId.llama3_2_3b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.2-vision:11b-instruct-fp16",
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.2-vision:latest",
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.2-vision:90b-instruct-fp16",
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+    ),
+    build_model_entry(
+        "llama3.2-vision:90b",
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama3.3:70b",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
+    # The Llama Guard models don't have their full fp16 versions
+    # so we are going to alias their default version to the canonical SKU
+    build_hf_repo_model_entry(
+        "llama-guard3:8b",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
+    build_hf_repo_model_entry(
+        "llama-guard3:1b",
+        CoreModelId.llama_guard_3_1b.value,
+    ),
+    ProviderModelEntry(
+        provider_model_id="all-minilm:latest",
+        aliases=["all-minilm"],
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+            "context_length": 512,
+        },
+    ),
+    ProviderModelEntry(
+        provider_model_id="nomic-embed-text",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 768,
+            "context_length": 8192,
+        },
+    ),
+]