ci: use ollama container image with loaded models (#2410)

# What does this PR do? Instead of downloading the models each time we now have a single Ollama container that is baked with the models pulled and ready to use. This will remove the CI flakiness on model pulling. Signed-off-by: Sébastien Han <seb@redhat.com>
2025-06-28 02:53:30 +00:00 · 2025-06-06 12:08:20 +02:00 · 2025-06-06 12:08:20 +02:00 · 0d0b8d2be1
commit 0d0b8d2be1
parent 692709cd45
4 changed files with 29 additions and 194 deletions
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,26 +1,9 @@
 name: Setup Ollama
-description: Start Ollama and cache model
-inputs:
-  models:
-    description: Comma-separated list of models to pull
-    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
+description: Start Ollama
 runs:
  using: "composite"
  steps:
-    - name: Install and start Ollama
+    - name: Start Ollama
      shell: bash
      run: |
-        # the ollama installer also starts the ollama service
-        curl -fsSL https://ollama.com/install.sh | sh
-
-    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
-    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
-    # pull them directly.
-    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
-    - name: Pull requested models
-      if: inputs.models != ''
-      shell: bash
-      run: |
-        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
-          ollama pull "$model"
-        done
+        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -50,7 +50,7 @@ jobs:
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &

      - name: Wait for Llama Stack server to be ready
        if: matrix.client-type == 'http'
@ -87,6 +87,7 @@ jobs:
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          OLLAMA_URL: "http://0.0.0.0:11434"
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="ollama"
@ -107,7 +108,7 @@ jobs:
      - name: Write ollama logs to file
        if: ${{ always() }}
        run: |
-          sudo journalctl -u ollama.service > ollama.log
+          sudo docker logs ollama > ollama.log

      - name: Upload all logs to artifacts
        if: ${{ always() }}