diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 6e7e99ef9..e03c7401c 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -25,7 +25,8 @@ jobs:
       matrix:
         # Listing tests manually since some of them currently fail
         # TODO: generate matrix list from tests/integration when fixed
-        test-type: [inference, datasets, inspect, scoring, post_training, providers]
+        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
+        client-type: [library, http]
       fail-fast: false # we want to run all tests regardless of failure
 
     steps:
@@ -54,6 +55,8 @@ jobs:
           uv sync --extra dev --extra test
           uv pip install ollama faiss-cpu
           # always test against the latest version of the client
+          # TODO: this is not necessarily a good idea. we need to test against both published and latest
+          # to find out backwards compatibility issues.
           uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
           uv pip install -e .
           llama stack build --template ollama --image-type venv
@@ -74,6 +77,7 @@ jobs:
           exit 1
 
       - name: Start Llama Stack server in background
+        if: matrix.client-type == 'http'
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
@@ -81,6 +85,7 @@ jobs:
           nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
 
       - name: Wait for Llama Stack server to be ready
+        if: matrix.client-type == 'http'
         run: |
           echo "Waiting for Llama Stack server..."
           for i in {1..30}; do
@@ -98,4 +103,12 @@ jobs:
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
-          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="ollama"
+          else
+            stack_config="http://localhost:8321"
+          fi
+          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+            --embedding-model=all-MiniLM-L6-v2
diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py
index f558254e5..c1eec35f0 100644
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@@ -275,6 +275,7 @@ def test_text_chat_completion_first_token_profiling(client_with_models, text_mod
         model_id=text_model_id,
         messages=messages,
         stream=False,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
     )
     message_content = response.completion_message.content.lower().strip()
     assert len(message_content) > 0
@@ -301,6 +302,7 @@ def test_text_chat_completion_streaming(client_with_models, text_model_id, test_
         model_id=text_model_id,
         messages=[{"role": "user", "content": question}],
         stream=True,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
     )
     streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
     assert len(streamed_content) > 0