diff --git a/llama_stack/providers/tests/eval/provider_config_example.yaml b/llama_stack/providers/tests/eval/provider_config_example.yaml
deleted file mode 100644
index 38f7512f1..000000000
--- a/llama_stack/providers/tests/eval/provider_config_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-providers:
-  datasetio:
-  - provider_id: test-meta
-    provider_type: meta-reference
-    config: {}
-  scoring:
-    - provider_id: test-meta
-      provider_type: meta-reference
-      config: {}
-  eval:
-    - provider_id: test-meta
-      provider_type: meta-reference
-      config: {}
-  inference:
-    - provider_id: test-tgi
-      provider_type: remote::tgi
-      config:
-        url: http://127.0.0.1:5009
-    - provider_id: test-tgi-2
-      provider_type: remote::tgi
-      config:
-        url: http://127.0.0.1:5010
diff --git a/llama_stack/providers/tests/eval/test_eval_old.py b/llama_stack/providers/tests/eval/test_eval_old.py
deleted file mode 100644
index 667be1bd5..000000000
--- a/llama_stack/providers/tests/eval/test_eval_old.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.common.type_system import *  # noqa: F403
-from llama_stack.apis.datasetio import *  # noqa: F403
-from llama_stack.apis.eval.eval import ModelCandidate
-from llama_stack.distribution.datatypes import *  # noqa: F403
-
-from llama_models.llama3.api import SamplingParams
-
-from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
-from llama_stack.providers.tests.resolver import resolve_impls_for_test
-
-# How to run this test:
-#
-# 1. Ensure you have a conda with the right dependencies installed. This is a bit tricky
-#    since it depends on the provider you are testing. On top of that you need
-#    `pytest` and `pytest-asyncio` installed.
-#
-# 2. Copy and modify the provider_config_example.yaml depending on the provider you are testing.
-#
-# 3. Run:
-#
-# ```bash
-# PROVIDER_ID=<your_provider> \
-#   PROVIDER_CONFIG=provider_config.yaml \
-#   pytest -s llama_stack/providers/tests/eval/test_eval.py \
-#   --tb=short --disable-warnings
-# ```
-
-
-@pytest_asyncio.fixture(scope="session")
-async def eval_settings():
-    impls = await resolve_impls_for_test(
-        Api.eval, deps=[Api.datasetio, Api.scoring, Api.inference]
-    )
-    return {
-        "eval_impl": impls[Api.eval],
-        "scoring_impl": impls[Api.scoring],
-        "datasets_impl": impls[Api.datasets],
-    }
-
-
-@pytest.mark.asyncio
-async def test_eval(eval_settings):
-    datasets_impl = eval_settings["datasets_impl"]
-    await register_dataset(
-        datasets_impl,
-        for_generation=True,
-        dataset_id="test_dataset_for_eval",
-    )
-
-    response = await datasets_impl.list_datasets()
-    assert len(response) == 1
-
-    eval_impl = eval_settings["eval_impl"]
-    response = await eval_impl.evaluate_batch(
-        dataset_id=response[0].identifier,
-        candidate=ModelCandidate(
-            model="Llama3.2-1B-Instruct",
-            sampling_params=SamplingParams(),
-        ),
-        scoring_functions=[
-            "meta-reference::subset_of",
-            "meta-reference::llm_as_judge_8b_correctness",
-        ],
-    )
-    assert response.job_id == "0"
-    job_status = await eval_impl.job_status(response.job_id)
-
-    assert job_status and job_status.value == "completed"
-
-    eval_response = await eval_impl.job_result(response.job_id)
-
-    assert eval_response is not None
-    assert len(eval_response.generations) == 5
-    assert "meta-reference::subset_of" in eval_response.scores
-    assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores