From f67081d2d6088a0d3175baffad94977ddf8f6483 Mon Sep 17 00:00:00 2001 From: Doug Edgar Date: Fri, 12 Sep 2025 02:18:19 -0700 Subject: [PATCH] feat: migrate to FIPS-validated cryptographic algorithms (#3423) # What does this PR do? Migrates MD5 and SHA-1 hash algorithms to SHA-256. In particular, replaces: - MD5 in chunk ID generation. - MD5 in file verification. - SHA-1 in model identifier digests. And updates all related test expectations. Original discussion: https://github.com/llamastack/llama-stack/discussions/3413 Closes #3424. ## Test Plan Unit tests from scripts/unit-tests.sh were updated to match the new hash output, and ran to verify the tests pass. Signed-off-by: Doug Edgar --- llama_stack/cli/verify_download.py | 17 +++++++---------- .../providers/utils/vector_io/vector_utils.py | 6 ++---- llama_stack/testing/inference_recorder.py | 2 +- .../providers/vector_io/test_vector_utils.py | 12 ++++++------ 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py index b7f4cfdb5..e738abb4f 100644 --- a/llama_stack/cli/verify_download.py +++ b/llama_stack/cli/verify_download.py @@ -48,15 +48,12 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None: parser.set_defaults(func=partial(run_verify_cmd, parser=parser)) -def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str: - # NOTE: MD5 is used here only for download integrity verification, - # not for security purposes - # TODO: switch to SHA256 - md5_hash = hashlib.md5(usedforsecurity=False) +def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str: + sha256_hash = hashlib.sha256() with open(filepath, "rb") as f: for chunk in iter(lambda: f.read(chunk_size), b""): - md5_hash.update(chunk) - return md5_hash.hexdigest() + sha256_hash.update(chunk) + return sha256_hash.hexdigest() def load_checksums(checklist_path: Path) -> dict[str, str]: @@ -64,10 +61,10 @@ def load_checksums(checklist_path: Path) -> dict[str, str]: with open(checklist_path) as f: for line in f: if line.strip(): - md5sum, filepath = line.strip().split(" ", 1) + sha256sum, filepath = line.strip().split(" ", 1) # Remove leading './' if present filepath = filepath.lstrip("./") - checksums[filepath] = md5sum + checksums[filepath] = sha256sum return checksums @@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) - matches = False if exists: - actual_hash = calculate_md5(full_path) + actual_hash = calculate_sha256(full_path) matches = actual_hash == expected_hash results.append( diff --git a/llama_stack/providers/utils/vector_io/vector_utils.py b/llama_stack/providers/utils/vector_io/vector_utils.py index e55ac75ae..324f35405 100644 --- a/llama_stack/providers/utils/vector_io/vector_utils.py +++ b/llama_stack/providers/utils/vector_io/vector_utils.py @@ -12,14 +12,12 @@ import uuid def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str: """ Generate a unique chunk ID using a hash of the document ID and chunk text. - - Note: MD5 is used only to calculate an identifier, not for security purposes. - Adding usedforsecurity=False for compatibility with FIPS environments. + Then use the first 32 characters of the hash to create a UUID. """ hash_input = f"{document_id}:{chunk_text}".encode() if chunk_window: hash_input += f":{chunk_window}".encode() - return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest())) + return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32])) def proper_case(s: str) -> str: diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index e78f493a6..6f017c51d 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -211,7 +211,7 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str: return sorted(set(idents)) identifiers = _extract_model_identifiers() - return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8] + return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8] def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None: diff --git a/tests/unit/providers/vector_io/test_vector_utils.py b/tests/unit/providers/vector_io/test_vector_utils.py index a5d803a82..10ebe5bfb 100644 --- a/tests/unit/providers/vector_io/test_vector_utils.py +++ b/tests/unit/providers/vector_io/test_vector_utils.py @@ -26,9 +26,9 @@ def test_generate_chunk_id(): chunk_ids = sorted([chunk.chunk_id for chunk in chunks]) assert chunk_ids == [ - "177a1368-f6a8-0c50-6e92-18677f2c3de3", - "bc744db3-1b25-0a9c-cdff-b6ba3df73c36", - "f68df25d-d9aa-ab4d-5684-64a233add20d", + "31d1f9a3-c8d2-66e7-3c37-af2acd329778", + "d07dade7-29c0-cda7-df29-0249a1dcbc3e", + "d14f75a1-5855-7f72-2c78-d9fc4275a346", ] @@ -36,14 +36,14 @@ def test_generate_chunk_id_with_window(): chunk = Chunk(content="test", metadata={"document_id": "doc-1"}) chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1") chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2") - assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb" - assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154" + assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866" + assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685" def test_chunk_id(): # Test with existing chunk ID chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"}) - assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350" + assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd" # Test with document ID in metadata chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})