diff --git a/tests/test_hf.py b/tests/test_hf.py index 417a75c..7d72c8b 100644 --- a/tests/test_hf.py +++ b/tests/test_hf.py @@ -87,11 +87,10 @@ def test_build_hf_input_serialisation() -> None: assert lines[7] == "{seed}" # seed placeholder assert lines[9] == "100.0 0.005 20.0 0.045 0.6" # Domain and resolution parameters assert lines[10] == "0.8 0.7 0.9 2.5 0.0" # rupture velocity + czero,alpha - assert lines[11] == "0.0 2.0 1.0 3.0" # shallow depth, deep depth - assert lines[12] == "-1 1.2" # mom (None -> -1) and rupv - assert lines[13] == str(stoch_ffp) # Stoch file path - assert lines[16] == "0 0.1 0.1 0.1 0.1 1" # Sigs and ic_flag (True -> 1) - assert lines[21] == "-1 -1 -1" # Optional stress parameters + assert lines[11] == "-1 1.2" # mom (None -> -1) and rupv + assert lines[12] == str(stoch_ffp) # Stoch file path + assert lines[15] == "0 0.1 0.1 0.1 0.1 1" # Sigs and ic_flag (True -> 1) + assert lines[20] == "-1 -1 -1" # Optional stress parameters STATION_STRATEGY = st.text( @@ -99,12 +98,6 @@ def test_build_hf_input_serialisation() -> None: ) -@given(station=STATION_STRATEGY) -def test_stable_hash(station: str) -> None: - # Check that stable_hash output is always a valid 32-bit integer - assert -(1 << 31) <= hf_sim.stable_hash(station) <= (1 << 31) - 1 - - def test_station_seeds() -> None: seed = hf_sim.station_seeds(0, ["station"]) assert seed.dtype == np.int32 diff --git a/tests/test_utils.py b/tests/test_utils.py index 63b1430..a50bec8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,8 @@ import geopandas as gpd import pytest import shapely +from hypothesis import assume, given +from hypothesis import strategies as st from workflow import utils @@ -142,3 +144,41 @@ def test_dict_zip_identical_keys_different_order() -> None: result = utils.dict_zip(d1, d2, strict=True) assert result["a"] == (1, 10) assert result["b"] == (2, 20) + + +@given( + value=st.text(min_size=0, max_size=64, alphabet=st.characters(codec="ascii")), + size=st.sampled_from([16, 32, 64]), +) +def test_stable_hash_bounds(value: str, size: int) -> None: + """Check that stable_hash output is always a valid ``size``-byte integer""" + assert ( + -(1 << (size - 1)) + <= utils.stable_hash(value, size=size // 8) + <= (1 << (size - 1)) - 1 + ) + + +@given( + value=st.text(min_size=0, max_size=64, alphabet=st.characters(codec="ascii")), + size=st.sampled_from([16, 32, 64]), +) +def test_stable_hash_determinism(value: str, size: int) -> None: + """Check that stable_hash output is deterministic""" + hash_a = utils.stable_hash(value, size=size // 8) + hash_b = utils.stable_hash(value, size=size // 8) + assert hash_a == hash_b + + +@given( + value_a=st.text(min_size=0, max_size=64, alphabet=st.characters(codec="ascii")), + value_b=st.text(min_size=1, max_size=64, alphabet=st.characters(codec="ascii")), + size=st.sampled_from([16, 32, 64]), +) +def test_stable_hash_collision(value_a: str, value_b: str, size: int) -> None: + """Check that stable_hash output lacks collision""" + # Combine value with + assume(value_a != value_b) + hash_a = utils.stable_hash(value_a, size=size // 8) + hash_b = utils.stable_hash(value_b, size=size // 8) + assert hash_a != hash_b diff --git a/workflow/scripts/hf_sim.py b/workflow/scripts/hf_sim.py index ea009af..cbffd52 100644 --- a/workflow/scripts/hf_sim.py +++ b/workflow/scripts/hf_sim.py @@ -247,32 +247,6 @@ def hf_simulate_station( return station_name, epicentre_distance, station_waveform -def stable_hash(station: str) -> int: - """Compute stable hashes for station names. - - The HF binary expects seeds. We want the provided seed to be - independent of the order of stations in the stations lists. This - is so setting HF seed reproduces the same outputs, even for - different orders or subsets of the original station file. To do - that, we generate stable hashes based on the station name. - - - Parameters - ---------- - station : str - The station name. - - Returns - ------- - int - A hash of the station name. This is guaranteed to be in the - range of a signed 32-bit integer. - """ - return int.from_bytes( - hashlib.blake2b(station.encode("utf-8"), digest_size=4).digest(), signed=True - ) - - def station_seeds(seed: int, stations: Iterable[str]) -> npt.NDArray[np.int32]: """Create a list of per-station seeds in an order-invariant fashion with a root seed. @@ -289,7 +263,9 @@ def station_seeds(seed: int, stations: Iterable[str]) -> npt.NDArray[np.int32]: npt.NDArray[np.int32] A list of station seeds. """ - station_hashes = np.array([stable_hash(name) for name in stations], dtype=np.int32) + station_hashes = np.array( + [utils.stable_hash(name) for name in stations], dtype=np.int32 + ) # Rather than add (which could overflow and cause annoying numpy # warnings), we just xor the hf seed with the station hashes. # Since this is invertible, we ensure that the same hf seed gives diff --git a/workflow/scripts/realisation_to_srf.py b/workflow/scripts/realisation_to_srf.py index 10c6754..5d2e6b2 100644 --- a/workflow/scripts/realisation_to_srf.py +++ b/workflow/scripts/realisation_to_srf.py @@ -546,7 +546,10 @@ def generate_fault_srf( gsf_file_path=gsf_file_path, nx=nx, ny=ny, - seed=environment.seeds.genslip_seed, + # NOTE: This stable hash trick is also used in hf-sim, and is + # designed to give order invariant stable hashes for segments + # based on their names. + seed=environment.seeds.genslip_seed ^ utils.stable_hash(name), velocity_model_path=environment.velocity_model_path, shypo=genslip_hypocentre_coords[0], dhypo=genslip_hypocentre_coords[1], diff --git a/workflow/utils.py b/workflow/utils.py index 3e3c627..b9acb22 100644 --- a/workflow/utils.py +++ b/workflow/utils.py @@ -1,5 +1,6 @@ """Miscellaneous workflow utilities that couldn't go anywhere else.""" +import hashlib import os import tempfile import urllib.request @@ -174,3 +175,27 @@ def dict_zip(*dicts: Mapping[K, Any], strict: bool = True) -> dict[K, tuple[Any, result = {key: tuple(d[key] for d in dicts) for key in list(keys)} return result + + +def stable_hash(value: str, size: int = 4) -> int: + """Compute stable hashes for strings. + + Parameters + ---------- + value : str + String to hash. + size : int, optional + Digest size in bytes. This is passed as ``digest_size`` to + `hashlib.blake2b` and must be within the valid range for + BLAKE2b (1 to 64 bytes). + + Returns + ------- + int + A hash of the value derived from a ``size``-byte BLAKE2b digest. + The result is within the range of a signed integer representable + with ``size`` bytes. + """ + return int.from_bytes( + hashlib.blake2b(value.encode("utf-8"), digest_size=size).digest(), signed=True + )