Skip to content

Commit bd541e1

Browse files
dmitriyrepinBrianMichelltasansal
authored
Environment variable flag to preserve unmodified, raw binary header from SEG-Y in segy_file_header variable. (#683)
* binary_header_disaster_recovery 2 * Update segy.py Make raw binary header serialization conditional to match with pending raw trace headers PR * Update test_segy_roundtrip_teapot.py Set the raw headers environment variable * refactor a little * monkey patch env vars for teapot --------- Co-authored-by: Brian Michell <[email protected]> Co-authored-by: Altay Sansal <[email protected]>
1 parent 1b98255 commit bd541e1

File tree

4 files changed

+83
-22
lines changed

4 files changed

+83
-22
lines changed

src/mdio/converters/segy.py

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
from __future__ import annotations
44

5+
import base64
56
import logging
67
import os
8+
from dataclasses import dataclass
79
from typing import TYPE_CHECKING
810

911
import numpy as np
@@ -123,6 +125,35 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
123125
raise GridTraceSparsityError(grid.shape, num_traces, msg)
124126

125127

128+
@dataclass
129+
class SegyFileHeaderDump:
130+
"""Segy metadata information."""
131+
132+
text_header: str
133+
binary_header_dict: dict
134+
raw_binary_headers: bytes
135+
136+
137+
def _get_segy_file_header_dump(segy_file: SegyFile) -> SegyFileHeaderDump:
138+
"""Reads information from a SEG-Y file."""
139+
text_header = segy_file.text_header
140+
141+
raw_binary_headers: bytes = segy_file.fs.read_block(
142+
fn=segy_file.url,
143+
offset=segy_file.spec.binary_header.offset,
144+
length=segy_file.spec.binary_header.itemsize,
145+
)
146+
147+
# We read here twice, but it's ok for now. Only 400-bytes.
148+
binary_header_dict = segy_file.binary_header.to_dict()
149+
150+
return SegyFileHeaderDump(
151+
text_header=text_header,
152+
binary_header_dict=binary_header_dict,
153+
raw_binary_headers=raw_binary_headers,
154+
)
155+
156+
126157
def _scan_for_headers(
127158
segy_file: SegyFile,
128159
template: AbstractDatasetTemplate,
@@ -151,12 +182,12 @@ def _scan_for_headers(
151182
return segy_dimensions, segy_headers
152183

153184

154-
def _build_and_check_grid(segy_dimensions: list[Dimension], segy_file: SegyFile, segy_headers: SegyHeaderArray) -> Grid:
185+
def _build_and_check_grid(segy_dimensions: list[Dimension], num_traces: int, segy_headers: SegyHeaderArray) -> Grid:
155186
"""Build and check the grid from the SEG-Y headers and dimensions.
156187
157188
Args:
158189
segy_dimensions: List of of all SEG-Y dimensions to build grid from.
159-
segy_file: Instance of SegyFile to check for trace count.
190+
num_traces: Number of traces in the SEG-Y file.
160191
segy_headers: Headers read in from SEG-Y file for building the trace map.
161192
162193
Returns:
@@ -166,15 +197,15 @@ def _build_and_check_grid(segy_dimensions: list[Dimension], segy_file: SegyFile,
166197
GridTraceCountError: If number of traces in SEG-Y file does not match the parsed grid
167198
"""
168199
grid = Grid(dims=segy_dimensions)
169-
grid_density_qc(grid, segy_file.num_traces)
200+
grid_density_qc(grid, num_traces)
170201
grid.build_map(segy_headers)
171202
# Check grid validity by comparing trace numbers
172-
if np.sum(grid.live_mask) != segy_file.num_traces:
203+
if np.sum(grid.live_mask) != num_traces:
173204
for dim_name in grid.dim_names:
174205
dim_min, dim_max = grid.get_min(dim_name), grid.get_max(dim_name)
175206
logger.warning("%s min: %s max: %s", dim_name, dim_min, dim_max)
176207
logger.warning("Ingestion grid shape: %s.", grid.shape)
177-
raise GridTraceCountError(np.sum(grid.live_mask), segy_file.num_traces)
208+
raise GridTraceCountError(np.sum(grid.live_mask), num_traces)
178209
return grid
179210

180211

@@ -301,20 +332,19 @@ def _populate_coordinates(
301332
return dataset, drop_vars_delayed
302333

303334

304-
def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file: SegyFile) -> xr_Dataset:
335+
def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_header_dump: SegyFileHeaderDump) -> xr_Dataset:
305336
save_file_header = os.getenv("MDIO__IMPORT__SAVE_SEGY_FILE_HEADER", "") in ("1", "true", "yes", "on")
306337
if not save_file_header:
307338
return xr_dataset
308339

309340
expected_rows = 40
310341
expected_cols = 80
311342

312-
text_header = segy_file.text_header
313-
text_header_rows = text_header.splitlines()
343+
text_header_rows = segy_file_header_dump.text_header.splitlines()
314344
text_header_cols_bad = [len(row) != expected_cols for row in text_header_rows]
315345

316346
if len(text_header_rows) != expected_rows:
317-
err = f"Invalid text header count: expected {expected_rows}, got {len(text_header)}"
347+
err = f"Invalid text header count: expected {expected_rows}, got {len(segy_file_header_dump.text_header)}"
318348
raise ValueError(err)
319349

320350
if any(text_header_cols_bad):
@@ -324,10 +354,13 @@ def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file: SegyFile) -> xr_Da
324354
xr_dataset["segy_file_header"] = ((), "")
325355
xr_dataset["segy_file_header"].attrs.update(
326356
{
327-
"textHeader": text_header,
328-
"binaryHeader": segy_file.binary_header.to_dict(),
357+
"textHeader": segy_file_header_dump.text_header,
358+
"binaryHeader": segy_file_header_dump.binary_header_dict,
329359
}
330360
)
361+
if os.getenv("MDIO__IMPORT__RAW_HEADERS") in ("1", "true", "yes", "on"):
362+
raw_binary_base64 = base64.b64encode(segy_file_header_dump.raw_binary_headers).decode("ascii")
363+
xr_dataset["segy_file_header"].attrs.update({"rawBinaryHeader": raw_binary_base64})
331364

332365
return xr_dataset
333366

@@ -428,10 +461,11 @@ def segy_to_mdio( # noqa PLR0913
428461

429462
segy_settings = SegySettings(storage_options=input_path.storage_options)
430463
segy_file = SegyFile(url=input_path.as_posix(), spec=segy_spec, settings=segy_settings)
464+
segy_info: SegyFileHeaderDump = _get_segy_file_header_dump(segy_file)
431465

432466
segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template, grid_overrides)
433467

434-
grid = _build_and_check_grid(segy_dimensions, segy_file, segy_headers)
468+
grid = _build_and_check_grid(segy_dimensions, segy_file.num_traces, segy_headers)
435469

436470
_, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
437471
header_dtype = to_structured_type(segy_spec.trace.header.dtype)
@@ -461,7 +495,7 @@ def segy_to_mdio( # noqa PLR0913
461495
coords=non_dim_coords,
462496
)
463497

464-
xr_dataset = _add_segy_file_headers(xr_dataset, segy_file)
498+
xr_dataset = _add_segy_file_headers(xr_dataset, segy_info)
465499

466500
xr_dataset.trace_mask.data[:] = grid.live_mask
467501
# IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in

tests/conftest.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
def fake_segy_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
2424
"""Make a temp file for the fake SEG-Y files we are going to create."""
2525
if DEBUG_MODE:
26-
return Path("TMP/fake_segy")
26+
tmp_dir = Path("tmp/fake_segy")
27+
tmp_dir.mkdir(parents=True, exist_ok=True)
28+
return tmp_dir
2729
return tmp_path_factory.mktemp(r"fake_segy")
2830

2931

@@ -37,7 +39,7 @@ def segy_input_uri() -> str:
3739
def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> Path:
3840
"""Download teapot dome dataset for testing."""
3941
if DEBUG_MODE:
40-
tmp_dir = Path("TMP/segy")
42+
tmp_dir = Path("tmp/segy")
4143
tmp_dir.mkdir(parents=True, exist_ok=True)
4244
else:
4345
tmp_dir = tmp_path_factory.mktemp("segy")
@@ -50,23 +52,23 @@ def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) ->
5052
def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
5153
"""Make a temp file for the output MDIO."""
5254
if DEBUG_MODE:
53-
return Path("TMP/mdio")
55+
return Path("tmp/mdio")
5456
return tmp_path_factory.mktemp(r"mdio")
5557

5658

5759
@pytest.fixture(scope="module")
5860
def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path:
5961
"""Make a temp file for the output MDIO."""
6062
if DEBUG_MODE:
61-
return Path("TMP/mdio2")
63+
return Path("tmp/mdio2")
6264
return tmp_path_factory.mktemp(r"mdio2")
6365

6466

6567
@pytest.fixture(scope="session")
6668
def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
6769
"""Make a temp file for the round-trip IBM SEG-Y."""
6870
if DEBUG_MODE:
69-
tmp_dir = Path("TMP/segy")
71+
tmp_dir = Path("tmp/segy")
7072
tmp_dir.mkdir(parents=True, exist_ok=True)
7173
else:
7274
tmp_dir = tmp_path_factory.mktemp("segy")

tests/integration/test_segy_import_export_masked.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ def export_masked_path(tmp_path_factory: pytest.TempPathFactory, raw_headers_env
290290
path_suffix = "with_raw_headers" if raw_headers_enabled else "without_raw_headers"
291291

292292
if DEBUG_MODE:
293-
return Path(f"TMP/export_masked_{path_suffix}")
293+
return Path(f"tmp/export_masked_{path_suffix}")
294294
return tmp_path_factory.getbasetemp() / f"export_masked_{path_suffix}"
295295

296296

tests/integration/test_segy_roundtrip_teapot.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from __future__ import annotations
44

55
import json
6-
import os
76
from typing import TYPE_CHECKING
87

98
import dask
@@ -25,13 +24,20 @@
2524
from mdio.converters.segy import segy_to_mdio
2625

2726
if TYPE_CHECKING:
27+
from collections.abc import Generator
2828
from pathlib import Path
2929

3030
from segy.schema import SegySpec
3131

3232

3333
dask.config.set(scheduler="synchronous")
34-
os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true"
34+
35+
36+
@pytest.fixture
37+
def set_env_vars(monkeypatch: Generator[pytest.MonkeyPatch]) -> None:
38+
"""Set environment variables for the Teapot dome tests."""
39+
monkeypatch.setenv("MDIO__IMPORT__SAVE_SEGY_FILE_HEADER", "true")
40+
monkeypatch.setenv("MDIO__IMPORT__RAW_HEADERS", "true")
3541

3642

3743
@pytest.fixture
@@ -130,11 +136,29 @@ def binary_header_teapot_dome() -> dict[str, int]:
130136
}
131137

132138

139+
def raw_binary_header_teapot_dome() -> str:
140+
"""Return the teapot dome expected raw binary header, base64 encoded."""
141+
return (
142+
"AAAnDwAAJw8AAAABALwAAAfQAAAF3QXdAAEAOQAEAAEAAAAAAAAAAAAAAAAAAAAAAAIAAQAEAAIAAQAAAAAAAAAAAAAAAAAAAAAA"
143+
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
144+
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
145+
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
146+
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
147+
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=="
148+
)
149+
150+
133151
class TestTeapotRoundtrip:
134152
"""Tests for Teapot Dome data ingestion and export."""
135153

136154
@pytest.mark.dependency
137-
def test_teapot_import(self, segy_input: Path, zarr_tmp: Path, teapot_segy_spec: SegySpec) -> None:
155+
@pytest.mark.usefixtures("set_env_vars")
156+
def test_teapot_import(
157+
self,
158+
segy_input: Path,
159+
zarr_tmp: Path,
160+
teapot_segy_spec: SegySpec,
161+
) -> None:
138162
"""Test importing a SEG-Y file to MDIO.
139163
140164
NOTE: This test must be executed before the 'TestReader' and 'TestExport' tests.
@@ -176,6 +200,7 @@ def test_dataset_metadata(self, zarr_tmp: Path) -> None:
176200
segy_file_header = ds["segy_file_header"]
177201
assert segy_file_header.attrs["textHeader"] == text_header_teapot_dome()
178202
assert segy_file_header.attrs["binaryHeader"] == binary_header_teapot_dome()
203+
assert segy_file_header.attrs["rawBinaryHeader"] == raw_binary_header_teapot_dome()
179204

180205
def test_variable_metadata(self, zarr_tmp: Path) -> None:
181206
"""Metadata reading tests."""

0 commit comments

Comments
 (0)