22
33from __future__ import annotations
44
5+ import base64
56import logging
67import os
8+ from dataclasses import dataclass
79from typing import TYPE_CHECKING
810
911import numpy as np
@@ -123,6 +125,35 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
123125 raise GridTraceSparsityError (grid .shape , num_traces , msg )
124126
125127
128+ @dataclass
129+ class SegyFileHeaderDump :
130+ """Segy metadata information."""
131+
132+ text_header : str
133+ binary_header_dict : dict
134+ raw_binary_headers : bytes
135+
136+
137+ def _get_segy_file_header_dump (segy_file : SegyFile ) -> SegyFileHeaderDump :
138+ """Reads information from a SEG-Y file."""
139+ text_header = segy_file .text_header
140+
141+ raw_binary_headers : bytes = segy_file .fs .read_block (
142+ fn = segy_file .url ,
143+ offset = segy_file .spec .binary_header .offset ,
144+ length = segy_file .spec .binary_header .itemsize ,
145+ )
146+
147+ # We read here twice, but it's ok for now. Only 400-bytes.
148+ binary_header_dict = segy_file .binary_header .to_dict ()
149+
150+ return SegyFileHeaderDump (
151+ text_header = text_header ,
152+ binary_header_dict = binary_header_dict ,
153+ raw_binary_headers = raw_binary_headers ,
154+ )
155+
156+
126157def _scan_for_headers (
127158 segy_file : SegyFile ,
128159 template : AbstractDatasetTemplate ,
@@ -151,12 +182,12 @@ def _scan_for_headers(
151182 return segy_dimensions , segy_headers
152183
153184
154- def _build_and_check_grid (segy_dimensions : list [Dimension ], segy_file : SegyFile , segy_headers : SegyHeaderArray ) -> Grid :
185+ def _build_and_check_grid (segy_dimensions : list [Dimension ], num_traces : int , segy_headers : SegyHeaderArray ) -> Grid :
155186 """Build and check the grid from the SEG-Y headers and dimensions.
156187
157188 Args:
158189 segy_dimensions: List of of all SEG-Y dimensions to build grid from.
159- segy_file: Instance of SegyFile to check for trace count .
190+ num_traces: Number of traces in the SEG-Y file .
160191 segy_headers: Headers read in from SEG-Y file for building the trace map.
161192
162193 Returns:
@@ -166,15 +197,15 @@ def _build_and_check_grid(segy_dimensions: list[Dimension], segy_file: SegyFile,
166197 GridTraceCountError: If number of traces in SEG-Y file does not match the parsed grid
167198 """
168199 grid = Grid (dims = segy_dimensions )
169- grid_density_qc (grid , segy_file . num_traces )
200+ grid_density_qc (grid , num_traces )
170201 grid .build_map (segy_headers )
171202 # Check grid validity by comparing trace numbers
172- if np .sum (grid .live_mask ) != segy_file . num_traces :
203+ if np .sum (grid .live_mask ) != num_traces :
173204 for dim_name in grid .dim_names :
174205 dim_min , dim_max = grid .get_min (dim_name ), grid .get_max (dim_name )
175206 logger .warning ("%s min: %s max: %s" , dim_name , dim_min , dim_max )
176207 logger .warning ("Ingestion grid shape: %s." , grid .shape )
177- raise GridTraceCountError (np .sum (grid .live_mask ), segy_file . num_traces )
208+ raise GridTraceCountError (np .sum (grid .live_mask ), num_traces )
178209 return grid
179210
180211
@@ -301,20 +332,19 @@ def _populate_coordinates(
301332 return dataset , drop_vars_delayed
302333
303334
304- def _add_segy_file_headers (xr_dataset : xr_Dataset , segy_file : SegyFile ) -> xr_Dataset :
335+ def _add_segy_file_headers (xr_dataset : xr_Dataset , segy_file_header_dump : SegyFileHeaderDump ) -> xr_Dataset :
305336 save_file_header = os .getenv ("MDIO__IMPORT__SAVE_SEGY_FILE_HEADER" , "" ) in ("1" , "true" , "yes" , "on" )
306337 if not save_file_header :
307338 return xr_dataset
308339
309340 expected_rows = 40
310341 expected_cols = 80
311342
312- text_header = segy_file .text_header
313- text_header_rows = text_header .splitlines ()
343+ text_header_rows = segy_file_header_dump .text_header .splitlines ()
314344 text_header_cols_bad = [len (row ) != expected_cols for row in text_header_rows ]
315345
316346 if len (text_header_rows ) != expected_rows :
317- err = f"Invalid text header count: expected { expected_rows } , got { len (text_header )} "
347+ err = f"Invalid text header count: expected { expected_rows } , got { len (segy_file_header_dump . text_header )} "
318348 raise ValueError (err )
319349
320350 if any (text_header_cols_bad ):
@@ -324,10 +354,13 @@ def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file: SegyFile) -> xr_Da
324354 xr_dataset ["segy_file_header" ] = ((), "" )
325355 xr_dataset ["segy_file_header" ].attrs .update (
326356 {
327- "textHeader" : text_header ,
328- "binaryHeader" : segy_file . binary_header . to_dict () ,
357+ "textHeader" : segy_file_header_dump . text_header ,
358+ "binaryHeader" : segy_file_header_dump . binary_header_dict ,
329359 }
330360 )
361+ if os .getenv ("MDIO__IMPORT__RAW_HEADERS" ) in ("1" , "true" , "yes" , "on" ):
362+ raw_binary_base64 = base64 .b64encode (segy_file_header_dump .raw_binary_headers ).decode ("ascii" )
363+ xr_dataset ["segy_file_header" ].attrs .update ({"rawBinaryHeader" : raw_binary_base64 })
331364
332365 return xr_dataset
333366
@@ -428,10 +461,11 @@ def segy_to_mdio( # noqa PLR0913
428461
429462 segy_settings = SegySettings (storage_options = input_path .storage_options )
430463 segy_file = SegyFile (url = input_path .as_posix (), spec = segy_spec , settings = segy_settings )
464+ segy_info : SegyFileHeaderDump = _get_segy_file_header_dump (segy_file )
431465
432466 segy_dimensions , segy_headers = _scan_for_headers (segy_file , mdio_template , grid_overrides )
433467
434- grid = _build_and_check_grid (segy_dimensions , segy_file , segy_headers )
468+ grid = _build_and_check_grid (segy_dimensions , segy_file . num_traces , segy_headers )
435469
436470 _ , non_dim_coords = _get_coordinates (grid , segy_headers , mdio_template )
437471 header_dtype = to_structured_type (segy_spec .trace .header .dtype )
@@ -461,7 +495,7 @@ def segy_to_mdio( # noqa PLR0913
461495 coords = non_dim_coords ,
462496 )
463497
464- xr_dataset = _add_segy_file_headers (xr_dataset , segy_file )
498+ xr_dataset = _add_segy_file_headers (xr_dataset , segy_info )
465499
466500 xr_dataset .trace_mask .data [:] = grid .live_mask
467501 # IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in
0 commit comments