Skip to content

Commit 7b9bba6

Browse files
authored
Enhanced FedASDF indexing and waveform extraction (#268)
* Added incremental indexing in FedASDF routines * Miscellaneous Changes * Added new tables to fedASDF db to speed up queries * Renamed key API to better conform to functionality * Minor changes to function signature * Added in-memory optimizations * Adapted gcmt catalog for interchangeable use with Obspy * Major Updates to Waveform-extractor * Integrated exclusive use of GCMT catalogs, moving away from ISC catalogs * Improved logging and reporting of data-extraction stats * Removed functionality for downloading catalogs
1 parent b169cbc commit 7b9bba6

File tree

15 files changed

+835
-558
lines changed

15 files changed

+835
-558
lines changed

seismic/ASDFdatabase/FederatedASDFDataSet.py

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424
import click
2525

2626
class FederatedASDFDataSet():
27-
def __init__(self, asdf_source, force_reindex=False, logger=None,
27+
def __init__(self, asdf_source, fast=True, force_reindex=False, logger=None,
2828
single_item_read_limit_in_mb=1024,
2929
single_threaded_access=True):
3030
"""
3131
Initializer for FederatedASDFDataSet.
3232
3333
:param asdf_source: Path to a text file containing a list of ASDF files. \
3434
Entries can be commented out with '#'
35+
:param fast: enables in-memory optimizations for faster queries
3536
:param force_reindex: Force reindex even if a preexisting db file is found
3637
:param logger: logger instance
3738
:param single_item_read_limit_in_mb: buffer size for Obspy reads
@@ -41,25 +42,23 @@ def __init__(self, asdf_source, force_reindex=False, logger=None,
4142
"""
4243
self.logger = logger
4344
self.asdf_source = asdf_source
44-
self._unique_coordinates = None
4545
self._earth_radius = 6371 # km
4646

4747
# Instantiate implementation class
48-
self.fds = _FederatedASDFDataSetImpl(asdf_source, force_reindex=force_reindex, logger=logger,
48+
self.fds = _FederatedASDFDataSetImpl(asdf_source, fast=fast,
49+
force_reindex=force_reindex, logger=logger,
4950
single_item_read_limit_in_mb=single_item_read_limit_in_mb,
5051
single_threaded_access=single_threaded_access)
5152

5253
# Populate coordinates
53-
self._unique_coordinates = defaultdict(list)
54-
5554
rtps_dict = defaultdict()
5655
for ds_dict in self.fds.asdf_station_coordinates:
5756
for key in list(ds_dict.keys()):
58-
self._unique_coordinates[key] = [ds_dict[key][0], ds_dict[key][1]]
5957

58+
lon, lat, _ = ds_dict[key]
6059
rtps_dict[key] = [self._earth_radius,
61-
np.radians(90 - ds_dict[key][1]),
62-
np.radians(ds_dict[key][0])]
60+
np.radians(90 - lat),
61+
np.radians(lon)]
6362
# end for
6463
# end for
6564

@@ -80,8 +79,7 @@ def unique_coordinates(self):
8079
8180
:return: dictionary containing [lon, lat] coordinates indexed by 'net.sta'
8281
"""
83-
return self._unique_coordinates
84-
82+
return self.fds._unique_coordinates
8583
# end func
8684

8785
def corrections_enabled(self):
@@ -124,7 +122,7 @@ def get_closest_stations(self, lon, lat, nn=1):
124122

125123
# end func
126124

127-
def get_global_time_range(self, network, station=None, location=None, channel=None):
125+
def get_recording_timespan(self, network, station=None, location=None, channel=None):
128126
"""
129127
:param network: network code
130128
:param station: station code
@@ -134,19 +132,18 @@ def get_global_time_range(self, network, station=None, location=None, channel=No
134132
min is set to 2100-01-01T00:00:00.000000Z and max is set to 1900-01-01T00:00:00.000000Z
135133
"""
136134

137-
return self.fds.get_global_time_range(network, station=station, location=location, channel=channel)
138-
135+
return self.fds.get_recording_timespan(network, station=station, location=location, channel=channel)
139136
# end func
140137

141-
def get_nslc_coverage(self):
138+
def get_all_recording_timespans(self):
142139
"""
143140
Get a structured numpy array with named columns
144141
'net', 'sta', 'loc', 'cha', 'min_st', 'max_et'
145142
representing contents of the database
146143
@return:
147144
"""
148145

149-
results = self.fds.get_nslc_coverage()
146+
results = self.fds.get_all_recording_timespans()
150147
return results
151148
# end if
152149

@@ -255,8 +252,7 @@ def get_inventory(self, network=None, station=None):
255252
return inv
256253
# end func
257254

258-
def find_gaps(self, network=None, station=None, location=None,
259-
channel=None, start_date_ts=None, end_date_ts=None,
255+
def find_gaps(self, network=None, station=None, location=None, channel=None, starttime=None, endtime=None,
260256
min_gap_length=86400):
261257
"""
262258
This function returns gaps in data as a numpy array with columns: net, sta, loc, cha, start_timestamp,
@@ -265,27 +261,39 @@ def find_gaps(self, network=None, station=None, location=None,
265261
@param station: station code
266262
@param location: location code
267263
@param channel: channel code
268-
@param start_date_ts: start timestamp
269-
@param end_date_ts: end timestamp
270-
@param min_gap_length: minimum length of gap; smaller gaps in data are ignored
264+
@param starttime: start timestamp
265+
@param endtime: end timestamp
266+
@param min_gap_length: minimum length of gap in seconds; smaller gaps in data are ignored
271267
@return:
272268
"""
273-
return self.fds.find_gaps(network, station, location, channel, start_date_ts, end_date_ts, min_gap_length)
269+
return self.fds.find_gaps(network, station, location, channel, starttime, endtime, min_gap_length)
274270
# end func
275271

276-
def get_coverage(self, network=None):
272+
def get_recording_duration(self, network=None, station=None, location=None, channel=None,
273+
starttime=None, endtime=None, cumulative=False):
277274
"""
278-
Generates coverage for the entire data holdings for a selected network.
279-
@param network: network code
280-
@return: Numpy record array with columns: net, sta, loc, cha,
281-
start_timestamp, end_timestamp
275+
Fetches total recording duration in seconds. Note that 'duration_seconds' in the output exclude data-gaps
276+
277+
@param network:
278+
@param station:
279+
@param location:
280+
@param channel:
281+
@param starttime:
282+
@param endtime:
283+
@param cumulative: returns cumulative recording times, otherwise blocks of start- and end-times
284+
@return: Numpy record array with columns, if cumulative=False:
285+
net, sta, loc, cha, block_st, block_et
286+
, otherwise:
287+
net, sta, loc, cha, lon, lat, min_st, max_et, duration_seconds
282288
"""
283289

284-
rows = self.fds.get_coverage(network=network)
290+
rows = self.fds.get_recording_duration(network=network, station=station, location=location, channel=channel,
291+
starttime=starttime, endtime=endtime, cumulative=cumulative)
285292
return rows
286293
# end func
287294
# end class
288295

296+
289297
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
290298
@click.command(context_settings=CONTEXT_SETTINGS)
291299
@click.argument('asdf-source', required=True,
@@ -312,16 +320,22 @@ def process(asdf_source, force_reindex, generate_summary):
312320
with open(ofn, 'w') as fh:
313321
fh.write('# net, sta, loc, cha, lon, lat, min_starttime, max_endtime, duration_months\n')
314322

315-
rows = ds.get_coverage()
323+
rows = ds.get_recording_duration(cumulative=True)
316324
for row in rows:
317-
net, sta, loc, cha, lon, lat, min_st, max_et = row
318-
duration_months = (max_et - min_st) / (86400 * 30)
325+
net, sta, loc, cha, min_st, max_et, duration_seconds = row
326+
duration_months = duration_seconds / (86400 * 30)
319327

328+
lon, lat = ds.unique_coordinates['{}.{}'.format(net, sta)]
320329
line = '{},{},{},{},{:3.4f},{:3.4f},{},{},{:5.3f}\n'.\
321330
format(net, sta, loc, cha, lon, lat,
322331
UTCDateTime(min_st).strftime('%Y-%m-%dT%H:%M:%S'),
323332
UTCDateTime(max_et).strftime('%Y-%m-%dT%H:%M:%S'),
324333
duration_months)
334+
335+
if(duration_seconds > (max_et - min_st)):
336+
logger.warn('Potential overlapping data found: {}'.format(line.strip()))
337+
# end if
338+
325339
fh.write(line)
326340
# end for
327341
# end with

0 commit comments

Comments
 (0)