forked from gwastro/ml-mock-data-challenge-1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
segments.py
570 lines (502 loc) · 21.2 KB
/
segments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
import numpy as np
import json
import h5py
import time
from pycbc.types import TimeSeries
from pycbc.inject import InjectionSet
# Notes:
# -Right now the initial time series have to overlap. Why is that
# required?
# -Right now the initial time series overlap must be >= the duration.
# What if both of them are long enough and we could slide them in such
# a way that they would overlap?
class OverlapSegment(object):
"""A class to handle time series of different detectors which are
overlapping in time.
It provides functionality to shift those time series with respect to
one another and thereby create unique combinations.
Arguments
---------
timeseries : tuple of (str, TimeSeries)
One or multiple tuples where the first entry specifies the
detector and the second argument is the actual time series.
duration : {None or float, None}
The required duration of the overlapping region.
"""
def __init__(self, *timeseries, duration=None):
self.duration = duration
self.timeseries = None
self.detectors = None
for ts in timeseries:
self.add_timeseries(ts)
pass
@property
def start_time(self):
if self.timeseries is None:
return None
return max([ts.start_time for ts in self.timeseries])
@property
def end_time(self):
if self.timeseries is None:
return None
return min([ts.end_time for ts in self.timeseries])
@property
def delta_t(self):
if self.timeseries is None:
return None
return self.timeseries[0].delta_t
@property
def sample_rate(self):
if self.timeseries is None:
return None
return self.timeseries[0].sample_rate
def add_timeseries(self, timeseries):
det, ts = timeseries
if not isinstance(ts, TimeSeries):
raise TypeError(f'The input to `add_timeseries` must be an iterable of length two, where the first entry specifies the detector and the second is a time series.')
if self.timeseries is None:
if self.duration is not None:
if ts.duration < self.duration:
raise ValueError(f'Cannot add time series.')
self.timeseries = [ts]
self.detectors = [det]
return
if ts.delta_t != self.delta_t:
raise ValueError(f'Delta t not matching.')
new_start = float(max(self.start_time, ts.start_time))
new_end = float(min(self.end_time, ts.end_time))
new_max_dur = new_end - new_start
if self.duration is None:
if new_max_dur < 0:
raise ValueError(f'Cannot add time series.')
self.timeseries.append(ts)
self.detectors.append(det)
return
if new_max_dur < self.duration:
raise ValueError(f'Cannot add time series.')
self.timeseries.append(ts)
self.detectors.append(det)
return
def get(self, seed=None, shift=True, random_start_time=False):
return self.get3(seed=seed, shift=shift,
random_start_time=random_start_time)
def get1(self, seed=None, shift=True, random_start_time=False):
# This function only shifts the first time series and the last n - 1
# time series. The last n - 1 time series will be shifted all by the same amount
if self.timeseries is None:
return
rs = np.random.RandomState(seed=seed)
if random_start_time and self.duration is not None:
mintime = float(self.start_time)
maxtime = float(self.end_time) - float(self.duration)
if maxtime > mintime:
start_time = rs.uniform(mintime, maxtime)
else:
start_time = mintime
else:
start_time = float(self.start_time)
indices = []
for ts in self.timeseries:
start = int((float(self.start_time) - float(ts.start_time))
/ ts.delta_t)
if self.duration is None:
end = int((float(self.end_time) - float(ts.start_time))
/ ts.delta_t)
else:
end = start + int(self.duration / ts.delta_t)
if len(indices) > 0:
assert (end - start) == (indices[-1][1] - indices[-1][0])
indices.append((start, end))
ret = []
if not shift:
for i, ts in enumerate(self.timeseries):
ret.append(TimeSeries(ts.data[indices[i][0]:indices[i][1]],
epoch=start_time,
delta_t=ts.delta_t))
return ret
#Handle upper
rsmin = -indices[0][0]
rsmax = len(self.timeseries[0]) - indices[0][1]
if rsmin >= rsmax:
shiftidx = 0
else:
shiftidx = rs.randint(rsmin, rsmax)
sidx = indices[0][0] + shiftidx
eidx = indices[0][1] + shiftidx
dat = self.timeseries[0].data[sidx:eidx]
ret.append(TimeSeries(dat, epoch=start_time,
delta_t=self.delta_t))
#Only one segment
if len(self.timeseries) < 2:
return ret
#Handle lower
tss = self.timeseries[1:]
rsmin = -min([pt[0] for pt in indices[1:]])
rsmax = min([len(ts) - ind[1] for (ts, ind) in zip(tss, indices[1:])])
if rsmin >= rsmax:
shiftidx = 0
else:
shiftidx = rs.randint(rsmin, rsmax)
for ind, ts in zip(indices[1:], tss):
sidx = ind[0] + shiftidx
eidx = ind[1] + shiftidx
dat = ts.data[sidx:eidx]
ret.append(TimeSeries(dat, epoch=start_time,
delta_t=self.delta_t))
return ret
def get2(self, seed=None, shift=True, random_start_time=False):
if self.timeseries is None:
return
rs = np.random.RandomState(seed=seed)
if random_start_time and self.duration is not None:
mintime = float(self.start_time)
maxtime = float(self.end_time) - float(self.duration)
if maxtime > mintime:
start_time = rs.uniform(mintime, maxtime)
else:
start_time = mintime
else:
start_time = float(self.start_time)
indices = []
for ts in self.timeseries:
start = int((float(self.start_time) - float(ts.start_time))
/ ts.delta_t)
if self.duration is None:
end = int((float(self.end_time) - float(ts.start_time))
/ ts.delta_t)
else:
end = start + int(self.duration / ts.delta_t)
if len(indices) > 0:
assert (end - start) == (indices[-1][1] - indices[-1][0])
indices.append((start, end))
ret = []
if not shift:
for i, ts in enumerate(self.timeseries):
ret.append(TimeSeries(ts.data[indices[i][0]:indices[i][1]],
epoch=start_time,
delta_t=ts.delta_t))
return ret
#Handle upper
for ind, ts in zip(indices, self.timeseries):
rsmin = -ind[0]
rsmax = len(ts) - ind[1]
if rsmin >= rsmax:
shiftidx = 0
else:
shiftidx = rs.randint(rsmin, rsmax)
sidx = ind[0] + shiftidx
eidx = ind[1] + shiftidx
dat = ts.data[sidx:eidx]
ret.append(TimeSeries(dat, epoch=start_time,
delta_t=self.delta_t))
return ret
def get3(self, seed=None, shift=True, random_start_time=False):
"""Shift only the last n-1 time series and keep the first one
fixed.
"""
if self.timeseries is None:
return
rs = np.random.RandomState(seed=seed)
if random_start_time and self.duration is not None:
mintime = float(self.start_time)
maxtime = float(self.end_time) - float(self.duration)
if maxtime > mintime:
start_time = rs.uniform(mintime, maxtime)
else:
start_time = mintime
else:
start_time = float(self.start_time)
indices = []
for ts in self.timeseries:
start = int((float(self.start_time) - float(ts.start_time))
/ ts.delta_t)
if self.duration is None:
end = int((float(self.end_time) - float(ts.start_time))
/ ts.delta_t)
else:
end = start + int(self.duration / ts.delta_t)
if len(indices) > 0:
assert (end - start) == (indices[-1][1] - indices[-1][0])
indices.append((start, end))
ret = []
if not shift:
for i, ts in enumerate(self.timeseries):
ret.append(TimeSeries(ts.data[indices[i][0]:indices[i][1]],
epoch=start_time,
delta_t=ts.delta_t))
return ret
#Handle upper
for i, (ind, ts) in enumerate(zip(indices, self.timeseries)):
if i == 0:
sidx, eidx = ind
dat = ts.data[sidx:eidx]
ret.append(TimeSeries(dat, epoch=start_time,
delta_t=self.delta_t))
continue
rsmin = -ind[0]
rsmax = len(ts) - ind[1]
if rsmin >= rsmax:
shiftidx = 0
else:
shiftidx = rs.randint(rsmin, rsmax)
sidx = ind[0] + shiftidx
eidx = ind[1] + shiftidx
dat = ts.data[sidx:eidx]
ret.append(TimeSeries(dat, epoch=start_time,
delta_t=self.delta_t))
return ret
@classmethod
def from_hdf_datasets(cls, *datasets, **kwargs):
"""Constructor from an opened HDF5-file dataset.
Arguments
---------
datasets : One or multiple dataset objects of an open HDF5 file
The datasets to read from. They need to contain the
attributes `delta_t` and `start_time`.
kwargs :
All keyword arguments are passed to the usual constructor.
"""
timeseries = []
for ds in datasets:
timseries.append(TimeSeries(ds[()],
epoch=ds.attrs['start_time'],
delta_t=ds.attrs['delta_t']))
return cls(timeseries, **kwargs)
@classmethod
def from_hdf_group(cls, group, **kwargs):
"""Constructor from an opened HDF5-file group.
Arguments
---------
group : group object of an open HDF5 file
The group to read from. It is expected to contain only
datasets each of which is expected to have the two
attributes `delta_t` and `start_time`. The keys of the group
are sorted by the inbuilt `sorted` function before they are
loaded.
kwargs :
All keyword arguments are passed to the usual constructor.
"""
timeseries = []
for key in sorted(group.keys()):
ds = group[key]
timseries.append(TimeSeries(ds[()],
epoch=ds.attrs['start_time'],
delta_t=ds.attrs['delta_t']))
return cls(timeseries, **kwargs)
class SegmentList(object):
def __init__(self, *segments):
self.segments = None
for segment in segments:
self.add_segment(segment)
def add_segment(self, segment):
if self.segments is None:
self.segments = [segment]
return
idx = np.searchsorted([float(seg.start_time) for seg in self.segments],
float(segment.start_time), side='right')
if idx == len(self.segments):
self.segments.append(segment)
return
#TODO: Consider random start times from Segment.get
if float(segment.start_time) + segment.duration > float(self.segments[idx].start_time):
raise ValueError(f'Segment is too long to be added without overlap.')
self.segments.insert(idx, segment)
def apply_injections(self, injection_file, shift_injections=True,
seed=None, shift_data=True, padding_start=0,
padding_end=0, random_start_time=False,
f_lower=None, return_times=False,
return_indices=False):
"""Apply injections from an injection file to a set of segments.
Arguments
---------
injection_file : str
Path to a file containing injections.
shift_injections : {bool, True}
If this option is set the injections are believed to start
tc == 0. The times are then shifted in such a way that they
line up with the disjoint segments. This means the remaining
injections are shifted such that they line up with the start
of the next segment. For an example see the notes below.
seed : {None or int, None}
The seed to use for time-shifting the strain data. See the
documentation of OverlapSegment.get for more information.
shift_data : {bool, True}
Whether or not to shift the strain data of the segments.
padding_start : {float, 0}
The amount of time at the beginning of each segment to not
put injections into.
padding_end : {float, 0}
The amount of time at the end of each segment to not put
injections into.
random_start_time : {bool, False}
Randomize the start time of the segments within the
applicable limits.
f_lower : {float or None, None}
The lower frequency cutoff of injections.
return_times : {bool, False}
Return the shifted injection times.
return_indices : {bool, False}
Return injection-file indices of injected signals.
Returns
-------
list:
Returns a list of dictionaries. The key of the dictionaries
are the detectors available from the corresponding segment.
The values are the TimeSeries containing injections.
np.array, optional:
The injection times as shifted by the function.
np.array, optional:
The indices in the injection file of the injected signals.
Notes
-----
shift_injections:
Consider two segments with with start and end times
[1, 6], [8, 15] and two injections [1, 10]. Since the
injections are believed to have a start-time of 0, the first
injection with t = 1 will be injected into the first segment
[1, 6] at a GPS-time of t = 2 = 1 + start_time = 1 + 1. The
second injection would, therefore, have a GPS injection time
of t = 11, which is outside the segment.
The first segment covered 5 seconds in duration. Hence only
injections with t > 5 from the injection set will be
considered for the remaining segments.
To align the injection times for the second segment the
previously considered duration will be subtracted and the
start time of the segment will be added to the injection
times to obtain the correct GPS times. The injection GPS
times for the second segment would thus be [4, 13]. The
injection GPS time with t = 4 lies outside the the
boundaries of the second segment and would thus be ignored.
The second injection GPS time on the other hand lies within
the second segment and is applied.
"""
if self.segments is None:
return None
injector = InjectionSet(injection_file)
injtable = injector.table
passed_dur = 0
ret = []
indices = []
injtimes = []
for segment in self.segments:
timeseries = segment.get(seed=seed, shift=shift_data,
random_start_time=random_start_time)
detectors = segment.detectors
#All time series should have same start time, end time, and delta_t
ts = timeseries[0]
#Shift injections to appropriate start time
if shift_injections:
addition = float(ts.start_time) - passed_dur + padding_start
injtable['tc'] += addition
idxs = np.where(np.logical_and(float(ts.start_time) + padding_start <= injtable['tc'],
injtable['tc'] <= float(ts.end_time) - padding_end))[0]
indices.append(idxs)
injtimes.append(injtable['tc'][idxs])
tmp = {}
for det, ts in zip(detectors, timeseries):
tscopy = ts.copy()
#Apply injections
injector.apply(tscopy, det, f_lower=f_lower,
simulation_ids=list(idxs))
tmp[det] = tscopy
#Shift injections back to original
if shift_injections:
injtable['tc'] -= addition
if segment.duration is None:
passed_dur += float(segment.end_time) - float(segment.start_time)
else:
passed_dur += segment.duration
ret.append(tmp)
indices = np.concatenate(indices)
injtimes = np.concatenate(injtimes)
if return_times:
if return_indices:
return ret, injtimes, indices
else:
return ret, injtimes
else:
if return_indices:
return ret, indices
else:
return ret
def get(self, seed=None, shift=True, random_start_time=False):
if self.segments is None:
return None
ret = []
for segment in self.segments:
timeseries = segment.get(seed=seed, shift=shift,
random_start_time=random_start_time)
detectors = segment.detectors
ret.append({det: ts for (det, ts) in zip(detectors, timeseries)})
return ret
def get_full(self, **kwargs):
gotten = self.get(**kwargs)
dets = self.detectors
ret = {det: [] for det in dets}
for dic in gotten:
seglen = max([len(ts) for ts in dic.values()])
dt = list(dic.values())[0].delta_t
epoch = float(list(dic.values())[0].start_time)
for det in dets:
if det in dic:
ret[det].append(dic[det])
else:
ret[det].append(TimeSeries(np.zeros(seglen),
delta_t=dt,
epoch=epoch))
return ret
def get_full_seglist(self, **kwargs):
gotten = self.get(**kwargs)
dets = self.detectors
ret = SegmentList()
for dic in gotten:
seglen = len(list(dic.values())[0])
dt = list(dic.values())[0].delta_t
epoch = float(list(dic.values())[0].start_time)
seg = OverlapSegment()
for det in dets:
if det in dic:
ts = dic[det]
else:
ts = TimeSeries(np.zeros(seglen),
delta_t=dt,
epoch=epoch)
seg.add_timeseries((det, ts))
ret.add_segment(seg)
return ret
def min_duration(self, duration):
segs = []
for seg in self.segments:
if seg.duration is None:
dur = float(seg.end_time) - float(seg.start_time)
else:
dur = seg.duration
if dur < duration:
continue
segs.append(seg)
return SegmentList(*segs)
@property
def detectors(self):
detectors = set([])
for seg in self.segments:
detectors = detectors.union(set(seg.detectors))
return list(detectors)
@property
def start_time(self):
if self.segments is None:
return
return min([seg.start_time for seg in self.segments])
@property
def end_time(self):
if self.segments is None:
return
return max([seg.end_time for seg in self.segments])
@property
def duration(self):
ret = 0
for seg in self.segments:
if seg.duration is None:
ret += seg.end_time - seg.start_time
else:
ret += seg.duration
return ret