Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
0ae5704
rch test
joelridden Sep 8, 2025
535e85c
test run
joelridden Sep 8, 2025
ad5a09c
cpus
joelridden Sep 8, 2025
138dfab
script mention
joelridden Sep 8, 2025
5d8db96
exclude node 4
joelridden Sep 9, 2025
2ad14ab
test bash
joelridden Sep 17, 2025
cca9b10
Merge branch 'nzcvm_data_update' into rch
joelridden Sep 17, 2025
e61951d
pickle fix
joelridden Sep 17, 2025
0410d2e
pickle fix
joelridden Sep 17, 2025
b447186
standard
joelridden Sep 19, 2025
c374097
adjust pickled objects
joelridden Oct 5, 2025
fe7be58
testing fixes
joelridden Oct 5, 2025
7e0b70e
rename client
joelridden Oct 5, 2025
26cce3c
geonet fork
joelridden Oct 6, 2025
948bff1
attempt mp sites
joelridden Oct 6, 2025
63687a5
inventory changes
joelridden Oct 7, 2025
6d3ef68
processing fix
joelridden Oct 7, 2025
308b709
distances fix
joelridden Oct 7, 2025
50b715f
big core test rch
joelridden Oct 7, 2025
6cfac04
rch fix
joelridden Oct 8, 2025
2642639
adjust cores
joelridden Oct 8, 2025
a4a56c1
add rch machine
joelridden Oct 8, 2025
53e0aff
test new event cat method
joelridden Oct 8, 2025
a33d86e
extraction improvement
joelridden Oct 9, 2025
6f29f9c
client change
joelridden Oct 9, 2025
e4baab8
rename parameter
joelridden Oct 9, 2025
bd73862
decrease extraction
joelridden Oct 9, 2025
efe9a2b
decrease extraction
joelridden Oct 9, 2025
2799bd6
5000 rows
joelridden Oct 9, 2025
8c241d3
Merge branch '4p4' into rch
joelridden Dec 4, 2025
0e9a11e
quality db single copy
joelridden Dec 4, 2025
638a042
Wait for too many requests
joelridden Dec 5, 2025
b5b0163
2 min
joelridden Dec 5, 2025
6487333
32 limit
joelridden Dec 5, 2025
bca491a
self review
joelridden Dec 7, 2025
5119d7e
default batch size
joelridden Dec 7, 2025
9a170ca
config and BH added
joelridden Dec 7, 2025
36cbfed
type hint fix
joelridden Dec 14, 2025
511616d
fix requirements
joelridden Dec 14, 2025
3296323
add type checking
joelridden Jan 9, 2026
6f96e7d
Merge remote-tracking branch 'origin/rch' into type_check
joelridden Jan 9, 2026
569cac6
change numpy version
joelridden Jan 9, 2026
00d391c
install ty
joelridden Jan 9, 2026
78037d6
adjust type check
joelridden Jan 9, 2026
ffe5299
install dependencies
joelridden Jan 9, 2026
0fad64e
ensure same env
joelridden Jan 9, 2026
4358ff3
revert to uv
joelridden Jan 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/workflows/types.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Type Check
on: [pull_request]
jobs:
typecheck:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup Python
uses: actions/setup-python@v5

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true

- name: Install project with types
run: uv sync --all-extras --dev

- name: Run type checking with ty
run: uv run ty check --exclude setup.py --exclude nzgmdb/CCLD/ccldpy.py
3 changes: 2 additions & 1 deletion nzgmdb/calculation/distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,7 +1044,8 @@ def calc_distances(main_dir: Path, n_procs: int = 1):

# Get the station information
client_NZ = FDSN_Client("GEONET")
inventory = client_NZ.get_stations()
channel_codes = config.get_value("channel_codes")
inventory = client_NZ.get_stations(channel=channel_codes, level="station")
station_info = []
for network in inventory:
for station in network:
Expand Down
20 changes: 19 additions & 1 deletion nzgmdb/calculation/snr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import numpy as np
import pandas as pd
from obspy.clients.fdsn import Client as FDSN_Client
from obspy.core.inventory import Inventory
from pandas.errors import EmptyDataError

from IM import im_calculation, snr_calculation
Expand All @@ -23,6 +25,7 @@ def compute_snr_for_single_mseed(
output_dir: Path,
ko_directory: Path,
common_frequency_vector: np.ndarray = im_calculation.DEFAULT_FREQUENCIES,
inventory: Inventory | None = None,
):
"""
Compute the SNR for a single mseed file
Expand All @@ -39,6 +42,9 @@ def compute_snr_for_single_mseed(
Path to the directory containing the Ko matrices
common_frequency_vector : np.ndarray, optional
Common frequency vector to extract for SNR and FAS, by default None
inventory : Inventory, optional
The inventory information for the mseed file, by default None
(Only used to improve performance when reading the mseed file)

Returns
-------
Expand All @@ -60,7 +66,13 @@ def compute_snr_for_single_mseed(

# Read mseed information
try:
waveform = reading.create_waveform_from_mseed(mseed_file, pre_process=True, apply_taper=False, apply_zero_padding=False)
waveform = reading.create_waveform_from_mseed(
mseed_file,
pre_process=True,
apply_taper=False,
apply_zero_padding=False,
inventory=inventory,
)
except custom_errors.InventoryNotFoundError:
skipped_record_dict = {
"record_id": mseed_file.stem,
Expand Down Expand Up @@ -267,6 +279,11 @@ def compute_snr_for_mseed_data(
# Load the phase arrival table
phase_table = pd.read_csv(phase_table_path)

# Load the inventory
client = FDSN_Client("GEONET")
channel_codes = config.get_value("channel_codes")
inventory = client.get_stations(channel=channel_codes, level="response")

# Load the bypass records if provided
if bypass_records_ffp is not None:
bypass_records = pd.read_csv(bypass_records_ffp)
Expand Down Expand Up @@ -295,6 +312,7 @@ def compute_snr_for_mseed_data(
output_dir=snr_fas_output_dir,
ko_directory=ko_directory,
common_frequency_vector=common_frequency_vector,
inventory=inventory,
),
batch,
)
Expand Down
5 changes: 1 addition & 4 deletions nzgmdb/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ priority_phase_list:
- Pn
- Pg
- Pb
channel_codes:
- HN?
- BN?
- HH?
channel_codes: "HN?,BN?,HH?,BH?"
percentage_gap_allowed: 0.1
is_large_overlap: 0.5
# Mseed Variables
Expand Down
16 changes: 14 additions & 2 deletions nzgmdb/config/machine_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ mantle:
upload: 28
hypocentre:
geonet: 44
extraction: 44
extraction: 32
tec_domain: 44
phase_table: 44
snr: 18
Expand All @@ -33,4 +33,16 @@ hypocentre:
process: 44
im: 12
distances: 44
upload: 44
upload: 44
rch:
geonet: 128
extraction: 32
tec_domain: 128
phase_table: 128
snr: 64
fmax: 128
gmc: 64
process: 128
im: 64
distances: 128
upload: 128
2 changes: 1 addition & 1 deletion nzgmdb/data_processing/merge_flatfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def add_ground_level(
"""
# Find the station location information with the inventory lat, lon and elev
config = cfg.Config()
channel_codes = ",".join(config.get_value("channel_codes"))
channel_codes = config.get_value("channel_codes")
client_NZ = FDSN_Client("GEONET")
inventory = client_NZ.get_stations(channel=channel_codes, level="response")
station_info = [
Expand Down
14 changes: 13 additions & 1 deletion nzgmdb/data_processing/process_observed.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@

import numpy as np
import pandas as pd
from obspy.clients.fdsn import Client as FDSN_Client
from obspy.core.inventory import Inventory

import qcore.timeseries as ts
from nzgmdb.data_processing import waveform_manipulation
from nzgmdb.management import config as cfg
from nzgmdb.management import custom_errors, file_structure
from nzgmdb.mseed_management import reading

Expand All @@ -20,6 +23,7 @@ def process_single_mseed(
gmc_df: pd.DataFrame | None = None,
fmax_df: pd.DataFrame | None = None,
bypass_df: pd.DataFrame | None = None,
inventory: Inventory | None = None,
):
"""
Process a single mseed file and save the processed data to a txt file
Expand All @@ -38,6 +42,8 @@ def process_single_mseed(
The Fmax values
bypass_df : pd.DataFrame, optional
The bypass records containing custom fmin, fmax values
inventory : Inventory, optional
The inventory information for the mseed file

Returns
-------
Expand Down Expand Up @@ -67,7 +73,7 @@ def process_single_mseed(

# Perform initial pre-processing
try:
mseed = waveform_manipulation.initial_preprocessing(mseed)
mseed = waveform_manipulation.initial_preprocessing(mseed, inventory=inventory)
except custom_errors.InventoryNotFoundError:
skipped_record_dict = {
"record_id": mseed_stem,
Expand Down Expand Up @@ -229,6 +235,11 @@ def process_mseeds_to_txt(
)
bypass_df = None if bypass_records_ffp is None else pd.read_csv(bypass_records_ffp)

config = cfg.Config()
channel_codes = config.get_value("channel_codes")
client = FDSN_Client("GEONET")
inventory = client.get_stations(channel=channel_codes, level="response")

# Use multiprocessing to process the mseed files
with multiprocessing.Pool(processes=n_procs) as pool:
skipped_records = pool.map(
Expand All @@ -237,6 +248,7 @@ def process_mseeds_to_txt(
gmc_df=gmc_df,
fmax_df=fmax_df,
bypass_df=bypass_df,
inventory=inventory,
),
mseed_files,
)
Expand Down
35 changes: 17 additions & 18 deletions nzgmdb/data_processing/quality_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,7 @@ def filter_duplicate_channels(
2. HN channels (Strong motion, high frequency)
3. BN channels (Strong motion, lower frequency)
4. HH channels (Broadband, high frequency)
5. BH channels (Broadband, lower frequency)

If multiple records have the same priority, the first one encountered is kept.
All other duplicates are removed and returned in the skipped records.
Expand All @@ -785,15 +786,15 @@ def filter_duplicate_channels(
catalogue["bypass"] = catalogue["record_id"].isin(bypass_records)

# Step 3: Define priority levels
priority = {"HN": 1, "BN": 2, "HH": 3}
catalogue["chan_priority"] = catalogue["chan"].map(priority).fillna(4)
priority = {"HN": 1, "BN": 2, "HH": 3, "BH": 4}
catalogue["chan_priority"] = catalogue["chan"].map(priority).fillna(5)
# Step 4: Override priority for bypass records
catalogue.loc[catalogue["bypass"], "chan_priority"] = 0

# Step 5: Sort by priority and select top-priority row per group
catalog_sorted = catalogue.sort_values(by=["evid_sta", "chan_priority"])
# Remove records with priority 4 (not HN, BN, HH)
catalog_sorted = catalog_sorted[catalog_sorted["chan_priority"] < 4]
# Remove records with priority 4 (not HN, BN, HH, BH)
catalog_sorted = catalog_sorted[catalog_sorted["chan_priority"] < 5]
best_dups = catalog_sorted.groupby("evid_sta", as_index=False).nth(0)

# Step 6: Identify which records to drop (the non-best ones)
Expand Down Expand Up @@ -872,35 +873,33 @@ def apply_all_filters(
fmin_max = fmin_max if fmin_max is not None else config.get_value("fmin_max")
mag_min = min_mag if min_mag is not None else config.get_value("quality_min_mag")

catalogue_copy = catalogue.copy()

# Filter by magnitude
skipped_records_mag = filter_mag(catalogue.copy(), mag_min)
skipped_records_mag = filter_mag(catalogue_copy, mag_min)

# Find ground level locations
skipped_records_ground = filter_ground_level_locations(
catalogue.copy(), bypass_records
catalogue_copy, bypass_records
)

# Find has score mean
skipped_records_has_score = filter_has_score_mean(catalogue.copy(), bypass_records)
skipped_records_has_score = filter_has_score_mean(catalogue_copy, bypass_records)

# Find score mean
skipped_records_score = filter_score_mean(
catalogue.copy(), score_min, bypass_records
)
skipped_records_score = filter_score_mean(catalogue_copy, score_min, bypass_records)

# Find multi mean
skipped_records_multi = filter_multi_mean(
catalogue.copy(), multi_max, bypass_records
)
skipped_records_multi = filter_multi_mean(catalogue_copy, multi_max, bypass_records)

# Find fmax
skipped_records_fmax = filter_fmax(catalogue.copy(), fmax_min, bypass_records)
skipped_records_fmax = filter_fmax(catalogue_copy, fmax_min, bypass_records)

# Find fmin
skipped_records_fmin = filter_fmin(catalogue.copy(), fmin_max, bypass_records)
skipped_records_fmin = filter_fmin(catalogue_copy, fmin_max, bypass_records)

# Find missing station information
skipped_records_sta = filter_missing_sta_info(catalogue.copy(), bypass_records)
skipped_records_sta = filter_missing_sta_info(catalogue_copy, bypass_records)

# Find clipped records
skipped_records_clipped = apply_clipNet_filter(clipped_records_ffp, bypass_records)
Expand All @@ -910,12 +909,12 @@ def apply_all_filters(

# Find troublesome sensitivity records
skipped_records_sensitivity = filter_troublesome_sensitivity(
catalogue.copy(), bypass_records
catalogue_copy, bypass_records
)

# Find empirical predictions
skipped_records_empirical = filter_empirical_predictions(
catalogue.copy(), bypass_records
catalogue_copy, bypass_records
)

# Combine all the skipped records
Expand Down
41 changes: 26 additions & 15 deletions nzgmdb/data_processing/waveform_manipulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from obspy.clients.fdsn import Client as FDSN_Client
from obspy.clients.fdsn.header import FDSNNoDataException
from obspy.core.inventory import Inventory
from obspy.core.stream import Stream
from scipy import integrate, signal

Expand All @@ -13,7 +14,10 @@


def initial_preprocessing(
mseed: Stream, apply_taper: bool = True, apply_zero_padding: bool = True
mseed: Stream,
apply_taper: bool = True,
apply_zero_padding: bool = True,
inventory: Inventory = None,
):
"""
Basic pre-processing of the waveform data
Expand All @@ -33,6 +37,8 @@ def initial_preprocessing(
Whether to apply the tapering, by default True
apply_zero_padding : bool, optional
Whether to apply zero padding, by default True
inventory : Inventory, optional
The inventory object containing the response information, by default None

Returns
-------
Expand Down Expand Up @@ -73,29 +79,34 @@ def initial_preprocessing(
location = mseed[0].stats.location
channel = mseed[0].stats.channel

# Get Station Information from geonet clients
# Fetching here instead of passing the inventory object as searching for the station, network, and channel
# information takes a long time as it's implemented in a for loop
try:
client_NZ = FDSN_Client("GEONET")
inv = client_NZ.get_stations(
level="response", network="NZ", station=station, location=location
)
except FDSNNoDataException:
raise custom_errors.InventoryNotFoundError(
f"No inventory information found for station {station} with location {location}"
)
if inventory is not None:
# Select only the required station and location from the inventory
inv_selected = inventory.select(station=station, location=location)
if len(inv_selected) == 0:
raise custom_errors.InventoryNotFoundError(
f"No inventory information found for station {station} with location {location}"
)
else:
try:
client_NZ = FDSN_Client("GEONET")
inv_selected = client_NZ.get_stations(
level="response", network="NZ", station=station, location=location
)
except FDSNNoDataException:
raise custom_errors.InventoryNotFoundError(
f"No inventory information found for station {station} with location {location}"
)

try:
mseed = mseed.remove_sensitivity(inventory=inv)
mseed = mseed.remove_sensitivity(inventory=inv_selected)
except ValueError:
raise custom_errors.SensitivityRemovalError(
f"Failed to remove sensitivity for station {station} with location {location}"
)

# Rotate
try:
mseed.rotate("->ZNE", inventory=inv)
mseed.rotate("->ZNE", inventory=inv_selected)
except (
Exception # noqa: BLE001
): # Due to obspy raising an Exception instead of a specific error
Expand Down
Loading
Loading