# --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: sphinx # format_version: '1.1' # jupytext_version: 1.19.1 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- """ # CC vs PCC — Side-by-side Comparison This notebook builds a minimal MSNoise project from scratch and runs two parallel correlation lineages on the same dataset: | Step name | Method | Notes | |-----------------|--------|------------------------------------| | `preprocess_1` | — | shared preprocessing | | `cc_1` | CC | standard geometrically-normalised | | `cc_2` | PCC | phase cross-correlation (v=2) | | `filter_1` | — | shared band-pass filter | | `stack_1` | linear | shared stacking config | Both `cc_1` and `cc_2` feed the same `filter_1` → `stack_1` steps. The final section retrieves stacked CCFs from both lineages and plots them side-by-side for a direct comparison. ## Prerequisites * MSNoise installed (with the PCC patch applied). * A one-day MiniSEED dataset for **at least two stations**. The classic MSNoise tutorial dataset (network `YA`, stations `UV05`, `UV06`, `UV10`, day 2010-09-01 / Julian day 244, stored in PDF layout) is used as the reference, but **any SDS or PDF archive works** — just adjust the USER SETTINGS cell below. """ ############################################################################### # ## 0 · Imports # %matplotlib inline import os import shutil import logging import tempfile import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt # MSNoise core from msnoise.core.db import connect, create_database_inifile from msnoise.core.config import ( create_config_set, update_config ) from msnoise.core.stations import update_station from msnoise.core.workflow import ( create_workflow_steps_from_config_sets, create_workflow_links_from_steps, get_workflow_steps, reset_jobs ) from msnoise.msnoise_table_def import declare_tables, DataAvailability from msnoise.results import MSNoiseResult # MSNoise compute steps from msnoise.s01_scan_archive import main as scan_archive from msnoise.s02_new_jobs import main as new_jobs from msnoise.s02_preprocessing import main as preprocess from msnoise.s03_compute_no_rotation import main as compute_cc from msnoise.s04_stack_mov import main as stack_mov logging.basicConfig(level=logging.WARNING) # suppress INFO noise in notebook ############################################################################### # ## 1 · User Settings # # **Edit this cell to match your dataset.** # # The defaults match the classic MSNoise tutorial dataset (YA network, # stations UV05/UV06/UV10, PDF archive layout). # ── Path to your waveform archive ──────────────────────────────────────────── # Can be an absolute path or relative. # The installer will store it verbatim in the DataSource table. DATA_PATH = r"C:\Users\tlecocq\AppData\Local\msnoise-testdata\msnoise-testdata\Cache\1.1\classic\data" # ── Archive layout ──────────────────────────────────────────────────────────── # "PDF" → YEAR/STA/CHAN.D/NET.STA.LOC.CHAN.D.YEAR.JDAY # "SDS" → YEAR/NET/STA/CHAN.D/NET.STA.LOC.CHAN.D.YEAR.JDAY DATA_STRUCTURE = "PDF" # ← "SDS" or "PDF" # ── Network / channel filter used when scanning the archive ────────────────── NETWORK_CODE = "YA" CHANNELS = "HHZ" # ── Date range to process (ISO format, inclusive) ──────────────────────────── STARTDATE = "2010-09-01" ENDDATE = "2010-09-01" # ── Station coordinates (net, sta, lon°E, lat°N, elev_m) ───────────────────── # Fill in your actual stations. These are the YA tutorial stations. STATIONS = [ ("YA", "UV05", 29.735, -17.817, 1174.0), ("YA", "UV06", 29.785, -17.827, 1162.0), ("YA", "UV10", 29.790, -17.847, 1180.0), ] # ── CC config (shared by both cc_1 and cc_2) ───────────────────────────────── CC_SAMPLING_RATE = 20.0 # Hz — resample target MAXLAG = 120.0 # seconds CORR_DURATION = 1800.0 # seconds per window COMPONENTS = "ZZ" # ── Filter passband ─────────────────────────────────────────────────────────── FREQMIN = 0.1 # Hz FREQMAX = 4.0 # Hz # ── Stack config ────────────────────────────────────────────────────────────── MOV_STACK = "(('1h','1h'),)" # 1-hour stack # ── Working directory (project folder) ─────────────────────────────────────── # A fresh temporary directory is created automatically. # Set to a fixed path if you want a persistent project between kernel restarts. WORK_DIR = None # None → auto temp dir ############################################################################### # ## 2 · Create Project Directory and MSNoise Database if WORK_DIR is None: WORK_DIR = tempfile.mkdtemp(prefix="msnoise_ccvspcc_") print(f"Created temporary project directory: {WORK_DIR}") else: os.makedirs(WORK_DIR, exist_ok=True) print(f"Using project directory: {WORK_DIR}") os.chdir(WORK_DIR) # Initialise SQLite database create_database_inifile( tech=1, hostname=os.path.join(WORK_DIR, "msnoise.sqlite"), database="", username="", password="", prefix="", ) # Create all tables db = connect() declare_tables().Base.metadata.create_all(db.get_bind()) print("Database created:", os.path.join(WORK_DIR, "msnoise.sqlite")) ############################################################################### # ## 3 · Create Default Config Sets and Workflow Skeleton # # We create **one** config set for every category except `cc`, for which we # will create **two** (cc_1 = CC, cc_2 = PCC). ALL_CATEGORIES = [ "global", "preprocess", "filter", "stack", "refstack", "mwcs", "mwcs_dtt", "mwcs_dtt_dvv", "stretching", "stretching_dvv", "wavelet", "wavelet_dtt", "wavelet_dtt_dvv", "psd", "psd_rms", ] for cat in ALL_CATEGORIES: sn = create_config_set(db, cat) print(f" {cat}_1 created (set_number={sn})") # CC set 1 — standard cross-correlation cc1_sn = create_config_set(db, "cc") print(f" cc_{cc1_sn} created ← standard CC") # CC set 2 — phase cross-correlation cc2_sn = create_config_set(db, "cc") print(f" cc_{cc2_sn} created ← PCC") db.commit() ############################################################################### # ## 4 · Configure Global Parameters OUTPUT_FOLDER = os.path.join(WORK_DIR, "OUTPUT") global_params = { "output_folder": OUTPUT_FOLDER, "startdate": STARTDATE, "enddate": ENDDATE, "hpc": "N", # propagate_downstream fires automatically } for k, v in global_params.items(): update_config(db, k, v, category="global", set_number=1) print(f" global.{k} = {v!r}") ############################################################################### # ## 5 · Configure the Two CC Sets # ── Shared CC parameters ────────────────────────────────────────────────────── cc_shared = { "cc_sampling_rate": str(CC_SAMPLING_RATE), "maxlag": str(MAXLAG), "corr_duration": str(CORR_DURATION), "components_to_compute": COMPONENTS, "keep_all": "Y", "keep_days": "Y", "whitening": "A", # whiten all inter-station pairs "winsorizing": "3.0", # 3× RMS clipping "clip_after_whiten": "N", "stack_method": "linear", } for k, v in cc_shared.items(): update_config(db, k, v, category="cc", set_number=cc1_sn) update_config(db, k, v, category="cc", set_number=cc2_sn) # ── cc_1: standard CC ───────────────────────────────────────────────────────── update_config(db, "cc_type", "CC", category="cc", set_number=cc1_sn) update_config(db, "whitening", "A", category="cc", set_number=cc1_sn) print(f"cc_{cc1_sn}: cc_type = CC (standard cross-correlation)") # ── cc_2: PCC (Phase Cross-Correlation v=2) ────────────────────────────────── # # What happens in the PCC branch of s03: # 1. The filter_N bandpass is applied to the time-domain traces so each # filter produces a genuinely distinct CCF (CC does this via whiten2's # frequency window; PCC needs it done explicitly). # 2. If whitening != "N", whiten2 is also applied within the band BEFORE # AmpNorm. This flattens the spectrum so the phase signal is broadband # across the full filter passband rather than dominated by the most # energetic frequency in the band ("spectrally whitened PCC", # Ventosa 2019 §3.1). With whitening="N" you get the classic PCC # result whose bandwidth is set solely by the preprocessing bandpass. # # Winsorising: AmpNorm already discards all amplitude information per-sample, # so winsorising (clipping at N×RMS) is redundant for PCC. Disable it. update_config(db, "cc_type", "PCC", category="cc", set_number=cc2_sn) update_config(db, "winsorizing", "0", category="cc", set_number=cc2_sn) update_config(db, "whitening", "A", category="cc", set_number=cc2_sn) print(f"cc_{cc2_sn}: cc_type = PCC (phase cross-correlation v=2)") print(f" winsorizing=0 (redundant: AmpNorm already discards amplitude)") print(f" whitening=A (spectral whitening within band before AmpNorm)") db.commit() "" reset_jobs(db, "cc_1", alljobs=True) reset_jobs(db, "cc_2", alljobs=True) ############################################################################### # ## 6 · Configure Filter and Stack update_config(db, "freqmin", str(FREQMIN), category="filter", set_number=1) update_config(db, "freqmax", str(FREQMAX), category="filter", set_number=1) update_config(db, "CC", "Y", category="filter", set_number=1) update_config(db, "mov_stack", MOV_STACK, category="stack", set_number=1) update_config(db, "ref_begin", STARTDATE, category="refstack", set_number=1) update_config(db, "ref_end", ENDDATE, category="refstack", set_number=1) db.commit() print(f"filter_1: {FREQMIN}–{FREQMAX} Hz") print(f"stack_1: {MOV_STACK}") ############################################################################### # ## 7 · Configure DataSource and Station Table # Create the default DataSource (the installer normally does this; when # initialising the DB manually we must insert it ourselves). DataSource = declare_tables().DataSource ds = DataSource( name="local", uri=os.path.realpath(DATA_PATH), data_structure=DATA_STRUCTURE, auth_env="MSNOISE", network_code=NETWORK_CODE, channels=CHANNELS, ) db.add(ds) db.commit() print(f"DataSource created: uri={os.path.realpath(DATA_PATH)!r}") print(f" data_structure={DATA_STRUCTURE!r}") # Add stations for net, sta, lon, lat, elev in STATIONS: update_station(db, net=net, sta=sta, X=lon, Y=lat, altitude=elev, coordinates="DEG", used=1) print(f" Added station {net}.{sta} ({lon:.3f}°E, {lat:.3f}°N)") db.commit() ############################################################################### # ## 8 · Build the Workflow Graph # # The topology we want: # # ``` # preprocess_1 ──► cc_1 ──► filter_1 ──► stack_1 # └──► cc_2 ──► ↑ # ``` # # Both cc steps feed the **same** filter_1 and stack_1. # Create WorkflowSteps from all config sets (auto-discovers preprocess_1, # cc_1, cc_2, filter_1, stack_1, …) created, existing, err = create_workflow_steps_from_config_sets(db) assert err is None, f"Error creating workflow steps: {err}" print(f"Workflow steps: {created} created, {existing} already existed") # Auto-link following MSNoise's canonical dependency rules created_links, existing_links, err = create_workflow_links_from_steps(db) assert err is None, f"Error creating workflow links: {err}" print(f"Workflow links: {created_links} created, {existing_links} already existed") # Verify the topology steps = {s.step_name: s for s in get_workflow_steps(db)} print("\nWorkflow steps present:", sorted(steps.keys())) ############################################################################### # ## 9 · Scan Archive and Seed Jobs # Scan the waveform archive → populate DataAvailability table scan_archive(init=True, threads=1) # Update loc/chan on Station rows from DataAvailability from sqlalchemy import text as _text _db2 = connect() for sta in _db2.query(declare_tables().Station): data = _db2.query(DataAvailability). \ filter(_text("net=:net")).filter(_text("sta=:sta")). \ group_by(DataAvailability.net, DataAvailability.sta, DataAvailability.loc, DataAvailability.chan). \ params(net=sta.net, sta=sta.sta).all() sta.used_location_codes = ",".join(sorted({d.loc for d in data})) sta.used_channel_names = ",".join(sorted({d.chan for d in data})) _db2.commit() _db2.close() # Verify availability _db3 = connect() n_da = _db3.query(DataAvailability).count() print(f"DataAvailability rows: {n_da}") _db3.close() # Seed initial jobs (creates preprocess_1 T-jobs) new_jobs(init=True) ############################################################################### # ## 10 · Run the Pipeline # # Steps run sequentially in-process. For large datasets use `msnoise` CLI # or HPC submission instead. print("── preprocess ─────────────────────────────────────────────────────") preprocess() print("── new_jobs --after preprocess ────────────────────────────────────") new_jobs(after="preprocess") print("── compute_cc (cc_1 = CC and cc_2 = PCC) ───────────────────────") compute_cc() print("── new_jobs --after cc ────────────────────────────────────────────") new_jobs(after="cc") print("── stack_mov ──────────────────────────────────────────────────────") stack_mov(stype="mov") print("── new_jobs --after stack ─────────────────────────────────────────") new_jobs(after="stack") print("Done.") ############################################################################### # ## 11 · Gather MSNoiseResult Objects db = connect() # Lineage for CC branch: preprocess_1 / cc_1 / filter_1 / stack_1 result_cc = MSNoiseResult.from_ids( db, preprocess=1, cc=cc1_sn, # cc_1 filter=1, stack=1, ) print("CC lineage:", result_cc.lineage_names) # Lineage for PCC branch: preprocess_1 / cc_2 / filter_1 / stack_1 result_pcc = MSNoiseResult.from_ids( db, preprocess=1, cc=cc2_sn, # cc_2 filter=1, stack=1, ) print("PCC lineage:", result_pcc.lineage_names) ############################################################################### # ## 12 · Compare Stacked CCFs # Retrieve all stacked CCFs for both lineages ccfs_cc = result_cc.get_ccf() ccfs_pcc = result_pcc.get_ccf() print(f"CC result: {len(ccfs_cc)} CCF(s) found") print(f"PCC result: {len(ccfs_pcc)} CCF(s) found") for key in sorted(ccfs_cc): pair, comp, ms = key print(f" {pair} {comp} {ms[0]}–{ms[1]}") ############################################################################### # ## 13 · Plot: CC vs PCC per Pair fig_dir = os.path.join(WORK_DIR, "figures") os.makedirs(fig_dir, exist_ok=True) common_keys = sorted(set(ccfs_cc) & set(ccfs_pcc)) if not common_keys: print("No common keys found between CC and PCC results — check that " "the pipeline completed successfully.") else: n = len(common_keys) fig, axes = plt.subplots(n, 2, figsize=(14, 3.5 * n), squeeze=False) fig.suptitle( f"Stacked CCF comparison: CC (left) vs PCC (right)\n" f"Filter {FREQMIN}–{FREQMAX} Hz | {STARTDATE}", fontsize=13, y=1.01, ) for row, key in enumerate(common_keys): pair, comp, ms = key da_cc = ccfs_cc[key] da_pcc = ccfs_pcc[key] # Time axis (seconds) taxis = da_cc.coords["taxis"].values if "taxis" in da_cc.coords \ else np.linspace(-MAXLAG, MAXLAG, da_cc.shape[-1]) # Stack over the time dimension if multiple days present ccf_cc = float(da_cc.mean()) if da_cc.ndim == 0 else da_cc.values ccf_pcc = float(da_pcc.mean()) if da_pcc.ndim == 0 else da_pcc.values if ccf_cc.ndim > 1: ccf_cc = ccf_cc.mean(axis=0) if ccf_pcc.ndim > 1: ccf_pcc = ccf_pcc.mean(axis=0) label = f"{pair} {comp}" # ── CC panel ────────────────────────────────────────────────────── ax = axes[row, 0] ax.plot(taxis, ccf_cc, color="#1f77b4", lw=0.9) ax.axvline(0, color="k", lw=0.6, ls="--", alpha=0.4) ax.set_title(f"{label} — CC") ax.set_xlabel("Lag (s)") ax.set_ylabel("Amplitude") ax.set_xlim(-MAXLAG, MAXLAG) # ── PCC panel ───────────────────────────────────────────────────── ax = axes[row, 1] ax.plot(taxis, ccf_pcc, color="#d62728", lw=0.9) ax.axvline(0, color="k", lw=0.6, ls="--", alpha=0.4) ax.set_title(f"{label} — PCC") ax.set_xlabel("Lag (s)") ax.set_ylabel("Amplitude") ax.set_xlim(-MAXLAG, MAXLAG) plt.tight_layout() outfig = os.path.join(fig_dir, "cc_vs_pcc.png") fig.savefig(outfig, dpi=150, bbox_inches="tight") plt.show() print(f"Saved: {outfig}") "" import numpy as np A = np.fft.fft(ccf_cc) f = np.fft.fftfreq(len(A), d=1/20.)[:len(A)//2] A /= A.max() B = np.fft.fft(ccf_pcc) B /= B.max() plt.figure(figsize=(15,5)) plt.plot(f, abs(A[:len(A)//2])) plt.plot(f, abs(B[:len(B)//2])) # plt.ylim(0,0.15) plt.xlim(0,5) ############################################################################### # ## 14 · Overlay: CC vs PCC on the same axes (one panel per pair) if common_keys: fig2, axes2 = plt.subplots(1, len(common_keys), figsize=(6 * len(common_keys), 4), squeeze=False) fig2.suptitle( f"CC (blue) vs PCC (red) | {FREQMIN}–{FREQMAX} Hz | {STARTDATE}", fontsize=12, ) for col, key in enumerate(common_keys): pair, comp, ms = key da_cc = ccfs_cc[key] da_pcc = ccfs_pcc[key] taxis = da_cc.coords["taxis"].values if "taxis" in da_cc.coords \ else np.linspace(-MAXLAG, MAXLAG, da_cc.values.shape[-1]) ccf_cc = da_cc.values ccf_pcc = da_pcc.values if ccf_cc.ndim > 1: ccf_cc = ccf_cc.mean(axis=0) if ccf_pcc.ndim > 1: ccf_pcc = ccf_pcc.mean(axis=0) # Normalise to ABSMAX for visual overlay def _norm(x): m = np.abs(x).max() return x / m if m > 0 else x ax = axes2[0, col] ax.plot(taxis, _norm(ccf_cc), color="#1f77b4", lw=1.0, label="CC", alpha=0.85) ax.plot(taxis, _norm(ccf_pcc), color="#d62728", lw=1.0, label="PCC", alpha=0.85) ax.axvline(0, color="k", lw=0.6, ls="--", alpha=0.3) ax.set_title(f"{pair} {comp}") ax.set_xlabel("Lag (s)") ax.set_ylabel("Normalised amplitude") ax.set_xlim(-MAXLAG, MAXLAG) ax.legend(loc="upper right", fontsize=9) plt.tight_layout() outfig2 = os.path.join(fig_dir, "cc_vs_pcc_overlay.png") fig2.savefig(outfig2, dpi=150, bbox_inches="tight") plt.show() print(f"Saved: {outfig2}") ############################################################################### # ## 15 · Summary Table import pandas as pd rows = [] for key in common_keys: pair, comp, ms = key cc_arr = ccfs_cc[key].values pcc_arr = ccfs_pcc[key].values if cc_arr.ndim > 1: cc_arr = cc_arr.mean(axis=0) if pcc_arr.ndim > 1: pcc_arr = pcc_arr.mean(axis=0) rows.append({ "Pair": pair, "Component": comp, "Mov. stack": f"{ms[0]}–{ms[1]}", "CC peak": f"{np.abs(cc_arr).max():.4g}", "PCC peak": f"{np.abs(pcc_arr).max():.4g}", "CC lag@peak (s)": f"{np.where(np.abs(cc_arr) == np.abs(cc_arr).max())[0][0] - len(cc_arr)//2:.1f}", "PCC lag@peak (s)": f"{np.where(np.abs(pcc_arr) == np.abs(pcc_arr).max())[0][0] - len(pcc_arr)//2:.1f}", }) df = pd.DataFrame(rows) print(df.to_string(index=False)) ############################################################################### # ## Notes # # ### Why PCC output looks different from CC # # * **CC** retains amplitude information: large-energy windows (earthquakes, # transients) dominate the stack even after winsorising. The CCF contains # the full filter-band content weighted by window energy. # # * **PCC** (v=2) projects every sample of the analytic signal onto the unit # circle before correlating — amplitude is discarded per-sample. The output # is bounded to [−1, 1] by construction. The PCC branch in s03: # (a) bandpasses to the filter_N frequencies so each filter is distinct, # (b) optionally whitens within the band (controlled by `whitening` config) # to give a broadband phase signal ("spectrally whitened PCC"), # (c) then calls `pcc_xcorr` which computes AmpNorm + FFT cross-correlation. # # * **Winsorising** is disabled for `cc_2`: it clips amplitude extremes, which # is meaningless after AmpNorm has already equalised every sample to unit # amplitude. # # * **Whitening = "N" variant**: set `whitening` to `"N"` on `cc_2` for # classic (non-whitened) PCC. The result will be dominated by the most # energetic frequency within the preprocessing bandpass — often visually # narrower than CC. The default (`whitening="A"`) gives a more # spectrally balanced PCC that compares directly to CC. # # ### Adjusting the comparison # # * Change `FREQMIN` / `FREQMAX` in the User Settings cell and re-run from # §6 onwards — both lineages share `filter_1`, so re-running the stack # step is sufficient (no need to redo preprocessing or CC). # * The `result_cc` and `result_pcc` objects expose the full MSNoiseResult # API: call `.get_mwcs()`, `.get_dvv()`, etc. after running the downstream # steps on each lineage.