[seiscomp, scanloc] Install, add .gitignore

2025-10-09 15:07:02 +02:00
commit 20f5301bb1
2848 changed files with 1315858 additions and 0 deletions
--- a/bin/scmssort
+++ b/bin/scmssort
@@ -0,0 +1,532 @@
+#!/usr/bin/env seiscomp-python
+# -*- coding: utf-8 -*-
+############################################################################
+# Copyright (C) GFZ Potsdam                                                #
+# All rights reserved.                                                     #
+#                                                                          #
+# GNU Affero General Public License Usage                                  #
+# This file may be used under the terms of the GNU Affero                  #
+# Public License version 3.0 as published by the Free Software Foundation  #
+# and appearing in the file LICENSE included in the packaging of this      #
+# file. Please review the following information to ensure the GNU Affero   #
+# Public License version 3.0 requirements will be met:                     #
+# https://www.gnu.org/licenses/agpl-3.0.html.                              #
+############################################################################
+
+import argparse
+import os
+import re
+import sys
+import traceback
+
+from seiscomp import core, io
+
+VERBOSITY = 0
+
+INFO = 1
+DEBUG = 2
+TRACE = 3
+
+
+def log(level, msg):
+    print(f"[{level}] {msg}", file=sys.stderr)
+
+
+def info_enabled():
+    return VERBOSITY >= INFO
+
+
+def debug_enabled():
+    return VERBOSITY >= DEBUG
+
+
+def trace_enabled():
+    return VERBOSITY >= TRACE
+
+
+def error(msg):
+    log("error", msg)
+
+
+def warning(msg):
+    log("warning", msg)
+
+
+def info(msg):
+    if info_enabled():
+        log("info", msg)
+
+
+def debug(msg):
+    if debug_enabled():
+        log("debug", msg)
+
+
+def trace(msg):
+    if trace_enabled():
+        log("trace", msg)
+
+
+def parse_args():
+    description = (
+        "Read unsorted and possibly multiplexed miniSEED files. Sort data by time "
+        "(multiplexing) and filter the individual records by time and/or streams. "
+        "Apply this before playbacks and waveform archiving."
+    )
+
+    epilog = """Examples:
+Read data from multiple files, extract streams by time, sort records by start time, \
+ignore duplicated and empty records
+  cat f1.mseed f2.mseed f3.mseed | \
+scmssort -v -t 2007-03-28T15:48~2007-03-28T16:18' -ui > sorted.mseed
+
+Extract streams by time, stream code and sort records by end time
+  echo CX.PB01..BH? | \
+scmssort -v -E -t '2007-03-28T15:48~2007-03-28T16:18' \
+-u -l - test.mseed > sorted.mseed
+"""
+
+    p = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "file",
+        nargs="*",
+        default="-",
+        help="miniSEED file(s) to sort. If no file name or '-' is specified then "
+        "standard input is used.",
+    )
+    p.add_argument(
+        "-E",
+        "--sort-by-end-time",
+        action="store_true",
+        help="Sort according to record end time; default is start time.",
+    )
+    p.add_argument(
+        "-i",
+        "--ignore",
+        action="store_true",
+        help="Ignore all records which have no data samples.",
+    )
+    p.add_argument(
+        "-l",
+        "--list",
+        action="store",
+        help="Filter records by a list of stream codes specified in a file or on stdin "
+        "(-). One stream per line of format: NET.STA.LOC.CHA - wildcards and regular "
+        "expressions are considered. Example: CX.*..BH?.",
+    )
+    p.add_argument(
+        "-o",
+        "--output",
+        action="store",
+        help="Name of output file for miniSEED data (default is stdout).",
+    )
+    p.add_argument(
+        "-r",
+        "--rm",
+        action="store_true",
+        help="Remove all traces in stream list given by '--list' instead of keeping "
+        "them.",
+    )
+    p.add_argument(
+        "-t",
+        "--time-window",
+        action="store",
+        help="Time window to filter the records, format: <START TIME> ~ <END TIME>. "
+        "Time values are in UTC, must start with an ISO date and may include time "
+        "components starting on the hour down to milliseconds. Example: "
+        "2023-01-15T12:15",
+    )
+    p.add_argument(
+        "-u",
+        "--uniqueness",
+        action="store_true",
+        help="Ensure uniqueness of output by skipping duplicate records.",
+    )
+    p.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="Run in verbose mode. This option may be repeated several time to "
+        "increase the level of verbosity. Example: -vvv.",
+    )
+
+    opt = p.parse_args()
+
+    global VERBOSITY
+    VERBOSITY += int(opt.verbose)
+
+    if opt.rm and not opt.list:
+        error("The '--rm' requires the '--list' option to be present as well.")
+        sys.exit(1)
+
+    return opt
+
+
+def rec2id(record):
+    return (
+        f"{record.networkCode()}.{record.stationCode()}."
+        f"{record.locationCode()}.{record.channelCode()}"
+    )
+
+
+def str2time(timeString):
+    return core.Time.FromString(timeString)
+
+
+def time2str(time):
+    """
+    Convert a seiscomp.core.Time to a string
+    """
+    if not time:
+        return ""
+
+    return time.toString("%Y-%m-%dT%H:%M:%S.%f000")[:23]
+
+
+def read_time_window(opt):
+    if not opt.time_window:
+        return None, None
+
+    toks = opt.time_window.split("~")
+    if len(toks) != 2:
+        if len(toks) < 2:
+            raise ValueError(
+                "Time window has wrong format: Use (~) for separating start and end time"
+            )
+        raise ValueError("Time window has wrong format: Too many tildes (~) found")
+
+    start = core.Time.FromString(toks[0])
+    end = core.Time.FromString(toks[1])
+
+    if start is None or end is None:
+        error(f"Could not read time window: {toks}")
+        if debug_enabled():
+            debug(traceback.format_exc())
+        sys.exit(1)
+
+    return start, end
+
+
+def read_lines(file):
+    # read from stdin
+    if file == "-":
+        yield from sys.stdin
+        return
+
+    # read from file
+    with open(file, "r", encoding="utf-8") as f:
+        yield from f
+        return
+
+
+def compile_stream_pattern(opt):
+    if not opt.list:
+        return None
+
+    streams = []
+    pattern = None
+    try:
+        line_number = -1
+        for line in map(str.strip, read_lines(opt.list)):
+            line_number += 1
+
+            # ignore empty lines and comments
+            if not line or line.startswith("#"):
+                continue
+
+            toks = line.split(".")
+            if len(toks) != 4:
+                raise ValueError(
+                    f"Invalid stream definition at line {line_number}. Expected the 4 "
+                    "stream components NET.STA.LOC.CHA separated by a dot, "
+                    "got: {line}."
+                )
+
+            streams.append(line)
+
+        if not streams:
+            raise ValueError("No stream definition found.")
+
+        pattern = re.compile("|".join(streams))
+
+    except Exception as e:
+        error(f"Could not compile pattern from stream list file '{opt.list}': {e}")
+        if debug_enabled():
+            debug(traceback.format_exc())
+        sys.exit(1)
+
+    info(
+        f"Using stream id {'DENY' if opt.rm else 'ALLOW'} list with {len(streams)} "
+        "stream masks"
+    )
+
+    if debug_enabled():
+        masks = "\n  + ".join(streams)
+        debug(f"Stream masks:\n  + {masks}")
+
+    return pattern
+
+
+def record_input(file, datatype=core.Array.INT):
+    """
+    Simple record iterator that reads from a file (or stdin in case of '-')
+    """
+    stream = io.RecordStream.Create("file")
+    if not stream:
+        raise IOError("Failed to create a RecordStream")
+
+    if file != "-" and not os.path.exists(file):
+        raise FileNotFoundError("Could not find file")
+
+    if not stream.setSource(file):
+        raise ValueError("Could not set record stream source")
+
+    it = io.RecordInput(stream, datatype, core.Record.SAVE_RAW)
+
+    if trace_enabled():
+        while True:
+            record = it.next()
+            if not record:
+                return
+
+            trace(
+                f"    + {time2str(record.startTime())}~{time2str(record.endTime())} "
+                f"{rec2id(record)}"
+            )
+            yield record
+    else:
+        while True:
+            record = it.next()
+            if not record:
+                return
+
+            yield record
+
+
+def unique(sequence):
+    seen = set()
+    return [x for x in sequence if not (x in seen or seen.add(x))]
+
+
+def main():
+    # parse commandline
+    opt = parse_args()
+
+    # time window
+    t_min, t_max = read_time_window(opt)
+    if t_max and t_min and t_max <= t_min:
+        error(
+            f"Invalid time window: {time2str(t_min)}~{time2str(t_max)}\n"
+            "  + end time must be greater than start time"
+        )
+        return False
+
+    info(f"Filtering records by time window: {time2str(t_min)}~{time2str(t_max)}")
+
+    # stream filter
+    pattern = compile_stream_pattern(opt)
+
+    outputFile = None
+    if opt.output:
+        outputFile = opt.output
+
+    # record buffer to be sorted later on, each item is a tuple of
+    # (delta_time, raw_binary_record_data)
+    rec_buf = []
+
+    # statistics
+    records_read = 0
+    records_window = 0
+    records_empty = 0
+
+    # statistics (info mode)
+    networks = set()
+    stations = set()
+    streams = set()
+    buf_min = None
+    buf_max = None
+
+    # make sure to read from stdin only once
+    files = [x for x in opt.file if x != "-"]
+    if len(files) == len(opt.file):
+        info(f"Reading data from {len(opt.file)} file(s)")
+    elif not files:
+        files = "-"
+        info("Reading data from stdin. Use Ctrl + C to interrupt.")
+    else:
+        info(
+            f"Reading data from stdin and {len(files)} files. Use Ctrl + C to "
+            "interrupt."
+        )
+        files.insert(opt.file.index("-"), "-")
+
+    # time or first valid record use as reference for sorting
+    ref_time = None
+
+    # read records from input file
+    for file in files:
+        records_file = 0
+        records_empty_file = 0
+
+        try:
+            for rec in record_input(file):
+                records_file += 1
+                stream_id = ""
+
+                # skip record if outside time window
+                if (t_min and rec.endTime() < t_min) or (
+                    t_max and rec.startTime() > t_max
+                ):
+                    continue
+
+                if pattern or info_enabled():
+                    records_window += 1
+                    stream_id = rec2id(rec)
+
+                if pattern and bool(pattern.match(stream_id)) == bool(opt.rm):
+                    continue
+
+                if not rec.sampleCount():
+                    trace(
+                        f"    + found empty record staring at {time2str(rec.startTime())} "
+                        f"{rec2id(rec)}"
+                    )
+                    records_empty_file += 1
+                    if opt.ignore:
+                        trace("      + ignored")
+                        continue
+
+                # record time reference set to start or end time depending on sort
+                # option
+                t = rec.endTime() if opt.sort_by_end_time else rec.startTime()
+
+                if ref_time is None:
+                    ref_time = core.Time(t)
+                    t = 0
+                else:
+                    t = float(t - ref_time)  # float needs less memory
+
+                # buffer tuple of (time delta, binary record data)
+                rec_buf.append((t, rec.raw().str()))
+
+                # collect statistics for debug mode
+                if info_enabled():
+                    networks.add(rec.networkCode())
+                    stations.add(f"{rec.networkCode()}.{rec.stationCode()}")
+                    streams.add(stream_id)
+                    # copy of time object is required because record may be freed before
+                    if not buf_min or rec.startTime() < buf_min:
+                        buf_min = core.Time(rec.startTime())
+                    if not buf_max or rec.startTime() > buf_max:
+                        buf_max = core.Time(rec.endTime())
+
+            name = "<stdin>" if file == "-" else file
+            empty = f", empty: {records_empty_file}" if records_empty_file else ""
+            debug(f"  + {name}: {records_file} records{empty}")
+
+        except Exception as e:
+            error(f"Could not read file '{file}: {e}")
+            if debug_enabled():
+                debug(traceback.format_exc())
+            return 1
+
+        records_read += records_file
+        records_empty += records_empty_file
+
+    # stop if no records have been read
+    if not records_read:
+        warning("No records found in input file(s).")
+        return 0
+
+    buf_len = len(rec_buf)
+
+    # statistics about records read and filtered
+    if info_enabled() and buf_len != records_read:
+        info(
+            f"""{records_read-buf_len}/{records_read} records filtered:
+  + by time window: {records_read-records_window}
+  + by stream id {'DENY' if opt.rm else 'ALLOW'} list: {records_window-buf_len}"""
+        )
+
+    # stop if no record passed the filter
+    if not buf_len:
+        warning("All records filtered, nothing to write.")
+        return 0
+
+    # network, station and stream information
+    if info_enabled():
+        info(
+            f"Found data for {len(networks)} networks, {len(stations)} stations "
+            f"and {len(streams)} streams",
+        )
+        if debug_enabled() and streams:
+            streamList = "\n  + ".join(streams)
+            debug(f"streams:\n  + {streamList}")
+
+    # sort records by time only
+    if buf_len > 1:
+        info(f"Sorting {buf_len} records")
+        rec_buf.sort()
+
+    # write sorted records, count duplicates and optional remove them
+    info(f"Writing {buf_len} records")
+    prev_rec = None
+    duplicates = 0
+
+    if outputFile:
+        print(f"Output data to file: {outputFile}", file=sys.stderr)
+        try:
+            out = open(outputFile, "wb")
+        except Exception:
+            print("Cannot create output file {outputFile}", file=sys.stderr)
+            return -1
+    else:
+        out = sys.stdout.buffer
+
+    for _t, rec in rec_buf:
+        if rec == prev_rec:
+            duplicates += 1
+            if opt.uniqueness:
+                continue
+        else:
+            prev_rec = rec
+
+        out.write(rec)
+
+    # statistics about records written
+    if info_enabled():
+        records_written = buf_len - duplicates if opt.uniqueness else buf_len
+        msg = f"""Wrote {records_written} records
+  + time window: {time2str(buf_min)}~{time2str(buf_max)}"""
+
+        if opt.uniqueness:
+            msg += f"""
+  + found and removed {duplicates} duplicate records"""
+        elif not duplicates:
+            msg += """
+  + no duplicate records found"""
+
+        if opt.ignore:
+            msg += f"""
+  + {records_empty} empty records found and ignored"""
+
+        info(msg)
+
+    # additional warning output
+    if records_empty and not opt.ignore:
+        warning(f"Found {records_empty} empty records - remove with: scmssort -i")
+
+    # This is an important hint which should always be printed
+    if duplicates > 0 and not opt.uniqueness:
+        warning(f"Found {duplicates} duplicate records - remove with: scmssort -u")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())