seiscomp-training/bin/scmssort

#!/usr/bin/env seiscomp-python
# -*- coding: utf-8 -*-
############################################################################
# Copyright (C) GFZ Potsdam                                                #
# All rights reserved.                                                     #
#                                                                          #
# GNU Affero General Public License Usage                                  #
# This file may be used under the terms of the GNU Affero                  #
# Public License version 3.0 as published by the Free Software Foundation  #
# and appearing in the file LICENSE included in the packaging of this      #
# file. Please review the following information to ensure the GNU Affero   #
# Public License version 3.0 requirements will be met:                     #
# https://www.gnu.org/licenses/agpl-3.0.html.                              #
############################################################################

import argparse
import os
import re
import sys
import traceback

from seiscomp import core, io

VERBOSITY = 0

INFO = 1
DEBUG = 2
TRACE = 3


def log(level, msg):
    print(f"[{level}] {msg}", file=sys.stderr)


def info_enabled():
    return VERBOSITY >= INFO


def debug_enabled():
    return VERBOSITY >= DEBUG


def trace_enabled():
    return VERBOSITY >= TRACE


def error(msg):
    log("error", msg)


def warning(msg):
    log("warning", msg)


def info(msg):
    if info_enabled():
        log("info", msg)


def debug(msg):
    if debug_enabled():
        log("debug", msg)


def trace(msg):
    if trace_enabled():
        log("trace", msg)


def parse_args():
    description = (
        "Read unsorted and possibly multiplexed miniSEED files. Sort data by time "
        "(multiplexing) and filter the individual records by time and/or streams. "
        "Apply this before playbacks and waveform archiving."
    )

    epilog = """Examples:
Read data from multiple files, extract streams by time, sort records by start time, \
ignore duplicated and empty records
  cat f1.mseed f2.mseed f3.mseed | \
scmssort -v -t 2007-03-28T15:48~2007-03-28T16:18' -ui > sorted.mseed

Extract streams by time, stream code and sort records by end time
  echo CX.PB01..BH? | \
scmssort -v -E -t '2007-03-28T15:48~2007-03-28T16:18' \
-u -l - test.mseed > sorted.mseed
"""

    p = argparse.ArgumentParser(
        description=description,
        epilog=epilog,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    p.add_argument(
        "file",
        nargs="*",
        default="-",
        help="miniSEED file(s) to sort. If no file name or '-' is specified then "
        "standard input is used.",
    )
    p.add_argument(
        "-E",
        "--sort-by-end-time",
        action="store_true",
        help="Sort according to record end time; default is start time.",
    )
    p.add_argument(
        "-i",
        "--ignore",
        action="store_true",
        help="Ignore all records which have no data samples.",
    )
    p.add_argument(
        "-l",
        "--list",
        action="store",
        help="Filter records by a list of stream codes specified in a file or on stdin "
        "(-). One stream per line of format: NET.STA.LOC.CHA - wildcards and regular "
        "expressions are considered. Example: CX.*..BH?.",
    )
    p.add_argument(
        "-o",
        "--output",
        action="store",
        help="Name of output file for miniSEED data (default is stdout).",
    )
    p.add_argument(
        "-r",
        "--rm",
        action="store_true",
        help="Remove all traces in stream list given by '--list' instead of keeping "
        "them.",
    )
    p.add_argument(
        "-t",
        "--time-window",
        action="store",
        help="Time window to filter the records, format: <START TIME> ~ <END TIME>. "
        "Time values are in UTC, must start with an ISO date and may include time "
        "components starting on the hour down to milliseconds. Example: "
        "2023-01-15T12:15",
    )
    p.add_argument(
        "-u",
        "--uniqueness",
        action="store_true",
        help="Ensure uniqueness of output by skipping duplicate records.",
    )
    p.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        help="Run in verbose mode. This option may be repeated several time to "
        "increase the level of verbosity. Example: -vvv.",
    )

    opt = p.parse_args()

    global VERBOSITY
    VERBOSITY += int(opt.verbose)

    if opt.rm and not opt.list:
        error("The '--rm' requires the '--list' option to be present as well.")
        sys.exit(1)

    return opt


def rec2id(record):
    return (
        f"{record.networkCode()}.{record.stationCode()}."
        f"{record.locationCode()}.{record.channelCode()}"
    )


def str2time(timeString):
    return core.Time.FromString(timeString)


def time2str(time):
    """
    Convert a seiscomp.core.Time to a string
    """
    if not time:
        return ""

    return time.toString("%Y-%m-%dT%H:%M:%S.%f000")[:23]


def read_time_window(opt):
    if not opt.time_window:
        return None, None

    toks = opt.time_window.split("~")
    if len(toks) != 2:
        if len(toks) < 2:
            raise ValueError(
                "Time window has wrong format: Use (~) for separating start and end time"
            )
        raise ValueError("Time window has wrong format: Too many tildes (~) found")

    start = core.Time.FromString(toks[0])
    end = core.Time.FromString(toks[1])

    if start is None or end is None:
        error(f"Could not read time window: {toks}")
        if debug_enabled():
            debug(traceback.format_exc())
        sys.exit(1)

    return start, end


def read_lines(file):
    # read from stdin
    if file == "-":
        yield from sys.stdin
        return

    # read from file
    with open(file, "r", encoding="utf-8") as f:
        yield from f
        return


def compile_stream_pattern(opt):
    if not opt.list:
        return None

    streams = []
    pattern = None
    try:
        line_number = -1
        for line in map(str.strip, read_lines(opt.list)):
            line_number += 1

            # ignore empty lines and comments
            if not line or line.startswith("#"):
                continue

            toks = line.split(".")
            if len(toks) != 4:
                raise ValueError(
                    f"Invalid stream definition at line {line_number}. Expected the 4 "
                    "stream components NET.STA.LOC.CHA separated by a dot, "
                    "got: {line}."
                )

            streams.append(line)

        if not streams:
            raise ValueError("No stream definition found.")

        pattern = re.compile("|".join(streams))

    except Exception as e:
        error(f"Could not compile pattern from stream list file '{opt.list}': {e}")
        if debug_enabled():
            debug(traceback.format_exc())
        sys.exit(1)

    info(
        f"Using stream id {'DENY' if opt.rm else 'ALLOW'} list with {len(streams)} "
        "stream masks"
    )

    if debug_enabled():
        masks = "\n  + ".join(streams)
        debug(f"Stream masks:\n  + {masks}")

    return pattern


def record_input(file, datatype=core.Array.INT):
    """
    Simple record iterator that reads from a file (or stdin in case of '-')
    """
    stream = io.RecordStream.Create("file")
    if not stream:
        raise IOError("Failed to create a RecordStream")

    if file != "-" and not os.path.exists(file):
        raise FileNotFoundError("Could not find file")

    if not stream.setSource(file):
        raise ValueError("Could not set record stream source")

    it = io.RecordInput(stream, datatype, core.Record.SAVE_RAW)

    if trace_enabled():
        while True:
            record = it.next()
            if not record:
                return

            trace(
                f"    + {time2str(record.startTime())}~{time2str(record.endTime())} "
                f"{rec2id(record)}"
            )
            yield record
    else:
        while True:
            record = it.next()
            if not record:
                return

            yield record


def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]


def main():
    # parse commandline
    opt = parse_args()

    # time window
    t_min, t_max = read_time_window(opt)
    if t_max and t_min and t_max <= t_min:
        error(
            f"Invalid time window: {time2str(t_min)}~{time2str(t_max)}\n"
            "  + end time must be greater than start time"
        )
        return False

    info(f"Filtering records by time window: {time2str(t_min)}~{time2str(t_max)}")

    # stream filter
    pattern = compile_stream_pattern(opt)

    outputFile = None
    if opt.output:
        outputFile = opt.output

    # record buffer to be sorted later on, each item is a tuple of
    # (delta_time, raw_binary_record_data)
    rec_buf = []

    # statistics
    records_read = 0
    records_window = 0
    records_empty = 0

    # statistics (info mode)
    networks = set()
    stations = set()
    streams = set()
    buf_min = None
    buf_max = None

    # make sure to read from stdin only once
    files = [x for x in opt.file if x != "-"]
    if len(files) == len(opt.file):
        info(f"Reading data from {len(opt.file)} file(s)")
    elif not files:
        files = "-"
        info("Reading data from stdin. Use Ctrl + C to interrupt.")
    else:
        info(
            f"Reading data from stdin and {len(files)} files. Use Ctrl + C to "
            "interrupt."
        )
        files.insert(opt.file.index("-"), "-")

    # time or first valid record use as reference for sorting
    ref_time = None

    # read records from input file
    for file in files:
        records_file = 0
        records_empty_file = 0

        try:
            for rec in record_input(file):
                records_file += 1
                stream_id = ""

                # skip record if outside time window
                if (t_min and rec.endTime() < t_min) or (
                    t_max and rec.startTime() > t_max
                ):
                    continue

                if pattern or info_enabled():
                    records_window += 1
                    stream_id = rec2id(rec)

                if pattern and bool(pattern.match(stream_id)) == bool(opt.rm):
                    continue

                if not rec.sampleCount():
                    trace(
                        f"    + found empty record staring at {time2str(rec.startTime())} "
                        f"{rec2id(rec)}"
                    )
                    records_empty_file += 1
                    if opt.ignore:
                        trace("      + ignored")
                        continue

                # record time reference set to start or end time depending on sort
                # option
                t = rec.endTime() if opt.sort_by_end_time else rec.startTime()

                if ref_time is None:
                    ref_time = core.Time(t)
                    t = 0
                else:
                    t = float(t - ref_time)  # float needs less memory

                # buffer tuple of (time delta, binary record data)
                rec_buf.append((t, rec.raw().str()))

                # collect statistics for debug mode
                if info_enabled():
                    networks.add(rec.networkCode())
                    stations.add(f"{rec.networkCode()}.{rec.stationCode()}")
                    streams.add(stream_id)
                    # copy of time object is required because record may be freed before
                    if not buf_min or rec.startTime() < buf_min:
                        buf_min = core.Time(rec.startTime())
                    if not buf_max or rec.startTime() > buf_max:
                        buf_max = core.Time(rec.endTime())

            name = "<stdin>" if file == "-" else file
            empty = f", empty: {records_empty_file}" if records_empty_file else ""
            debug(f"  + {name}: {records_file} records{empty}")

        except Exception as e:
            error(f"Could not read file '{file}: {e}")
            if debug_enabled():
                debug(traceback.format_exc())
            return 1

        records_read += records_file
        records_empty += records_empty_file

    # stop if no records have been read
    if not records_read:
        warning("No records found in input file(s).")
        return 0

    buf_len = len(rec_buf)

    # statistics about records read and filtered
    if info_enabled() and buf_len != records_read:
        info(
            f"""{records_read-buf_len}/{records_read} records filtered:
  + by time window: {records_read-records_window}
  + by stream id {'DENY' if opt.rm else 'ALLOW'} list: {records_window-buf_len}"""
        )

    # stop if no record passed the filter
    if not buf_len:
        warning("All records filtered, nothing to write.")
        return 0

    # network, station and stream information
    if info_enabled():
        info(
            f"Found data for {len(networks)} networks, {len(stations)} stations "
            f"and {len(streams)} streams",
        )
        if debug_enabled() and streams:
            streamList = "\n  + ".join(streams)
            debug(f"streams:\n  + {streamList}")

    # sort records by time only
    if buf_len > 1:
        info(f"Sorting {buf_len} records")
        rec_buf.sort()

    # write sorted records, count duplicates and optional remove them
    info(f"Writing {buf_len} records")
    prev_rec = None
    duplicates = 0

    if outputFile:
        print(f"Output data to file: {outputFile}", file=sys.stderr)
        try:
            out = open(outputFile, "wb")
        except Exception:
            print("Cannot create output file {outputFile}", file=sys.stderr)
            return -1
    else:
        out = sys.stdout.buffer

    for _t, rec in rec_buf:
        if rec == prev_rec:
            duplicates += 1
            if opt.uniqueness:
                continue
        else:
            prev_rec = rec

        out.write(rec)

    # statistics about records written
    if info_enabled():
        records_written = buf_len - duplicates if opt.uniqueness else buf_len
        msg = f"""Wrote {records_written} records
  + time window: {time2str(buf_min)}~{time2str(buf_max)}"""

        if opt.uniqueness:
            msg += f"""
  + found and removed {duplicates} duplicate records"""
        elif not duplicates:
            msg += """
  + no duplicate records found"""

        if opt.ignore:
            msg += f"""
  + {records_empty} empty records found and ignored"""

        info(msg)

    # additional warning output
    if records_empty and not opt.ignore:
        warning(f"Found {records_empty} empty records - remove with: scmssort -i")

    # This is an important hint which should always be printed
    if duplicates > 0 and not opt.uniqueness:
        warning(f"Found {duplicates} duplicate records - remove with: scmssort -u")

    return 0


if __name__ == "__main__":
    sys.exit(main())