533 lines
15 KiB
Plaintext
Executable File
533 lines
15 KiB
Plaintext
Executable File
#!/usr/bin/env seiscomp-python
|
|
# -*- coding: utf-8 -*-
|
|
############################################################################
|
|
# Copyright (C) GFZ Potsdam #
|
|
# All rights reserved. #
|
|
# #
|
|
# GNU Affero General Public License Usage #
|
|
# This file may be used under the terms of the GNU Affero #
|
|
# Public License version 3.0 as published by the Free Software Foundation #
|
|
# and appearing in the file LICENSE included in the packaging of this #
|
|
# file. Please review the following information to ensure the GNU Affero #
|
|
# Public License version 3.0 requirements will be met: #
|
|
# https://www.gnu.org/licenses/agpl-3.0.html. #
|
|
############################################################################
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
import traceback
|
|
|
|
from seiscomp import core, io
|
|
|
|
VERBOSITY = 0
|
|
|
|
INFO = 1
|
|
DEBUG = 2
|
|
TRACE = 3
|
|
|
|
|
|
def log(level, msg):
|
|
print(f"[{level}] {msg}", file=sys.stderr)
|
|
|
|
|
|
def info_enabled():
|
|
return VERBOSITY >= INFO
|
|
|
|
|
|
def debug_enabled():
|
|
return VERBOSITY >= DEBUG
|
|
|
|
|
|
def trace_enabled():
|
|
return VERBOSITY >= TRACE
|
|
|
|
|
|
def error(msg):
|
|
log("error", msg)
|
|
|
|
|
|
def warning(msg):
|
|
log("warning", msg)
|
|
|
|
|
|
def info(msg):
|
|
if info_enabled():
|
|
log("info", msg)
|
|
|
|
|
|
def debug(msg):
|
|
if debug_enabled():
|
|
log("debug", msg)
|
|
|
|
|
|
def trace(msg):
|
|
if trace_enabled():
|
|
log("trace", msg)
|
|
|
|
|
|
def parse_args():
|
|
description = (
|
|
"Read unsorted and possibly multiplexed miniSEED files. Sort data by time "
|
|
"(multiplexing) and filter the individual records by time and/or streams. "
|
|
"Apply this before playbacks and waveform archiving."
|
|
)
|
|
|
|
epilog = """Examples:
|
|
Read data from multiple files, extract streams by time, sort records by start time, \
|
|
ignore duplicated and empty records
|
|
cat f1.mseed f2.mseed f3.mseed | \
|
|
scmssort -v -t 2007-03-28T15:48~2007-03-28T16:18' -ui > sorted.mseed
|
|
|
|
Extract streams by time, stream code and sort records by end time
|
|
echo CX.PB01..BH? | \
|
|
scmssort -v -E -t '2007-03-28T15:48~2007-03-28T16:18' \
|
|
-u -l - test.mseed > sorted.mseed
|
|
"""
|
|
|
|
p = argparse.ArgumentParser(
|
|
description=description,
|
|
epilog=epilog,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
p.add_argument(
|
|
"file",
|
|
nargs="*",
|
|
default="-",
|
|
help="miniSEED file(s) to sort. If no file name or '-' is specified then "
|
|
"standard input is used.",
|
|
)
|
|
p.add_argument(
|
|
"-E",
|
|
"--sort-by-end-time",
|
|
action="store_true",
|
|
help="Sort according to record end time; default is start time.",
|
|
)
|
|
p.add_argument(
|
|
"-i",
|
|
"--ignore",
|
|
action="store_true",
|
|
help="Ignore all records which have no data samples.",
|
|
)
|
|
p.add_argument(
|
|
"-l",
|
|
"--list",
|
|
action="store",
|
|
help="Filter records by a list of stream codes specified in a file or on stdin "
|
|
"(-). One stream per line of format: NET.STA.LOC.CHA - wildcards and regular "
|
|
"expressions are considered. Example: CX.*..BH?.",
|
|
)
|
|
p.add_argument(
|
|
"-o",
|
|
"--output",
|
|
action="store",
|
|
help="Name of output file for miniSEED data (default is stdout).",
|
|
)
|
|
p.add_argument(
|
|
"-r",
|
|
"--rm",
|
|
action="store_true",
|
|
help="Remove all traces in stream list given by '--list' instead of keeping "
|
|
"them.",
|
|
)
|
|
p.add_argument(
|
|
"-t",
|
|
"--time-window",
|
|
action="store",
|
|
help="Time window to filter the records, format: <START TIME> ~ <END TIME>. "
|
|
"Time values are in UTC, must start with an ISO date and may include time "
|
|
"components starting on the hour down to milliseconds. Example: "
|
|
"2023-01-15T12:15",
|
|
)
|
|
p.add_argument(
|
|
"-u",
|
|
"--uniqueness",
|
|
action="store_true",
|
|
help="Ensure uniqueness of output by skipping duplicate records.",
|
|
)
|
|
p.add_argument(
|
|
"-v",
|
|
"--verbose",
|
|
action="count",
|
|
default=0,
|
|
help="Run in verbose mode. This option may be repeated several time to "
|
|
"increase the level of verbosity. Example: -vvv.",
|
|
)
|
|
|
|
opt = p.parse_args()
|
|
|
|
global VERBOSITY
|
|
VERBOSITY += int(opt.verbose)
|
|
|
|
if opt.rm and not opt.list:
|
|
error("The '--rm' requires the '--list' option to be present as well.")
|
|
sys.exit(1)
|
|
|
|
return opt
|
|
|
|
|
|
def rec2id(record):
|
|
return (
|
|
f"{record.networkCode()}.{record.stationCode()}."
|
|
f"{record.locationCode()}.{record.channelCode()}"
|
|
)
|
|
|
|
|
|
def str2time(timeString):
|
|
return core.Time.FromString(timeString)
|
|
|
|
|
|
def time2str(time):
|
|
"""
|
|
Convert a seiscomp.core.Time to a string
|
|
"""
|
|
if not time:
|
|
return ""
|
|
|
|
return time.toString("%Y-%m-%dT%H:%M:%S.%f000")[:23]
|
|
|
|
|
|
def read_time_window(opt):
|
|
if not opt.time_window:
|
|
return None, None
|
|
|
|
toks = opt.time_window.split("~")
|
|
if len(toks) != 2:
|
|
if len(toks) < 2:
|
|
raise ValueError(
|
|
"Time window has wrong format: Use (~) for separating start and end time"
|
|
)
|
|
raise ValueError("Time window has wrong format: Too many tildes (~) found")
|
|
|
|
start = core.Time.FromString(toks[0])
|
|
end = core.Time.FromString(toks[1])
|
|
|
|
if start is None or end is None:
|
|
error(f"Could not read time window: {toks}")
|
|
if debug_enabled():
|
|
debug(traceback.format_exc())
|
|
sys.exit(1)
|
|
|
|
return start, end
|
|
|
|
|
|
def read_lines(file):
|
|
# read from stdin
|
|
if file == "-":
|
|
yield from sys.stdin
|
|
return
|
|
|
|
# read from file
|
|
with open(file, "r", encoding="utf-8") as f:
|
|
yield from f
|
|
return
|
|
|
|
|
|
def compile_stream_pattern(opt):
|
|
if not opt.list:
|
|
return None
|
|
|
|
streams = []
|
|
pattern = None
|
|
try:
|
|
line_number = -1
|
|
for line in map(str.strip, read_lines(opt.list)):
|
|
line_number += 1
|
|
|
|
# ignore empty lines and comments
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
toks = line.split(".")
|
|
if len(toks) != 4:
|
|
raise ValueError(
|
|
f"Invalid stream definition at line {line_number}. Expected the 4 "
|
|
"stream components NET.STA.LOC.CHA separated by a dot, "
|
|
"got: {line}."
|
|
)
|
|
|
|
streams.append(line)
|
|
|
|
if not streams:
|
|
raise ValueError("No stream definition found.")
|
|
|
|
pattern = re.compile("|".join(streams))
|
|
|
|
except Exception as e:
|
|
error(f"Could not compile pattern from stream list file '{opt.list}': {e}")
|
|
if debug_enabled():
|
|
debug(traceback.format_exc())
|
|
sys.exit(1)
|
|
|
|
info(
|
|
f"Using stream id {'DENY' if opt.rm else 'ALLOW'} list with {len(streams)} "
|
|
"stream masks"
|
|
)
|
|
|
|
if debug_enabled():
|
|
masks = "\n + ".join(streams)
|
|
debug(f"Stream masks:\n + {masks}")
|
|
|
|
return pattern
|
|
|
|
|
|
def record_input(file, datatype=core.Array.INT):
|
|
"""
|
|
Simple record iterator that reads from a file (or stdin in case of '-')
|
|
"""
|
|
stream = io.RecordStream.Create("file")
|
|
if not stream:
|
|
raise IOError("Failed to create a RecordStream")
|
|
|
|
if file != "-" and not os.path.exists(file):
|
|
raise FileNotFoundError("Could not find file")
|
|
|
|
if not stream.setSource(file):
|
|
raise ValueError("Could not set record stream source")
|
|
|
|
it = io.RecordInput(stream, datatype, core.Record.SAVE_RAW)
|
|
|
|
if trace_enabled():
|
|
while True:
|
|
record = it.next()
|
|
if not record:
|
|
return
|
|
|
|
trace(
|
|
f" + {time2str(record.startTime())}~{time2str(record.endTime())} "
|
|
f"{rec2id(record)}"
|
|
)
|
|
yield record
|
|
else:
|
|
while True:
|
|
record = it.next()
|
|
if not record:
|
|
return
|
|
|
|
yield record
|
|
|
|
|
|
def unique(sequence):
|
|
seen = set()
|
|
return [x for x in sequence if not (x in seen or seen.add(x))]
|
|
|
|
|
|
def main():
|
|
# parse commandline
|
|
opt = parse_args()
|
|
|
|
# time window
|
|
t_min, t_max = read_time_window(opt)
|
|
if t_max and t_min and t_max <= t_min:
|
|
error(
|
|
f"Invalid time window: {time2str(t_min)}~{time2str(t_max)}\n"
|
|
" + end time must be greater than start time"
|
|
)
|
|
return False
|
|
|
|
info(f"Filtering records by time window: {time2str(t_min)}~{time2str(t_max)}")
|
|
|
|
# stream filter
|
|
pattern = compile_stream_pattern(opt)
|
|
|
|
outputFile = None
|
|
if opt.output:
|
|
outputFile = opt.output
|
|
|
|
# record buffer to be sorted later on, each item is a tuple of
|
|
# (delta_time, raw_binary_record_data)
|
|
rec_buf = []
|
|
|
|
# statistics
|
|
records_read = 0
|
|
records_window = 0
|
|
records_empty = 0
|
|
|
|
# statistics (info mode)
|
|
networks = set()
|
|
stations = set()
|
|
streams = set()
|
|
buf_min = None
|
|
buf_max = None
|
|
|
|
# make sure to read from stdin only once
|
|
files = [x for x in opt.file if x != "-"]
|
|
if len(files) == len(opt.file):
|
|
info(f"Reading data from {len(opt.file)} file(s)")
|
|
elif not files:
|
|
files = "-"
|
|
info("Reading data from stdin. Use Ctrl + C to interrupt.")
|
|
else:
|
|
info(
|
|
f"Reading data from stdin and {len(files)} files. Use Ctrl + C to "
|
|
"interrupt."
|
|
)
|
|
files.insert(opt.file.index("-"), "-")
|
|
|
|
# time or first valid record use as reference for sorting
|
|
ref_time = None
|
|
|
|
# read records from input file
|
|
for file in files:
|
|
records_file = 0
|
|
records_empty_file = 0
|
|
|
|
try:
|
|
for rec in record_input(file):
|
|
records_file += 1
|
|
stream_id = ""
|
|
|
|
# skip record if outside time window
|
|
if (t_min and rec.endTime() < t_min) or (
|
|
t_max and rec.startTime() > t_max
|
|
):
|
|
continue
|
|
|
|
if pattern or info_enabled():
|
|
records_window += 1
|
|
stream_id = rec2id(rec)
|
|
|
|
if pattern and bool(pattern.match(stream_id)) == bool(opt.rm):
|
|
continue
|
|
|
|
if not rec.sampleCount():
|
|
trace(
|
|
f" + found empty record staring at {time2str(rec.startTime())} "
|
|
f"{rec2id(rec)}"
|
|
)
|
|
records_empty_file += 1
|
|
if opt.ignore:
|
|
trace(" + ignored")
|
|
continue
|
|
|
|
# record time reference set to start or end time depending on sort
|
|
# option
|
|
t = rec.endTime() if opt.sort_by_end_time else rec.startTime()
|
|
|
|
if ref_time is None:
|
|
ref_time = core.Time(t)
|
|
t = 0
|
|
else:
|
|
t = float(t - ref_time) # float needs less memory
|
|
|
|
# buffer tuple of (time delta, binary record data)
|
|
rec_buf.append((t, rec.raw().str()))
|
|
|
|
# collect statistics for debug mode
|
|
if info_enabled():
|
|
networks.add(rec.networkCode())
|
|
stations.add(f"{rec.networkCode()}.{rec.stationCode()}")
|
|
streams.add(stream_id)
|
|
# copy of time object is required because record may be freed before
|
|
if not buf_min or rec.startTime() < buf_min:
|
|
buf_min = core.Time(rec.startTime())
|
|
if not buf_max or rec.startTime() > buf_max:
|
|
buf_max = core.Time(rec.endTime())
|
|
|
|
name = "<stdin>" if file == "-" else file
|
|
empty = f", empty: {records_empty_file}" if records_empty_file else ""
|
|
debug(f" + {name}: {records_file} records{empty}")
|
|
|
|
except Exception as e:
|
|
error(f"Could not read file '{file}: {e}")
|
|
if debug_enabled():
|
|
debug(traceback.format_exc())
|
|
return 1
|
|
|
|
records_read += records_file
|
|
records_empty += records_empty_file
|
|
|
|
# stop if no records have been read
|
|
if not records_read:
|
|
warning("No records found in input file(s).")
|
|
return 0
|
|
|
|
buf_len = len(rec_buf)
|
|
|
|
# statistics about records read and filtered
|
|
if info_enabled() and buf_len != records_read:
|
|
info(
|
|
f"""{records_read-buf_len}/{records_read} records filtered:
|
|
+ by time window: {records_read-records_window}
|
|
+ by stream id {'DENY' if opt.rm else 'ALLOW'} list: {records_window-buf_len}"""
|
|
)
|
|
|
|
# stop if no record passed the filter
|
|
if not buf_len:
|
|
warning("All records filtered, nothing to write.")
|
|
return 0
|
|
|
|
# network, station and stream information
|
|
if info_enabled():
|
|
info(
|
|
f"Found data for {len(networks)} networks, {len(stations)} stations "
|
|
f"and {len(streams)} streams",
|
|
)
|
|
if debug_enabled() and streams:
|
|
streamList = "\n + ".join(streams)
|
|
debug(f"streams:\n + {streamList}")
|
|
|
|
# sort records by time only
|
|
if buf_len > 1:
|
|
info(f"Sorting {buf_len} records")
|
|
rec_buf.sort()
|
|
|
|
# write sorted records, count duplicates and optional remove them
|
|
info(f"Writing {buf_len} records")
|
|
prev_rec = None
|
|
duplicates = 0
|
|
|
|
if outputFile:
|
|
print(f"Output data to file: {outputFile}", file=sys.stderr)
|
|
try:
|
|
out = open(outputFile, "wb")
|
|
except Exception:
|
|
print("Cannot create output file {outputFile}", file=sys.stderr)
|
|
return -1
|
|
else:
|
|
out = sys.stdout.buffer
|
|
|
|
for _t, rec in rec_buf:
|
|
if rec == prev_rec:
|
|
duplicates += 1
|
|
if opt.uniqueness:
|
|
continue
|
|
else:
|
|
prev_rec = rec
|
|
|
|
out.write(rec)
|
|
|
|
# statistics about records written
|
|
if info_enabled():
|
|
records_written = buf_len - duplicates if opt.uniqueness else buf_len
|
|
msg = f"""Wrote {records_written} records
|
|
+ time window: {time2str(buf_min)}~{time2str(buf_max)}"""
|
|
|
|
if opt.uniqueness:
|
|
msg += f"""
|
|
+ found and removed {duplicates} duplicate records"""
|
|
elif not duplicates:
|
|
msg += """
|
|
+ no duplicate records found"""
|
|
|
|
if opt.ignore:
|
|
msg += f"""
|
|
+ {records_empty} empty records found and ignored"""
|
|
|
|
info(msg)
|
|
|
|
# additional warning output
|
|
if records_empty and not opt.ignore:
|
|
warning(f"Found {records_empty} empty records - remove with: scmssort -i")
|
|
|
|
# This is an important hint which should always be printed
|
|
if duplicates > 0 and not opt.uniqueness:
|
|
warning(f"Found {duplicates} duplicate records - remove with: scmssort -u")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|