2025/bin/gempa-check-database

#!/usr/bin/env seiscomp-python

############################################################################
# Copyright (C) 2021 by gempa GmbH                                         #
#                                                                          #
# All Rights Reserved.                                                     #
#                                                                          #
# NOTICE: All information contained herein is, and remains                 #
# the property of gempa GmbH and its suppliers, if any. The intellectual   #
# and technical concepts contained herein are proprietary to gempa GmbH    #
# and its suppliers.                                                       #
# Dissemination of this information or reproduction of this material       #
# is strictly forbidden unless prior written permission is obtained        #
# from gempa GmbH.                                                         #
#                                                                          #
#  Author: Stephan Herrnkind                                               #
#  Email: herrnkind@gempa.de                                               #
############################################################################

import os
import sys
from collections import OrderedDict

from seiscomp import client, logging


def writeUpdateStatements(database, tables, charset):
    filename = f"/tmp/update-mysql-charset-{charset}-{database}.sql"
    with open(filename, "w", encoding="utf8") as f:
        print(
            f"ALTER DATABASE `{database}` "
            f"CHARACTER SET {charset} COLLATE {charset}_bin;",
            file=f,
        )
        for table in tables:
            print(
                f"ALTER TABLE `{database}`.`{table}` "
                f"CONVERT TO CHARACTER SET {charset} COLLATE {charset}_bin;",
                file=f,
            )
        print("", file=f)
        for table in tables:
            print(f"ANALYZE TABLE `{database}`.`{table}`;", file=f)

    return filename


def checkBinaryCollation(charset, collation):
    return collation == f"{charset}_bin"


class CheckDatabase(client.Application):
    def __init__(self, argc, argv):
        super().__init__(argc, argv)
        self.setDaemonEnabled(False)
        self.setMessagingEnabled(True)
        self.setDatabaseEnabled(True, True)
        self.setConnectionRetries(0)
        self.setLoggingToStdErr(True)

    def validateParameters(self):
        if not super().validateParameters():
            return False

        # Disable messaging if database connection string is provided
        if self.databaseURI():
            self.setMessagingEnabled(False)

        return True

    def printUsage(self):
        print(
            f"""Usage:
  {os.path.basename(__file__)} [options]"""
        )

        client.Application.printUsage(self)

        print(
            f"""Examples:
Run the script getting the database parameters from default messaging
  {os.path.basename(__file__)}

Run the script specifiying the database parameters
  {os.path.basename(__file__)} --debug -d mysql://sysop:sysop@localhost/seiscomp"""
        )

    def run(self):
        if not self.query():
            logging.error("No database connection available")
            return False

        dbType = self.databaseType()

        if dbType and dbType != "mysql":
            print(f"No tests for database type {dbType} available.")
            return True

        db = self.query().driver()

        # query database name
        q = "SELECT DATABASE()"
        if not db.beginQuery(q) or not db.fetchRow() or not db.getRowFieldString(0):
            logging.error("Could not query database name")
            return False

        dbName = db.getRowFieldString(0)
        db.endQuery()
        logging.info(f"Checking encoding of database: {dbName}")

        # collect charset found at database, table and column level
        charsets = set()

        # select default database character set and collation
        q = (
            "SELECT default_character_set_name, default_collation_name "
            "FROM information_schema.SCHEMATA "
            f"WHERE schema_name = '{dbName}'"
        )
        if not db.beginQuery(q) or not db.fetchRow() or db.getRowFieldCount() != 2:
            logging.error("Could not query default database charset and collation")
            return False

        charset = db.getRowFieldString(0)
        collation = db.getRowFieldString(1)
        db.endQuery()

        binCollation = checkBinaryCollation(charset, collation)
        logging.debug(f"{dbName:<48}{charset} -> {collation}")

        # select default table character set and collation
        q = (
            "SELECT T.table_name, CCSA.character_set_name, CCSA.collation_name "
            "FROM information_schema.`TABLES` T, "
            "information_schema.`COLLATION_CHARACTER_SET_APPLICABILITY` CCSA "
            "WHERE CCSA.collation_name = T.table_collation AND "
            f"T.table_schema = '{dbName}' "
            "ORDER BY T.table_name"
        )
        if not db.beginQuery(q) or not db.fetchRow() or db.getRowFieldCount() != 3:
            logging.error("Could not query default charset and collation of tables")
            return False
        tables = OrderedDict()
        while True:
            table, charset, collation = (
                db.getRowFieldString(col) for col in range(0, 3)
            )
            tables[table] = (charset, collation)
            charsets.add(charset)
            binCollation = binCollation and checkBinaryCollation(charset, collation)

            if not db.fetchRow():
                break

        db.endQuery()

        # select charset and collation of all tables and columns
        q = (
            "SELECT table_name, column_name, character_set_name, collation_name "
            "FROM information_schema.`COLUMNS` "
            f"WHERE table_schema = '{dbName}' "
            "ORDER BY table_name, column_name"
        )
        if not db.beginQuery(q) or not db.fetchRow() or db.getRowFieldCount() != 4:
            logging.error("Could not query charset and collation of columns")
            return False

        prevTable = None
        while True:
            table, col, charset, collation = (
                db.getRowFieldString(col) for col in range(0, 4)
            )
            if prevTable != table:
                if table not in tables:
                    tables[table] = ("?", "?")
                tCharset, tCollation = tables[table]
                logging.debug(f"    {table:<44}{tCharset} -> {tCollation}")
                prevTable = table

            if charset:
                logging.debug(f"        {col:<40}{charset} -> {collation}")

                charsets.add(charset)
                binCollation = binCollation and checkBinaryCollation(charset, collation)

            if not db.fetchRow():
                break

        db.endQuery()

        filenames = []
        issues = []
        utf8mb4 = "utf8mb4"

        if utf8mb4 not in charsets:
            filename = writeUpdateStatements(dbName, tables.keys(), utf8mb4)
            issues.append(
                f"Your database is not configured with the {utf8mb4} character set. "
                "Certain unicode characters may not be stored correctly. Consider "
                f"applying the migrations in:\n    - {filename}"
            )

        noBinText = (
            "Found collation other than 'binary'. Case-insensitive collations should "
            "be avoided because they may lead to publicID collisions. "
        )

        if len(charsets) > 1:
            filenames = []
            for charset in charsets:
                filename = writeUpdateStatements(dbName, tables.keys(), charset)
                if charset == utf8mb4:
                    filename += " (preferred)"
                filenames.append(filename)
            fileNamesText = "\n    - ".join(filenames)
            issues.append(
                "Found more than one character set. It is recommended to use the same "
                f"character set across all tables. {'' if binCollation else noBinText}"
                "Consider applying the migrations in one of the following files:"
                f"\n    - {fileNamesText}"
            )
        elif not binCollation:
            filename = writeUpdateStatements(dbName, tables.keys(), charsets.pop())
            issues.append(
                f"{noBinText}Consider applying the migrations in:\n    - {filename}"
            )

        if issues:
            print("Found database issues:")
            for issue in issues:
                print(f"  * {issue}")

            print(
                """
Update instructions:
  * Stop scmaster
  * Ensure, no other modules like scdb, scardac, etc. or custom tools from internal or
    external clients attempt accessing the database.
  * Login to your database, e.g.:
      mysql -u sysop -p
  * Source one of the suggested update scripts:
      SOURCE /tmp/update-mysql-charset-CHARACTERSET-DATABASE.sql"""
            )

            return False

        print("No database issues found.")

        return True


# Main method to call the app
def main(argc, argv):
    app = CheckDatabase(argc, argv)
    return app()


# Call the main method if run as script
if __name__ == "__main__":
    sys.exit(main(len(sys.argv), sys.argv))