mysql8で使えるようになった正規表現関数を、UDFで似たようなものを作って5.7でも使えるようにしてみた

Last updated at 2020-06-22Posted at 2020-06-21

私はc言語、mysql初心者です。実際のmysql8の関数とは動きが違う部分もあります。自分の勉強のためにUDFを作成してみただけという投稿ですので注意でしてください

mysql8の正規表現関数の参考
https://dev.mysql.com/doc/refman/8.0/en/regexp.html

8で使えるようになったこちらの関数（と似たような動きの関数）を5.7でも動作させてみました

REGEXP_LIKE() 一致していたら1、不一致だったら0を返す
REGEXP_SUBSTR() 一致していた部分を抜き出す
REGEXP_REPLACE() 一致していたら置換する
REGEXP_INSTR() 一致していた文字の開始インデックスを返す

作成したもの

regex.c

# include <mysql.h>
# include <string.h>
# include <stdio.h>
# include <regex.h>

regex_t regex;

my_bool regexp_like_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
    if (args->arg_count != 2) {
        strcpy(message, "引数の数は2つにしてください");
        return 1;
    }
    if ((args->arg_type[0] != STRING_RESULT)
        || (args->arg_type[1] != STRING_RESULT)) {
        strcpy(message, "引数は (string, string) です");
        return 1;
    }
    initid->ptr = (char *) &regex;
    return 0;
}
long long regexp_like(
    UDF_INIT *initid,
    UDF_ARGS *args,
    char *is_null,
    char *error
) {
    long long is_match;
    char *expr = args->args[0];
    char *patten = args->args[1];

    if (args->maybe_null[0] || args->maybe_null[1]) {
        // 引数のどちらかが NULL だったら NULL を返す
        *is_null = 1;
        return 0;
    }

    if (regcomp((regex_t *) initid->ptr, patten, REG_EXTENDED | REG_ICASE | REG_NEWLINE) != 0) {
        // 正規表現のコンパイル失敗
        *error = 1;
        return 0;
    }

    if (regexec((regex_t *) initid->ptr, expr, 0, NULL, 0) == 0) {
        is_match = 1;
    } else {
        is_match = 0;
    }
    return is_match;
}
void regexp_like_deinit(UDF_INIT *initid) {
    regfree((regex_t *) initid->ptr);
}

my_bool regexp_substr_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
    if ((args->arg_count < 2) || (args->arg_count > 4)) {
        strcpy(message, "引数の数は2つ以上,4つ以下にしてください");
        return 1;
    }
    if ((args->arg_type[0] != STRING_RESULT)
        || (args->arg_type[1] != STRING_RESULT)
        || ((args->arg_count >= 3) && (args->arg_type[2] != INT_RESULT))
        || ((args->arg_count == 4) && (args->arg_type[3] != INT_RESULT))) {
        strcpy(message, "引数は (string, string[, integer[, integer]]) です");
        return 1;
    }
    if (args->arg_count >= 3) {
        if (0 >= *((long long*) args->args[2])) {
            strcpy(message, "開始位置が0以下");
            return 1;
        }
        if ((args->args[0] != NULL)
            && (strlen(args->args[0]) < *((long long*) args->args[2]))) {
            strcpy(message, "開始位置が文字数よりも多い");
            return 1;
        }
    }
    if (args->arg_count == 4) {
        if (0 >= *((long long*) args->args[3])) {
            strcpy(message, "発生位置が0以下");
            return 1;
        }
    }
    initid->ptr = (char *) &regex;
    return 0;
}
char *regexp_substr(
    UDF_INIT *initid,
    UDF_ARGS *args,
    char *result,
    unsigned long *length,
    char *is_null,
    char *error
) {
    regmatch_t regmatch;
    long max_length = args->lengths[0];
    char *expr = args->args[0];
    char *patten = args->args[1];
    long long position = (args->arg_count >= 3) ? *((long long*) args->args[2]) - 1 : 0;
    long long occurrence = (args->arg_count == 4) ? *((long long*) args->args[3]) : 1;
    char tmp_expr[max_length];
    int expr_length;
    int i = 0;
    int is_match;
    long long start;
    long long end;
    char match[max_length];

    if (args->maybe_null[0] || args->maybe_null[1]) {
        // 引数のどちらかが NULL だったら NULL を返す
        *is_null = 1;
        return result;
    }

    if (regcomp((regex_t *) initid->ptr, patten, REG_EXTENDED | REG_ICASE | REG_NEWLINE) != 0) {
        // 正規表現のコンパイル失敗
        *error = 1;
        return result;
    }

    // 開始位置からの文字列を取得
    expr_length = strlen(expr);
    strncpy(tmp_expr, expr + position, expr_length - position);
    tmp_expr[expr_length - position] = '\x0';
    strcpy(expr, tmp_expr);

    do {
        is_match = (regexec((regex_t *) initid->ptr, expr, 1, &regmatch, 0) == 0);
        if (is_match) {
            // 一致あり
            i++;
            start = regmatch.rm_so;
            end = regmatch.rm_eo;
            if (occurrence > i) {
                expr_length = strlen(expr);
                strncpy(tmp_expr, expr + end, expr_length - end);
                tmp_expr[expr_length - end] = '\x0';
                strcpy(expr, tmp_expr);
                continue;
            }

            strncpy(match, expr + start, end - start);
            match[end - start] = '\x0';
            break;
        }

        // 一致なし
        *is_null = 1;
        return result;
    } while (1);

    strcpy(result, match);
    *length = strlen(result);
    return result;
}
void regexp_substr_deinit(UDF_INIT *initid) {
    regfree((regex_t *) initid->ptr);
}

my_bool regexp_replace_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
    if ((args->arg_count < 3) || (args->arg_count > 5)) {
        strcpy(message, "引数の数は3つ以上,5つ以下にしてください");
        return 1;
    }
    if ((args->arg_type[0] != STRING_RESULT)
        || (args->arg_type[1] != STRING_RESULT)
        || (args->arg_type[2] != STRING_RESULT)
        || ((args->arg_count >= 4) && (args->arg_type[3] != INT_RESULT))
        || ((args->arg_count == 5) && (args->arg_type[4] != INT_RESULT))) {
        strcpy(message, "引数は (string, string, string[, integer[, integer]]) です");
        return 1;
    }
    if (args->arg_count >= 4) {
        if (0 >= *((long long*) args->args[3])) {
            strcpy(message, "開始位置が0以下");
            return 1;
        }
        if ((args->args[0] != NULL)
            && (strlen(args->args[0]) < *((long long*) args->args[3]))) {
            strcpy(message, "開始位置が文字数よりも多い");
            return 1;
        }
    }
    if (args->arg_count == 5) {
        if (0 > *((long long*) args->args[4])) {
            strcpy(message, "発生位置が0より小さい");
            return 1;
        }
    }
    initid->ptr = (char *) &regex;
    return 0;
}
char *regexp_replace(
    UDF_INIT *initid,
    UDF_ARGS *args,
    char *result,
    unsigned long *length,
    char *is_null,
    char *error
) {
    regmatch_t regmatch;
    char *expr = args->args[0];
    char *patten = args->args[1];
    char *replacement = args->args[2];
    long long position = (args->arg_count >= 4) ? *((long long*) args->args[3]) - 1 : 0;
    long long occurrence = (args->arg_count == 5) ? *((long long*) args->args[4]) : 0;
    char result_expr[1000];
    char tmp_expr[1000];
    char start_expr[1000];
    char match_expr[1000];
    char end_expr[1000];
    int expr_length;
    int i = 0;
    int is_match;
    long long start;
    long long end;

    if (args->maybe_null[0] || args->maybe_null[1] || args->maybe_null[2]) {
        // 引数のどちらかが NULL だったら NULL を返す
        *is_null = 1;
        return result;
    }

    if (regcomp((regex_t *) initid->ptr, patten, REG_EXTENDED | REG_ICASE | REG_NEWLINE) != 0) {
        // 正規表現のコンパイル失敗
        *error = 1;
        return result;
    }

    expr_length = strlen(expr);

    strncpy(tmp_expr, expr, position);
    tmp_expr[position] = '\x0';
    strcpy(result_expr, tmp_expr);

    strncpy(tmp_expr, expr + position, expr_length - position);
    tmp_expr[expr_length - position] = '\x0';
    strcpy(expr, tmp_expr);

    do {
        is_match = (regexec((regex_t *) initid->ptr, expr, 1, &regmatch, 0) == 0);
        if (is_match) {
            // 一致あり
            i++;
            start = regmatch.rm_so;
            end = regmatch.rm_eo;

            expr_length = strlen(expr);

            strncpy(start_expr, expr, start);
            start_expr[start] = '\x0';
            strncpy(match_expr, expr + start, end - start);
            match_expr[end - start] = '\x0';
            strncpy(end_expr, expr + end, expr_length - end);
            end_expr[expr_length - end] = '\x0';

            if ((occurrence == i) || (occurrence == 0)) {
                // 発生位置が指定されていた場合はそこで置換
                // 発生位置が0で指定されていた場合は全て置換
                sprintf(tmp_expr, "%s%s", start_expr, replacement);
            } else {
                // 発生位置が指定されてるところではなかった場合は置換しない
                sprintf(tmp_expr, "%s%s", start_expr, match_expr);
            }
            strcat(result_expr, tmp_expr);
            strcpy(expr, end_expr);
            continue;
        }

        // 一致がなくなったらループを抜ける
        strcat(result_expr, expr);
        break;
    } while (1);

    strcpy(result, result_expr);
    *length = strlen(result);
    return result;
}
void regexp_replace_deinit(UDF_INIT *initid) {
    regfree((regex_t *) initid->ptr);
}

my_bool regexp_instr_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
    if ((args->arg_count < 2) || (args->arg_count > 5)) {
        strcpy(message, "引数の数は2つ以上,5つ以下にしてください");
        return 1;
    }
    if ((args->arg_type[0] != STRING_RESULT)
        || (args->arg_type[1] != STRING_RESULT)
        || ((args->arg_count >= 3) && (args->arg_type[2] != INT_RESULT))
        || ((args->arg_count >= 4) && (args->arg_type[3] != INT_RESULT))
        || ((args->arg_count == 5) && (args->arg_type[4] != INT_RESULT))) {
        strcpy(message, "引数は (string, string[, integer[, integer[, integer]]]) です");
        return 1;
    }
    if (args->arg_count >= 3) {
        if (0 >= *((long long*) args->args[2])) {
            strcpy(message, "開始位置が0以下");
            return 1;
        }
        if ((args->args[0] != NULL)
            && (strlen(args->args[0]) < *((long long*) args->args[2]))) {
            strcpy(message, "開始位置が文字数よりも多い");
            return 1;
        }
    }
    if (args->arg_count >= 4) {
        if (0 >= *((long long*) args->args[3])) {
            strcpy(message, "発生位置が0以下");
            return 1;
        }
    }
    if (args->arg_count == 5) {
        if ((*((long long*) args->args[4]) != 0) && (*((long long*) args->args[4]) != 1)) {
            strcpy(message, "return_optionが不正");
            return 1;
        }
    }
    initid->ptr = (char *) &regex;
    return 0;
}
long long regexp_instr(
    UDF_INIT *initid,
    UDF_ARGS *args,
    char *is_null,
    char *error
) {
    regmatch_t regmatch;
    long max_length = args->lengths[0];
    char *expr = args->args[0];
    char *patten = args->args[1];
    long long position = (args->arg_count >= 3) ? *((long long*) args->args[2]) - 1 : 0;
    long long occurrence = (args->arg_count >= 4) ? *((long long*) args->args[3]) : 1;
    long long return_option = (args->arg_count == 5) ? *((long long*) args->args[4]) : 0;
    long long res_position = 1;
    char tmp_expr[max_length];
    int expr_length;
    int i = 0;
    long long is_match;
    long long start;
    long long end;

    if (args->maybe_null[0] || args->maybe_null[1]) {
        // 引数のどちらかが NULL だったら NULL を返す
        *is_null = 1;
        return 0;
    }

    if (regcomp((regex_t *) initid->ptr, patten, REG_EXTENDED | REG_ICASE | REG_NEWLINE) != 0) {
        // 正規表現のコンパイル失敗
        *error = 1;
        return 0;
    }

    expr_length = strlen(expr);

    // 開始位置からの文字列を取得
    strncpy(tmp_expr, expr + position, expr_length - position);
    tmp_expr[expr_length - position] = '\x0';
    strcpy(expr, tmp_expr);

    res_position += position;

    do {
        is_match = (regexec((regex_t *) initid->ptr, expr, 1, &regmatch, 0) == 0);
        if (is_match) {
            // 一致あり
            i++;
            start = regmatch.rm_so;
            end = regmatch.rm_eo;
            if (occurrence > i) {
                expr_length = strlen(expr);
                strncpy(tmp_expr, expr + end, expr_length - end);
                tmp_expr[expr_length - end] = '\x0';
                strcpy(expr, tmp_expr);
                res_position += end;
                continue;
            }

            res_position += (return_option ? end : start);
            break;
        }

        // 一致なし
        return 0;
    } while (1);

    return res_position;
}
void regexp_instr_deinit(UDF_INIT *initid) {
    regfree((regex_t *) initid->ptr);
}

コンパイル

$ mysql_config --cflags #mysql.hがあるパスを確認
-I/usr/local/opt/mysql@5.7/include/mysql
$ mysql_config --plugindir #プラグインのパスを確認
/usr/local/opt/mysql@5.7/lib/plugin

$ gcc regex.c -shared \
 -o /usr/local/opt/mysql@5.7/lib/plugin/regex.so \
 -I /usr/local/opt/mysql@5.7/include/mysql/ #コンパイル

c言語の関数をmysqlで使えるように↓を実行する

drop function if exists regexp_like;
create function regexp_like returns integer soname 'regex.so';

drop function if exists regexp_substr;
create function regexp_substr returns string soname 'regex.so';

drop function if exists regexp_replace;
create function regexp_replace returns string soname 'regex.so';

drop function if exists regexp_instr;
create function regexp_instr returns integer soname 'regex.so';

mysql> select version();
+-----------+
| version() |
+-----------+
| 5.7.29    |
+-----------+
1 row in set (0.00 sec)

mysql> drop function if exists regexp_like;
Query OK, 0 rows affected, 1 warning (0.00 sec)

mysql> create function regexp_like returns integer soname 'regex.so';
Query OK, 0 rows affected (0.01 sec)

mysql> drop function if exists regexp_substr;
Query OK, 0 rows affected, 1 warning (0.00 sec)

mysql> create function regexp_substr returns string soname 'regex.so';
Query OK, 0 rows affected (0.00 sec)

mysql> drop function if exists regexp_replace;
Query OK, 0 rows affected, 1 warning (0.00 sec)

mysql> create function regexp_replace returns string soname 'regex.so';
Query OK, 0 rows affected (0.00 sec)

mysql> drop function if exists regexp_instr;
Query OK, 0 rows affected, 1 warning (0.00 sec)

mysql> create function regexp_instr returns integer soname 'regex.so';
Query OK, 0 rows affected (0.00 sec)

動作を確認

select
  regexp_like('abc', '[a-z]{3}') 一致
  ,regexp_like('ab', '[a-z]{3}') 不一致;

select 
  regexp_substr('abc def ghi', '[a-z]+') デフォルト
  ,regexp_substr('abc def ghi', '[a-z]+', 1) 1文字目から検索
  ,regexp_substr('abc def ghi', '[a-z]+', 5) 5文字目から検索
  ,regexp_substr('abc def ghi', '[a-z]+', 1, 1) 1回目に出現
  ,regexp_substr('abc def ghi', '[a-z]+', 1, 2) 2回目に出現;

select 
  regexp_replace('abc def ghi', '[a-z]+', 'xxx') デフォルト
  ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 5) 5文字目から検索
  ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 9) 9文字目から検索
  ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 1, 0) 0は全て置換
  ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 1, 3) 3回目を置換;

select 
  regexp_instr('abc def ghi', '[a-z]{3}') デフォルト
  ,regexp_instr('abc def ghi', '[a-z]{3}', 7) 9文字目から
  ,regexp_instr('abc def ghi', '[a-z]{3}', 1, 3) 3回目に出現
  ,regexp_instr('abc def ghi', '[a-z]{3}', 1, 1, 0) 一致した位置
  ,regexp_instr('abc def ghi', '[a-z]{3}', 1, 1, 1) 一致した後の位置;

mysql> select version();
+-----------+
| version() |
+-----------+
| 5.7.29    |
+-----------+
1 row in set (0.00 sec)

mysql> select
    ->   regexp_like('abc', '[a-z]{3}') 一致
    ->   ,regexp_like('ab', '[a-z]{3}') 不一致;
+--------+-----------+
| 一致   | 不一致    |
+--------+-----------+
|      1 |         0 |
+--------+-----------+
1 row in set (0.00 sec)

mysql> select 
    ->   regexp_substr('abc def ghi', '[a-z]+') デフォルト
    ->   ,regexp_substr('abc def ghi', '[a-z]+', 1) 1文字目から検索
    ->   ,regexp_substr('abc def ghi', '[a-z]+', 5) 5文字目から検索
    ->   ,regexp_substr('abc def ghi', '[a-z]+', 1, 1) 1回目に出現
    ->   ,regexp_substr('abc def ghi', '[a-z]+', 1, 2) 2回目に出現;
+-----------------+------------------------+------------------------+------------------+------------------+
| デフォルト      | 1文字目から検索        | 5文字目から検索        | 1回目に出現      | 2回目に出現      |
+-----------------+------------------------+------------------------+------------------+------------------+
| abc             | abc                    | def                    | abc              | def              |
+-----------------+------------------------+------------------------+------------------+------------------+
1 row in set (0.00 sec)

mysql> select 
    ->   regexp_replace('abc def ghi', '[a-z]+', 'xxx') デフォルト
    ->   ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 5) 5文字目から検索
    ->   ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 9) 9文字目から検索
    ->   ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 1, 0) 0は全て置換
    ->   ,regexp_replace('abc def ghi', '[a-z]+', 'xxx', 1, 3) 3回目を置換;
+-----------------+------------------------+------------------------+------------------+------------------+
| デフォルト      | 5文字目から検索        | 9文字目から検索        | 0は全て置換      | 3回目を置換      |
+-----------------+------------------------+------------------------+------------------+------------------+
| xxx xxx xxx     | abc xxx xxx            | abc def xxx            | xxx xxx xxx      | abc def xxx      |
+-----------------+------------------------+------------------------+------------------+------------------+
1 row in set (0.00 sec)

mysql> select 
    ->   regexp_instr('abc def ghi', '[a-z]{3}') デフォルト
    ->   ,regexp_instr('abc def ghi', '[a-z]{3}', 7) 9文字目から
    ->   ,regexp_instr('abc def ghi', '[a-z]{3}', 1, 3) 3回目に出現
    ->   ,regexp_instr('abc def ghi', '[a-z]{3}', 1, 1, 0) 一致した位置
    ->   ,regexp_instr('abc def ghi', '[a-z]{3}', 1, 1, 1) 一致した後の位置;
+-----------------+------------------+------------------+--------------------+--------------------------+
| デフォルト      | 9文字目から      | 3回目に出現      | 一致した位置       | 一致した後の位置         |
+-----------------+------------------+------------------+--------------------+--------------------------+
|               1 |                9 |                9 |                  1 |                        4 |
+-----------------+------------------+------------------+--------------------+--------------------------+
1 row in set (0.00 sec)

見ていただいてありがとうございましたm(_ _)m

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up