実装してみました
皆さんのお役に立てれば幸いです。
- 環境
$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=18.04
DISTRIB_CODENAME=bionic
DISTRIB_DESCRIPTION="Ubuntu 18.04.2 LTS"
$ sqlite3 --version
3.29.0 2019-07-10 17:32:03 fc82b73eaac8b36950e527f12c4b5dc1e147e6f4ad2217ae43ad82882a88bfa6
$ mecab --version
mecab of 0.996
-
ビルド環境の構築
ビルドに必要なパッケージのインストール
sqliteはソースからコンパイルして、$HOME/usr
にインストール(fts5を有効化)
mecabはソースからコンパイルして、$HOME/usr
にインストール
辞書はmecab-ipadic-neologdを使用 -
ソース:fts5_mecab.c
https://github.com/thino-rma/fts5_mecab
備忘のためコメントを大量に入れてあります。
ビルド方法は後述
/*
** ** Implementation Help **
** https://www.sqlite.org/loadext.html
** 4. Programming Loadable Extensions
** A template loadable extension contains the following three elements:
** 1. Use "#include <sqlite3ext.h>" at the top of your source code files instead of "#include <sqlite3.h>".
** 2. Put the macro "SQLITE_EXTENSION_INIT1" on a line by itself right after the "#include <sqlite3ext.h>" line.
*/
#include <sqlite3ext.h>
SQLITE_EXTENSION_INIT1
#include <assert.h>
#include <mecab.h>
#include <stdlib.h>
#include <string.h>
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** Before a new auxiliary function or tokenizer implementation may
** be registered with FTS5, an application must obtain a pointer
** to the "fts5_api" structure. ... The following example code
** demonstrates the technique:
*/
/*
** Return a pointer to the fts5_api pointer for database connection db.
** If an error occurs, return NULL and leave an error in the database
** handle (accessible using sqlite3_errcode()/errmsg()).
*/
fts5_api *fts5_api_from_db(sqlite3 *db){
fts5_api *pRet = 0;
sqlite3_stmt *pStmt = 0;
/*
** ** Implementation Help **
** https://www.sqlite.org/c3ref/bind_blob.html
** ...
** The second argument is the index of the SQL parameter to be
** set. The leftmost SQL parameter has an index of 1.
** ...
** sqlite3_bind_pointer(S,I,P,T,D) → Bind pointer P of type T to
** the I-th parameter of prepared statement S. D is an optional
** destructor function for P.
*/
if( SQLITE_OK==sqlite3_prepare(db, "SELECT fts5(?1)", -1, &pStmt, 0) ){
sqlite3_bind_pointer(pStmt, 1, (void*)&pRet, "fts5_api_ptr", NULL);
sqlite3_step(pStmt);
}
sqlite3_finalize(pStmt);
return pRet;
}
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** 7.1. Custom Tokenizers
** To create a custom tokenizer, an application must implement
** three functions: a tokenizer constructor (xCreate),
** a destructor (xDelete) and a function to do the actual
** tokenization (xTokenize). ...
**
** typedef struct Fts5Tokenizer Fts5Tokenizer;
** typedef struct fts5_tokenizer fts5_tokenizer;
**
*/
/*
** Implementation Help
** https://taku910.github.io/mecab/libmecab.html
*/
typedef struct MecabTokenizer {
fts5_tokenizer base;
mecab_t *mecab;
int verbose;
int stop789;
} MecabTokenizer;
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** 7.1. Custom Tokenizers
** To create a custom tokenizer, an application must implement
** three functions: a tokenizer constructor (xCreate),
** a destructor (xDelete) and a function to do the actual
** tokenization (xTokenize). ...
**
** struct fts5_tokenizer {
** int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut);
** ....
** }
**
** This function is used to allocate and initialize a tokenizer
** instance. A tokenizer instance is required to actually
** tokenize text.
**
** The first argument passed to this function is a copy of
** the (void*) pointer provided by the application when the
** fts5_tokenizer object was registered with FTS5 (the third
** argument to xCreateTokenizer()). The second and third
** arguments are an array of nul-terminated strings containing
** the tokenizer arguments, if any, specified following the
** tokenizer name as part of the CREATE VIRTUAL TABLE statement
** used to create the FTS5 table.
**
** The final argument is an output variable. If successful,
** (*ppOut) should be set to point to the new tokenizer handle
** and SQLITE_OK returned. If an error occurs, some value other
** than SQLITE_OK should be returned. In this case, fts5
** assumes that the final value of *ppOut is undefined.
*/
static int mecabCreate(
void *pContext,
const char **azArg, int nArg,
Fts5Tokenizer **ppOut
){
// parse args
int verbose = 0; // 0 or 1 or 2
int stop789 = 0; // 0:false, 1:true
for (int i = 0; i < nArg; i++) {
if (strcmp(azArg[i], "stop789") == 0) {
stop789 = 1;
} else if (strcmp(azArg[i], "vv") == 0) {
if (verbose < 2) {
verbose = 2;
}
} else if (strcmp(azArg[i], "v") == 0) {
if (verbose < 2) {
verbose += 1;
}
} else {
if (verbose > 0) {
printf("ignored unknown option: %s\n", azArg[i]);
}
}
}
#ifdef DEBUG
if (verbose > 0) { // DEBUG LEVEL 1
printf("mecabCreate()\n");
printf("nArg: %d\n", nArg);
if (verbose > 1) { // DEBUG LEVEL 2
for (int i = 0; i < nArg; i++) {
printf(" %d: %s\n", i, azArg[i]);
}
}
printf("verbose = %d\n", verbose);
printf("stop789 = %d\n", stop789);
}
#endif
fts5_api *pApi = (fts5_api*)pContext;
MecabTokenizer *p = 0;
p = sqlite3_malloc(sizeof(MecabTokenizer));
if (p == NULL) {
return SQLITE_NOMEM;
}
memset(p, 0, sizeof(MecabTokenizer));
p->mecab = mecab_new(nArg, (char**)azArg);
if (p->mecab == NULL) {
sqlite3_free(p);
return SQLITE_ERROR;
}
p->verbose = verbose;
p->stop789 = stop789;
#ifdef DEBUG
/*
** ** Implementation Help **
** https://taku910.github.io/mecab/libmecab.html
*/
// Dictionary info
if (verbose > 0) { // DEBUG LEVEL 1
const mecab_dictionary_info_t *d = mecab_dictionary_info(p->mecab);
for (; d; d = d->next) {
printf("mecab_dictionary_info()\n");
printf(" filename: %s\n", d->filename);
printf(" charset: %s\n", d->charset);
printf(" size: %d\n", d->size);
printf(" type: %d\n", d->type);
printf(" lsize: %d\n", d->lsize);
printf(" rsize: %d\n", d->rsize);
printf(" version: %d\n", d->version);
}
}
#endif
*ppOut = (Fts5Tokenizer*)p;
return SQLITE_OK;
}
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** 7.1. Custom Tokenizers
** To create a custom tokenizer, an application must implement
** three functions: a tokenizer constructor (xCreate),
** a destructor (xDelete) and a function to do the actual
** tokenization (xTokenize). ...
**
** struct fts5_tokenizer {
** ....
** void (*xDelete)(Fts5Tokenizer*);
** ....
** }
**
** This function is invoked to delete a tokenizer handle
** previously allocated using xCreate(). Fts5 guarantees
** that this function will be invoked exactly once for each
** successful call to xCreate().
*/
static void mecabDelete(Fts5Tokenizer *pTokenizer){
MecabTokenizer *p = (MecabTokenizer*)pTokenizer;
#ifdef DEBUG
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("mecabDelete()\n");
}
#endif
/*
** Implementation Help
** https://taku910.github.io/mecab/libmecab.html
*/
mecab_destroy(p->mecab);
p->verbose = 0;
p->stop789 = 0;
sqlite3_free(p);
}
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** 7.1. Custom Tokenizers
** To create a custom tokenizer, an application must implement
** three functions: a tokenizer constructor (xCreate),
** a destructor (xDelete) and a function to do the actual
** tokenization (xTokenize). ...
**
**
** struct fts5_tokenizer {
** ....
** int (*xTokenize)(Fts5Tokenizer*,
** void *pCtx,
** int flags, /* Mask of FTS5_TOKENIZE_* flags * /
** const char *pText, int nText,
** int (*xToken)(
** void *pCtx, /* Copy of 2nd argument to xTokenize() * /
** int tflags, /* Mask of FTS5_TOKEN_* flags * /
** const char *pToken, /* Pointer to buffer containing token * /
** int nToken, /* Size of token in bytes * /
** int iStart, /* Byte offset of token within input text * /
** int iEnd /* Byte offset of end of token within input text * /
** )
** );
** }
**
** This function is expected to tokenize the nText byte string
** indicated by argument pText. pText may or may not be
** nul-terminated. The first argument passed to this function
** is a pointer to an Fts5Tokenizer object returned by an
** earlier call to xCreate().
**
** The second argument indicates the reason that FTS5 is
** requesting tokenization of the supplied text. This is
** always one of the following four values:
**
** FTS5_TOKENIZE_DOCUMENT - A document is being inserted
** into or removed from the FTS table. The tokenizer is
** being invoked to determine the set of tokens to add to
** (or delete from) the FTS index.
**
** FTS5_TOKENIZE_QUERY - A MATCH query is being executed
** against the FTS index. The tokenizer is being called
** to tokenize a bareword or quoted string specified as
** part of the query.
**
** (FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX) - Same as
** FTS5_TOKENIZE_QUERY, except that the bareword or quoted
** string is followed by a "*" character, indicating that
** the last token returned by the tokenizer will be treated
** as a token prefix.
**
** FTS5_TOKENIZE_AUX - The tokenizer is being invoked to
** satisfy an fts5_api.xTokenize() request made by an
** auxiliary function. Or an fts5_api.xColumnSize() request
** made by the same on a columnsize=0 database.
**
** For each token in the input string, the supplied callback
** xToken() must be invoked. The first argument to it should be
** a copy of the pointer passed as the second argument to
** xTokenize(). The third and fourth arguments are a pointer to
** a buffer containing the token text, and the size of the token
** in bytes. The 4th and 5th arguments are the byte offsets of
** the first byte of and first byte immediately following the
** text from which the token is derived within the input.
**
** The second argument passed to the xToken() callback
** ("tflags") should normally be set to 0. The exception is if
** the tokenizer supports synonyms. In this case see the
** discussion below for details.
**
** FTS5 assumes the xToken() callback is invoked for each token
** in the order that they occur within the input text.
**
** If an xToken() callback returns any value other than
** SQLITE_OK, then the tokenization should be abandoned and the
** xTokenize() method should immediately return a copy of the
** xToken() return value. Or, if the input buffer is exhausted,
** xTokenize() should return SQLITE_OK. Finally, if an error
** occurs with the xTokenize() implementation itself, it may
** abandon the tokenization and return any error code other
** than SQLITE_OK or SQLITE_DONE.
*/
static int mecabTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
int flags, /* Mask of FTS5_TOKENIZE_* flags */
const char *pText, int nText,
int (*xToken)(
void *pCtx, /* Copy of 2nd argument to xTokenize() */
int tflags, /* Mask of FTS5_TOKEN_* flags */
const char *pToken, /* Pointer to buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Byte offset of token within input text */
int iEnd /* Byte offset of end of token within input text */
)
){
MecabTokenizer *p = (MecabTokenizer*)pTokenizer;
#ifdef DEBUG
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("mecabTokenize()\n");
}
#endif
const mecab_node_t *node;
int nlen;
char *tmp;
char *buf;
int buflen;
int offset;
int rc;
#ifdef DEBUG
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("pText (nText) = %s (%d)\n", pText, nText);
}
#endif
/* parse */
node = mecab_sparse_tonode2(p->mecab, pText, strlen(pText)+1);
if (node == NULL) {
return SQLITE_ERROR;
}
/* initialize */
nlen = 0;
#define DEFAULT_BUFFER_LENGTH 256
buf = malloc(DEFAULT_BUFFER_LENGTH);
if(buf == NULL){
return SQLITE_NOMEM;
}
buflen = DEFAULT_BUFFER_LENGTH;
offset = 0;
rc = SQLITE_OK;
#ifdef DEBUG
int _node_count = 0; // for DEBUG
int _token_count = 0; // for DEBUG
#endif
while (node != NULL) {
while (node->next != NULL && node->length == 0) {
#ifdef DEBUG
_node_count += 1;
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("increment _node_count [1]: %s\n", node->feature);
}
#endif
offset += node->rlength;
node = node->next;
}
#ifdef DEBUG
if (p->verbose > 1) { // DEBUG LEVEL 2
// printf("pText (nText) = %s (%d)\n", pText, nText);
printf("node info\n");
printf(" feature = %s\n", node->feature);
printf(" surface = %s\n", node->surface);
printf(" length = %d\n", node->length);
printf(" rlength = %d\n", node->rlength);
printf(" posid = %d\n", node->posid);
printf(" char_type = %d\n", node->char_type);
printf(" stat = %d\n", node->stat);
printf("--------------\n");
}
#endif
nlen = node->length;
offset += node->rlength - nlen;
if (nlen > buflen) {
tmp = (char *)realloc(buf, nlen + 1);
if(tmp == NULL) {
rc = SQLITE_NOMEM;
break;
}else{
buf = tmp;
}
buf[nlen] = '\0';
buflen = nlen;
}
strncpy(buf, node->surface, nlen);
buf[nlen] = '\0';
#ifdef DEBUG
if (p->verbose > 1) { // DEBUG LEVEL 2
printf("calling xToken()\n");
printf(" tflags = 0\n");
printf(" pToken = %s\n", buf);
printf(" nToken = %d\n", nlen);
printf(" iStart = %d\n", offset);
printf(" iEnd = %d\n", offset + nlen);
printf("==============\n");
}
#endif
#ifdef STOP789
if (!p->stop789 || node->posid > 9 || node->posid < 7) {
rc = xToken(pCtx, 0, buf, nlen, offset, offset + nlen);
#ifdef DEBUG
_token_count += 1;
} else {
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("increment _node_count [3]: %s\n", node->feature);
}
#endif
}
#else
rc = xToken(pCtx, 0, buf, nlen, offset, offset + nlen);
#ifdef DEBUG
_token_count += 1;
#endif
#endif
if (rc != SQLITE_OK) {
/*
** If an xToken() callback returns any value other than
** SQLITE_OK, then the tokenization should be abandoned
** and the xTokenize() method should immediately return
** a copy of the xToken() return value.
*/
#ifdef DEBUG
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("break [1]: xToken() rc = %d\n", rc);
}
#endif
break;
}
offset += node->length;
#ifdef DEBUG
_node_count += 1;
#endif
node = node->next;
if (offset >= nText) {
rc = SQLITE_OK; // SQLITE_DONE;
#ifdef DEBUG
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("break [2]: offset >= nText\n");
}
#endif
break;
}
}
/* clean up */
while (node != NULL) {
#ifdef DEBUG
_node_count += 1;
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("increment _node_count [2]: %s\n", node->feature);
}
#endif
node = node->next;
}
nlen = 0;
if (buf) {
free(buf);
}
buflen = 0;
offset = 0;
#ifdef DEBUG
if (p->verbose > 0) { // DEBUG LEVEL 1
printf("_node_count, _token_count = %d, %d\n", _node_count, _token_count);
}
#endif
return rc;
}
/*
** ** Implementation Help **
** https://www.sqlite.org/loadext.html
** 4. Programming Loadable Extensions
** A template loadable extension contains the following three
** elements:
** 3. Add an extension loading entry point routine that looks
** like something the following:
*/
#ifdef _WIN32
__declspec(dllexport)
#endif
int sqlite3_ftsmecab_init( /* entry point for "fts5_mecab.o" */
sqlite3 *db,
char **pzErrMsg,
const sqlite3_api_routines *pApi
){
printf("sqlite3_ftsmecab_init()\n");
int rc = SQLITE_OK;
SQLITE_EXTENSION_INIT2(pApi);
/* insert code to initialize your extension here */
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** Before a new auxiliary function or tokenizer implementation
** may be registered with FTS5, an application must obtain a
** pointer to the "fts5_api" structure.
*/
fts5_api *pApi_fts5;
pApi_fts5 = fts5_api_from_db(db);
if( pApi_fts5==0 ){
*pzErrMsg = sqlite3_mprintf("fts5_api_from_db: %s", sqlite3_errmsg(db));
return SQLITE_ERROR;
}
/*
** ** Implementation Help **
** https://www.sqlite.org/fts5.html
** 7. Extending FTS5
** ...
** The fts5_api structure is defined as follows. It exposes
** three methods, one each for registering new auxiliary
** functions and tokenizers, and one for retrieving existing
** tokenizer. The latter is intended to facilitate the
** implementation of "tokenizer wrappers" similar to the
** built-in porter tokenizer.
**
** typedef struct fts5_api fts5_api;
** struct fts5_api {
** int iVersion; /* Currently always set to 2 * /
**
** /* Create a new tokenizer * /
** int (*xCreateTokenizer)(
** fts5_api *pApi,
** const char *zName,
** void *pContext,
** fts5_tokenizer *pTokenizer,
** void (*xDestroy)(void*)
** );
**
** /* Find an existing tokenizer * /
** int (*xFindTokenizer)(
** fts5_api *pApi,
** const char *zName,
** void **ppContext,
** fts5_tokenizer *pTokenizer
** );
**
** /* Create a new auxiliary function * /
** int (*xCreateFunction)(
** fts5_api *pApi,
** const char *zName,
** void *pContext,
** fts5_extension_function xFunction,
** void (*xDestroy)(void*)
** );
** };
**
** ...
**
** 7.1. Custom Tokenizers
** To create a custom tokenizer, an application must implement
** three functions:
** a tokenizer constructor (xCreate),
** a destructor (xDelete)
** and
** a function to do the actual tokenization (xTokenize).
** ...
*/
fts5_tokenizer t;
t.xCreate = mecabCreate;
t.xDelete = mecabDelete;
t.xTokenize = mecabTokenize;
rc = pApi_fts5->xCreateTokenizer(pApi_fts5, "mecab", (void*)pApi_fts5, &t, 0);
return rc;
}
- ビルド方法
$ gcc -g -fPIC -shared fts5_mecab.c -o fts5_mecab.so -I$HOME/usr/include -L$HOME/usr/lib -lmecab -DDEBUG -DSTOP789
- ビルドスクリプト: compile.sh
手前ミソですみません。
#!/bin/bash
CC=gcc
PREFIX=/usr/local
CFLAGS="-DDEBUG -DSTOP789"
DRY_RUN=0
NAME=fts5_mecab
SRC=$NAME.c
OUT=$NAME.so
usage_exit() {
echo "Usage: $0 [-h|--help] [--prefix=PREFIX] [--clear-cflags] [-DSYMBOL] [--dry-run]"
echo " \$PREFIX=$PREFIX"
echo " \$CFLAGS=\"$CFLAGS\""
echo " \$DRY_RUN=$DRY_RUN"
echo "Sample"
echo " $0 --prefix=\$HOME/usr --clear-cflags -DDEBUG -DSTOP789 --dry-run"
exit 0
}
echo_and_do() {
echo "$1"
if [ $2 == 0 ]; then
eval "$1"
else
echo "!! DRY RUN !!"
fi
}
_HELP=0
_DRY_RUN=0
for x in "$@"; do
if [[ $x == -h ]] || [[ $x == --help ]]; then
_HELP=1
elif [[ $x == --dry-run ]]; then
DRY_RUN=1
elif [[ $x == --prefix=* ]]; then
PREFIX=${x:9}
elif [[ $x == --clear-cflags ]]; then
CFLAGS=""
elif [[ $x == -D* ]]; then
CFLAGS="$CFLAGS -${x:1}"
fi
done
if [ $_HELP == 1 ]; then
usage_exit
fi
[ ! -d $PREFIX/lib ] && mkdir -p $PREFIX/lib
echo_and_do "$CC -g -fPIC -shared $SRC -o $OUT -I$PREFIX/include -L$PREFIX/lib -lmecab $CFLAGS" $DRY_RUN
if [ $? == 0 ]; then
echo "Compilation succeeded. execute command below."
echo " cp -a $OUT $PREFIX/lib/"
else
echo "compile error."
fi
echo ""
-
実行方法
ビルドの際にDEBUGシンボルを含めると、引数'v'や'vv'を付けることで、デバッグメッセージを表示します。'v'の数で、ログレベルが変わります。
ビルドの際にSTOP789シンボルを含めると、引数'stop789'を付けることで、posid=7,8,9を形態素解析結果から除外します。引数を与えない場合。(fts5を有効化したsqlite3コマンドを起動)
$ /PATH/TO/sqlite3
sqlite> .load /PATH/TO/fts5_mecab
sqlite> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'mecab');
引数を与える場合。(fts5を有効化したsqlite3コマンドを起動)
$ /PATH/TO/sqlite3
sqlite> .load /PATH/TO/fts5_mecab
sqlite> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'mecab vv stop789');