More than 3 years have passed since last update.

SQLのwhere的な条件指定によってテーブルから行を抽出【Javascript】【構文解析】

Last updated at 2021-07-18Posted at 2021-07-14

目的

SQLのwhere構文のように、列名と値をandやorでつなげて指定した条件によって、テーブルから行を抽出する機能を作ることを目指します。例えば、ポケモンの一覧表から"type1=でんき and type2=ひこう"みたいに指定したら、でんき・ひこうタイプのポケモンだけを抽出できるようにしたいです。

20210719追記：カプセル化しなさすぎてでごちゃついてしまったので、以下でもう少しスッキリ実装しました。
【Javascript】SQLのwhere的な文字列で真偽判定【構文解析】【改良版】

ステップ０（下準備)

テスト用の関数を作っておきます。
あとでリスト同士の比較をしたくなることがあるので、それを考慮した関数にしておきます。

function typeOf(obj) {
    return Object.prototype.toString.call(obj).slice(8, -1).toLowerCase();
}
function isSameObject(object1, object2){
    const type1 = typeOf(object1);
    const type2 = typeOf(object2);
    if(type1 != type2)return false;
    if(["undefined","null","boolean","number","string"].includes(type1)){
        return object1 == object2;
    }else if(type1 == "array"){
        if(object1.length != object2.length)return false;
        for(let i=0;i<object1.length;i++){
            if(!isSameObject(object1[i], object2[i]))return false;
        }
        return true;
    }else if(type1 == "object"){
        if(object1.length != object2.length)return false;
        for(let k in object1){
            if(k in object2 == false) return false;
            if(!isSameObject(object1[k], object2[k]))return false;
        }
        return true;
    }else{//mapとかsetとかsymbolとか
        console.log("warning: unsupposed type", type1, "is detected!");
        return object1 === object2;
    }
}
function test(value, expected){
  const OK = "ok";
  const NO = "no";
  if(isSameObject(value, expected)){
    console.log(OK);
  }else{
    console.log(NO);
  }
}

ステップ１: 単純式

まずシンプルに考えるため、{"name":"ピカチュウ","type1":"でんき","type2":"NA"}のようなobjectが条件式とマッチするかを判定する関数を作ってみます。

ステップ１として、"type1 = でんき"のようなand、orを含まないシンプルな条件式による判定を実装します。このような式をここでは単純式と呼ぶことにします。
比較演算子は=または!=とします。また簡単のため、比較演算子の前後には必ずスペースが挿入されるものとします。
この条件において判定をおこなう関数は以下のとおりです。わかりやすさ優先で不正な条件式の場合のエラー処理は省略しています。

function isMatch1(obj, condition){
  const tokens = condition.trim().split(/\s+/)
  if(tokens[1]==="="){
    const key = tokens[0].trim()
    const value = tokens[2].trim()
    return obj[key]===value;
  }else if(tokens[1]==="!="){
    const key = tokens[0].trim()
    const value = tokens[2].trim()
    return obj[key]!==value;
  }
}

上記でも動きますが、ステップ２以降のことを考えて、少し機能を分離しておきます。
具体的には、条件式をスプリットしてリスト化する部分と、リスト化された条件式のうち指定したindexから始まる単純式を評価する関数に分離します。

//conditionをtokenに分離する
function tokenize(condition){
  let tokens = condition.split(/\s+/);
  return tokens
}

//startで指定されるindexで始まる単純式による評価結果と、その単純式の次のindexを返す
//obj: 評価対象のobject
//tokens: 条件式の構成要素のリスト（スペースでスプリット済み）
//start: 評価に用いる単純式の先頭のindex
//戻り値：[result, next_start]
//result: 単純式の評価結果
//next_start: 続きのstart_index
function evaluateSimpleEquation(obj, tokens, start){
  const operator = tokens[start+1]
  if(operator === "="){
    const key = tokens[start]
    const value = tokens[start+2]
    const result = (obj[key] === value)
    return [result, start+3]
  }else if(operator === "!="){
    const key = tokens[start]
    const value = tokens[start+2]
    const result = (obj[key] !== value)
    return [result, start+3]
  }
}

function isMatch1(obj, condition){
  const tokens = tokenize(condition);
  const r = evaluateSimpleEquation(obj, tokens, 0);
  return r[0]
}

テストしてみます。

let obj = {"name":"ピカチュウ","type1":"でんき","type2":"NA"}
test(isMatch1(obj, "type1 = でんき"), true)
//ok
test(isMatch1(obj, "type1 = みず"), false)
//ok
test(isMatch1(obj, "type1 != でんき"), false)
//ok
test(isMatch1(obj, "type1 != みず"), true)
//ok

ステップ２：and/or

andやorを導入します。
単純式をand/orで接続した条件式をここでは 複合式 と呼ぶことにします。
中括弧{}を中身の0回以上の繰り返しと定義するとき、複合式は次のように表すことができます。
単純式 {and|or 単純式}
と表せます。バー|はそれを挟む２単語のどちらかが書かれていることを意味します。この表現から、単純式は複合式の特別な場合とみなされます。

複合式の始まりは必ず単純式なので（言い換えればand/orで式が始まることはない）ので、まず最初の単純式を評価し、その後ろに続くandやorの数だけand/or 単純式の部分に対する評価をするようなプログラムを書けばよいです。

以下において、tokenize、evaluateSimpleEquationはステップ1と同じものを使っています。

//and, orを含む条件式を評価する関数
function evaluateComplexEquation(obj, tokens){
  //はじめの単純式を評価
  const first = evaluateSimpleEquation(obj, tokens, 0);
  let result = first[0];
  let next_index = first[1];
  //nextがqueryの長さを超えるまで単純式の評価を繰り返す
  while(next_index < tokens.length){
    //始点が接続詞でなければループを終了する
    if(tokens[next_index] !== "and" && tokens[next_index] !== "or"){
      break;
    }

    const conjunction = tokens[next_index]
    //andの場合
    if(conjunction === "and"){
      const next = evaluateSimpleEquation(obj, tokens, next_index+1);
      const next_result = next[0]
      result = (result && next_result)
      next_index = next[1]
    }
    //orの場合
    else if(conjunction === "or"){
      const next = evaluateSimpleEquation(obj, tokens, next_index+1);
      const next_result = next[0]
      result = (result || next_result)      
      next_index = next[1]
    }
  }
  //next_indexを返す意味はあまりないけど、evaluate系の関数の戻り値の型を統一するため、一応この形式で返しておく
  return [result, next_index];
}

function isMatch2(obj, condition){
  const tokens = tokenize(condition);
  const r = evaluateComplexEquation(obj, tokens, 0);
  return r[0]
}

テストしてみます。

let obj = {"name":"ピカチュウ","type1":"でんき","type2":"NA"}
//単純式の評価
test(isMatch2(obj, "type1 = でんき"), true)
//ok
test(isMatch2(obj, "type1 = みず"), false)
//ok
test(isMatch2(obj, "type1 != でんき"), false)
//ok
test(isMatch2(obj, "type1 != みず"), true)
//ok
//複雑式の評価
test(isMatch2(obj, "type1 = でんき and type2 = NA"), true)
//ok
test(isMatch2(obj, "type1 = みず or type1 = でんき"), true)
//ok
test(isMatch2(obj, "type1 != でんき or type2 != NA"), false)
//ok
test(isMatch2(obj, "type1 = でんき and type2 = ひこう"), false)
//ok

ステップ３：カッコ

半角丸括弧()による評価順序の指定に対応させます。
たとえば{"name":"ピカチュウ","type1":"でんき","type2":"NA"}に対して、( type1 = でんき and type2 = ひこう ) or type2 = NA'はfalse、type1 = でんき and ( type2 = ひこう or type2 = NA )'はtrueとなるようにしたいです。

()をつかって評価順序を指定できる条件式のことを 括弧付き条件式 と呼ぶことにします。
括弧つき条件式は次のように表すことができます。
単純式|( 括弧付き条件式 ) {and|or 単純式|( 括弧付き条件式 )}

ステップ2で単純式だった部分が、単純式または括弧で囲まれた括弧付き条件式に置き換わっています。
括弧つき条件式の表現に括弧付き条件式が使われているので、変な感じがしますが、括弧付き条件式はいくらでもネストして記述できる（つまり括弧のなかに更に括弧を記述できる）ので、このような表現になります。

表現から、なんとなく再帰関数を使うことでうまく実装できそうな気配があります。
括弧付き条件式の始まりは単純式か括弧なので、"("で始まっていれば、括弧付き条件式として、そうでなければ単純式として処理をします。再帰関数を使うことで、カッコの中身を先に評価することができます。

実装は以下のとおりです。ステップ2でevaluateSimpleEquationだった部分がevaluateSimpleOrBraketに置き換わっており、表現式と対応していることがわかります。

function evaluateBraketEquation(obj, tokens, start){

  //単純式または括弧付き条件式の評価結果を返す
  //よく使うので関数内関数としてまとめておく
  function evaluateSimpleOrBraket(obj_,tokens_,start_){
    const first = tokens_[start_]
    //"("で始まっていれば括弧付き条件式として評価
    if(first === "("){
      const r = evaluateBraketEquation(obj_, tokens_, start_+1);
      if(tokens_[r[1]] === ")"){
        return [r[0], r[1]+1];
      }
    }
    //"("で始まっていなければ単純式として評価
    else{
      return evaluateSimpleEquation(obj_, tokens_, start_);
    }
  }

  let result, next_index;
  const first_result = evaluateSimpleOrBraket(obj,tokens,start);
  result = first_result[0]
  next_index = first_result[1]

  //nextがqueryの長さを超えるまで単純式の評価を繰り返す
  while(next_index < tokens.length){
    //始点が括弧閉じであればループを終了する
    if(tokens[next_index] === ")"){
      break;
    }
    //始点が接続詞でなければループを終了する
    if(tokens[next_index] !== "and" && tokens[next_index] !== "or"){
      break;
    }
    const conjunction = tokens[next_index]
    //andの場合
    if(conjunction === "and"){
      const next = evaluateSimpleOrBraket(obj, tokens, next_index+1);//ステップ２で単純式評価だった部分（evaluateSimpleEquation)がevaluateSimpleOrBraketに置き換わっている
      const next_result = next[0]
      result = (result && next_result)
      next_index = next[1]
    }
    //orの場合
    else if(conjunction === "or"){
      const next = evaluateSimpleOrBraket(obj, tokens, next_index+1);//ステップ２で単純式評価だった部分（evaluateSimpleEquation)がevaluateSimpleOrBraketに置き換わっている
      const next_result = next[0]
      result = (result || next_result)      
      next_index = next[1]
    }
  }
  return [result, next_index];
}

function isMatch3(obj, condition){
  const tokens = tokenize(condition);
  const r = evaluateBraketEquation(obj, tokens, 0);
  return r[0];
}

テストしてみます

let obj = {"name":"ピカチュウ","type1":"でんき","type2":"NA"}
//単純式の評価
test(isMatch3(obj, "type1 = でんき"), true)
//ok
test(isMatch3(obj, "type1 = みず"), false)
//ok
test(isMatch3(obj, "type1 != でんき"), false)
//ok
test(isMatch3(obj, "type1 != みず"), true)
//ok
//複雑式の評価
test(isMatch3(obj, "type1 = でんき and type2 = NA"), true)
//ok
test(isMatch3(obj, "type1 = みず or type1 = でんき"), true)
//ok
test(isMatch3(obj, "type1 != でんき or type2 != NA"), false)
//ok
test(isMatch3(obj, "type1 = でんき and type2 = ひこう"), false)
//ok
//括弧付き条件式の評価
test(isMatch3(obj, "( type1 = でんき and type1 = みず ) and ( type2 = ひこう or type2 = NA )"), false);
//ok
test(isMatch3(obj, "type1 = でんき and ( type1 = みず and type2 = ひこう or type2 = NA )"), true);
//ok

ステップ4：スペースの自動補完

あまり本質的ではない部分ですが、"="や"("の前後にスペースを入れ忘れる人もいると思うので、そういった場合でもトークンに分解できるようにtokenize関数を修正します。
"!="、"="、"("、")"の前後にスペースを入れてからsplitするようにします。
この修正をすることによって、マッチさせるkeyやvalue名に"!="などを使えなくなることに注意してください。

function tokenize(condition){
  //空文字に対しては空のリストを返す
  if(condition.trim() === "")return [];

  condition = condition.replace(/!=|=|\(|\)/g," $& ");//!= or = or ( or )
  condition = condition.trim();
  const tokens = condition.split(/\s+/);
  return tokens
}

一応テストします。

test(tokenize(""), [])
//ok
test(tokenize("(type1=でんき and type1=みず) and (type2=ひこう or type2=NA)"), ["(","type1","=","でんき","and","type1","=","みず",")","and","(","type2","=","ひこう","or","type2","=","NA",")"])
//ok

ステップ５：文法チェック

これまで入力された構文が文法的に正しいことを前提として機能を作ってきたので、入力された条件式が文法的に正しいことをチェックする関数を作ります。

ステップ３で作ったevaluateBraketEquation関数を参考に、期待する演算子やトークン数になっているかを評価する関数を作ります。

//エラーメッセージの定義
let ErrorMessage = {
  valueExpectedButTokenEnd: ()=>"value type string is expected but token ended",
  operatorExpectedButTokenEnd: ()=>"operator type string is expected but token ended",
  endingBraketExpectedButTokenEnd: ()=>"ending braket ')' is expected but token ended",
  endingBraketExpectedButDoesNotExist: ()=>"ending braket ')' is expected but does not exist",
  unknownOperator: ()=>"unkonwn operator",
  unknownConjunction: ()=>"unknown conjunction",
  equationExpectedButTokenEnd: ()=>"equation is expected but token" 
}

//エラーメッセージを出力するための関数
function getErrorMessage(index, value, error_type){
  let message = []
  message.push("Uncorrect grammer at index = ");
  message.push(String(index));
  message.push("( value = '");
  message.push(String(value));
  message.push("' )");
  message.push("or later. Error type:");
  message.push(error_type+".");
  return message.join(" ");
}
//単純式であることのチェック
function isSimpleEquation(tokens, start){
  if(start+1 >= tokens.length){
    const message = getErrorMessage(start+1, null, ErrorMessage.operatorExpectedButTokenEnd());
    return [false, message];
  }
  const operator = tokens[start+1]
  if(operator === "="){
    if(start+2>=tokens.length){
      const message = getErrorMessage(start+2, null, ErrorMessage.valueExpectedButTokenEnd());
      return [false, message];
    }
    return [true, start+3]
  }
  //"!="のチェック
  else if(operator === "!="){
    if(start+2>=tokens.length){
      const message = getErrorMessage(start+2, null, ErrorMessage.valueExpectedButTokenEnd());
      return [false, message];
    }
    return [true, start+3]
  }

  else{
    const message = getErrorMessage(start+1, tokens[start+1], ErrorMessage.unknownOperator());
    return [false, message]
  }
}

function isBraketEquation(tokens, start){
  //単純式または括弧付き条件式として正しいかどうかの評価結果を返す
  //よく使うので関数内関数としてまとめておく
  function isSimpleOrBraket(tokens_, start_){
    //start_がtokens_の長さより大きいときはエラーを返す
    if(start_ >= tokens_.length){
      const message = getErrorMessage(start_, null, ErrorMessage.equationExpectedButTokenEnd());
      return [false, message];
    }

    const first = tokens_[start_]
    //"("で始まっていれば、括弧付き条件式として評価
    if(first === "("){
      const r = isBraketEquation(tokens_, start_+1);
      if(r[0] === false){
        return r;
      }
      else if(r[1] >= tokens_.length){
        const message = getErrorMessage(r[1], null, ErrorMessage.endingBraketExpectedButTokenEnd());
        return [false, message]
      }
      else if(tokens_[r[1]] === ")"){
        return [true, r[1]+1];
      }else{
        const message = getErrorMessage(r[1], tokens_[r[1]], ErrorMessage.endingBraketExpectedButDoesNotExist());
        return [false, message]
      }
    }
    //"("で始まっていなければ単純式として評価
    else{
      const r = isSimpleEquation(tokens_, start_);
      return r;
    }
  }
  const first_result = isSimpleOrBraket(tokens,start);
  if(first_result[0] === false){
    return first_result;
  }
  let next_index = first_result[1];
  //next_indexがqueryの長さを超えるまで単純式の評価を繰り返す
  while(next_index < tokens.length){
    const conjunction = tokens[next_index]
    //and、orの場合
    if(conjunction !== "and" && conjunction !== "or"){
      break;
    }
    const r = isSimpleOrBraket(tokens, next_index+1);
    if(r[0] === false){
      return r;
    }
    next_index = r[1];
  }
  return [true, next_index];
}
function isCorrectGrammer(tokens){
  const r = isBraketEquation(tokens, 0);
  if(r[0] === true && r[1] < tokens.length){
    const message = getErrorMessage(r[1], tokens[r[1]], ErrorMessage.unknownConjunction());
    return [false, message]
  }
  if(r[0] === true) return [true, null];
  else return r;
}

テストします。

test(isCorrectGrammer(tokenize("type1=でんき")), [true, null]);

test(isCorrectGrammer(tokenize("(type1=でんき)")), [true, null]);

test(isCorrectGrammer(tokenize("(((type1=でんき)))")), [true, null]);

test(isCorrectGrammer(tokenize("(((type1=でんき))) and (type2=ひこう or (type2=NA))")), [true, null]);

test(isCorrectGrammer(tokenize("(type1=でんき))")), [false, getErrorMessage(5, ")", ErrorMessage.unknownConjunction())]);

test(isCorrectGrammer(tokenize("((type1=でんき)")), [false, getErrorMessage(6, null, ErrorMessage.endingBraketExpectedButTokenEnd())]);

test(isCorrectGrammer(tokenize("type1=でんき and_ type2 = NA")), [false,getErrorMessage(3, "and_", ErrorMessage.unknownConjunction())]);

test(isCorrectGrammer(tokenize("type1=でんき and")), [false,getErrorMessage(4, null, ErrorMessage.equationExpectedButTokenEnd())]);

test(isCorrectGrammer(tokenize("type1==でんき")), [false,getErrorMessage(3, "でんき", ErrorMessage.unknownConjunction())]);

test(isCorrectGrammer(tokenize("type1 is でんき")), [false,getErrorMessage(1, "is", ErrorMessage.unknownOperator())]);

test(isCorrectGrammer(tokenize("")), [false, getErrorMessage(0, null, ErrorMessage.equationExpectedButTokenEnd())]);

ステップ6：表形式データから行の抽出

表形式のデータから行を抽出できるようにします。
基本は行をひとつずつ見て、ステップ３で作成した関数でtrueとなったものを抽出するだけなので難しくありません。
念の為文法チェックの処理も入れて、不正な条件式を検出できるようにしておきます。

//tokenizeされた条件式によるマッチ判定
function isMatch(obj, tokens){
  const r = evaluateBraketEquation(obj, tokens, 0);
  return r[0];
}

function filterTable(table, condition){
  let tokens = tokenize(condition);
  const grammerCheckResult = isCorrectGrammer(tokens);
  //もし正しくない条件式であれば、エラーメッセージを出力し、処理を中断する
  if(grammerCheckResult[0] === false){
    console.log(grammerCheckResult[1]);
    return false;
  }

  const header = table[0]
  const filtered_table = [header]

  for(let i=1;i<table.length;i++){
    const row = table[i]
    const obj = {}
    for(let j=0;j<header.length;j++){
      obj[header[j]]=row[j]
    }
    const r = isMatch(obj, tokens);
    if(r === true){
      filtered_table.push(row);
    }
  }
  return filtered_table;
}

テストします。

let table = [
  ["name","type1","type2"],
  ["ピカチュウ","でんき","NA"],
  ["カイリュー","ドラゴン","ひこう"],
  ["ヤドラン","みず","エスパー"],
  ["ピジョン","ノーマル","ひこう"],
  ["コダック","みず","NA"],
  ["コラッタ","ノーマル","NA"],
  ["ズバット","どく","ひこう"],
  ["ギャロップ","ほのお","NA"],
  ["サンダース","でんき","NA"],
  ["メノクラゲ","みず","どく"]
]
test(filterTable(table, "type1=でんき"), [table[0],table[1],table[9]]);
//[["name","type1","type2"],["ピカチュウ","でんき","NA"],["サンダース","でんき","NA"]]
test(filterTable(table, "type1=どく or type2 = どく"), [table[0],table[7],table[10]]);
//[["name","type1","type2"],["ズバット","どく","ひこう"],["メノクラゲ","みず","どく"]]

おわりに

想像以上に長くなりました。構文解析は素人なので冗長な処理になっているところがあるかもしれません。
また単純式、複合式、括弧付き条件式などの言葉は、今回のために考えた言葉で、特に専門書などから引っ張ってきたわけではないので、ご留意ください。
実用面では括弧付き条件式を使いたくなることは少なそうなので、複合式くらいまでの実装でとどめても良いかもしれません。

参考にさせていただいた記事

JavaScriptでゆるく学ぶ再帰下降構文解析
 ある配列が別の配列の部分配列になっているか判定する【javascript】

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up