Help us understand the problem. What is going on with this article?

Go小ネタ: 正規表現を全角半角問わずマッチするよう変換する

More than 3 years have passed since last update.

なぜ必要になったか

Google Analytics の CoreReporting API では、検索キーワードの一覧を取得できるが、取得したキーワードは、全角英数字は半角に変換される。

一方、正規表現でクエリを投げられるんだけど、そいつはローデータに対してマッチしているよう。
さらに悪いことに、ローデータを取れない。。。

なので、ユーザーが入力した正規表現を、全角半角英数両方にマッチする正規表現に変換するコードを書いた。

テストコード

package util

import "testing"

func TestZenHanRegexp(t *testing.T) {
    testPatterns := [][]string{
        // literal
        {`1923`, `[11][99][22][33]`},
        {`abc`, `[aa][bb][cc]`},
        {`aBc`, `[aa][BB][cc]`},
        {`ABC`, `[AA][BB][CC]`},
        {`abc`, `[aa][bb][cc]`},
        {`ABC`, `[AA][BB][CC]`},
        {`abc.*`, `[aa][bb][cc](?-s:.)*`},
        {`あいうえおabc.*`, `あいうえお[aa][bb][cc](?-s:.)*`},

        // char case
        {`[a-z]`, `[a-za-z]`},
        {`[0-9a-zA-Z]`, `[0-90-9A-ZA-Za-za-z]`},
        {`[a-z]`, `[a-za-z]`},
        {`[A-z]`, `[A-z]`},
        {`[acz]`, `[aacczz]`},
        {`\ba`, `\b[aa]`},
        {`[[:alpha:]]`, `[A-ZA-Za-za-z]`},
        {`\d`, `[0-90-9]`},

        // complex
        {`\Qaiueo\E`, `[aa][ii][uu][ee][oo]`},
        {`test(test)(あいうえおb)+[a-z]{0,4}$`, `[tt][ee][ss][tt]([tt][ee][ss][tt])(あいうえお[bb])+[a-za-z]{0,4}$`},
    }

    for _, pt := range testPatterns {
        out, err := ZenHanRegexp(pt[0])
        if err != nil {
            t.Errorf("parse error for %s, %s", pt[0], err.Error())
        }
        if out != pt[1] {
            t.Errorf("original %s: %s expected to be %s", pt[0], out, pt[1])
        }
    }
}

実装

package util

import (
    "bytes"
    "fmt"
    "regexp/syntax"
)

func ZenHanRegexp(src string) (string, error) {
    if src == "" {
        return "", nil
    }
    r, err := syntax.Parse(src, syntax.PerlX|syntax.Simple)
    if err != nil {
        return "", err
    }
    newR := replaceZenHan(r)
    return newR.String(), nil
}

func replaceZenHan(re *syntax.Regexp) *syntax.Regexp {
    if re == nil {
        return nil
    }

    copied := *re
    switch copied.Op {
    case syntax.OpLiteral:
        return createZenHanLiteral(&copied)
    case syntax.OpCharClass:
        return createZenHanCharClass(&copied)
    }
    for i, cr := range copied.Sub {
        copied.Sub[i] = replaceZenHan(cr)
    }
    return &copied
}

func createZenHanLiteral(re *syntax.Regexp) *syntax.Regexp {
    var buf bytes.Buffer
    for _, r := range re.Rune {
        buf.Write([]byte(runeToZenHanString(r)))
    }
    newRe, _ := syntax.Parse(buf.String(), 0)
    return newRe
}

func runeToZenHanString(r rune) string {
    runes, found := allRuneMap[r]
    if !found {
        return string(r)
    }
    return fmt.Sprintf("[%c%c]", runes[0], runes[1])
}

func createZenHanCharClass(re *syntax.Regexp) *syntax.Regexp {
    copied := *re
    var buf bytes.Buffer

    for i := 0; i < len(re.Rune)/2; i++ {
        start, end := re.Rune[2*i], re.Rune[2*i+1]
        buf.Write([]byte(classRunesToZenHanString(start, end)))
    }

    copied.Rune = []rune(buf.String())

    return &copied
}

func classRunesToZenHanString(start, end rune) string {
    // それぞれの文字の範囲でのみ全角半角に置き換えて返す
    for _, targetMap := range []map[rune][2]rune{lowerAlphaRuneMap, upperAlphaRuneMap, digitRuneMap} {
        if runeMapExists(targetMap, start) && runeMapExists(targetMap, end) {
            return convertZenHanCharClass(start, end)
        }
    }

    return string([]rune{start, end})
}

func convertZenHanCharClass(start, end rune) string {
    ss := allRuneMap[start]
    es := allRuneMap[end]
    return string([]rune{ss[0], es[0], ss[1], es[1]})
}

func runeMapExists(m map[rune][2]rune, r rune) bool {
    _, exists := m[r]
    return exists
}

var (
    lowerAlphaPairs   [][2]rune
    upperAlphaPairs   [][2]rune
    digitPairs        [][2]rune
    lowerAlphaRuneMap map[rune][2]rune
    upperAlphaRuneMap map[rune][2]rune
    digitRuneMap      map[rune][2]rune
    allRuneMap        map[rune][2]rune
)

func init() {
    // create map
    allRuneMap = map[rune][2]rune{}
    lowerAlphaRuneMap = map[rune][2]rune{}
    upperAlphaRuneMap = map[rune][2]rune{}
    digitRuneMap = map[rune][2]rune{}

    // init alphabets
    for i := 0; i < 26; i++ {
        lowerAlphaPairs = append(lowerAlphaPairs, [2]rune{rune(int('a') + i), rune(int('a') + i)})
        upperAlphaPairs = append(upperAlphaPairs, [2]rune{rune(int('A') + i), rune(int('A') + i)})
    }

    for _, pair := range lowerAlphaPairs {
        for _, r := range pair {
            lowerAlphaRuneMap[r] = pair
            allRuneMap[r] = pair
        }
    }
    for _, pair := range upperAlphaPairs {
        for _, r := range pair {
            upperAlphaRuneMap[r] = pair
            allRuneMap[r] = pair
        }
    }

    // init digits
    for i := 0; i < 10; i++ {
        digitPairs = append(digitPairs, [2]rune{rune(int('0') + i), rune(int('0') + i)})
    }

    for _, pair := range digitPairs {
        for _, r := range pair {
            digitRuneMap[r] = pair
            allRuneMap[r] = pair
        }
    }
}

Why do not you register as a user and use Qiita more conveniently?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
Comments
Sign up for free and join this conversation.
If you already have a Qiita account
Why do not you register as a user and use Qiita more conveniently?
You need to log in to use this function. Qiita can be used more conveniently after logging in.
You seem to be reading articles frequently this month. Qiita can be used more conveniently after logging in.
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away