6
7

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

Go小ネタ: 正規表現を全角半角問わずマッチするよう変換する

Last updated at Posted at 2015-08-06

なぜ必要になったか

Google Analytics の CoreReporting API では、検索キーワードの一覧を取得できるが、取得したキーワードは、全角英数字は半角に変換される。

一方、正規表現でクエリを投げられるんだけど、そいつはローデータに対してマッチしているよう。
さらに悪いことに、ローデータを取れない。。。

なので、ユーザーが入力した正規表現を、全角半角英数両方にマッチする正規表現に変換するコードを書いた。

テストコード

package util

import "testing"

func TestZenHanRegexp(t *testing.T) {
	testPatterns := [][]string{
		// literal
		{`1923`, `[11][99][22][33]`},
		{`abc`, `[aa][bb][cc]`},
		{`aBc`, `[aa][BB][cc]`},
		{`ABC`, `[AA][BB][CC]`},
		{`abc`, `[aa][bb][cc]`},
		{`ABC`, `[AA][BB][CC]`},
		{`abc.*`, `[aa][bb][cc](?-s:.)*`},
		{`あいうえおabc.*`, `あいうえお[aa][bb][cc](?-s:.)*`},

		// char case
		{`[a-z]`, `[a-za-z]`},
		{`[0-9a-zA-Z]`, `[0-90-9A-ZA-Za-za-z]`},
		{`[a-z]`, `[a-za-z]`},
		{`[A-z]`, `[A-z]`},
		{`[acz]`, `[aacczz]`},
		{`\ba`, `\b[aa]`},
		{`[[:alpha:]]`, `[A-ZA-Za-za-z]`},
		{`\d`, `[0-90-9]`},

		// complex
		{`\Qaiueo\E`, `[aa][ii][uu][ee][oo]`},
		{`test(test)(あいうえおb)+[a-z]{0,4}$`, `[tt][ee][ss][tt]([tt][ee][ss][tt])(あいうえお[bb])+[a-za-z]{0,4}$`},
	}

	for _, pt := range testPatterns {
		out, err := ZenHanRegexp(pt[0])
		if err != nil {
			t.Errorf("parse error for %s, %s", pt[0], err.Error())
		}
		if out != pt[1] {
			t.Errorf("original %s: %s expected to be %s", pt[0], out, pt[1])
		}
	}
}

実装

package util

import (
	"bytes"
	"fmt"
	"regexp/syntax"
)

func ZenHanRegexp(src string) (string, error) {
	if src == "" {
		return "", nil
	}
	r, err := syntax.Parse(src, syntax.PerlX|syntax.Simple)
	if err != nil {
		return "", err
	}
	newR := replaceZenHan(r)
	return newR.String(), nil
}

func replaceZenHan(re *syntax.Regexp) *syntax.Regexp {
	if re == nil {
		return nil
	}

	copied := *re
	switch copied.Op {
	case syntax.OpLiteral:
		return createZenHanLiteral(&copied)
	case syntax.OpCharClass:
		return createZenHanCharClass(&copied)
	}
	for i, cr := range copied.Sub {
		copied.Sub[i] = replaceZenHan(cr)
	}
	return &copied
}

func createZenHanLiteral(re *syntax.Regexp) *syntax.Regexp {
	var buf bytes.Buffer
	for _, r := range re.Rune {
		buf.Write([]byte(runeToZenHanString(r)))
	}
	newRe, _ := syntax.Parse(buf.String(), 0)
	return newRe
}

func runeToZenHanString(r rune) string {
	runes, found := allRuneMap[r]
	if !found {
		return string(r)
	}
	return fmt.Sprintf("[%c%c]", runes[0], runes[1])
}

func createZenHanCharClass(re *syntax.Regexp) *syntax.Regexp {
	copied := *re
	var buf bytes.Buffer

	for i := 0; i < len(re.Rune)/2; i++ {
		start, end := re.Rune[2*i], re.Rune[2*i+1]
		buf.Write([]byte(classRunesToZenHanString(start, end)))
	}

	copied.Rune = []rune(buf.String())

	return &copied
}

func classRunesToZenHanString(start, end rune) string {
	// それぞれの文字の範囲でのみ全角半角に置き換えて返す
	for _, targetMap := range []map[rune][2]rune{lowerAlphaRuneMap, upperAlphaRuneMap, digitRuneMap} {
		if runeMapExists(targetMap, start) && runeMapExists(targetMap, end) {
			return convertZenHanCharClass(start, end)
		}
	}

	return string([]rune{start, end})
}

func convertZenHanCharClass(start, end rune) string {
	ss := allRuneMap[start]
	es := allRuneMap[end]
	return string([]rune{ss[0], es[0], ss[1], es[1]})
}

func runeMapExists(m map[rune][2]rune, r rune) bool {
	_, exists := m[r]
	return exists
}

var (
	lowerAlphaPairs   [][2]rune
	upperAlphaPairs   [][2]rune
	digitPairs        [][2]rune
	lowerAlphaRuneMap map[rune][2]rune
	upperAlphaRuneMap map[rune][2]rune
	digitRuneMap      map[rune][2]rune
	allRuneMap        map[rune][2]rune
)

func init() {
	// create map
	allRuneMap = map[rune][2]rune{}
	lowerAlphaRuneMap = map[rune][2]rune{}
	upperAlphaRuneMap = map[rune][2]rune{}
	digitRuneMap = map[rune][2]rune{}

	// init alphabets
	for i := 0; i < 26; i++ {
		lowerAlphaPairs = append(lowerAlphaPairs, [2]rune{rune(int('a') + i), rune(int('a') + i)})
		upperAlphaPairs = append(upperAlphaPairs, [2]rune{rune(int('A') + i), rune(int('A') + i)})
	}

	for _, pair := range lowerAlphaPairs {
		for _, r := range pair {
			lowerAlphaRuneMap[r] = pair
			allRuneMap[r] = pair
		}
	}
	for _, pair := range upperAlphaPairs {
		for _, r := range pair {
			upperAlphaRuneMap[r] = pair
			allRuneMap[r] = pair
		}
	}

	// init digits
	for i := 0; i < 10; i++ {
		digitPairs = append(digitPairs, [2]rune{rune(int('0') + i), rune(int('0') + i)})
	}

	for _, pair := range digitPairs {
		for _, r := range pair {
			digitRuneMap[r] = pair
			allRuneMap[r] = pair
		}
	}
}

6
7
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
6
7

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?