LoginSignup
1
1

More than 3 years have passed since last update.

C#の正規表現Regexの内部を覗いてみた(RegexCode)

Last updated at Posted at 2019-09-15

目的

正規表現でやらかすことがそこそこあるので、c#のRegexが解釈した正規表現を可視化したい。
図示はできていないが、内部構造を出力できた。

アプローチ

マッチした部分を可視化しようと思ったが(※)、オートマトン的なものが表示できたほうがうれしい気がしたので、ILSpy.exeでRegexクラス周辺をあさってみた。
(※:結局やってみた)

private Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, bool useCache)

に下記のようなコードがあったので、下記3行目のcodeが使えそうと判断した。

Regexのコードから抜粋
// 注意:要所だけ抜粋して、間のコードも消してます。
1 if (cachedCodeEntry == null) {
2     RegexTree regexTree = RegexParser.Parse(pattern, roptions);
3     code = RegexWriter.Write(regexTree);
4     regexTree = null;
5 }
6 if (UseOptionC() && factory == null) {
7     code = null;
8 }

で、このメンバ codeRegexinternal なフィールドなのですが、
リフレクションで強引にアクセスできます。(普通はやっちゃダメ)

codeのクラスRegexCodeを見てみると、


internal const int Goto = 38;
internal int[] _codes;
internal string[] _strings;

オペコードっぽい定義がある。
というわけでこれを表示してみます。

出力結果

正規表現 abc(de)f を分析にかけてみた。
※ [a-c]とかをかけると、stringsに表示できないデータが含まれるので注意。

>regexReflectionTest.exe abc(de)f
--codes--
n:17
0:Lazybranch(16)
2:Setmark
3:Multi(0)
5:Setmark
6:Multi(1)
8:Capturemark(1,-1)
11:One(102)
13:Capturemark(0,-1)
16:Stop
--strings--
n:2
0:abc
1:de

どうやら
Multi は部分文字列を示す。(直後の_code[i+1]_stringsのindex)
One は文字を示す。(直後の_code[i+1]が文字コード)
SetmarkとCapturemarkはMatchGroupsで取り出せる情報と関係していそう。
といった感じ。

環境

RegexCodeクラスのコードが変わると正常に動作しない懸念があります。
下記で特定できているのか分からんですが、環境を晒しておきます。


C:\Windows\Microsoft.NET\Framework64\v4.0.30319\csc.exe
Microsoft (R) Visual C# Compiler version 4.8.3752.0
for C# 5

ソースコード

ソースコード
regexReflectionTest.cs

using System;
using System.Drawing;
using System.Text.RegularExpressions;
using System.Runtime.InteropServices;
using System.Windows.Forms;
using System.Reflection;

class Test
{
    enum RegexCodeOpCode {
        Onerep = 0,
        Notonerep = 1,
        Setrep = 2,
        Oneloop = 3,
        Notoneloop = 4,
        Setloop = 5,
        Onelazy = 6,
        Notonelazy = 7,
        Setlazy = 8,
        One = 9,
        Notone = 10,
        Set = 11,
        Multi = 12,
        Ref = 13,
        Bol = 14,
        Eol = 15,
        Boundary = 16,
        Nonboundary = 17,
        Beginning = 18,
        Start = 19,
        EndZ = 20,
        End = 21,
        Nothing = 22,
        Lazybranch = 23,
        Branchmark = 24,
        Lazybranchmark = 25,
        Nullcount = 26,
        Setcount = 27,
        Branchcount = 28,
        Lazybranchcount = 29,
        Nullmark = 30,
        Setmark = 31,
        Capturemark = 32,
        Getmark = 33,
        Setjump = 34,
        Backjump = 35,
        Forejump = 36,
        Testref = 37,
        Goto = 38,
        Prune = 39,
        Stop = 40,
        ECMABoundary = 41,
        NonECMABoundary = 42,
        Mask = 63,
        Rtl = 64,
        Back           = 128,  // bit to indicate that we're backtracking.
        Back2          = 256,  // bit to indicate that we're backtracking on a second branch.
        Ci             = 512,  
    }


    [STAThread]
    public static void Main(string[] args)
    {
        if(args.Length==0)return;

        Regex r=null;

        try{
            r = new Regex(args[0]);
        }
        catch(Exception e){
            Console.WriteLine(e);
            return;
        }

        Type typeR = r.GetType();
        FieldInfo fieldRC = typeR.GetField("code", BindingFlags.Public | BindingFlags.Instance | BindingFlags.NonPublic);

        dynamic rc = fieldRC.GetValue(r);

        Type typeRC = rc.GetType();
        FieldInfo fieldRcCodes  = typeRC.GetField("_codes", BindingFlags.Public | BindingFlags.Instance | BindingFlags.NonPublic);
        FieldInfo fieldRcString = typeRC.GetField("_strings", BindingFlags.Public | BindingFlags.Instance | BindingFlags.NonPublic);

        MethodInfo miOpcodeSize = typeRC.GetMethod("OpcodeSize", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);


        int[] t    = (int[])fieldRcCodes.GetValue(rc);
        string[] s = (string[])fieldRcString.GetValue(rc);

        Console.WriteLine("--codes--");

        Console.Write("n:");
        Console.WriteLine(t.Length);
        for(int i=0;i<t.Length;i++) {
            int x = t[i];
            Console.Write(i);
            Console.Write(":");
            Console.Write((RegexCodeOpCode)x);

            int argCount = ((int)miOpcodeSize.Invoke(null, new object[] {x} )) - 1;

            if (argCount > 0) {
                Console.Write("(");
                for(int j=i+1;j<=i+argCount&&j<t.Length;j++) {
                    if(j>i+1){Console.Write(",");}
                    Console.Write(t[j]);
                }
                Console.Write(")");
                i += argCount;
            }
            Console.WriteLine();
        }

        Console.WriteLine("--strings--");

        Console.Write("n:");
        Console.WriteLine(s.Length);
        for(int i=0;i<s.Length;i++) {
            Console.Write(i);
            Console.Write(":");
            Console.WriteLine(s[i]??"<<NULL>>");
        }
    }
}

参考サイト

分析追記

a|bb を分析してみた。


--codes--
n:15
0:Lazybranch(14)
2:Setmark
3:Lazybranch(9)
5:One(97)
7:Goto(11)
9:Multi(0)
11:Capturemark(0,-1)
14:Stop
--strings--
n:1
0:bb

↓の図解は手書き。
image.png

さらにその後

グラフ表示させてみた。※箱は手動でドラッグして配置
image.png


using System;
using System.Drawing;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Runtime.InteropServices;
using System.Windows.Forms;
using System.Reflection;


public class MyNode
{
    int _codeIndex; // RegexCode._codeのindex
    int _selfIndex; // nodeとしてのindex
    List<int> _destCodeIndices;
    int[] _destNodeIndices; // nodeとしてのindex
    string _textOpCode;
    string _text;
    bool _charClassNotFlag;

    public int CodeIndex{get{return _codeIndex;}}
    public int[] NextIndices{get{return _destNodeIndices;}} // 横着...

    public bool CharClassNotFlag{get{return _charClassNotFlag;}}

    public string Text{get{return _text;}}
    public string TextOpCode{get{return _textOpCode;}}

    public enum RegexCodeOpCode {
        Onerep = 0,//
        Notonerep = 1,
        Setrep = 2,//
        Oneloop = 3,//
        Notoneloop = 4,
        Setloop = 5,
        Onelazy = 6,//
        Notonelazy = 7,
        Setlazy = 8,//
        One = 9,//
        Notone = 10,//
        Set = 11,//
        Multi = 12,//
        Ref = 13,
        Bol = 14,
        Eol = 15,
        Boundary = 16,
        Nonboundary = 17,
        Beginning = 18,
        Start = 19,
        EndZ = 20,
        End = 21,
        Nothing = 22,
        Lazybranch = 23,//
        Branchmark = 24,//
        Lazybranchmark = 25,
        Nullcount = 26,
        Setcount = 27,
        Branchcount = 28,
        Lazybranchcount = 29,
        Nullmark = 30,//
        Setmark = 31,//
        Capturemark = 32,//
        Getmark = 33,
        Setjump = 34,
        Backjump = 35,
        Forejump = 36,
        Testref = 37,
        Goto = 38,//
        Prune = 39,
        Stop = 40,//
        ECMABoundary = 41,
        NonECMABoundary = 42,
        Mask = 63,
        Rtl = 64,
        Back           = 128,  // bit to indicate that we're backtracking.
        Back2          = 256,  // bit to indicate that we're backtracking on a second branch.
        Ci             = 512,  
    }


    public MyNode(int selfIndex, int codeIndex, RegexCodeOpCode opCode, int[] args, string[] strings)
    {
        _selfIndex = selfIndex;
        _codeIndex = codeIndex;
        _text = "";
        _textOpCode = "";

        switch ( opCode ) {
        case RegexCodeOpCode.Notone:/* through */
            _charClassNotFlag = true;
            break;
        default:
            _charClassNotFlag = false;
            break;
        }

        _destCodeIndices = new List<int>();

        switch ( opCode ) {
        case RegexCodeOpCode.Lazybranch:/* through */
        case RegexCodeOpCode.Branchmark:
            _destCodeIndices.Add(codeIndex + args.Length + 1);
            _destCodeIndices.Add(args[0]);
            break;
        case RegexCodeOpCode.Goto: 
            _destCodeIndices.Add(args[0]);
            break;
        case RegexCodeOpCode.Stop:
            break;
        default:
            _destCodeIndices.Add(codeIndex + args.Length + 1);
            break;
        }

        _textOpCode = opCode.ToString();

        switch ( opCode ) {
        case RegexCodeOpCode.Lazybranch:
            _textOpCode = "LazyBr";
            break;
        case RegexCodeOpCode.One:/* through */
        case RegexCodeOpCode.Notone:
            _text = MyParseCharClass(((char)args[0]).ToString());
            break;
        case RegexCodeOpCode.Onerep: // 指定回数繰り返し
            _textOpCode += "{"+args[1].ToString()+"}";
            _text = MyParseCharClass(((char)args[0]).ToString());
            break;
        case RegexCodeOpCode.Setrep: // 指定回数繰り返し
            _textOpCode += "{"+args[1].ToString()+"}";
            _text = MyParseCharClass(strings[args[0]]);
            break;
        case RegexCodeOpCode.Oneloop: // 0~最大で指定回数繰り返し
            _textOpCode += "{0 to "+args[1].ToString()+"}";
            _text = MyParseCharClass(((char)args[0]).ToString());
            break;
        case RegexCodeOpCode.Onelazy: // 0~最大で指定回数繰り返し (最短マッチ)
            _text = MyParseCharClass(((char)args[0]).ToString());
            break;
        case RegexCodeOpCode.Setlazy: // 0~最大で指定回数繰り返し (最短マッチ)
            _text = MyParseCharClass(strings[args[0]]);
            break;
        case RegexCodeOpCode.Set:
            _text = MyParseCharClass(strings[args[0]]);
            break;
        case RegexCodeOpCode.Multi:
            _text = MyParseCharClass(strings[args[0]]);
            break;
        default:
            break;
        }
    }

    public void UpdateDestIndices(List<MyNode> nodes)
    {
        _destNodeIndices = new int[_destCodeIndices.Count];
        for ( int i = 0 ; i < _destCodeIndices.Count ; i++ ) {
            _destNodeIndices[i] = -1;
            foreach ( MyNode node in nodes ) {
                if ( node._codeIndex == _destCodeIndices[i] ) {
                    _destNodeIndices[i] = node._selfIndex;
                    break;
                }
            }
            if ( _destNodeIndices[i] == -1 ) {
                throw new Exception("bug!!!");
            }
        }
    }

    public static string MyParseCharClass(string s)
    {
        StringBuilder sb = new StringBuilder();

        foreach ( char c in s ) {
            if ( (int)c<0x20 || ((int)c>=0x7F && (int)c<=0xFF) ) {
                sb.Append("[x" + ((int)c).ToString("X02")+"]");
            }
            else {
                sb.Append(c);
            }
        }
        return sb.ToString();
    }


    public static List<MyNode> CreateNodes(Regex r)
    {
        var nodes = new List<MyNode>();

        Type typeR = r.GetType();
        FieldInfo fieldRC = typeR.GetField("code", BindingFlags.Public | BindingFlags.Instance | BindingFlags.NonPublic);

        dynamic rc = fieldRC.GetValue(r);

        Type typeRC = rc.GetType();
        FieldInfo fieldRcCodes  = typeRC.GetField("_codes", BindingFlags.Public | BindingFlags.Instance | BindingFlags.NonPublic);
        FieldInfo fieldRcString = typeRC.GetField("_strings", BindingFlags.Public | BindingFlags.Instance | BindingFlags.NonPublic);

        MethodInfo miOpcodeSize = typeRC.GetMethod("OpcodeSize", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);


        int[] opCodes = (int[])fieldRcCodes.GetValue(rc);
        string[] strings = (string[])fieldRcString.GetValue(rc);

        int nodeCount = 0;

        for ( int i=0 ; i<opCodes.Length ; i++ ) {
            int x = opCodes[i];
            int argCount = ((int)miOpcodeSize.Invoke(null, new object[] {x} )) - 1;
            if ( argCount < 0 ) {
                throw new Exception("Unexcepted opcode size.");
            }

            int[] args = new int[argCount];

            for ( int j = 0 ; j < argCount ; j++ ) {
                if ( i+j+1 >= opCodes.Length ) {
                    throw new Exception("Argument parse error.");
                }
                args[j] = opCodes[i+j+1];
            }

            nodes.Add(new MyNode(nodeCount, i, (RegexCodeOpCode)x, args, strings));

            nodeCount++;
            i += argCount;
        }

        foreach ( var node in nodes ) {
            node.UpdateDestIndices(nodes);
        }

        return nodes;
    }
}


// 描画用
public class MyEdge
{
    int _fromIndex;
    int _toIndex;
    Point[] _travelPoints; // 経由点の座標 (両端座標はダミー)

    public Point[] TravelPoints{get{return _travelPoints;}} // ちょっと横着..


    public MyEdge(int fromI, int toI)
    {
        _fromIndex = fromI;
        _toIndex = toI;
        _travelPoints = new Point[2];
    }

    public void UpdateTerminalPoints(MyGraph graph)
    {
        _travelPoints[0] = graph.CenterPointOf(_fromIndex);
        _travelPoints[_travelPoints.Length-1] = graph.CenterPointOf(_toIndex);
        Point tmp0 = graph.MyIntersect(_fromIndex, _travelPoints[1]);
        Point tmp1 = graph.MyIntersect(_toIndex  , _travelPoints[_travelPoints.Length-2]);
        _travelPoints[0]                      = tmp0;
        _travelPoints[_travelPoints.Length-1] = tmp1;
    }
}

public class MyGraph
{
    List<MyNode> _nodes;
    Rectangle[] _rects; // same length of _nodes

    List<MyEdge> _edges;

    public Point CenterPointOf(int i)
    {
        return new Point( ( _rects[i].Left + _rects[i].Right  ) / 2 ,
                          ( _rects[i].Top  + _rects[i].Bottom ) / 2 );
    }

    public MyGraph(List<MyNode> nodes)
    {
        _nodes = nodes;

        _rects = new Rectangle[nodes.Count];

        for ( int i = 0 ; i < _rects.Length ; i++ ) {
            _rects[i] = new Rectangle(new Point(5+i*60, 100), new Size(50,50));
        }
        if ( _rects.Length > 0 ) {
            _rects[0].Y -= 50;
        }
        if ( _rects.Length >= 2 ) {
            _rects[_rects.Length-1].Y -= 50;
        }

        _edges = new List<MyEdge>();

        for ( int i=0 ; i<nodes.Count ; i++ ) {
            foreach ( int toI in nodes[i].NextIndices ) {
                _edges.Add(new MyEdge(i,toI));
            }
        }
    }


    public void DrawGraph(Graphics g, Font fnt)
    {
        Pen pen = new Pen(Color.Blue, 1.5f);
        pen.CustomEndCap = new System.Drawing.Drawing2D.AdjustableArrowCap(4, 4); // 矢印

        foreach ( MyEdge edge in _edges ) {
            edge.UpdateTerminalPoints(this);
            g.DrawLines(pen, edge.TravelPoints);
        }

        for ( int i = 0 ; i < _rects.Length ; i++ ) {
            Rectangle rect = _rects[i];
            if ( rect.Width >= 1 ) {
                g.FillRectangle(Brushes.White, rect);
                g.DrawRectangle(Pens.Blue,     rect);
                g.DrawString(_nodes[i].CodeIndex.ToString(), fnt, Brushes.Blue,  rect.Left+2, rect.Top);
                g.DrawString(_nodes[i].TextOpCode,           fnt, Brushes.Blue,  rect.Left+2, rect.Top+18);
                g.DrawString(_nodes[i].Text,                 fnt, Brushes.Black, rect.Left+2, rect.Top+36);
            }
        }
    }

    public int MyHitTest(Point p, out Point location)
    {
        // 昇順で検索すると、z-order(描画の順序)が奥のほうを先に拾ってしまうので、降順で検索する
        for ( int i = _rects.Length - 1 ; i>=0 ; i-- ) {
            if ( _rects[i].Contains(p) ) {
                location = _rects[i].Location;
                return i;
            }
        }
        location = new Point(0,0);
        return -1;
    }

    public void MoveNodeTo(int nodeIndex, Point p)
    {
        _rects[nodeIndex].X = p.X;
        _rects[nodeIndex].Y = p.Y;
    }

    // a が -intmax の場合はダメだけど..
    private int MyAbsInt(int a)
    {
        return (a<0)?(-a):a;
    }

    // p <---> CenterPointOf(nodeIndex)(=cと置く) をつなぐ線L(端点をcとする半直線)と、rectの交点を求める
    public Point MyIntersect(int nodeIndex, Point p)
    {
        // まず、cを中央にもつ長方形rectと、半直線Lの交点がどの線と交わるかを判定する
        // (memo: y座標は数学の座標系で考える)
        //
        // \1 /
        // 2 c 0
        // / 3\
        //
        // 傾きと、x同士,y同士の大小関係から判定する。ゼロ除算に注意する。

        Point t = new Point();
        Point c = CenterPointOf(nodeIndex);
        int W = _rects[nodeIndex].Width;
        int H = _rects[nodeIndex].Height;

        if ( W == 0 || H == 0 ) { // 例外処理
            return c;
        }
        if ( p.X == c.X && p.Y == c.Y ) { // 例外処理
            return c;
        }

        //   abs((p.Y-c.Y)/(p.X-c.X)) <= H/W  なら  領域0か2
        if ( MyAbsInt(p.Y-c.Y)*(long)W <= MyAbsInt(p.X-c.X)*(long)H ) {
            int dY = (int)((((long)W/2) * (p.Y-c.Y))/(p.X-c.X));
            // p.X==c.X なら、p.Y==c.Yであり、先の例外処理でこのifには入らないのでゼロ除算は回避できる(W>0が前提)

            if ( c.X < p.X ) {
                t.X = c.X + W/2;
                t.Y = c.Y + dY;
            }
            else{
                t.X = c.X - W/2;
                t.Y = c.Y - dY;
            }
        }
        else {
            int dX = (int)((((long)H/2) * (p.X-c.X))/(p.Y-c.Y));

            if ( c.Y < p.Y ) {
                t.X = c.X + dX;
                t.Y = c.Y + H/2;
            }
            else{
                t.X = c.X - dX;
                t.Y = c.Y - H/2;
            }
        }

        return t;
    }
}



class Test : Form
{
    MyGraph graph;
    PictureBox pct;
    const int WIDTH = 800;
    const int HEIGHT = 300;
    Font fnt;
    Point dragStartPoint;
    Point dragInitialNodeLocation;
    int dragStartNodeIndex;

    Test(Regex r)
    {   
        fnt = this.Font;
        dragStartNodeIndex = -1;

        ClientSize = new Size(WIDTH, HEIGHT);

        pct = new PictureBox();
        pct.Dock = DockStyle.Fill;
        pct.Image = new Bitmap(WIDTH, HEIGHT);
        Controls.Add(pct);

        Console.WriteLine("Creating regex nodes...");
        List<MyNode> nodes = MyNode.CreateNodes(r);
        Console.WriteLine("done.");

        graph = new MyGraph(nodes);

        Load += (sender, e)=>{RedrawGraph();};
        pct.MouseDown += Pct_MouseDown;
        pct.MouseMove += Pct_MouseMove;
        pct.MouseUp   += (sender,e)=>{dragStartNodeIndex = -1;};
    }

    void RedrawGraph()
    {
        Graphics g = Graphics.FromImage(pct.Image);
        g.Clear(Color.LightGray);
        graph.DrawGraph(g, fnt);
        g.Dispose();
        pct.Refresh();
    }

    void Pct_MouseDown(object sender, MouseEventArgs e)
    {
        if ( (e.Button & MouseButtons.Left) == MouseButtons.Left ) {
            dragStartPoint = e.Location;
            dragStartNodeIndex = graph.MyHitTest(dragStartPoint, out dragInitialNodeLocation);
        }
    }

    void Pct_MouseMove(object sender, MouseEventArgs e)
    {
        if ( (e.Button & MouseButtons.Left) == MouseButtons.Left ) {
            if ( dragStartNodeIndex >= 0 ) {
                if ( e.X > 0 && e.Y > 0 && e.X < WIDTH && e.Y < HEIGHT ) {
                    Point p = new Point();
                    p.X = dragInitialNodeLocation.X + (e.X - dragStartPoint.X);
                    p.Y = dragInitialNodeLocation.Y + (e.Y - dragStartPoint.Y);
                    graph.MoveNodeTo(dragStartNodeIndex, p);
                    RedrawGraph();
                }
            }
        }
    }

    [STAThread]
    public static void Main(string[] args)
    {
        if(args.Length==0)return;

        Regex r=null;

        Console.WriteLine("Creating regex...");
        try{
            r = new Regex(args[0]);
            Console.WriteLine("done.");
        }
        catch(Exception e){
            Console.WriteLine(e);
            return;
        }

        Application.Run(new Test(r));
    }
}

1
1
2

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1