More than 5 years have passed since last update.

Menhir と ocamllex のサンプルコードをいくつか

Posted at 2018-03-04

はじめに

プログラムを書いていると、ちょっとした言語処理系を作りたくなることがあります。ちょっとした言語処理系を作ろうとしたとき、OCamlは良い選択肢です。何故良い選択肢なのかはよくわかりませんが、実際書いてみて快適でしたし、えらい人もそう言ってましたし、まあいいや、とにかく良い選択肢です。

しかし、素のOCamlで構文解析器や字句解析器を作るのはしんどいです。手を抜こうと思ったら構文解析器生成器Menhirと字句解析器生成器ocamllexを使うのが常套手段でしょう。

が、普通の人はそう頻繁に新しく言語処理系を作ったりしませんので、 Menhir や ocamllex の使い方を忘れがちです。こうなると困ります。例えば「半年ぶりに言語処理系を作りたい気分になっちゃった」というとき、「あれっ？ Menhir ってどう書くんだっけ？」となり、こうなると気分が重くなり、結果「言語処理系つくらなくていいやー」となってしまいがちです。

というわけで気が向いたときに軽く構文解析器を作れるように、サンプルコードを残しておこうと思いました。

外部の情報

まず外部の情報です。私の記事よりはるかに信頼できるかと思います。

OCaml 公式サイトの ocamllex(とocamlyacc)のページ
- http://caml.inria.fr/pub/docs/manual-ocaml/lexyacc.html
- 下の方にサンプルコードもある
OCaml 公式サイトの Lexing のページ
- https://caml.inria.fr/pub/docs/manual-ocaml/libref/Lexing.html
Menhir公式サイト
- http://gallium.inria.fr/~fpottier/menhir/
min-camlのソースコード
- lexer.mll: https://github.com/esumii/min-caml/blob/master/lexer.mll
- parser.mly: https://github.com/esumii/min-caml/blob/master/parser.mly
OCaml のソースコード
- lexer.mll: https://github.com/ocaml/ocaml/blob/trunk/parsing/lexer.mll
- parser.mly: https://github.com/ocaml/ocaml/blob/trunk/parsing/parser.mly

サンプルコード用 Makefile

まずサンプルコード用 Makefile です。sub.ml, parser.mly, lexer.mll, main.ml から a.out を作ります。本記事のサンプルコードは全てこの Makefile で make できるようにしてあります。

Makefile

a.out: sub.ml parser.ml lexer.ml main.ml
	ocamlc sub.ml parser.mli parser.ml lexer.ml main.ml

parser.ml: parser.mly
	menhir parser.mly

lexer.ml: lexer.mll
	ocamllex lexer.mll

.PHONY: clean
clean:
	rm -f parser.ml parser.mli lexer.ml *.cmi *.cmx *.cmo *.o *~ a.out

最小の例

意味がありそうな最小構成のサンプルコードです。次のように動きます。

lexer は、1バイト読むたび、読んだ文字を表示する
parser は、reduce するたび、reduceした文字を表示する

sub.ml

(* emtpy *)

parser.mly

%{
  (* This part is inserted into the generated file *)
%}

%token <string> CHAR
%token EOF
%type <unit> program
%start program

%%

program:
| EOF { Printf.printf "parser: eof\n" }
| CHAR program { Printf.printf "parser: %s\n" $1 }

lexer.mll

{
  (* This part is inserted into the head of the generated file. *)
}

rule token = parse
| ['\x00'-'\xff'] { (* This means 'any byte' *) 
    Printf.printf "lexer: %s\n" (Lexing.lexeme lexbuf);
    Parser.CHAR (Lexing.lexeme lexbuf)
  }
| eof {
    Printf.printf "lexer: eof\n";
    Parser.EOF
  }

{
  (* This part is inserted into the end of the generated file. *)
}

main.ml

let () =
  Parser.program Lexer.token (Lexing.from_channel stdin)

実行例

$ echo -n hoge | ./a.out
lexer: h
lexer: o
lexer: g
lexer: e
lexer: eof
parser: eof
parser: e
parser: g
parser: o
parser: h

tokenizer

いわゆる tokenizer です。読み込んだファイルの中身を空白で分解し、空白以外の部分を文字列として取り出します。

sub.ml

(* emtpy *)

parser.mly

%{
  (* This part is inserted into the generated file *)
%}

%token <string> STRING
%token EOF
%type <unit> program
%start program

%%

program:
| EOF { Printf.printf "parser: eof\n" }
| STRING program { Printf.printf "parser: %s\n" $1 }

lexer.mll

{
  (* This part is inserted into the head of the generated file. *)
}

let space = ['\t' '\n' '\r' ' ']
let char = ['\x21'-'\xfe'] (* ascii graphic characters *)

rule token = parse
| space+ { token lexbuf (* use recursion to ignore *) }
| char+ as lexeme {
    Printf.printf "lexer: %s\n" lexeme;
    Parser.STRING(lexeme)
  }
| eof {
    Printf.printf "lexer: eof\n";
    Parser.EOF
  }

{
  (* This part is inserted into the end of the generated file. *)
}

main.ml

let () =
  Parser.program Lexer.token (Lexing.from_channel stdin)

実行例

$ echo "foo bar baz" | ./a.out
lexer: foo
lexer: bar
lexer: baz
lexer: eof
parser: eof
parser: baz
parser: bar
parser: foo

シンプルなS式パーサ

シンプルなS式パーサのようなものです。S式を読んで吐きます。以下の縛りがあります。

Symbol と Cons しかない
ドット対構文(正式名称はなんて言うんでしょう)は読めない
出力は全部ドット対形式
コメントはない

sub.ml

type sexp = Symbol of string | Cons of sexp * sexp

let rec print_sexp = function
  | Symbol s ->
     print_string s
  | Cons(car, cdr) ->
     print_char '(';
     print_sexp car;
     print_string " . ";
     print_sexp cdr;
     print_char ')'

parser.mly

%{
  (* This part is inserted into the generated file *)
%}

%token <string> SYMBOL
%token LPAREN RPAREN EOF
%type <Sub.sexp list> program
%type <Sub.sexp> sexp
%start program

%%

program:
| sexp* EOF { $1 }

sexp:
| SYMBOL { Sub.Symbol $1 }
| LPAREN list_body { $2 }

list_body:
| RPAREN { Sub.Symbol("nil") }
| sexp list_body { Sub.Cons($1, $2) }

lexer.mll

{
  (* This part is inserted into the head of the generated file. *)
}

let space = ['\t' '\n' '\r' ' ']
let symbol_char =
  ['!' '#' '$' '%' '&' '*' '+' '-' '.' '/' '0'-'9' '<' '=' '>' '?'
   '@' 'A'-'Z' '^' '_' 'a'-'z' '~']

rule token = parse
| space+ { token lexbuf (* use recursion to ignore *) }
| symbol_char+ as lexeme { Parser.SYMBOL (lexeme) }
| '(' { Parser.LPAREN }
| ')' { Parser.RPAREN }
| eof { Parser.EOF }

{
  (* This part is inserted into the end of the generated file. *)
}

main.ml

let () =
  let sexps = Parser.program Lexer.token (Lexing.from_channel stdin) in
  List.iter (fun sexp -> Sub.print_sexp sexp; print_newline ()) sexps

実行例

$ echo "(foo bar) (foo (bar) (baz))" | ./a.out
(foo . (bar . nil))
(foo . ((bar . nil) . ((baz . nil) . nil)))

少しリッチなS式パーサ

少しリッチなS式パーサです。以下の要素を追加しました。

数
- 数は全て OCaml の float とした
- 構文解析的には「数と見做せるシンボルは数である」とした
- 「シンボルが数と見做せるか」は float_of_string_opt で判定する
文字列リテラル
- escape 関連は OCaml の Scanf.unescaped と String.escaped に任せた
ドット対記法
quote の構文糖
ネスト可能なブロックコメント(#| |#)
行コメント(;)

sub.ml

type sexp =
  | Symbol of string
  | String of string
  | Number of float
  | Cons of sexp * sexp

let rec print_sexp outchan = function
  | Symbol s ->
     Printf.printf "%s" s
  | String s ->
     Printf.printf "\"%s\"" (String.escaped s)
  | Number f ->
     Printf.fprintf outchan "%f" f
  | Cons(car, cdr) ->
     Printf.fprintf outchan "(%a . %a)" print_sexp car print_sexp cdr

parser.mly

%{
  (* This part is inserted into the generated file *)
%}

%token <string> SYMBOL
%token <string> STRING
%token <float> NUMBER
%token LPAREN RPAREN DOT QUOTE EOF
%type <Sub.sexp list> program
%start program

%%

program:
| sexp* EOF { $1 }

sexp:
| atom { $1 }
| LPAREN list_body1 { $2 }
| QUOTE sexp { Sub.Cons(Sub.Symbol("quote"), Sub.Cons($2, Sub.Symbol("nil"))) }

atom:
| SYMBOL { Sub.Symbol($1) }
| STRING { Sub.String($1) }
| NUMBER { Sub.Number($1) }

list_body1:
| RPAREN { Sub.Symbol("nil") }
| sexp list_body2 { Sub.Cons($1, $2) }

list_body2:
| DOT sexp RPAREN { $2 }
| RPAREN { Sub.Symbol("nil") }
| sexp list_body2 { Sub.Cons($1, $2) }

lexer.mll

{
  (* This part is inserted into the head of the generated file. *)
}

let space = ['\t' '\n' '\r' ' ']
let symbol_char =
  ['!' '$' '%' '&' '*' '+' '-' '.' '/' '0'-'9' '<' '=' '>' '?'
   '@' 'A'-'Z' '^' '_' 'a'-'z' '~']

rule token = parse
| space+ { token lexbuf (* use recursion to ignore *) }
| symbol_char+ as lexeme {
    match float_of_string_opt lexeme with
    | Some f ->
        Parser.NUMBER f
    | None when lexeme = "." ->
        Parser.DOT
    | None ->
        Parser.SYMBOL lexeme
  }
| "#|" { block_comment lexbuf; token lexbuf }
| ";" { line_comment lexbuf; token lexbuf }
| '\'' { Parser.QUOTE }
| '"' ([^'"''\\'] | '\\'['\x00'-'\xff'])+ '"' {
    let lexeme = Bytes.sub_string
      lexbuf.lex_buffer
      (lexbuf.lex_start_pos + 1)
      (lexbuf.lex_curr_pos - lexbuf.lex_start_pos - 2) in
    Parser.STRING(Scanf.unescaped lexeme)
  }
| '(' { Parser.LPAREN }
| ')' { Parser.RPAREN }
| eof { Parser.EOF }

and line_comment = parse
| ('\n' | eof) { () }
| _ { line_comment lexbuf }

and block_comment = parse
| "|#" { () }
| "#|" { block_comment lexbuf; block_comment lexbuf }
| eof { () }
| _ { block_comment lexbuf }

{
  (* This part is inserted into the end of the generated file. *)
}

main.ml

let () =
  let sexps = Parser.program Lexer.token (Lexing.from_channel stdin) in
  List.iter (fun sexp -> Sub.print_sexp stdout sexp; print_newline ()) sexps

実行例

$ echo "'(foo #|( #| bar |# )|# . baz) ; comment" | ./a.out
(quote . ((foo . baz) . nil))
$ echo '(42 foo "bar\"baz")' | ./a.out
(42.000000 . (foo . ("bar\"baz" . nil)))

簡易数式パーサ

S式から離れて、二項演算子が使えるパーサです。だいたい次のような感じです。

識別子・演算子・文字列・数が使える
演算子に優先順位はない
演算子は全て左結合
「記号の列」が演算子
「英数字もしくは.の列」が「識別子もしくは数」(数か否かは float_of_string_opt で判定)
C/C++風のコメント(ブロックコメントはネスト可能)

sub.ml

type expr =
  | Bin of expr * string * expr
  | Id of string
  | String of string
  | Number of float

let rec print_expr outchan = function
  | Bin(lhs, op, rhs) ->
     Printf.printf "(%a %s %a)" print_expr lhs op print_expr rhs
  | Id(id) ->
     Printf.printf "%s" id
  | String(str) ->
     Printf.printf "%s" str
  | Number(f) ->
     Printf.printf "%f" f

parser.mly

%{
  (* This part is inserted into the generated file *)
%}

%token <string> ID
%token <string> OP
%token <string> STRING
%token <float> NUMBER
%token LPAREN RPAREN EOF
%type <Sub.expr list> program
%left OP
%start program

%%

program:
| expr* EOF { $1 }

expr:
| expr OP expr { Sub.Bin($1, $2, $3) }
| LPAREN expr RPAREN { $2 }
| ID { Sub.Id($1) }
| NUMBER { Sub.Number($1) }
| STRING { Sub.String($1) }

lexer.mll

{
  (* This part is inserted into the head of the generated file. *)
}

let space = ['\t' '\n' '\r' ' ']
let op_char =
  ['!' '#' '$' '%' '&' '\'' '*' '+' ',' '-' '.' '/'
   ':' ';' '<' '=' '>' '?' '@' '\\' '^' '`' '|' '~' ]
let id_char = ['0'-'9' 'A'-'Z' '_' 'a'-'z' '.']

rule token = parse
| space+ { token lexbuf (* use recursion to ignore *) }
| "/*" { block_comment lexbuf; token lexbuf }
| "//" { line_comment lexbuf; token lexbuf }
| op_char+ as lexeme { Parser.OP(lexeme) }
| id_char+ as lexeme {
    match float_of_string_opt lexeme with
    | Some f ->
        Parser.NUMBER f
    | None ->
        Parser.ID(Lexing.lexeme lexbuf)
  }
| '"' ([^'"''\\'] | '\\'['\x00'-'\xff'])+ '"' {
    let lexeme = Bytes.sub_string
      lexbuf.lex_buffer
      (lexbuf.lex_start_pos + 1)
      (lexbuf.lex_curr_pos - lexbuf.lex_start_pos - 2) in
    Parser.STRING(Scanf.unescaped lexeme)
  }
| '(' { Parser.LPAREN }
| ')' { Parser.RPAREN }
| eof { Parser.EOF }

and line_comment = parse
| ('\n' | eof) { () }
| _ { line_comment lexbuf }

and block_comment = parse
| "*/" { () }
| "/*" { block_comment lexbuf; block_comment lexbuf }
| eof { () }
| _ { block_comment lexbuf }

{
  (* This part is inserted into the end of the generated file. *)
}

main.ml

let () =
  let exprs = Parser.program Lexer.token (Lexing.from_channel stdin) in
  List.iter (fun expr -> Sub.print_expr stdout expr; print_newline ()) exprs

実行例

$ echo 'f = (42++"foo" /* /* comment */ */) + bar + 99 // comment' | ./a.out
(((f = (42.000000 ++ foo)) + bar) + 99.000000)

演算子に優先順がある数式パーサ

演算子の優先順位は scala と同じく最初の一文字で決まります。 scala と異なり、演算子は全て左結合です。

sub.ml

type expr =
  | Bin of expr * string * expr
  | Id of string
  | String of string
  | Number of float

let rec print_expr outchan = function
  | Bin(lhs, op, rhs) ->
     Printf.printf "(%a %s %a)" print_expr lhs op print_expr rhs
  | Id(id) ->
     Printf.printf "%s" id
  | String(str) ->
     Printf.printf "%s" str
  | Number(f) ->
     Printf.printf "%f" f

parser.mly

%{
  (* This part is inserted into the generated file *)
%}

%token <string> ID
%token <string> OP0 OP1 OP2 OP3 OP4 OP5 OP6 OP7 OP8
%token <string> STRING
%token <float> NUMBER
%token LPAREN RPAREN EOF
%type <Sub.expr list> program
%left OP8
%left OP7
%left OP6
%left OP5
%left OP4
%left OP3
%left OP2
%left OP1
%left OP0
%start program

%%

program:
| expr* EOF { $1 }

expr:
| expr OP0 expr { Sub.Bin($1, $2, $3) }
| expr OP1 expr { Sub.Bin($1, $2, $3) }
| expr OP2 expr { Sub.Bin($1, $2, $3) }
| expr OP3 expr { Sub.Bin($1, $2, $3) }
| expr OP4 expr { Sub.Bin($1, $2, $3) }
| expr OP5 expr { Sub.Bin($1, $2, $3) }
| expr OP6 expr { Sub.Bin($1, $2, $3) }
| expr OP7 expr { Sub.Bin($1, $2, $3) }
| expr OP8 expr { Sub.Bin($1, $2, $3) }
| LPAREN expr RPAREN { $2 }
| ID { Sub.Id($1) }
| NUMBER { Sub.Number($1) }
| STRING { Sub.String($1) }

lexer.mll

{
  (* This part is inserted into the head of the generated file. *)
}

let space = ['\t' '\n' '\r' ' ']
let op_char =
  ['!' '#' '$' '%' '&' '\'' '*' '+' ',' '-' '.' '/'
   ':' ';' '<' '=' '>' '?' '@' '\\' '^' '`' '|' '~' ]
let id_char = ['0'-'9' 'A'-'Z' '_' 'a'-'z' '.']

rule token = parse
| space+ { token lexbuf (* use recursion to ignore *) }
| "/*" { block_comment lexbuf; token lexbuf }
| "//" { line_comment lexbuf; token lexbuf }
| op_char+ as lexeme {
    match lexeme.[0] with
    | '*' | '/' | '%' -> Parser.OP0(lexeme)
    | '+' | '-' -> Parser.OP1(lexeme)
    | ':' -> Parser.OP2(lexeme)
    | '<' | '>' -> Parser.OP3(lexeme)
    | '=' | '!' -> Parser.OP4(lexeme)
    | '&' -> Parser.OP5(lexeme)
    | '^' -> Parser.OP6(lexeme)
    | '|' -> Parser.OP7(lexeme)
    | _ -> Parser.OP8(lexeme)) 
  }
| id_char+ as lexeme {
    match float_of_string_opt lexeme with
    | Some f ->
        Parser.NUMBER f
    | None ->
        Parser.ID(Lexing.lexeme lexbuf)
  }
| '"' ([^'"''\\'] | '\\'['\x00'-'\xff'])+ '"' {
    let lexeme = Bytes.sub_string
      lexbuf.lex_buffer
      (lexbuf.lex_start_pos + 1)
      (lexbuf.lex_curr_pos - lexbuf.lex_start_pos - 2) in
    Parser.STRING(Scanf.unescaped lexeme)
  }
| '(' { Parser.LPAREN }
| ')' { Parser.RPAREN }
| eof { Parser.EOF }

and line_comment = parse
| ('\n' | eof) { () }
| _ { line_comment lexbuf }

and block_comment = parse
| "*/" { () }
| "/*" { block_comment lexbuf; block_comment lexbuf }
| eof { () }
| _ { block_comment lexbuf }

{
  (* This part is inserted into the end of the generated file. *)
}

main.ml

let () =
  let exprs = Parser.program Lexer.token (Lexing.from_channel stdin) in
  List.iter (fun expr -> Sub.print_expr stdout expr; print_newline ()) exprs

実行例

$ echo 'f = 42&&"foo" /* /* comment */ */ + bar * 99 // comment' | ./a.out
((f = 42.000000) && (foo + (bar * 99.000000)))

さいごに

というわけでサンプルコードでした。どこかで役に立ちましたら幸いです。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up