LoginSignup
2
2

More than 3 years have passed since last update.

CPythonを読む (1) ParserとTokenizerの境界

Last updated at Posted at 2021-01-31

ソースのバージョンはPython 3.9.1

構造体

typedef struct {
    struct tok_state *tok;
    Token **tokens;
    int mark;
    int fill, size;
    PyArena *arena;
    KeywordToken **keywords;
    int n_keyword_lists;
    int start_rule;
    int *errcode;
    int parsing_started;
    PyObject* normalize;
    int starting_lineno;
    int starting_col_offset;
    int error_indicator;
    int flags;
    int feature_version;
    growable_comment_array type_ignore_comments;
    Token *known_err_token;
    int level;
    int call_invalid_rules;
} Parser;

/* Tokenizer state */
struct tok_state {
    /* Input state; buf <= cur <= inp <= end */
    /* NB an entire line is held in the buffer */
    char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
    char *cur;          /* Next character in buffer */
    char *inp;          /* End of data in buffer */
    const char *end;    /* End of input buffer if buf != NULL */
    const char *start;  /* Start of current token if not NULL */
    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
    /* NB If done != E_OK, cur must be == inp!!! */
    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
    int tabsize;        /* Tab spacing */
    int indent;         /* Current indentation index */
    int indstack[MAXINDENT];            /* Stack of indents */
    int atbol;          /* Nonzero if at begin of new line */
    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
    const char *prompt, *nextprompt;          /* For interactive prompting */
    int lineno;         /* Current line number */
    int first_lineno;   /* First line of a single line or multi line string
                           expression (cf. issue 16806) */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
    char parenstack[MAXLEVEL];
    int parenlinenostack[MAXLEVEL];
    PyObject *filename;
    /* Stuff for checking on different tab sizes */
    int altindstack[MAXINDENT];         /* Stack of alternate indents */
    /* Stuff for PEP 0263 */
    enum decoding_state decoding_state;
    int decoding_erred;         /* whether erred in decoding  */
    int read_coding_spec;       /* whether 'coding:...' has been read  */
    char *encoding;         /* Source encoding. */
    int cont_line;          /* whether we are in a continuation line. */
    const char* line_start;     /* pointer to start of current line */
    const char* multi_line_start; /* pointer to start of first line of
                                     a single line or multi line string
                                     expression (cf. issue 16806) */
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
    const char* enc;        /* Encoding for the current str. */
    char* str;
    char* input;       /* Tokenizer's newline translated copy of the string. */

    int type_comments;      /* Whether to look for type comments */

    /* async/await related fields (still needed depending on feature_version) */
    int async_hacks;     /* =1 if async/await aren't always keywords */
    int async_def;        /* =1 if tokens are inside an 'async def' body. */
    int async_def_indent; /* Indentation level of the outermost 'async def'. */
    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
                             NEWLINE token after it. */
};

typedef struct {
    int type;
    PyObject *bytes;
    int lineno, col_offset, end_lineno, end_col_offset;
    Memo *memo;
} Token;

_PyPegen_expect_token(Parser *p, int type)

  • parse.c 内で定義されている。
  • _PyPegen_fill_token(Parser *p)を呼び出す。
  • 指定されたtypeかどうかを判定して、そうであれば Token を、そうでなければNULLを返す。
  • TokenはToken *t = p->tokens[p->mark];
  • *pはParser

_PyPegen_fill_token(Parser *p)

  • intを返す。
  • int type = PyTokenizer_Get(p->tok, &start, &end);を呼び出す。
  • while (type == TYPE_IGNORE) で不要なtokenを読み飛ばす。コメント等。
  • 読み飛ばしたtokenは p->type_ignore_comments に溜め込まれる。

PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)

  • tokenizer.c内で定義されている。
  • エラーの処理のみ行う。
  • 実体は tok_get()

tok_get(struct tok_state *tok, const char **p_start, const char **p_end)

  • 一文字ずつ、入力を読む。tok_nextc()
  • indent levelを処理する。
  • Token切り出しのコアの部分。処理長いので別途解説する.

Parser *pはどこでつくられる(mallocされる)のか?

_PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
  • pegen.c
  • struct tok_state *tok = PyTokenizer_FromFile(fp, enc, ps1, ps2);でtok_stateにプロンプトや、入力のfile pointer を渡している。
  • 以下を呼び出してつくる。
Parser *
_PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
                    int feature_version, int *errcode, PyArena *arena)
2
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
2