More than 3 years have passed since last update.

CPythonを読む (1) ParserとTokenizerの境界

Last updated at 2021-02-01Posted at 2021-01-31

ソースのバージョンはPython 3.9.1

構造体

typedef struct {
    struct tok_state *tok;
    Token **tokens;
    int mark;
    int fill, size;
    PyArena *arena;
    KeywordToken **keywords;
    int n_keyword_lists;
    int start_rule;
    int *errcode;
    int parsing_started;
    PyObject* normalize;
    int starting_lineno;
    int starting_col_offset;
    int error_indicator;
    int flags;
    int feature_version;
    growable_comment_array type_ignore_comments;
    Token *known_err_token;
    int level;
    int call_invalid_rules;
} Parser;

/* Tokenizer state */
struct tok_state {
    /* Input state; buf <= cur <= inp <= end */
    /* NB an entire line is held in the buffer */
    char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
    char *cur;          /* Next character in buffer */
    char *inp;          /* End of data in buffer */
    const char *end;    /* End of input buffer if buf != NULL */
    const char *start;  /* Start of current token if not NULL */
    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
    /* NB If done != E_OK, cur must be == inp!!! */
    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
    int tabsize;        /* Tab spacing */
    int indent;         /* Current indentation index */
    int indstack[MAXINDENT];            /* Stack of indents */
    int atbol;          /* Nonzero if at begin of new line */
    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
    const char *prompt, *nextprompt;          /* For interactive prompting */
    int lineno;         /* Current line number */
    int first_lineno;   /* First line of a single line or multi line string
                           expression (cf. issue 16806) */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
    char parenstack[MAXLEVEL];
    int parenlinenostack[MAXLEVEL];
    PyObject *filename;
    /* Stuff for checking on different tab sizes */
    int altindstack[MAXINDENT];         /* Stack of alternate indents */
    /* Stuff for PEP 0263 */
    enum decoding_state decoding_state;
    int decoding_erred;         /* whether erred in decoding  */
    int read_coding_spec;       /* whether 'coding:...' has been read  */
    char *encoding;         /* Source encoding. */
    int cont_line;          /* whether we are in a continuation line. */
    const char* line_start;     /* pointer to start of current line */
    const char* multi_line_start; /* pointer to start of first line of
                                     a single line or multi line string
                                     expression (cf. issue 16806) */
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
    const char* enc;        /* Encoding for the current str. */
    char* str;
    char* input;       /* Tokenizer's newline translated copy of the string. */

    int type_comments;      /* Whether to look for type comments */

    /* async/await related fields (still needed depending on feature_version) */
    int async_hacks;     /* =1 if async/await aren't always keywords */
    int async_def;        /* =1 if tokens are inside an 'async def' body. */
    int async_def_indent; /* Indentation level of the outermost 'async def'. */
    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
                             NEWLINE token after it. */
};

typedef struct {
    int type;
    PyObject *bytes;
    int lineno, col_offset, end_lineno, end_col_offset;
    Memo *memo;
} Token;

_PyPegen_expect_token(Parser *p, int type)

parse.c 内で定義されている。
_PyPegen_fill_token(Parser *p)を呼び出す。
指定されたtypeかどうかを判定して、そうであれば Token を、そうでなければNULLを返す。
TokenはToken *t = p->tokens[p->mark];
*pはParser

_PyPegen_fill_token(Parser *p)

intを返す。
int type = PyTokenizer_Get(p->tok, &start, &end);を呼び出す。
while (type == TYPE_IGNORE) で不要なtokenを読み飛ばす。コメント等。
読み飛ばしたtokenは　p->type_ignore_comments に溜め込まれる。

PyTokenizer_Get(struct tok_state *tok, const char p_start, const char p_end)

tokenizer.c内で定義されている。
エラーの処理のみ行う。
実体は tok_get()

tok_get(struct tok_state *tok, const char p_start, const char p_end)

一文字ずつ、入力を読む。tok_nextc()
indent levelを処理する。
Token切り出しのコアの部分。処理長いので別途解説する.

Parser *pはどこでつくられる(mallocされる)のか？

_PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,

pegen.c
struct tok_state *tok = PyTokenizer_FromFile(fp, enc, ps1, ps2);でtok_stateにプロンプトや、入力のfile pointer を渡している。
以下を呼び出してつくる。

Parser *
_PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
                    int feature_version, int *errcode, PyArena *arena)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

CPythonを読む (1) ParserとTokenizerの境界

構造体

_PyPegen_expect_token(Parser *p, int type)

_PyPegen_fill_token(Parser *p)

PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)

tok_get(struct tok_state *tok, const char **p_start, const char **p_end)

Parser *pはどこでつくられる(mallocされる)のか？

PyTokenizer_Get(struct tok_state *tok, const char p_start, const char p_end)

tok_get(struct tok_state *tok, const char p_start, const char p_end)