feat(frontend): 重构词法分析器

- 添加 .gitignore 文件，忽略编译器生成的二进制文件 - 重构 lexer.c 文件，改进了关键字处理和字符串处理 - 更新前端的前端、解析器和 AST 相关文件，以适应新的词法分析器 - 优化了 token 相关的定义和函数，引入了新的 token 类型
2025-03-23 12:13:16 +08:00
parent 05c637e594
commit 2b4857001c
33 changed files with 532 additions and 624 deletions
--- a/ccompiler/frontend/lexer/lexer.c
+++ b/ccompiler/frontend/lexer/lexer.c
@@ -34,7 +34,7 @@ David Hanson / drh@drhanson.net
 static const struct {
    const char* name;
    enum CSTD_KEYWORD std_type;
-    tok_type_t tok;
+    cc_tktype_t tok;
 } keywords[] = {
    #define X(name, std_type, tok, ...) { #name, std_type, tok },
    KEYWORD_TABLE
@@ -74,19 +74,17 @@ static inline int keyword_cmp(const char* name, int len) {
    return -1; // Not a keyword.
 }

-void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) {
-    init_lib_core();
-
-    lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
-    lexer->index = 1;
-    lexer->line = 1;
+void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) {
+    lexer->strpool = strpool;
+    lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer);
+    lexer->loc.fname = strpool_intern(lexer->strpool, file_name);
+    lexer->loc.line = 1;
+    lexer->loc.col = 1;

    lexer->stream = stream;
    lexer->sread = sread;

-    for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
-        lexer->buffer[i] = 0;
-    }
+    rt_memset(lexer->buffer, 0, sizeof(lexer->buffer));
 }

 static void flush_buffer(lexer_t* lexer) {
@@ -94,7 +92,7 @@ static void flush_buffer(lexer_t* lexer) {
    for (int i = 0; i < num; i++) {
        lexer->buffer[i] = lexer->cur_ptr[i];
    }
-    lexer->cur_ptr = (unsigned char*)lexer->buffer;
+    lexer->cur_ptr = lexer->buffer;

    int read_size = LEXER_BUFFER_SIZE - num;
    // TODO rt_size_t to int maybe lose precision
@@ -128,19 +126,20 @@ static void goto_block_comment(lexer_t* lexer) {
            flush_buffer(lexer);
        }
        
-        if (*lexer->cur_ptr == '\0') {
+        if (lexer->cur_ptr[0] == '\0') {
            break;
        } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
            lexer->cur_ptr += 2;
            break;
        } else {
+            if (lexer->cur_ptr[0] == '\n') lexer->loc.line++;
            lexer->cur_ptr++;
        }
    }
 }

 // TODO escape character not enough
-static char got_slash(unsigned char* peek) {
+static char got_slash(char* peek) {
    switch (*peek) {
        case '\\':  return '\\';
        case '\'':  return '\'';
@@ -162,7 +161,7 @@ static char got_slash(unsigned char* peek) {

 static void parse_char_literal(lexer_t* lexer, tok_t* token) {
    char val = 0;
-    unsigned char* peek = lexer->cur_ptr + 1;
+    char* peek = lexer->cur_ptr + 1;
    if (*peek == '\\') {
        peek++;
        val = got_slash(peek);
@@ -172,16 +171,14 @@ static void parse_char_literal(lexer_t* lexer, tok_t* token) {
    }

    if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
-    token->val.ch = val;
    lexer->cur_ptr = peek;
-    token->val.have = 1;
-    token->type = TOKEN_CHAR_LITERAL;
+    token->val.ch = val;
 }

 static void parse_string_literal(lexer_t* lexer, tok_t* token) {
-    unsigned char* peek = lexer->cur_ptr + 1;
+    char* peek = lexer->cur_ptr + 1;
    // TODO string literal size check
-    char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1);
+    static char dest[LEXER_MAX_TOKEN_SIZE + 1];
    int len = 0;

    while (*peek != '"') {
@@ -196,14 +193,15 @@ static void parse_string_literal(lexer_t* lexer, tok_t* token) {
        dest[len++] = *peek++;
    }
    dest[len] = '\0';
-    lexer->cur_ptr = peek + 1;
-    token->val.have = 1;
-    token->type = TOKEN_STRING_LITERAL;
+    lexer->cur_ptr = peek + 1; // 1 is `"`
+    lexer->loc.len = len + 2; // 2 is `"` `"`
+
+    token->val.str = strpool_intern(lexer->strpool, dest);
 }

 // FIXME it write by AI maybe error
 static void parse_number(lexer_t* lexer, tok_t* token) {
-    unsigned char* peek = lexer->cur_ptr;
+    char* peek = lexer->cur_ptr;
    int base = 10;
    int is_float = 0;
    long long int_val = 0;
@@ -278,14 +276,15 @@ static void parse_number(lexer_t* lexer, tok_t* token) {
    }

    // 存储结果
+    // TODO
+    lexer->loc.len = peek - lexer->cur_ptr;
    lexer->cur_ptr = peek;
-    token->val.have = 1;
    if (is_float) {
-        token->val.d = float_val;
-        token->type = TOKEN_FLOAT_LITERAL;
+        token->val.f32 = float_val;
+        token->sub_type = TOKEN_FLOAT_LITERAL;
    } else {
-        token->val.ll = int_val;
-        token->type = TOKEN_INT_LITERAL;
+        token->val.i = int_val;
+        token->sub_type = TOKEN_INT_LITERAL;
    }
 }

@@ -296,160 +295,159 @@ void get_token(lexer_t* lexer, tok_t* token) {
    if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
        flush_buffer(lexer);
    }
-    register unsigned char* peek = lexer->cur_ptr;
-    
-    // 快速跳过空白符
-    while (*peek == ' ' || *peek == '\t') {
-        if (peek == lexer->end_ptr) {
-            break;
-        }
-        peek++;
-    }
-    if (peek != lexer->cur_ptr) {
-        // To TOKEN_FLUSH
-        lexer->cur_ptr = peek;
-        token->type = TOKEN_FLUSH;
-    }
-    
-    tok_type_t tok = TOKEN_INIT;
-    tok_val_t constant;
-    constant.have = 0;
-    
+    register char* peek = lexer->cur_ptr;
+
+    cc_tktype_t tk_type = TOKEN_INIT;
+    ctype_t literal = { 0 };
+
    // once step
    switch (*peek++) {
-        case '=':
+    case '=':
        switch (*peek++) {
-            case '=': tok = TOKEN_EQ; break;
-            default: peek--, tok = TOKEN_ASSIGN; break;
+            case '=': tk_type = TOKEN_EQ; break;
+            default: peek--, tk_type = TOKEN_ASSIGN; break;
        } break;
    case '+':
        switch (*peek++) {
-            case '+': tok = TOKEN_ADD_ADD; break;
-            case '=': tok = TOKEN_ASSIGN_ADD; break;
-            default: peek--, tok = TOKEN_ADD; break;
+            case '+': tk_type = TOKEN_ADD_ADD; break;
+            case '=': tk_type = TOKEN_ASSIGN_ADD; break;
+            default: peek--, tk_type = TOKEN_ADD; break;
        } break;
    case '-':
        switch (*peek++) {
-            case '-': tok = TOKEN_SUB_SUB; break;
-            case '=': tok = TOKEN_ASSIGN_SUB; break;
+            case '-': tk_type = TOKEN_SUB_SUB; break;
+            case '=': tk_type = TOKEN_ASSIGN_SUB; break;

-            case '>': tok = TOKEN_DEREF; break;
-            default: peek--, tok = TOKEN_SUB; break;
+            case '>': tk_type = TOKEN_DEREF; break;
+            default: peek--, tk_type = TOKEN_SUB; break;
        } break;
    case '*':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_MUL; break;
-            default: peek--, tok = TOKEN_MUL; break;
+            case '=': tk_type = TOKEN_ASSIGN_MUL; break;
+            default: peek--, tk_type = TOKEN_MUL; break;
        } break;
    case '/':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_DIV; break;
+            case '=': tk_type = TOKEN_ASSIGN_DIV; break;
            case '/': {
-                // need get a new line to parse
                goto_newline(lexer);
-                tok = TOKEN_LINE_COMMENT;
+                tk_type = TOKEN_LINE_COMMENT;
                goto END;
            }
            case '*': {
                lexer->cur_ptr = peek;
                goto_block_comment(lexer);
-                tok = TOKEN_BLOCK_COMMENT;
+                tk_type = TOKEN_BLOCK_COMMENT;
                goto END;
            }
-            default: peek--, tok = TOKEN_DIV; break;
+            default: peek--, tk_type = TOKEN_DIV; break;
        } break;
    case '%':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_MOD; break;
-            default: peek--, tok = TOKEN_MOD; break;
+            case '=': tk_type = TOKEN_ASSIGN_MOD; break;
+            default: peek--, tk_type = TOKEN_MOD; break;
        } break;
    case '&':
        switch (*peek++) {
-            case '&': tok = TOKEN_AND_AND; break;
-            case '=': tok = TOKEN_ASSIGN_AND; break;
-            default: peek--, tok = TOKEN_AND; break;
+            case '&': tk_type = TOKEN_AND_AND; break;
+            case '=': tk_type = TOKEN_ASSIGN_AND; break;
+            default: peek--, tk_type = TOKEN_AND; break;
        } break;
    case '|':
        switch (*peek++) {
-            case '|': tok = TOKEN_OR_OR; break;
-            case '=': tok = TOKEN_ASSIGN_OR; break;
-            default: peek--, tok = TOKEN_OR; break;
+            case '|': tk_type = TOKEN_OR_OR; break;
+            case '=': tk_type = TOKEN_ASSIGN_OR; break;
+            default: peek--, tk_type = TOKEN_OR; break;
        } break;
    case '^':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_XOR; break;
-            default: peek--, tok = TOKEN_XOR; break;
+            case '=': tk_type = TOKEN_ASSIGN_XOR; break;
+            default: peek--, tk_type = TOKEN_XOR; break;
        } break;
    case '<':
        switch (*peek++) {
-            case '=': tok = TOKEN_LE; break;
-            case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
-            default: peek--, tok = TOKEN_LT; break;
+            case '=': tk_type = TOKEN_LE; break;
+            case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
+            default: peek--, tk_type = TOKEN_LT; break;
        } break;
    case '>':
        switch (*peek++) {
-            case '=': tok = TOKEN_GE; break;
-            case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
-            default: peek--, tok = TOKEN_GT; break;
+            case '=': tk_type = TOKEN_GE; break;
+            case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
+            default: peek--, tk_type = TOKEN_GT; break;
        } break;
    case '~':
-        tok = TOKEN_BIT_NOT; break;
+        tk_type = TOKEN_BIT_NOT; break;
    case '!':
        switch (*peek++) {
-            case '=': tok = TOKEN_NEQ; break;
-            default: peek--, tok = TOKEN_NOT; break;
+            case '=': tk_type = TOKEN_NEQ; break;
+            default: peek--, tk_type = TOKEN_NOT; break;
        } break;
    case '[':
-        tok = TOKEN_L_BRACKET; break;
+        tk_type = TOKEN_L_BRACKET; break;
    case ']':
-        tok = TOKEN_R_BRACKET; break;
+        tk_type = TOKEN_R_BRACKET; break;
    case '(':
-        tok = TOKEN_L_PAREN; break;
+        tk_type = TOKEN_L_PAREN; break;
    case ')':
-        tok = TOKEN_R_PAREN; break;
+        tk_type = TOKEN_R_PAREN; break;
    case '{':
-        tok = TOKEN_L_BRACE; break;
+        tk_type = TOKEN_L_BRACE; break;
    case '}':
-        tok = TOKEN_R_BRACE; break;
+        tk_type = TOKEN_R_BRACE; break;
    case ';':
-        tok = TOKEN_SEMICOLON; break;
+        tk_type = TOKEN_SEMICOLON; break;
    case ',':
-        tok = TOKEN_COMMA; break;
+        tk_type = TOKEN_COMMA; break;
    case ':':
-        tok = TOKEN_COLON; break;
+        tk_type = TOKEN_COLON; break;
    case '.':
        if (peek[0] == '.' && peek[1] == '.') {
            peek += 2;
-            tok = TOKEN_ELLIPSIS;
+            tk_type = TOKEN_ELLIPSIS;
        } else {
-            tok = TOKEN_DOT;
+            tk_type = TOKEN_DOT;
        }
        break;
    case '?':
-        tok = TOKEN_COND; break;
-    case '\v': case '\r': case '\f': // FIXME it parse as a blank character
-        tok = TOKEN_FLUSH; break;
-    case '\n': 
+        tk_type = TOKEN_COND; break;
+    case '\v': case '\r': case '\f':
+    case ' ': case '\t':
+        tk_type = TOKEN_BLANK; break;
+    case '\n':
        // you need to flush a newline or blank
-        lexer->line++;
-        tok = TOKEN_FLUSH; break;
+        lexer->loc.line += 1;
+        lexer->loc.col = -1;
+        lexer->loc.len = 1;
+        tk_type = TOKEN_BLANK;
+        break;
    case '#':
-        LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored");
+    // TODO make line or file comment to change
+        LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
        goto_newline(lexer);
-        tok = TOKEN_FLUSH;
+        tk_type = TOKEN_BLANK;
        goto END;
    case '\0':
        // EOF
-        tok = TOKEN_EOF;
+        tk_type = TOKEN_EOF;
        goto END;
    case '\'':
-        return parse_char_literal(lexer, token);
-        return;
+        parse_char_literal(lexer, token);
+        literal = token->val;
+        tk_type = TOKEN_CHAR_LITERAL;
+        goto END; break;
    case '"':
-        return parse_string_literal(lexer, token);
+        parse_string_literal(lexer, token);
+        literal = token->val;
+        tk_type = TOKEN_STRING_LITERAL;
+        goto END; break;
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
-        return parse_number(lexer, token);
+        parse_number(lexer, token);
+        // TODO Make it easy
+        literal = token->val;
+        tk_type = token->sub_type;
+        goto END; break;
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
@@ -475,39 +473,53 @@ void get_token(lexer_t* lexer, tok_t* token) {
            break;
        }
    
-        int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr));
+        int strlen = peek - lexer->cur_ptr;
+        int res = keyword_cmp((const char*)lexer->cur_ptr, strlen);
        if (res == -1) {
-            int strlen = peek - lexer->cur_ptr;
-            unsigned char* str = rt._malloc(strlen + 1);
-            constant.have = 1;
-            constant.str = (char*)str;
-            for (int i = 0; i < strlen; i++) {
-                str[i] = lexer->cur_ptr[i];
-            }
-            str[strlen] = '\0';
-            constant.have = 1;
-            constant.str = (char*)str;
-            tok = TOKEN_IDENT; break;
+            char prev = lexer->cur_ptr[strlen];
+            lexer->cur_ptr[strlen] = '\0';
+            literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr);
+            lexer->cur_ptr[strlen] = prev;
+            tk_type = TOKEN_IDENT; break;
        } else {
-            tok = keywords[res].tok; break;
+            tk_type = keywords[res].tok; break;
        }
    default:
        LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
        break;
    }

+    lexer->loc.len = peek - lexer->cur_ptr;
    lexer->cur_ptr = peek;
 END:
-    token->val = constant;
-    token->type = tok;
-    LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i);
+    lexer->loc.col += lexer->loc.len;
+    lexer->loc.len = 0;
+
+    token->val = literal;
+    token->sub_type = tk_type;
+    token->loc = lexer->loc;
+static const tok_basic_type_t tok_type_map[] = {
+    // 普通token使用#str
+    #define X(str, basic, tok) [tok] = basic,
+    TOKEN_TABLE
+    #undef X
+    
+    // 关键字使用#name
+    #define X(name, std, tok) [tok] = TK_BASIC_KEYWORD,
+    KEYWORD_TABLE
+    #undef X
+};
+    token->type = tok_type_map[tk_type];
+    LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type),
+        token->loc.fname, token->loc.line, token->loc.col);
 }

 // get_token maybe got invalid (with parser)
 void get_valid_token(lexer_t* lexer, tok_t* token) {
-    tok_type_t type;
+    tok_basic_type_t type;
    do {
        get_token(lexer, token);
        type = token->type;
-    } while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
+        Assert(type != TK_BASIC_INVALID);
+    } while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT);
 }