init basic

2025-03-05 15:45:19 +08:00
commit 09299e339c
42 changed files with 5752 additions and 0 deletions
--- a/ccompiler/frontend/lexer/README.md
+++ b/ccompiler/frontend/lexer/README.md
@ -0,0 +1,5 @@
+# 词法分析
+
+参考LCC的此分析部分
+
+主要使用 LL(n) 硬编码查找token
--- a/ccompiler/frontend/lexer/lexer.c
+++ b/ccompiler/frontend/lexer/lexer.c
@ -0,0 +1,523 @@
+/**
+ * 仿照LCCompiler的词法分析部分
+ * 
+ * 如下为LCC的README in 2025.2
+This hierarchy is the distribution for lcc version 4.2.
+
+lcc version 3.x is described in the book "A Retargetable C Compiler:
+Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
+There are significant differences between 3.x and 4.x, most notably in
+the intermediate code. For details, see
+https://drh.github.io/lcc/documents/interface4.pdf.
+
+VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
+UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
+
+LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
+
+LOG describes the changes since the last release.
+
+CPYRIGHT describes the conditions under you can use, copy, modify, and
+distribute lcc or works derived from lcc.
+
+doc/install.html is an HTML file that gives a complete description of
+the distribution and installation instructions.
+
+Chris Fraser / cwf@aya.yale.edu
+David Hanson / drh@drhanson.net
+ */
+#include "../frontend.h"
+#include "lexer.h"
+
+static const struct {
+    const char* name;
+    enum CSTD_KEYWORD std_type;
+    enum TokenType tok;
+} keywords[] = {
+    #define X(name, std_type, tok, ...) { #name, std_type, tok },
+    KEYWORD_TABLE
+    #undef X
+};
+
+// by using binary search to find the keyword
+static inline int keyword_cmp(const char* name, int len) {
+    int low = 0;
+    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
+    while (low <= high) {
+        int mid = (low + high) / 2;
+        const char *key = keywords[mid].name;
+        int cmp = 0;
+        
+        // 自定义字符串比较逻辑
+        for (int i = 0; i < len; i++) {
+            if (name[i] != key[i]) {
+                cmp = (unsigned char)name[i] - (unsigned char)key[i];
+                break;
+            }
+            if (name[i] == '\0') break; // 遇到终止符提前结束
+        }
+        
+        if (cmp == 0) {
+            // 完全匹配检查（长度相同）
+            if (key[len] == '\0') return mid;
+            cmp = -1; // 当前关键词比输入长
+        }
+        
+        if (cmp < 0) {
+            high = mid - 1;
+        } else {
+            low = mid + 1;
+        }
+    }
+    return -1; // Not a keyword.
+}
+
+void init_lexer(struct Lexer* lexer, const char* file_name, void* stream, lexer_sread_fn sread)
+{
+    lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
+    lexer->index = 1;
+    lexer->line = 1;
+
+    lexer->stream = stream;
+    lexer->sread = sread;
+
+    for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
+        lexer->buffer[i] = 0;
+    }
+}
+
+static void flush_buffer(struct Lexer* lexer) {
+    int num = lexer->end_ptr - lexer->cur_ptr;
+    for (int i = 0; i < num; i++) {
+        lexer->buffer[i] = lexer->cur_ptr[i];
+    }
+    lexer->cur_ptr = lexer->buffer;
+
+    int read_size = LEXER_BUFFER_SIZE - num;
+    // TODO size_t to int maybe lose precision
+    int got_size = lexer->sread(lexer->buffer + num, read_size, 1, read_size, lexer->stream);
+    if (got_size < 0) {
+        error("lexer read error");
+    } else if (got_size < read_size) {
+        lexer->end_ptr += got_size;
+        lexer->end_ptr[0] = '\0'; // EOF
+        lexer->end_ptr++;
+    } else if (got_size == read_size) {
+        lexer->end_ptr += got_size;
+    } else {
+        error("lexer read error imposible got_size > read_size maybe overflow?");
+    }
+}
+
+static void goto_newline(struct Lexer* lexer) {
+    do {
+        if (lexer->cur_ptr == lexer->end_ptr) {
+            flush_buffer(lexer);
+            lexer->cur_ptr--;
+        }
+        lexer->cur_ptr++;
+    } while (*lexer->cur_ptr != '\n' && *lexer->cur_ptr != '\0');
+}
+
+static void goto_block_comment(struct Lexer* lexer) {
+    while (1) {
+        if (lexer->end_ptr - lexer->cur_ptr < 2) {
+            flush_buffer(lexer);
+        }
+        
+        if (*lexer->cur_ptr == '\0') {
+            break;
+        } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
+            lexer->cur_ptr += 2;
+            break;
+        } else {
+            lexer->cur_ptr++;
+        }
+    }
+}
+
+// TODO escape character not enough
+static char got_slash(unsigned char* peek) {
+    switch (*peek) {
+        case '\\':  return '\\';
+        case '\'':  return '\'';
+        case '\"':  return '\"';
+        case '\?':  return '\?';
+        case '0':   return '\0';
+
+        case 'b': return '\b';
+        case 'f': return '\f';
+        case 'n': return '\n';
+        case 'r': return '\r';
+        case 't': return '\t';
+        case 'v': return '\v';
+        default: error("Unknown escape character"); 
+    }
+}
+
+static void parse_char_literal(struct Lexer* lexer, struct Token* token) {
+    char val = 0;
+    unsigned char* peek = lexer->cur_ptr + 1;
+    if (*peek == '\\') {
+        peek++;
+        val = got_slash(peek);
+    } else {
+        val = *peek;
+    }
+
+    if (*peek != '\'') error("Unclosed character literal");
+    token->constant.ch = val;
+    lexer->cur_ptr = peek + 1;
+    token->constant.have = 1;
+    token->type = TOKEN_CHAR_LITERAL;
+}
+
+static void parse_string_literal(struct Lexer* lexer, struct Token* token) {
+    unsigned char* peek = lexer->cur_ptr + 1;
+    // TODO string literal size check
+    char* dest = token->constant.str = xmalloc(LEXER_MAX_TOKEN_SIZE + 1);
+    int len = 0;
+
+    while (*peek != '"') {
+        if (peek >= lexer->end_ptr) flush_buffer(lexer);
+        
+        if (*peek == '\\') { // 处理转义
+            peek++;
+            *peek = got_slash(peek);
+        }
+        
+        if (len >= LEXER_MAX_TOKEN_SIZE) error("String too long");
+        dest[len++] = *peek++;
+    }
+    dest[len] = '\0';
+    lexer->cur_ptr = peek + 1;
+    token->constant.have = 1;
+    token->type = TOKEN_STRING_LITERAL;
+}
+
+// FIXME it write by AI maybe error
+static void parse_number(struct Lexer* lexer, struct Token* token) {
+    unsigned char* peek = lexer->cur_ptr;
+    int base = 10;
+    int is_float = 0;
+    long long int_val = 0;
+    double float_val = 0.0;
+    double fraction = 1.0;
+
+    // 判断进制
+    if (*peek == '0') {
+        peek++;
+        switch (*peek) {
+            case 'x':
+            case 'X':
+                base = 16;
+            default:
+                base = 8;
+        }
+    }
+
+    // 解析整数部分
+    while (1) {
+        int digit = -1;
+        if (*peek >= '0' && *peek <= '9') {
+            digit = *peek - '0';
+        } else if (base == 16) {
+            if (*peek >= 'a' && *peek <= 'f') digit = *peek - 'a' + 10;
+            else if (*peek >= 'A' && *peek <= 'F') digit = *peek - 'A' + 10;
+        }
+
+        if (digit < 0 || digit >= base) break;
+
+        if (!is_float) {
+            int_val = int_val * base + digit;
+        } else {
+            float_val = float_val * base + digit;
+            fraction *= base;
+        }
+        peek++;
+    }
+
+    // 解析浮点数
+    if (*peek == '.' && base == 10) {
+        is_float = 1;
+        float_val = int_val;
+        peek++;
+        
+        while (*peek >= '0' && *peek <= '9') {
+            float_val = float_val * 10.0 + (*peek - '0');
+            fraction *= 10.0;
+            peek++;
+        }
+        float_val /= fraction;
+    }
+
+    // 解析科学计数法
+    if ((*peek == 'e' || *peek == 'E') && base == 10) {
+        is_float = 1;
+        peek++;
+        int exp_sign = 1;
+        int exponent = 0;
+
+        if (*peek == '+') peek++;
+        else if (*peek == '-') {
+            exp_sign = -1;
+            peek++;
+        }
+
+        while (*peek >= '0' && *peek <= '9') {
+            exponent = exponent * 10 + (*peek - '0');
+            peek++;
+        }
+        // float_val *= pow(10.0, exp_sign * exponent);
+    }
+
+    // 存储结果
+    lexer->cur_ptr = peek;
+    token->constant.have = 1;
+    if (is_float) {
+        token->constant.d = float_val;
+        token->type = TOKEN_FLOAT_LITERAL;
+    } else {
+        token->constant.ll = int_val;
+        token->type = TOKEN_INT_LITERAL;
+    }
+}
+
+#define GOT_ONE_TOKEN_BUF_SIZE 64
+// /zh/c/language/operator_arithmetic.html
+void get_token(struct Lexer* lexer, struct Token* token) {
+    // 需要保证缓冲区始终可读
+    if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
+        flush_buffer(lexer);
+    }
+    register unsigned char* peek = lexer->cur_ptr;
+    
+    // 快速跳过空白符
+    while (*peek == ' ' || *peek == '\t') {
+        if (peek == lexer->end_ptr) {
+            break;
+        }
+        peek++;
+    }
+    if (peek != lexer->cur_ptr) {
+        // To TOKEN_FLUSH
+        lexer->cur_ptr = peek;
+        token->type = TOKEN_FLUSH;
+    }
+    
+    enum TokenType tok = TOKEN_INIT;
+    struct TokenConstant constant;
+    constant.have = 0;
+    
+    // once step
+    switch (*peek++) {
+        case '=':
+        switch (*peek++) {
+            case '=': tok = TOKEN_EQ; break;
+            default: peek--, tok = TOKEN_ASSIGN; break;
+        } break;
+    case '+':
+        switch (*peek++) {
+            case '+': tok = TOKEN_ADD_ADD; break;
+            case '=': tok = TOKEN_ASSIGN_ADD; break;
+            default: peek--, tok = TOKEN_ADD; break;
+        } break;
+    case '-':
+        switch (*peek++) {
+            case '-': tok = TOKEN_SUB_SUB; break;
+            case '=': tok = TOKEN_ASSIGN_SUB; break;
+
+            case '>': tok = TOKEN_DEREF; break;
+            default: peek--, tok = TOKEN_SUB; break;
+        } break;
+    case '*':
+        switch (*peek++) {
+            case '=': tok = TOKEN_ASSIGN_MUL; break;
+            default: peek--, tok = TOKEN_MUL; break;
+        } break;
+    case '/':
+        switch (*peek++) {
+            case '=': tok = TOKEN_ASSIGN_DIV; break;
+            case '/': {
+                // need get a new line to parse
+                goto_newline(lexer);
+                tok = TOKEN_LINE_COMMENT;
+                goto END;
+            }
+            case '*': {
+                lexer->cur_ptr = peek;
+                goto_block_comment(lexer);
+                tok = TOKEN_BLOCK_COMMENT;
+                goto END;
+            }
+            default: peek--, tok = TOKEN_DIV; break;
+        } break;
+    case '%':
+        switch (*peek++) {
+            case '=': tok = TOKEN_ASSIGN_MOD; break;
+            default: peek--, tok = TOKEN_MOD; break;
+        } break;
+    case '&':
+        switch (*peek++) {
+            case '&': tok = TOKEN_AND_AND; break;
+            case '=': tok = TOKEN_ASSIGN_AND; break;
+            default: peek--, tok = TOKEN_AND; break;
+        } break;
+    case '|':
+        switch (*peek++) {
+            case '|': tok = TOKEN_OR_OR; break;
+            case '=': tok = TOKEN_ASSIGN_OR; break;
+            default: peek--, tok = TOKEN_OR; break;
+        } break;
+    case '^':
+        switch (*peek++) {
+            case '=': tok = TOKEN_ASSIGN_XOR; break;
+            default: peek--, tok = TOKEN_XOR; break;
+        } break;
+    case '<':
+        switch (*peek++) {
+            case '=': tok = TOKEN_LE; break;
+            case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
+            default: peek--, tok = TOKEN_LT; break;
+        } break;
+    case '>':
+        switch (*peek++) {
+            case '=': tok = TOKEN_GE; break;
+            case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
+            default: peek--, tok = TOKEN_GT; break;
+        } break;
+    case '~':
+        tok = TOKEN_BIT_NOT; break;
+    case '!':
+        switch (*peek++) {
+            case '=': tok = TOKEN_NEQ; break;
+            default: peek--, tok = TOKEN_NOT; break;
+        }
+    case '[':
+        tok = TOKEN_L_BRACKET; break;
+    case ']':
+        tok = TOKEN_R_BRACKET; break;
+    case '(':
+        tok = TOKEN_L_PAREN; break;
+    case ')':
+        tok = TOKEN_R_PAREN; break;
+    case '{':
+        tok = TOKEN_L_BRACE; break;
+    case '}':
+        tok = TOKEN_R_BRACE; break;
+    case ';':
+        tok = TOKEN_SEMICOLON; break;
+    case ',':
+        tok = TOKEN_COMMA; break;
+    case ':':
+        tok = TOKEN_COLON; break;
+    case '.':
+        if (peek[0] == '.' && peek[1] == '.') {
+            peek += 2;
+            tok = TOKEN_ELLIPSIS;
+        } else {
+            tok = TOKEN_DOT;
+        }
+        break;
+    case '?':
+        tok = TOKEN_COND; break;
+    case '\v': case '\r': case '\f': // FIXME it parse as a blank character
+        tok = TOKEN_FLUSH; break;
+    case '\n': 
+        // you need to flush a newline or blank
+        lexer->line++;
+        tok = TOKEN_FLUSH; break;
+    case '#':
+        warn("TODO: #define\n");
+        goto_newline(lexer);
+        tok = TOKEN_FLUSH;
+        goto END;
+    case '\0':
+        // EOF
+        tok = TOKEN_EOF;
+        goto END;
+    case '\'':
+        return parse_char_literal(lexer, token);
+        return;
+    case '"':
+        return parse_string_literal(lexer, token);
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+        return parse_number(lexer, token);
+    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+    case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
+    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
+    case '_':
+        // TOKEN_IDENT
+        if (*peek == 'L' && *peek == '\'' || *peek == 'L' && *peek == '"') {
+            error("unsupport wide-character char literal by `L` format");
+        }
+        while (1) {
+            if (peek == lexer->end_ptr) {
+                error("unsupport outof 64 length identifier");
+            }
+            if ((*peek >= 'a' && *peek <= 'z') || (*peek >= 'A' && *peek <= 'Z') ||
+            (*peek == '_') || (*peek >= '0' && *peek <= '9')) {
+                peek++;
+                continue;
+            }
+            break;
+        }
+    
+        int res = keyword_cmp(lexer->cur_ptr, peek - (lexer->cur_ptr));
+        if (res == -1) {
+            int strlen = peek - lexer->cur_ptr;
+            unsigned char* str = xmalloc(strlen + 1);
+            constant.have = 1;
+            constant.str = str;
+            for (int i = 0; i < strlen; i++) {
+                str[i] = lexer->cur_ptr[i];
+            }
+            str[strlen] = '\0';
+            constant.have = 1;
+            constant.str = str;
+            tok = TOKEN_IDENT; break;
+        } else {
+            tok = keywords[res].tok; break;
+        }
+    default:
+        error("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
+        break;
+    }
+
+    lexer->cur_ptr = peek;
+END:
+    token->constant = constant;
+    token->type = tok;
+}
+
+// get_token maybe got invalid (with parser)
+void get_valid_token(struct Lexer* lexer, struct Token* token) {
+    enum TokenType type;
+    do {
+        get_token(lexer, token);
+        type = token->type;
+    } while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
+}
+
+// 生成字符串映射（根据需求选择#str或#name）
+static const char* token_strings[] = {
+    // 普通token使用#str
+    #define X(str, tok) [tok] = #str,
+    TOKEN_TABLE
+    #undef X
+    
+    // 关键字使用#name
+    #define X(name, std, tok) [tok] = #name,
+    KEYWORD_TABLE
+    #undef X
+};
+
+const char* get_token_name(enum TokenType type) {
+    return token_strings[type];
+}
--- a/ccompiler/frontend/lexer/lexer.h
+++ b/ccompiler/frontend/lexer/lexer.h
@ -0,0 +1,40 @@
+#ifndef __LEXER_H__
+#define __LEXER_H__
+
+#include "token.h"
+#define LEXER_MAX_TOKEN_SIZE 63
+#define LEXER_BUFFER_SIZE 4095
+
+typedef int (*lexer_sread_fn)(void *dst_buf, int dst_size,
+        int elem_size, int count, void *stream);
+
+struct Lexer {
+    int line;
+    int index;
+    // const char current_file_name[LEXER_BUFFER_SIZE+1];
+
+    unsigned char* cur_ptr; // 当前扫描的字符，但是还没有开始扫描
+    unsigned char* end_ptr; // 缓冲区最后一个字符的下一个位置
+    char buffer[LEXER_BUFFER_SIZE+1];
+
+    lexer_sread_fn sread;
+    void* stream;
+};
+
+struct Token {
+    enum TokenType type;
+    struct TokenConstant constant;
+};
+
+void init_lexer(struct Lexer* lexer, const char* file_name, void* stream,
+    lexer_sread_fn sread);
+
+// 
+void get_token(struct Lexer* lexer, struct Token* token);
+
+// get_token maybe got invalid (with parser as TOKEN_FLUSH)
+void get_valid_token(struct Lexer* lexer, struct Token* token);
+
+const char* get_token_name(enum TokenType token);
+
+#endif
--- a/ccompiler/frontend/lexer/tests/test_lexer.c
+++ b/ccompiler/frontend/lexer/tests/test_lexer.c
@ -0,0 +1,46 @@
+#include "../lexer.h"
+#include <stdio.h>
+// gcc -g ../lexer.c test_lexer.c -o test_lexer
+/*
+struct TokenConstant {
+   int have;
+   union {
+       char ch;
+       int i;
+       float f;
+       double d;
+       long long ll;
+       char* str;
+   };
+};
+*/
+
+int g_num;
+int g_num_arr[3];
+int main(int argc, char* argv[]) {
+    int num = 0;
+
+    const char* file_name = "test_lexer.c";
+    if (argc == 2) {
+        file_name = argv[1];
+    }
+    FILE* fp = fopen(file_name, "r");
+    if (fp == NULL) {
+        perror("open file failed");
+        return 1;
+    }
+    printf("open file success\n");
+
+    struct Lexer lexer;
+    init_lexer(&lexer, "test_lexter.c", fp, (lexer_sread_fn)fread_s);
+    struct Token tok;
+
+    while (1) {
+        get_valid_token(&lexer, &tok);
+        if (tok.type == TOKEN_EOF) {
+            break;
+        }
+        printf("line: %d, column: %d, type: %3d, typename: %s\n",
+            lexer.line, lexer.index, tok.type, get_token_name(tok.type));
+    }
+}
--- a/ccompiler/frontend/lexer/token.h
+++ b/ccompiler/frontend/lexer/token.h
@ -0,0 +1,250 @@
+#ifndef __TOKEN_H__
+#define __TOKEN_H__
+
+enum CSTD_KEYWORD {
+    CSTD_C89,
+    CSTD_C99,
+    CEXT_ASM,
+};
+
+// Using Binary Search To Fast Find Keyword
+#define KEYWORD_TABLE \
+    X(asm           , CEXT_ASM, TOKEN_ASM)          \
+    X(break         , CSTD_C89, TOKEN_BREAK)        \
+    X(case          , CSTD_C89, TOKEN_CASE)         \
+    X(char          , CSTD_C89, TOKEN_CHAR)         \
+    X(const         , CSTD_C89, TOKEN_CONST)        \
+    X(continue      , CSTD_C89, TOKEN_CONTINUE)     \
+    X(default       , CSTD_C89, TOKEN_DEFAULT)      \
+    X(do            , CSTD_C89, TOKEN_DO)           \
+    X(double        , CSTD_C89, TOKEN_DOUBLE)       \
+    X(else          , CSTD_C89, TOKEN_ELSE)         \
+    X(enum          , CSTD_C89, TOKEN_ENUM)         \
+    X(extern        , CSTD_C89, TOKEN_EXTERN)       \
+    X(float         , CSTD_C89, TOKEN_FLOAT)        \
+    X(for           , CSTD_C89, TOKEN_FOR)          \
+    X(goto          , CSTD_C89, TOKEN_GOTO)         \
+    X(if            , CSTD_C89, TOKEN_IF)           \
+    X(inline        , CSTD_C99, TOKEN_INLINE)       \
+    X(int           , CSTD_C89, TOKEN_INT)          \
+    X(long          , CSTD_C89, TOKEN_LONG)         \
+    X(register      , CSTD_C89, TOKEN_REGISTER)     \
+    X(restrict      , CSTD_C99, TOKEN_RESTRICT)     \
+    X(return        , CSTD_C89, TOKEN_RETURN)       \
+    X(short         , CSTD_C89, TOKEN_SHORT)        \
+    X(signed        , CSTD_C89, TOKEN_SIGNED)       \
+    X(sizeof        , CSTD_C89, TOKEN_SIZEOF)       \
+    X(static        , CSTD_C89, TOKEN_STATIC)       \
+    X(struct        , CSTD_C89, TOKEN_STRUCT)       \
+    X(switch        , CSTD_C89, TOKEN_SWITCH)       \
+    X(typedef       , CSTD_C89, TOKEN_TYPEDEF)      \
+    X(union         , CSTD_C89, TOKEN_UNION)        \
+    X(unsigned      , CSTD_C89, TOKEN_UNSIGNED)     \
+    X(void          , CSTD_C89, TOKEN_VOID)         \
+    X(volatile      , CSTD_C89, TOKEN_VOLATILE)     \
+    X(while         , CSTD_C89, TOKEN_WHILE)        \
+    // KEYWORD_TABLE
+
+#define TOKEN_TABLE \
+    X(EOF            , TOKEN_EOF)                           \
+    X(init           , TOKEN_INIT)                          \
+    X(flush          , TOKEN_FLUSH)                         \
+    X("=="           , TOKEN_EQ)                            \
+    X("="            , TOKEN_ASSIGN)                        \
+    X("++"           , TOKEN_ADD_ADD)                           \
+    X("+="           , TOKEN_ASSIGN_ADD)                    \
+    X("+"            , TOKEN_ADD)                           \
+    X("--"           , TOKEN_SUB_SUB)                           \
+    X("-="           , TOKEN_ASSIGN_SUB)                    \
+    X("->"           , TOKEN_DEREF)                         \
+    X("-"            , TOKEN_SUB)                           \
+    X("*="           , TOKEN_ASSIGN_MUL)                    \
+    X("*"            , TOKEN_MUL)                           \
+    X("/="           , TOKEN_ASSIGN_DIV)                    \
+    X("/"            , TOKEN_DIV)                           \
+    X("//"           , TOKEN_LINE_COMMENT)                  \
+    X("/* */"        , TOKEN_BLOCK_COMMENT)                 \
+    X("%="           , TOKEN_ASSIGN_MOD)                    \
+    X("%"            , TOKEN_MOD)                           \
+    X("&&"           , TOKEN_AND_AND)                       \
+    X("&="           , TOKEN_ASSIGN_AND)                    \
+    X("&"            , TOKEN_AND)                           \
+    X("||"           , TOKEN_OR_OR)                         \
+    X("|="           , TOKEN_ASSIGN_OR)                     \
+    X("|"            , TOKEN_OR)                            \
+    X("^="           , TOKEN_ASSIGN_XOR)                    \
+    X("^"            , TOKEN_XOR)                           \
+    X("<<="          , TOKEN_ASSIGN_L_SH)                   \
+    X("<<"           , TOKEN_L_SH)                          \
+    X("<="           , TOKEN_LE)                            \
+    X("<"            , TOKEN_LT)                            \
+    X(">>="          , TOKEN_ASSIGN_R_SH)                   \
+    X(">>"           , TOKEN_R_SH)                          \
+    X(">="           , TOKEN_GE)                            \
+    X(">"            , TOKEN_GT)                            \
+    X("!"            , TOKEN_NOT)                           \
+    X("!="           , TOKEN_NEQ)                           \
+    X("~"            , TOKEN_BIT_NOT)                       \
+    X("["            , TOKEN_L_BRACKET)                     \
+    X("]"            , TOKEN_R_BRACKET)                     \
+    X("("            , TOKEN_L_PAREN)                       \
+    X(")"            , TOKEN_R_PAREN)                       \
+    X("{"            , TOKEN_L_BRACE)                       \
+    X("}"            , TOKEN_R_BRACE)                       \
+    X(";"            , TOKEN_SEMICOLON)                     \
+    X(","            , TOKEN_COMMA)                         \
+    X(":"            , TOKEN_COLON)                         \
+    X("."            , TOKEN_DOT)                           \
+    X("..."          , TOKEN_ELLIPSIS)                      \
+    X("?"            , TOKEN_COND)                          \
+    X(identifier     , TOKEN_IDENT)                    \
+    X(int_literal    , TOKEN_INT_LITERAL)                   \
+    X(float_literal  , TOKEN_FLOAT_LITERAL)                 \
+    X(char_literal   , TOKEN_CHAR_LITERAL)                  \
+    X(string_literal , TOKEN_STRING_LITERAL)                \
+    // END
+
+// 定义TokenType枚举
+enum TokenType {
+    // 处理普通token
+    #define X(str, tok) tok,
+    TOKEN_TABLE
+    #undef X
+    
+    // 处理关键字（保持原有格式）
+    #define X(name, std, tok) tok,
+    KEYWORD_TABLE
+    #undef X
+};
+
+struct TokenConstant {
+    int have;
+    union {
+        char ch;
+        int i;
+        float f;
+        double d;
+        long long ll;
+        char* str;
+    };
+};
+
+// "break"
+// "case"
+// "char"
+// "const"
+// "continue"
+// "default"
+// "do"
+// "double"
+// "else"
+// "enum"
+// "extern"
+// "float"
+// "for"
+// "goto"
+// "if"
+// "inline (C99)"
+// "int"
+// "long"
+// "register"
+// "restrict (C99)"
+// "return"
+// "short"
+// "signed"
+// "sizeof"
+// "static"
+// "struct"
+// "switch"
+// "typedef"
+// "union"
+// "unsigned"
+// "void"
+// "volatile"
+// "while"
+
+// alignas (C23)
+// alignof (C23)
+// auto
+// bool (C23)
+// constexpr (C23)
+// false (C23)
+// nullptr (C23)
+// static_assert (C23)
+// thread_local (C23)
+// true (C23)
+// typeof (C23)
+// typeof_unqual (C23)
+// _Alignas (C11)
+// _Alignof (C11)
+// _Atomic (C11)
+// _BitInt (C23)
+// _Bool (C99)
+// _Complex (C99)
+// _Decimal128 (C23)
+// _Decimal32 (C23)
+// _Decimal64 (C23)
+// _Generic (C11)
+// _Imaginary (C99)
+// _Noreturn (C11)
+// _Static_assert (C11)
+// _Thread_local (C11)
+
+// a = b
+// a += b
+// a -= b
+// a *= b
+// a /= b
+// a %= b
+// a &= b
+// a |= b
+// a ^= b
+// a <<= b
+// a >>= b
+
+// ++a
+// --a
+// a++
+// a--
+
+// +a
+// -a
+// a + b
+// a - b
+// a * b
+// a / b
+// a % b
+// ~a
+// a & b
+// a | b
+// a ^ b
+// a << b
+// a >> b
+
+// !a
+// a && b
+// a || b
+
+// a == b
+// a != b
+// a < b
+// a > b
+// a <= b
+// a >= b
+
+// a[b]
+// *a
+// &a
+// a->b
+// a.b
+
+// a(...)
+// a, b
+// (type) a
+// a ? b : c
+// sizeof
+
+// _Alignof
+// (C11)
+
+#endif