feat(build): 引入新的 Python 构建系统并移除旧 Makefile

新增基于 Python 的构建脚本 `cbuild.py`，支持包管理、依赖解析和模块化编译。同时添加 `.gitignore` 忽略 `build` 目录，并在 `justfile` 中更新构建命令。移除了原有的 `lib/Makefile` 和主目录下的相关 make 规则，统一使用新构建系统。
2025-11-20 10:44:59 +08:00
parent 8d97fe896c
commit e22811f2f5
140 changed files with 1996 additions and 10098 deletions
--- a/libs/lexer/src/lexer.c
+++ b/libs/lexer/src/lexer.c
@@ -0,0 +1,637 @@
+/**
+ * 仿照LCCompiler的词法分析部分
+ * 
+ * 如下为LCC的README in 2025.2
+This hierarchy is the distribution for lcc version 4.2.
+
+lcc version 3.x is described in the book "A Retargetable C Compiler:
+Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
+There are significant differences between 3.x and 4.x, most notably in
+the intermediate code. For details, see
+https://drh.github.io/lcc/documents/interface4.pdf.
+
+VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
+UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
+
+LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
+
+LOG describes the changes since the last release.
+
+CPYRIGHT describes the conditions under you can use, copy, modify, and
+distribute lcc or works derived from lcc.
+
+doc/install.html is an HTML file that gives a complete description of
+the distribution and installation instructions.
+
+Chris Fraser / cwf@aya.yale.edu
+David Hanson / drh@drhanson.net
+ */
+#include <lexer_log.h>
+#include <lexer.h>
+
+static const struct {
+    const char* name;
+    ckeyword_t std_type;
+    token_type_t tok;
+} keywords[] = {
+    #define X(name, subtype, tok, std_type,...) { #name, std_type, tok },
+    KEYWORD_TABLE
+    #undef X
+};
+
+// by using binary search to find the keyword
+static inline int keyword_cmp(const char* name, int len) {
+    int low = 0;
+    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
+    while (low <= high) {
+        int mid = (low + high) / 2;
+        const char *key = keywords[mid].name;
+        int cmp = 0;
+        
+        // 自定义字符串比较逻辑
+        for (int i = 0; i < len; i++) {
+            if (name[i] != key[i]) {
+                cmp = (unsigned char)name[i] - (unsigned char)key[i];
+                break;
+            }
+            if (name[i] == '\0') break; // 遇到终止符提前结束
+        }
+        
+        if (cmp == 0) {
+            // 完全匹配检查（长度相同）
+            if (key[len] == '\0') return mid;
+            cmp = -1; // 当前关键词比输入长
+        }
+        
+        if (cmp < 0) {
+            high = mid - 1;
+        } else {
+            low = mid + 1;
+        }
+    }
+    return -1; // Not a keyword.
+}
+
+void lexer_init(smcc_lexer_t* lexer, lexer_stream_t* stream) {
+    lexer->stream = stream;
+    lexer->pos = (lexer_loc_t) {
+        .name = stream->name,
+        .name_len = stream->name_len,
+        .line = 1,
+        .column = 1,
+        .offset = 0,
+    };
+}
+
+#define stream_reset_char(stream)   ((stream)->reset_char(stream))
+#define stream_next_char(stream)    ((stream)->next_char(stream))
+#define stream_peek_char(stream)    ((stream)->peek_char(stream))
+#define lexer_next_pos(lexer)       ((lexer)->pos.column ++, (lexer)->pos.offset ++)
+#define lexer_next_line(lexer)      ((lexer)->pos.line ++, (lexer)->pos.column = 1)
+#define set_err_token(token)        ((token)->type = TOKEN_UNKNOWN)
+
+static void skip_newline(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    lexer_stream_t* stream = lexer->stream;
+    token->type = TOKEN_LINE_COMMENT;
+
+    // 循环直到遇到换行符或文件结束
+    while (1) {
+        int ch = stream_next_char(stream);
+        
+        if (ch == lexer_stream_eof) {
+            // 到达文件末尾，直接返回
+            return;
+        }
+        
+        // 更新位置信息
+        lexer_next_pos(lexer);
+        if (ch == '\n') {
+            // 遇到换行符，增加行号并重置列号
+            lexer_next_line(lexer);
+            return;
+        }
+    }
+}
+
+static void skip_block_comment(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    lexer_stream_t* stream = lexer->stream;
+    token->type = TOKEN_BLOCK_COMMENT;
+    int ch;
+    
+    stream_reset_char(stream);
+    ch = stream_next_char(stream);
+    lexer_next_pos(lexer);
+    // FIXME Assertion
+    Assert (ch == '/');
+    ch = stream_next_char(stream);
+    lexer_next_pos(lexer);
+    Assert (ch == '*');
+    // 我们已经识别了 "/*"，现在需要找到 "*/"
+    while (1) {
+        ch = stream_next_char(stream);
+        lexer_next_pos(lexer);
+
+        if (ch == lexer_stream_eof) {
+            // 未闭合的块注释
+            LEX_WARN("Unterminated block comment");
+            return;
+        }
+        
+        // LEX_ERROR("%c", ch);
+
+        // 更新位置信息
+        if (ch == '\n') {
+            lexer_next_line(lexer);
+        } else if (ch == '*') {
+            // 查看下一个字符是否是 '/'
+            int next_ch = stream_peek_char(stream);
+
+            if (next_ch == '/') {
+                // 消费 '/' 字符
+                stream_next_char(stream);
+                
+                // 更新位置信息
+                lexer_next_pos(lexer);
+                
+                // 成功找到注释结束标记
+                return;
+            }
+        }
+    }
+}
+
+// TODO escape character not enough
+static inline int got_slash(int peek) {
+    switch (peek) {
+        case '\\':  return '\\';
+        case '\'':  return '\'';
+        case '\"':  return '\"';
+        case '\?':  return '\?';
+        case '0':   return '\0';
+
+        case 'b': return '\b';
+        case 'f': return '\f';
+        case 'n': return '\n';
+        case 'r': return '\r';
+        case 't': return '\t';
+        case 'v': return '\v';
+        default: break;
+    }
+    return -1;
+}
+
+static void parse_char(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    token->type = TOKEN_CHAR_LITERAL;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '\'') {
+        LEX_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+
+    ch = stream_next_char(stream);
+    lexer_next_pos(lexer);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at middle");
+        goto ERR;
+    } else if (ch == '\\') {
+        ch = stream_next_char(stream);
+        lexer_next_pos(lexer);
+        if ((ch = got_slash(ch)) == -1) {
+            LEX_ERROR("Invalid escape character");
+            // TODO 特殊情况处理
+            goto ERR;
+        }
+        token->value.ch = ch;
+    } else {
+        token->value.ch = ch;
+    }
+    if ((ch = stream_next_char(stream)) != '\'') {
+        LEX_ERROR("Unclosed character literal '%c' at end, expect `'`", ch);
+        lexer_next_pos(lexer);
+        goto ERR;
+    }
+
+    return;
+ERR:
+    set_err_token(token);
+}
+
+static void parse_string(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    token->type = TOKEN_STRING_LITERAL;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '"') {
+        LEX_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+    
+    int base = 0;
+    cstring_t str = cstring_new();
+    while (1) {
+        ch = stream_peek_char(stream);
+        
+        if (ch == lexer_stream_eof) {
+            LEX_ERROR("Unexpected EOF at string literal");
+            break;
+        } else if (ch == '\n') {
+            LEX_ERROR("Unexpected newline at string literal");
+            break;
+        } else if (ch == '\\') {
+            // TODO bad practice and maybe bugs here
+            stream_next_char(stream);
+            ch = stream_next_char(stream);
+            int val = got_slash(ch);
+            if (val == -1) {
+                LEX_ERROR("Invalid escape character it is \\%c [%d]", ch, ch);
+            } else {
+                cstring_push(&str, val);
+                continue;
+            }
+        } else if (ch == '"') {
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+            break;
+        }
+
+        stream_next_char(stream);
+        lexer_next_pos(lexer);
+        cstring_push(&str, ch);
+    }
+
+    token->value.cstr.data = (char*)cstring_as_cstr(&str);
+    token->value.cstr.len = cstring_len(&str);
+    return;
+ERR:
+    set_err_token(token);
+}
+
+static void parse_number(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+    int base = 0;
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch == '0') {
+        ch = stream_peek_char(stream);
+        if (ch == 'x' || ch == 'X') {
+            base = 16;
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+        } else if (ch == 'b' || ch == 'B') {
+            // FIXME C23 external integer base
+            base = 2;
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+        } else if (ch >= '0' && ch <= '7') {
+            base = 8;
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+        } else {
+            base = 10;
+        }
+    } else {
+        base = 10;
+    }
+
+    // 解析整数部分
+    stream_reset_char(stream);
+    int tmp = 0;
+    token->value.n = 0;
+    while (1) {
+        ch = stream_peek_char(stream);
+
+        if (ch == lexer_stream_eof) {
+            break;
+        } else if (ch >= 'a' && ch <= 'z') {
+            tmp = ch - 'a' + 10;
+        } else if (ch >= 'A' && ch <= 'Z') {
+            tmp = ch - 'A' + 10;
+        } else if (ch >= '0' && ch <= '9') {
+            tmp = ch - '0';
+        } else {
+            break;
+        }
+
+        if (tmp >= base) {
+            LOG_ERROR("Invalid digit");
+            break;
+        }
+
+        stream_next_char(stream);
+        lexer_next_pos(lexer);
+        token->value.n = token->value.n * base + tmp;
+        // TODO number overflow
+    }
+
+    token->type = TOKEN_INT_LITERAL;
+    return;
+ERR:
+    set_err_token(token);
+}
+
+static void parse_line(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '#') {
+        LEX_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+
+    const char line[] = "line";
+
+    for (int i = 0; i < sizeof(line); i++) {
+        ch = stream_next_char(stream);
+        lexer_next_pos(lexer);
+        if (ch != line[i]) {
+            LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
+            skip_newline(lexer, token);
+            goto SKIP_LINE;
+        }
+    }
+
+    parse_number(lexer, token);
+    if (token->type != TOKEN_INT_LITERAL) {
+        LEX_ERROR("Invalid line number");
+        goto SKIP_LINE;
+    }
+
+    if (stream_next_char(stream) != ' ') {
+        skip_newline(lexer, token);
+        token->loc.line = token->value.n;
+    }
+
+    if (stream_peek_char(stream) != '"') {
+        LEX_ERROR("Invalid `#` line");
+        goto SKIP_LINE;
+    }
+    parse_string(lexer, token);
+    if (token->type != TOKEN_STRING_LITERAL) {
+        LEX_ERROR("Invalid filename");
+        goto SKIP_LINE;
+    }
+
+    skip_newline(lexer, token);
+    token->loc.line = token->value.n;
+    // FIXME memory leak
+    token->loc.name = cstring_as_cstr((const cstring_t *)&token->value.cstr);
+    token->loc.name_len = cstring_len((const cstring_t *)&token->value.cstr);
+
+    return;
+SKIP_LINE:
+    skip_newline(lexer, token);
+ERR:
+    set_err_token(token);
+}
+
+// /zh/c/language/operator_arithmetic.html
+void lexer_get_token(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    token->type = TOKEN_UNKNOWN;
+    lexer_stream_t *stream = lexer->stream;
+
+    stream_reset_char(stream);
+    token_type_t type = TOKEN_UNKNOWN;
+    int ch = stream_peek_char(stream);
+
+    // once step
+    switch (ch) {
+    case '=':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_EQ; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_ASSIGN; break;
+        } break;
+    case '+':
+        switch (stream_peek_char(stream)) {
+            case '+': type = TOKEN_ADD_ADD; goto double_char;
+            case '=': type = TOKEN_ASSIGN_ADD; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_ADD; break;
+        } break;
+    case '-':
+        switch (stream_peek_char(stream)) {
+            case '-': type = TOKEN_SUB_SUB; goto double_char;
+            case '=': type = TOKEN_ASSIGN_SUB; goto double_char;
+            case '>': type = TOKEN_DEREF; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_SUB; break;
+        } break;
+    case '*':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_MUL; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_MUL; break;
+        } break;
+    case '/':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_DIV; goto double_char;
+            case '/': skip_newline(lexer, token); goto END;
+            case '*': skip_block_comment(lexer, token); goto END;
+            default: stream_reset_char(stream), type = TOKEN_DIV; break;
+        } break;
+    case '%':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_MOD; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_MOD; break;
+        } break;
+    case '&':
+        switch (stream_peek_char(stream)) {
+            case '&': type = TOKEN_AND_AND; goto double_char;
+            case '=': type = TOKEN_ASSIGN_AND; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_AND; break;
+        } break;
+    case '|':
+        switch (stream_peek_char(stream)) {
+            case '|': type = TOKEN_OR_OR; goto double_char;
+            case '=': type = TOKEN_ASSIGN_OR; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_OR; break;
+        } break;
+    case '^':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_XOR; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_XOR; break;
+        } break;
+    case '<':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_LE; goto double_char;
+            case '<': {
+                if (stream_peek_char(stream) == '=') {
+                    type = TOKEN_ASSIGN_L_SH;
+                    goto triple_char;
+                } else {
+                    type = TOKEN_L_SH;
+                    goto double_char;
+                }
+                break;
+            }
+            default: stream_reset_char(stream), type = TOKEN_LT; break;
+        } break;
+    case '>':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_GE; goto double_char;
+            case '>': {
+                if (stream_peek_char(stream) == '=') {
+                    type = TOKEN_ASSIGN_R_SH;
+                    goto triple_char;
+                } else {
+                    type = TOKEN_R_SH;
+                    goto double_char;
+                }
+                break;
+            }
+            default: stream_reset_char(stream), type = TOKEN_GT; break;
+        } break;
+    case '~':
+        type = TOKEN_BIT_NOT; break;
+    case '!':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_NEQ; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_NOT; break;
+        } break;
+    case '[':
+        type = TOKEN_L_BRACKET; break;
+    case ']':
+        type = TOKEN_R_BRACKET; break;
+    case '(':
+        type = TOKEN_L_PAREN; break;
+    case ')':
+        type = TOKEN_R_PAREN; break;
+    case '{':
+        type = TOKEN_L_BRACE; break;
+    case '}':
+        type = TOKEN_R_BRACE; break;
+    case ';':
+        type = TOKEN_SEMICOLON; break;
+    case ',':
+        type = TOKEN_COMMA; break;
+    case ':':
+        type = TOKEN_COLON; break;
+    case '.':
+        if (stream_peek_char(stream) == '.' && stream_peek_char(stream) == '.') {
+            type = TOKEN_ELLIPSIS;
+            goto triple_char;
+        }
+        type = TOKEN_DOT; break;
+    case '?':
+        type = TOKEN_COND; break;
+    case '\v': case '\r': case '\f':
+    case ' ': case '\t':
+        type = TOKEN_BLANK; break;
+    case '\n':
+        // you need to flush a newline or blank
+        stream_next_char(stream);
+        lexer_next_line(lexer);
+        // FIXME some error
+        token->type = TOKEN_BLANK;
+        goto END;
+    case '#':
+        parse_line(lexer, token);
+        token->type = TOKEN_BLANK;
+        goto END;
+    case '\0':
+    case lexer_stream_eof:
+        // EOF
+        type = TOKEN_EOF;
+        break;
+    case '\'':
+        parse_char(lexer, token);
+        goto END;
+    case '"':
+        parse_string(lexer, token);
+        goto END;
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+        parse_number(lexer, token);
+        goto END;
+    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+    case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
+    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
+    case '_':
+        // TOKEN_IDENT
+        // TODO
+        // if ((ch == 'L' && ch == '\'') || (ch == 'L' && ch == '"')) {
+        //     LEX_ERROR("unsupport wide-character char literal by `L` format");
+        // }
+        cstring_t str = cstring_new();
+        while (1) {
+            ch = stream_peek_char(stream);
+            if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+                (ch == '_') || (ch >= '0' && ch <= '9')) {
+                stream_next_char(stream);
+                lexer_next_pos(lexer);
+                cstring_push(&str, ch);
+                continue;
+            }
+            break;
+        }
+
+        int res = keyword_cmp((const char*)str.data, str.len);
+        if (res == -1) {
+            token->value.cstr.data = (char*)cstring_as_cstr(&str);
+            token->value.cstr.len = cstring_len(&str);
+            type = TOKEN_IDENT; break;
+        } else {
+            type = keywords[res].tok; break;
+        }
+    default:
+        LEX_ERROR("unsupport char in sourse code `%c`", ch);
+        break;
+    }
+    goto once_char;
+triple_char:
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+double_char:
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+once_char:
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+    token->type = type;
+END:
+    LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(token->type),
+        token->loc.name, token->loc.line, token->loc.column);
+}
+
+// lexer_get_token maybe got invalid (with parser)
+void lexer_get_valid_token(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token_subtype_t type;
+    do {
+        lexer_get_token(lexer, token);
+        type = get_tok_subtype(token->type);
+        AssertFmt(type != TK_BASIC_INVALID, "Invalid token: `%s` at %s:%d:%d",
+            get_tok_name(token->type), token->loc.name, token->loc.line, token->loc.column);
+    } while (type == TK_BASIC_EMPTYSPACE || type == TK_BASIC_COMMENT);
+}