feat(build): 引入新的 Python 构建系统并移除旧 Makefile

新增基于 Python 的构建脚本 `cbuild.py`，支持包管理、依赖解析和模块化编译。同时添加 `.gitignore` 忽略 `build` 目录，并在 `justfile` 中更新构建命令。移除了原有的 `lib/Makefile` 和主目录下的相关 make 规则，统一使用新构建系统。
2025-11-20 10:44:59 +08:00
parent 8d97fe896c
commit e22811f2f5
140 changed files with 1996 additions and 10098 deletions
--- a/libs/README.md
+++ b/libs/README.md
@@ -0,0 +1,9 @@
+
+lexer 词法分析
+parse 语法分析
+ast 抽象语法树
+sema 语义分析
+ir 中间代码标识
+opt 优化器
+codegen 代码生成
+target 目标平台支持
--- a/libs/lexer/README.md
+++ b/libs/lexer/README.md
@@ -0,0 +1,5 @@
+# 词法分析
+
+参考LCC的此分析部分
+
+主要使用 LL(n) 硬编码查找token
--- a/libs/lexer/cbuild.toml
+++ b/libs/lexer/cbuild.toml
@@ -0,0 +1,6 @@
+[package]
+name = "smcc_lex"
+
+dependencies = [
+    { name = "libcore", path = "../../runtime/libcore" },
+]
--- a/libs/lexer/include/lexer.h
+++ b/libs/lexer/include/lexer.h
@@ -0,0 +1,62 @@
+/**
+ * @file lexer.h
+ * @brief C语言词法分析器核心数据结构与接口
+ */
+
+#ifndef __SMCC_CC_LEXER_H__
+#define __SMCC_CC_LEXER_H__
+
+#include <libcore.h>
+#include "lexer_stream.h"
+#include "lexer_token.h"
+
+typedef struct lexer_loc {
+    const char *name;
+    usize name_len;
+    usize line;
+    usize column;
+    usize offset;
+} lexer_loc_t;
+
+typedef struct lexer_token {
+    token_type_t type;
+    core_cvalue_t value;
+    lexer_loc_t loc;
+} lexer_tok_t;
+
+/**
+ * @brief 词法分析器核心结构体
+ * 
+ * 封装词法分析所需的状态信息和缓冲区管理
+ */
+typedef struct cc_lexer {
+    lexer_stream_t* stream;
+    lexer_loc_t pos;
+} smcc_lexer_t;
+
+/**
+ * @brief 初始化词法分析器
+ * @param[out] lexer 要初始化的词法分析器实例
+ * @param[in] stream 输入流对象指针
+ */
+void lexer_init(smcc_lexer_t* lexer, lexer_stream_t* stream);
+
+/**
+ * @brief 获取原始token
+ * @param[in] lexer 词法分析器实例
+ * @param[out] token 输出token存储位置
+ * 
+ * 此函数会返回所有类型的token，包括空白符等无效token
+ */
+void lexer_get_token(smcc_lexer_t* lexer, lexer_tok_t* token);
+
+/**
+ * @brief 获取有效token
+ * @param[in] lexer 词法分析器实例
+ * @param[out] token 输出token存储位置
+ * 
+ * 此函数会自动跳过空白符等无效token，返回对语法分析有意义的token
+ */
+void lexer_get_valid_token(smcc_lexer_t* lexer, lexer_tok_t* token);
+
+#endif
--- a/libs/lexer/include/lexer_log.h
+++ b/libs/lexer/include/lexer_log.h
@@ -0,0 +1,48 @@
+#ifndef __SMCC_LEXER_LOG_H__
+#define __SMCC_LEXER_LOG_H__
+
+#include <libcore.h>
+
+#ifndef LEX_LOG_LEVEL
+#define LEX_LOG_LEVEL 4
+#endif
+
+#if LEX_LOG_LEVEL <= 1
+#define LEX_NOTSET( fmt, ...)       MLOG_NOTSET(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
+#else
+#define LEX_NOTSET( fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 2
+#define LEX_DEBUG(  fmt, ...)       MLOG_DEBUG(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
+#else
+#define LEX_DEBUG(  fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 3
+#define LEX_INFO(   fmt, ...)       MLOG_INFO(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
+#else
+#define LEX_INFO(   fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 4
+#define LEX_WARN(   fmt, ...)       MLOG_WARN(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
+#else
+#define LEX_WARN(   fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 5
+#define LEX_ERROR(  fmt, ...)       MLOG_ERROR(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
+#else
+#define LEX_ERROR(  fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 6
+#define LEX_FATAL(  fmt, ...)       MLOG_FATAL(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
+#else
+#define LEX_FATAL(  fmt, ...)
+#endif
+
+extern logger_t __smcc_lexer_log;
+
+#endif // __SMCC_LEXER_LOG_H__
--- a/libs/lexer/include/lexer_stream.h
+++ b/libs/lexer/include/lexer_stream.h
@@ -0,0 +1,37 @@
+#include <core_type.h>
+
+typedef struct lexer_stream lexer_stream_t;
+
+#define lexer_stream_eof (-1)
+
+struct lexer_stream {
+    const char* name;
+    usize name_len;
+
+    /// @brief 读取指定数量的字符到缓冲区
+    usize (*read_buf)(lexer_stream_t* stream, char* buffer, usize count);
+
+    /// @brief 获取下一个字符
+    int (*peek_char)(lexer_stream_t* stream);
+
+    /// @brief 重置字符流位置
+    void (*reset_char) (lexer_stream_t* stream);
+
+    /// @brief 读取并消费下一个字符（移动流位置）
+    int (*next_char)(lexer_stream_t* stream);
+
+    /// @brief 释放资源
+    void (*free_stream) (lexer_stream_t* steam);
+};
+
+#ifndef __SMCC_LEXER_NO_MEM_STREAM__
+typedef struct lexer_mem_stream {
+    lexer_stream_t stream;
+    const char* data;
+    usize data_length;
+    usize curr_pos;
+    usize peek_pos;
+    cbool owned;
+} lexer_mem_stream_t;
+lexer_stream_t* lexer_mem_stream_init(lexer_mem_stream_t* stream, const char* data, usize length, cbool need_copy);
+#endif
--- a/libs/lexer/include/lexer_token.h
+++ b/libs/lexer/include/lexer_token.h
@@ -0,0 +1,137 @@
+#ifndef __SMCC_CC_TOKEN_H__
+#define __SMCC_CC_TOKEN_H__
+
+#include <libcore.h>
+
+typedef enum ckeyword {
+    CSTD_C89,
+    CSTD_C99,
+    CEXT_ASM,
+} ckeyword_t;
+
+// Using Binary Search To Fast Find Keyword
+#define KEYWORD_TABLE \
+    X(asm           , TK_BASIC_KEYWORD  , TOKEN_ASM       , CEXT_ASM) \
+    X(break         , TK_BASIC_KEYWORD  , TOKEN_BREAK     , CSTD_C89) \
+    X(case          , TK_BASIC_KEYWORD  , TOKEN_CASE      , CSTD_C89) \
+    X(char          , TK_BASIC_KEYWORD  , TOKEN_CHAR      , CSTD_C89) \
+    X(const         , TK_BASIC_KEYWORD  , TOKEN_CONST     , CSTD_C89) \
+    X(continue      , TK_BASIC_KEYWORD  , TOKEN_CONTINUE  , CSTD_C89) \
+    X(default       , TK_BASIC_KEYWORD  , TOKEN_DEFAULT   , CSTD_C89) \
+    X(do            , TK_BASIC_KEYWORD  , TOKEN_DO        , CSTD_C89) \
+    X(double        , TK_BASIC_KEYWORD  , TOKEN_DOUBLE    , CSTD_C89) \
+    X(else          , TK_BASIC_KEYWORD  , TOKEN_ELSE      , CSTD_C89) \
+    X(enum          , TK_BASIC_KEYWORD  , TOKEN_ENUM      , CSTD_C89) \
+    X(extern        , TK_BASIC_KEYWORD  , TOKEN_EXTERN    , CSTD_C89) \
+    X(float         , TK_BASIC_KEYWORD  , TOKEN_FLOAT     , CSTD_C89) \
+    X(for           , TK_BASIC_KEYWORD  , TOKEN_FOR       , CSTD_C89) \
+    X(goto          , TK_BASIC_KEYWORD  , TOKEN_GOTO      , CSTD_C89) \
+    X(if            , TK_BASIC_KEYWORD  , TOKEN_IF        , CSTD_C89) \
+    X(inline        , TK_BASIC_KEYWORD  , TOKEN_INLINE    , CSTD_C99) \
+    X(int           , TK_BASIC_KEYWORD  , TOKEN_INT       , CSTD_C89) \
+    X(long          , TK_BASIC_KEYWORD  , TOKEN_LONG      , CSTD_C89) \
+    X(register      , TK_BASIC_KEYWORD  , TOKEN_REGISTER  , CSTD_C89) \
+    X(restrict      , TK_BASIC_KEYWORD  , TOKEN_RESTRICT  , CSTD_C99) \
+    X(return        , TK_BASIC_KEYWORD  , TOKEN_RETURN    , CSTD_C89) \
+    X(short         , TK_BASIC_KEYWORD  , TOKEN_SHORT     , CSTD_C89) \
+    X(signed        , TK_BASIC_KEYWORD  , TOKEN_SIGNED    , CSTD_C89) \
+    X(sizeof        , TK_BASIC_KEYWORD  , TOKEN_SIZEOF    , CSTD_C89) \
+    X(static        , TK_BASIC_KEYWORD  , TOKEN_STATIC    , CSTD_C89) \
+    X(struct        , TK_BASIC_KEYWORD  , TOKEN_STRUCT    , CSTD_C89) \
+    X(switch        , TK_BASIC_KEYWORD  , TOKEN_SWITCH    , CSTD_C89) \
+    X(typedef       , TK_BASIC_KEYWORD  , TOKEN_TYPEDEF   , CSTD_C89) \
+    X(union         , TK_BASIC_KEYWORD  , TOKEN_UNION     , CSTD_C89) \
+    X(unsigned      , TK_BASIC_KEYWORD  , TOKEN_UNSIGNED  , CSTD_C89) \
+    X(void          , TK_BASIC_KEYWORD  , TOKEN_VOID      , CSTD_C89) \
+    X(volatile      , TK_BASIC_KEYWORD  , TOKEN_VOLATILE  , CSTD_C89) \
+    X(while         , TK_BASIC_KEYWORD  , TOKEN_WHILE     , CSTD_C89) \
+    // KEYWORD_TABLE
+
+#define TOKEN_TABLE \
+    X(unknown        , TK_BASIC_INVALID,        TOKEN_UNKNOWN           ) \
+    X(EOF            , TK_BASIC_EOF,            TOKEN_EOF               ) \
+    X(blank          , TK_BASIC_EMPTYSPACE,     TOKEN_BLANK             ) \
+    X("=="           , TK_BASIC_OPERATOR,       TOKEN_EQ                ) \
+    X("="            , TK_BASIC_OPERATOR,       TOKEN_ASSIGN            ) \
+    X("++"           , TK_BASIC_OPERATOR,       TOKEN_ADD_ADD           ) \
+    X("+="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_ADD        ) \
+    X("+"            , TK_BASIC_OPERATOR,       TOKEN_ADD               ) \
+    X("--"           , TK_BASIC_OPERATOR,       TOKEN_SUB_SUB           ) \
+    X("-="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_SUB        ) \
+    X("->"           , TK_BASIC_OPERATOR,       TOKEN_DEREF             ) \
+    X("-"            , TK_BASIC_OPERATOR,       TOKEN_SUB               ) \
+    X("*="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_MUL        ) \
+    X("*"            , TK_BASIC_OPERATOR,       TOKEN_MUL               ) \
+    X("/="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_DIV        ) \
+    X("/"            , TK_BASIC_OPERATOR,       TOKEN_DIV               ) \
+    X("//"           , TK_BASIC_COMMENT ,       TOKEN_LINE_COMMENT      ) \
+    X("/* */"        , TK_BASIC_COMMENT ,       TOKEN_BLOCK_COMMENT     ) \
+    X("%="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_MOD        ) \
+    X("%"            , TK_BASIC_OPERATOR,       TOKEN_MOD               ) \
+    X("&&"           , TK_BASIC_OPERATOR,       TOKEN_AND_AND           ) \
+    X("&="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_AND        ) \
+    X("&"            , TK_BASIC_OPERATOR,       TOKEN_AND               ) \
+    X("||"           , TK_BASIC_OPERATOR,       TOKEN_OR_OR             ) \
+    X("|="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_OR         ) \
+    X("|"            , TK_BASIC_OPERATOR,       TOKEN_OR                ) \
+    X("^="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_XOR        ) \
+    X("^"            , TK_BASIC_OPERATOR,       TOKEN_XOR               ) \
+    X("<<="          , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_L_SH       ) \
+    X("<<"           , TK_BASIC_OPERATOR,       TOKEN_L_SH              ) \
+    X("<="           , TK_BASIC_OPERATOR,       TOKEN_LE                ) \
+    X("<"            , TK_BASIC_OPERATOR,       TOKEN_LT                ) \
+    X(">>="          , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_R_SH       ) \
+    X(">>"           , TK_BASIC_OPERATOR,       TOKEN_R_SH              ) \
+    X(">="           , TK_BASIC_OPERATOR,       TOKEN_GE                ) \
+    X(">"            , TK_BASIC_OPERATOR,       TOKEN_GT                ) \
+    X("!"            , TK_BASIC_OPERATOR,       TOKEN_NOT               ) \
+    X("!="           , TK_BASIC_OPERATOR,       TOKEN_NEQ               ) \
+    X("~"            , TK_BASIC_OPERATOR,       TOKEN_BIT_NOT           ) \
+    X("["            , TK_BASIC_OPERATOR,       TOKEN_L_BRACKET         ) \
+    X("]"            , TK_BASIC_OPERATOR,       TOKEN_R_BRACKET         ) \
+    X("("            , TK_BASIC_OPERATOR,       TOKEN_L_PAREN           ) \
+    X(")"            , TK_BASIC_OPERATOR,       TOKEN_R_PAREN           ) \
+    X("{"            , TK_BASIC_OPERATOR,       TOKEN_L_BRACE           ) \
+    X("}"            , TK_BASIC_OPERATOR,       TOKEN_R_BRACE           ) \
+    X(";"            , TK_BASIC_OPERATOR,       TOKEN_SEMICOLON         ) \
+    X(","            , TK_BASIC_OPERATOR,       TOKEN_COMMA             ) \
+    X(":"            , TK_BASIC_OPERATOR,       TOKEN_COLON             ) \
+    X("."            , TK_BASIC_OPERATOR,       TOKEN_DOT               ) \
+    X("..."          , TK_BASIC_OPERATOR,       TOKEN_ELLIPSIS          ) \
+    X("?"            , TK_BASIC_OPERATOR,       TOKEN_COND              ) \
+    X(ident          , TK_BASIC_IDENTIFIER,     TOKEN_IDENT             ) \
+    X(int_literal    , TK_BASIC_LITERAL,        TOKEN_INT_LITERAL       ) \
+    X(float_literal  , TK_BASIC_LITERAL,        TOKEN_FLOAT_LITERAL     ) \
+    X(char_literal   , TK_BASIC_LITERAL,        TOKEN_CHAR_LITERAL      ) \
+    X(string_literal , TK_BASIC_LITERAL,        TOKEN_STRING_LITERAL    ) \
+    // END
+
+// 定义TokenType枚举
+typedef enum cc_tktype {
+    // 处理普通token
+    #define X(str, subtype, tok) tok,
+    TOKEN_TABLE
+    #undef X
+    
+    // 处理关键字（保持原有格式）
+    #define X(name, subtype, tok, std) tok,
+    KEYWORD_TABLE
+    #undef X
+} token_type_t;
+
+typedef enum token_subtype {
+    TK_BASIC_INVALID,     // 错误占位
+    TK_BASIC_KEYWORD,     // 关键字
+    TK_BASIC_OPERATOR,    // 操作符
+    TK_BASIC_IDENTIFIER,  // 标识符
+    TK_BASIC_LITERAL,     // 字面量
+
+    TK_BASIC_EMPTYSPACE,  // 空白
+    TK_BASIC_COMMENT,     // 注释
+    TK_BASIC_EOF          // 结束标记
+} token_subtype_t;
+
+token_subtype_t get_tok_subtype(token_type_t type);
+const char* get_tok_name(token_type_t type);
+
+#endif
--- a/libs/lexer/src/lexer.c
+++ b/libs/lexer/src/lexer.c
@@ -0,0 +1,637 @@
+/**
+ * 仿照LCCompiler的词法分析部分
+ * 
+ * 如下为LCC的README in 2025.2
+This hierarchy is the distribution for lcc version 4.2.
+
+lcc version 3.x is described in the book "A Retargetable C Compiler:
+Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
+There are significant differences between 3.x and 4.x, most notably in
+the intermediate code. For details, see
+https://drh.github.io/lcc/documents/interface4.pdf.
+
+VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
+UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
+
+LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
+
+LOG describes the changes since the last release.
+
+CPYRIGHT describes the conditions under you can use, copy, modify, and
+distribute lcc or works derived from lcc.
+
+doc/install.html is an HTML file that gives a complete description of
+the distribution and installation instructions.
+
+Chris Fraser / cwf@aya.yale.edu
+David Hanson / drh@drhanson.net
+ */
+#include <lexer_log.h>
+#include <lexer.h>
+
+static const struct {
+    const char* name;
+    ckeyword_t std_type;
+    token_type_t tok;
+} keywords[] = {
+    #define X(name, subtype, tok, std_type,...) { #name, std_type, tok },
+    KEYWORD_TABLE
+    #undef X
+};
+
+// by using binary search to find the keyword
+static inline int keyword_cmp(const char* name, int len) {
+    int low = 0;
+    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
+    while (low <= high) {
+        int mid = (low + high) / 2;
+        const char *key = keywords[mid].name;
+        int cmp = 0;
+        
+        // 自定义字符串比较逻辑
+        for (int i = 0; i < len; i++) {
+            if (name[i] != key[i]) {
+                cmp = (unsigned char)name[i] - (unsigned char)key[i];
+                break;
+            }
+            if (name[i] == '\0') break; // 遇到终止符提前结束
+        }
+        
+        if (cmp == 0) {
+            // 完全匹配检查（长度相同）
+            if (key[len] == '\0') return mid;
+            cmp = -1; // 当前关键词比输入长
+        }
+        
+        if (cmp < 0) {
+            high = mid - 1;
+        } else {
+            low = mid + 1;
+        }
+    }
+    return -1; // Not a keyword.
+}
+
+void lexer_init(smcc_lexer_t* lexer, lexer_stream_t* stream) {
+    lexer->stream = stream;
+    lexer->pos = (lexer_loc_t) {
+        .name = stream->name,
+        .name_len = stream->name_len,
+        .line = 1,
+        .column = 1,
+        .offset = 0,
+    };
+}
+
+#define stream_reset_char(stream)   ((stream)->reset_char(stream))
+#define stream_next_char(stream)    ((stream)->next_char(stream))
+#define stream_peek_char(stream)    ((stream)->peek_char(stream))
+#define lexer_next_pos(lexer)       ((lexer)->pos.column ++, (lexer)->pos.offset ++)
+#define lexer_next_line(lexer)      ((lexer)->pos.line ++, (lexer)->pos.column = 1)
+#define set_err_token(token)        ((token)->type = TOKEN_UNKNOWN)
+
+static void skip_newline(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    lexer_stream_t* stream = lexer->stream;
+    token->type = TOKEN_LINE_COMMENT;
+
+    // 循环直到遇到换行符或文件结束
+    while (1) {
+        int ch = stream_next_char(stream);
+        
+        if (ch == lexer_stream_eof) {
+            // 到达文件末尾，直接返回
+            return;
+        }
+        
+        // 更新位置信息
+        lexer_next_pos(lexer);
+        if (ch == '\n') {
+            // 遇到换行符，增加行号并重置列号
+            lexer_next_line(lexer);
+            return;
+        }
+    }
+}
+
+static void skip_block_comment(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    lexer_stream_t* stream = lexer->stream;
+    token->type = TOKEN_BLOCK_COMMENT;
+    int ch;
+    
+    stream_reset_char(stream);
+    ch = stream_next_char(stream);
+    lexer_next_pos(lexer);
+    // FIXME Assertion
+    Assert (ch == '/');
+    ch = stream_next_char(stream);
+    lexer_next_pos(lexer);
+    Assert (ch == '*');
+    // 我们已经识别了 "/*"，现在需要找到 "*/"
+    while (1) {
+        ch = stream_next_char(stream);
+        lexer_next_pos(lexer);
+
+        if (ch == lexer_stream_eof) {
+            // 未闭合的块注释
+            LEX_WARN("Unterminated block comment");
+            return;
+        }
+        
+        // LEX_ERROR("%c", ch);
+
+        // 更新位置信息
+        if (ch == '\n') {
+            lexer_next_line(lexer);
+        } else if (ch == '*') {
+            // 查看下一个字符是否是 '/'
+            int next_ch = stream_peek_char(stream);
+
+            if (next_ch == '/') {
+                // 消费 '/' 字符
+                stream_next_char(stream);
+                
+                // 更新位置信息
+                lexer_next_pos(lexer);
+                
+                // 成功找到注释结束标记
+                return;
+            }
+        }
+    }
+}
+
+// TODO escape character not enough
+static inline int got_slash(int peek) {
+    switch (peek) {
+        case '\\':  return '\\';
+        case '\'':  return '\'';
+        case '\"':  return '\"';
+        case '\?':  return '\?';
+        case '0':   return '\0';
+
+        case 'b': return '\b';
+        case 'f': return '\f';
+        case 'n': return '\n';
+        case 'r': return '\r';
+        case 't': return '\t';
+        case 'v': return '\v';
+        default: break;
+    }
+    return -1;
+}
+
+static void parse_char(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    token->type = TOKEN_CHAR_LITERAL;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '\'') {
+        LEX_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+
+    ch = stream_next_char(stream);
+    lexer_next_pos(lexer);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at middle");
+        goto ERR;
+    } else if (ch == '\\') {
+        ch = stream_next_char(stream);
+        lexer_next_pos(lexer);
+        if ((ch = got_slash(ch)) == -1) {
+            LEX_ERROR("Invalid escape character");
+            // TODO 特殊情况处理
+            goto ERR;
+        }
+        token->value.ch = ch;
+    } else {
+        token->value.ch = ch;
+    }
+    if ((ch = stream_next_char(stream)) != '\'') {
+        LEX_ERROR("Unclosed character literal '%c' at end, expect `'`", ch);
+        lexer_next_pos(lexer);
+        goto ERR;
+    }
+
+    return;
+ERR:
+    set_err_token(token);
+}
+
+static void parse_string(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    token->type = TOKEN_STRING_LITERAL;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '"') {
+        LEX_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+    
+    int base = 0;
+    cstring_t str = cstring_new();
+    while (1) {
+        ch = stream_peek_char(stream);
+        
+        if (ch == lexer_stream_eof) {
+            LEX_ERROR("Unexpected EOF at string literal");
+            break;
+        } else if (ch == '\n') {
+            LEX_ERROR("Unexpected newline at string literal");
+            break;
+        } else if (ch == '\\') {
+            // TODO bad practice and maybe bugs here
+            stream_next_char(stream);
+            ch = stream_next_char(stream);
+            int val = got_slash(ch);
+            if (val == -1) {
+                LEX_ERROR("Invalid escape character it is \\%c [%d]", ch, ch);
+            } else {
+                cstring_push(&str, val);
+                continue;
+            }
+        } else if (ch == '"') {
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+            break;
+        }
+
+        stream_next_char(stream);
+        lexer_next_pos(lexer);
+        cstring_push(&str, ch);
+    }
+
+    token->value.cstr.data = (char*)cstring_as_cstr(&str);
+    token->value.cstr.len = cstring_len(&str);
+    return;
+ERR:
+    set_err_token(token);
+}
+
+static void parse_number(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+    int base = 0;
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch == '0') {
+        ch = stream_peek_char(stream);
+        if (ch == 'x' || ch == 'X') {
+            base = 16;
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+        } else if (ch == 'b' || ch == 'B') {
+            // FIXME C23 external integer base
+            base = 2;
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+        } else if (ch >= '0' && ch <= '7') {
+            base = 8;
+            stream_next_char(stream);
+            lexer_next_pos(lexer);
+        } else {
+            base = 10;
+        }
+    } else {
+        base = 10;
+    }
+
+    // 解析整数部分
+    stream_reset_char(stream);
+    int tmp = 0;
+    token->value.n = 0;
+    while (1) {
+        ch = stream_peek_char(stream);
+
+        if (ch == lexer_stream_eof) {
+            break;
+        } else if (ch >= 'a' && ch <= 'z') {
+            tmp = ch - 'a' + 10;
+        } else if (ch >= 'A' && ch <= 'Z') {
+            tmp = ch - 'A' + 10;
+        } else if (ch >= '0' && ch <= '9') {
+            tmp = ch - '0';
+        } else {
+            break;
+        }
+
+        if (tmp >= base) {
+            LOG_ERROR("Invalid digit");
+            break;
+        }
+
+        stream_next_char(stream);
+        lexer_next_pos(lexer);
+        token->value.n = token->value.n * base + tmp;
+        // TODO number overflow
+    }
+
+    token->type = TOKEN_INT_LITERAL;
+    return;
+ERR:
+    set_err_token(token);
+}
+
+static void parse_line(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    lexer_stream_t *stream = lexer->stream;
+    stream_reset_char(stream);
+    int ch = stream_peek_char(stream);
+
+    if (ch == lexer_stream_eof) {
+        LEX_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '#') {
+        LEX_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+
+    const char line[] = "line";
+
+    for (int i = 0; i < sizeof(line); i++) {
+        ch = stream_next_char(stream);
+        lexer_next_pos(lexer);
+        if (ch != line[i]) {
+            LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
+            skip_newline(lexer, token);
+            goto SKIP_LINE;
+        }
+    }
+
+    parse_number(lexer, token);
+    if (token->type != TOKEN_INT_LITERAL) {
+        LEX_ERROR("Invalid line number");
+        goto SKIP_LINE;
+    }
+
+    if (stream_next_char(stream) != ' ') {
+        skip_newline(lexer, token);
+        token->loc.line = token->value.n;
+    }
+
+    if (stream_peek_char(stream) != '"') {
+        LEX_ERROR("Invalid `#` line");
+        goto SKIP_LINE;
+    }
+    parse_string(lexer, token);
+    if (token->type != TOKEN_STRING_LITERAL) {
+        LEX_ERROR("Invalid filename");
+        goto SKIP_LINE;
+    }
+
+    skip_newline(lexer, token);
+    token->loc.line = token->value.n;
+    // FIXME memory leak
+    token->loc.name = cstring_as_cstr((const cstring_t *)&token->value.cstr);
+    token->loc.name_len = cstring_len((const cstring_t *)&token->value.cstr);
+
+    return;
+SKIP_LINE:
+    skip_newline(lexer, token);
+ERR:
+    set_err_token(token);
+}
+
+// /zh/c/language/operator_arithmetic.html
+void lexer_get_token(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token->loc = lexer->pos;
+    token->type = TOKEN_UNKNOWN;
+    lexer_stream_t *stream = lexer->stream;
+
+    stream_reset_char(stream);
+    token_type_t type = TOKEN_UNKNOWN;
+    int ch = stream_peek_char(stream);
+
+    // once step
+    switch (ch) {
+    case '=':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_EQ; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_ASSIGN; break;
+        } break;
+    case '+':
+        switch (stream_peek_char(stream)) {
+            case '+': type = TOKEN_ADD_ADD; goto double_char;
+            case '=': type = TOKEN_ASSIGN_ADD; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_ADD; break;
+        } break;
+    case '-':
+        switch (stream_peek_char(stream)) {
+            case '-': type = TOKEN_SUB_SUB; goto double_char;
+            case '=': type = TOKEN_ASSIGN_SUB; goto double_char;
+            case '>': type = TOKEN_DEREF; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_SUB; break;
+        } break;
+    case '*':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_MUL; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_MUL; break;
+        } break;
+    case '/':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_DIV; goto double_char;
+            case '/': skip_newline(lexer, token); goto END;
+            case '*': skip_block_comment(lexer, token); goto END;
+            default: stream_reset_char(stream), type = TOKEN_DIV; break;
+        } break;
+    case '%':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_MOD; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_MOD; break;
+        } break;
+    case '&':
+        switch (stream_peek_char(stream)) {
+            case '&': type = TOKEN_AND_AND; goto double_char;
+            case '=': type = TOKEN_ASSIGN_AND; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_AND; break;
+        } break;
+    case '|':
+        switch (stream_peek_char(stream)) {
+            case '|': type = TOKEN_OR_OR; goto double_char;
+            case '=': type = TOKEN_ASSIGN_OR; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_OR; break;
+        } break;
+    case '^':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_ASSIGN_XOR; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_XOR; break;
+        } break;
+    case '<':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_LE; goto double_char;
+            case '<': {
+                if (stream_peek_char(stream) == '=') {
+                    type = TOKEN_ASSIGN_L_SH;
+                    goto triple_char;
+                } else {
+                    type = TOKEN_L_SH;
+                    goto double_char;
+                }
+                break;
+            }
+            default: stream_reset_char(stream), type = TOKEN_LT; break;
+        } break;
+    case '>':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_GE; goto double_char;
+            case '>': {
+                if (stream_peek_char(stream) == '=') {
+                    type = TOKEN_ASSIGN_R_SH;
+                    goto triple_char;
+                } else {
+                    type = TOKEN_R_SH;
+                    goto double_char;
+                }
+                break;
+            }
+            default: stream_reset_char(stream), type = TOKEN_GT; break;
+        } break;
+    case '~':
+        type = TOKEN_BIT_NOT; break;
+    case '!':
+        switch (stream_peek_char(stream)) {
+            case '=': type = TOKEN_NEQ; goto double_char;
+            default: stream_reset_char(stream), type = TOKEN_NOT; break;
+        } break;
+    case '[':
+        type = TOKEN_L_BRACKET; break;
+    case ']':
+        type = TOKEN_R_BRACKET; break;
+    case '(':
+        type = TOKEN_L_PAREN; break;
+    case ')':
+        type = TOKEN_R_PAREN; break;
+    case '{':
+        type = TOKEN_L_BRACE; break;
+    case '}':
+        type = TOKEN_R_BRACE; break;
+    case ';':
+        type = TOKEN_SEMICOLON; break;
+    case ',':
+        type = TOKEN_COMMA; break;
+    case ':':
+        type = TOKEN_COLON; break;
+    case '.':
+        if (stream_peek_char(stream) == '.' && stream_peek_char(stream) == '.') {
+            type = TOKEN_ELLIPSIS;
+            goto triple_char;
+        }
+        type = TOKEN_DOT; break;
+    case '?':
+        type = TOKEN_COND; break;
+    case '\v': case '\r': case '\f':
+    case ' ': case '\t':
+        type = TOKEN_BLANK; break;
+    case '\n':
+        // you need to flush a newline or blank
+        stream_next_char(stream);
+        lexer_next_line(lexer);
+        // FIXME some error
+        token->type = TOKEN_BLANK;
+        goto END;
+    case '#':
+        parse_line(lexer, token);
+        token->type = TOKEN_BLANK;
+        goto END;
+    case '\0':
+    case lexer_stream_eof:
+        // EOF
+        type = TOKEN_EOF;
+        break;
+    case '\'':
+        parse_char(lexer, token);
+        goto END;
+    case '"':
+        parse_string(lexer, token);
+        goto END;
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+        parse_number(lexer, token);
+        goto END;
+    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+    case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
+    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
+    case '_':
+        // TOKEN_IDENT
+        // TODO
+        // if ((ch == 'L' && ch == '\'') || (ch == 'L' && ch == '"')) {
+        //     LEX_ERROR("unsupport wide-character char literal by `L` format");
+        // }
+        cstring_t str = cstring_new();
+        while (1) {
+            ch = stream_peek_char(stream);
+            if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+                (ch == '_') || (ch >= '0' && ch <= '9')) {
+                stream_next_char(stream);
+                lexer_next_pos(lexer);
+                cstring_push(&str, ch);
+                continue;
+            }
+            break;
+        }
+
+        int res = keyword_cmp((const char*)str.data, str.len);
+        if (res == -1) {
+            token->value.cstr.data = (char*)cstring_as_cstr(&str);
+            token->value.cstr.len = cstring_len(&str);
+            type = TOKEN_IDENT; break;
+        } else {
+            type = keywords[res].tok; break;
+        }
+    default:
+        LEX_ERROR("unsupport char in sourse code `%c`", ch);
+        break;
+    }
+    goto once_char;
+triple_char:
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+double_char:
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+once_char:
+    stream_next_char(stream);
+    lexer_next_pos(lexer);
+    token->type = type;
+END:
+    LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(token->type),
+        token->loc.name, token->loc.line, token->loc.column);
+}
+
+// lexer_get_token maybe got invalid (with parser)
+void lexer_get_valid_token(smcc_lexer_t* lexer, lexer_tok_t* token) {
+    token_subtype_t type;
+    do {
+        lexer_get_token(lexer, token);
+        type = get_tok_subtype(token->type);
+        AssertFmt(type != TK_BASIC_INVALID, "Invalid token: `%s` at %s:%d:%d",
+            get_tok_name(token->type), token->loc.name, token->loc.line, token->loc.column);
+    } while (type == TK_BASIC_EMPTYSPACE || type == TK_BASIC_COMMENT);
+}
--- a/libs/lexer/src/lexer_log.c
+++ b/libs/lexer/src/lexer_log.c
@@ -0,0 +1,7 @@
+#include <lexer_log.h>
+
+logger_t __smcc_lexer_log = {
+    .name = "lexer",
+    .level = LOG_LEVEL_ALL,
+    .handler = log_default_handler,
+};
--- a/libs/lexer/src/mem_stream.c
+++ b/libs/lexer/src/mem_stream.c
@@ -0,0 +1,101 @@
+#include <lexer_stream.h>
+#include <lexer_log.h>
+#include <libcore.h>
+
+// 内存流的具体实现结构
+
+static usize read_buf(lexer_stream_t* _stream, char* buffer, usize count) {
+    Assert(buffer != null && buffer != null);
+    lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
+
+    usize remaining = stream->data_length - stream->curr_pos;
+    usize to_read = (remaining < count) ? remaining : count;
+
+    if (to_read > 0) {
+        smcc_memcpy(buffer, stream->data + stream->curr_pos, to_read);
+        stream->curr_pos += to_read;
+    } else {
+        LEX_WARN("Reading past end of stream [maybe count is too large or negative?]");
+    }
+
+    return to_read;
+}
+
+static int peek_char(lexer_stream_t* _stream) {
+    Assert(_stream != null);
+    lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
+
+    // 如果已经到达末尾，返回EOF
+    if (stream->peek_pos >= stream->data_length) {
+        return lexer_stream_eof; // EOF
+    }
+
+    return (int)(unsigned char)stream->data[stream->peek_pos++];
+}
+
+static int next_char(lexer_stream_t* _stream) {
+    Assert(_stream != NULL);
+    lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
+
+    // 如果已经到达末尾，返回EOF
+    if (stream->curr_pos >= stream->data_length) {
+        return lexer_stream_eof; // EOF
+    }
+    
+    unsigned char ch = stream->data[stream->curr_pos++];
+    if (stream->peek_pos < stream->curr_pos) {
+        stream->peek_pos = stream->curr_pos;
+    }
+    return (int)ch;
+}
+
+static void reset_char(lexer_stream_t* _stream) {
+    Assert(_stream != NULL);
+    lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
+
+    stream->peek_pos = stream->curr_pos;
+}
+
+static void free_stream(lexer_stream_t* _stream) {
+    Assert(_stream != null);
+    lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
+    if (stream->owned) {
+        smcc_free((void*)stream->data);
+    }
+}
+
+lexer_stream_t* lexer_mem_stream_init(lexer_mem_stream_t* stream, const char* data, usize length, cbool need_copy) {
+    if (stream == null || data == NULL || length == 0) {
+        LEX_ERROR("param error");
+        return null;
+    }
+
+    stream->owned = need_copy;
+    if (need_copy) {
+        char* buf = (char*)smcc_malloc(length);
+        if (buf == null) {
+            LEX_ERROR("malloc error");
+            return null;
+        }
+
+        smcc_memcpy(buf, data, length);
+        stream->data = buf;
+    } else {
+        stream->data = data;
+    }
+    stream->data_length = length;
+    stream->curr_pos = 0;
+    stream->peek_pos = 0;
+
+    static const char name[] = "mem_stream";
+    stream->stream.name = name;
+    stream->stream.name_len = sizeof(name) - 1;
+    
+    stream->stream.read_buf = read_buf;
+    stream->stream.peek_char = peek_char;
+    stream->stream.next_char = next_char;
+    stream->stream.reset_char = reset_char;
+    stream->stream.free_stream = free_stream;
+
+    return (void*)stream;
+}
--- a/libs/lexer/src/token.c
+++ b/libs/lexer/src/token.c
@@ -0,0 +1,30 @@
+#include <lexer_token.h>
+
+// 生成字符串映射（根据需求选择#str或#name）
+static const char* token_strings[] = {
+    #define X(str, subtype, tok) [tok] = #str,
+    TOKEN_TABLE
+    #undef X
+
+    #define X(str, subtype, tok, std) [tok] = #str,
+    KEYWORD_TABLE
+    #undef X
+};
+
+static token_subtype_t token_subtypes[] = {
+    #define X(str, subtype, tok) [tok] = subtype,
+    TOKEN_TABLE
+    #undef X
+
+    #define X(str, subtype, tok, std) [tok] = subtype,
+    KEYWORD_TABLE
+    #undef X
+};
+
+token_subtype_t get_tok_subtype(token_type_t type) {
+    return token_subtypes[type];
+}
+
+const char* get_tok_name(token_type_t type) {
+    return token_strings[type];
+}
--- a/libs/lexer/tests/test_number.c
+++ b/libs/lexer/tests/test_number.c
@@ -0,0 +1,4 @@
+
+int main() {
+    
+}
--- a/libs/lexer/tests/test_run.c
+++ b/libs/lexer/tests/test_run.c
@@ -0,0 +1,83 @@
+#include <lexer.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+/// gcc -g ../lexer.c ../token.c test_lexer.c -o test_lexer
+/*
+tok_tConstant {
+    int have;
+    union {
+        char ch;
+        int i;
+        float f;
+        double d;
+        long long ll;
+        char* str;
+    };
+};
+*/
+
+int g_num;
+int g_num_arr[3];
+int main(int argc, char* argv[]) {
+    // int num = 0;
+    if (argc == 3 && strcmp(argv[2], "-nodebug") == 0) {
+        log_set_level(NULL, LOG_LEVEL_ALL);
+    }
+
+    const char* file_name = __FILE__;
+    if (argc == 2) {
+        file_name = argv[1];
+    }
+    FILE* fp = fopen(file_name, "rb");
+    if (fp == NULL) {
+        perror("open file failed");
+        return 1;
+    }
+    printf("open file success\n");
+
+    if (fseek(fp, 0, SEEK_END) != 0) {
+        perror("fseek failed");
+        return 1;
+    }
+    usize fsize = ftell(fp);
+    LOG_INFO("file size: %zu", fsize);
+    if (fseek(fp, 0, SEEK_SET)) {
+        perror("fseek failed");
+        return 1;
+    }
+
+    char* buffer = (char*) malloc(fsize);
+
+    usize read_ret = fread(buffer, 1, fsize, fp);
+    fclose(fp);
+    if (read_ret != fsize) {
+        LOG_FATAL("fread failed read_ret %u != fsize %u", read_ret, fsize);
+        free(buffer);
+        return 1;
+    }
+
+    smcc_lexer_t lexer;
+    lexer_mem_stream_t mem_stream = {0};
+    lexer_stream_t* stream = lexer_mem_stream_init(&mem_stream, buffer, fsize, false);
+    Assert(stream != null);
+    stream->name = __FILE__;
+    stream->name_len = strlen(__FILE__);
+    lexer_init(&lexer, stream);
+    lexer_tok_t tok;
+
+    while (1) {
+        lexer_get_valid_token(&lexer, &tok);
+        if (tok.type == TOKEN_EOF) {
+            break;
+        }
+        LOG_INFO("token `%s` at %s:%u:%u", get_tok_name(tok.type), tok.loc.name, tok.loc.line, tok.loc.column);
+        Assert(tok.loc.offset <= fsize);
+        // LOG_DEBUG("%s", tok.val.str);
+        // printf("line: %d, column: %d, type: %3d, typename: %s\n",
+        //     lexer.line, lexer.index, tok.type, get_tok_name(tok.type));
+    }
+
+    free(buffer);
+}