feat(lex_parser): 提取字符判断函数并增强解析器断言

将 `is_next_line` 内联函数重命名为 `lex_parse_is_endline` 并新增 `lex_parse_is_whitespace` 函数，统一用于词法解析中的字符分类。同时加强多个解析函数的输入参数断言，提升代码健壮性。此外，修正了 `lex_parse_skip_whitespace` 中的逻辑错误，并优化部分注释和控制流结构。 feat(pprocessor): 初始化预处理器模块并添加基础功能实现新增预处理器模块 `pprocessor`，包括宏定义、条件编译状态管理以及基本的指令解析框架。实现了标识符解析、空白跳过、关键字查找等功能，并初步支持 `#define` 指令的对象类宏替换。该提交还引入了一组测试用例，覆盖多种宏展开场景及边界情况，确保预处理器的核心行为符合预期。
2025-11-24 22:44:08 +08:00
parent 871d031ceb
commit e6a76e7a86
58 changed files with 1429 additions and 9 deletions
--- a/libs/pprocessor/src/pprocessor.c
+++ b/libs/pprocessor/src/pprocessor.c
@@ -0,0 +1,427 @@
+/**
+ * @file pprocessor.c
+ * @brief C语言预处理器实现
+ */
+
+#include <lex_parser.h>
+#include <pp_token.h>
+#include <pprocessor.h>
+#define PPROCESSER_BUFFER_SIZE (1024)
+
+static u32 hash_func(cstring_t *string) {
+    return smcc_strhash32(cstring_as_cstr(string));
+}
+
+static int hash_cmp(const cstring_t *str1, const cstring_t *str2) {
+    if (str1->size != str2->size) {
+        return str1->size - str2->size;
+    }
+
+    return smcc_strcmp(cstring_as_cstr(str1), cstring_as_cstr(str2));
+}
+
+// 添加宏定义
+static void add_macro(smcc_pp_t *pp, const cstring_t *name,
+                      const macro_list_t *replaces, const macro_list_t *params,
+                      macro_type_t type) {
+    smcc_macro_t *macro = smcc_malloc(sizeof(smcc_macro_t));
+    macro->name = *name;
+    macro->type = type;
+
+    if (replaces) {
+        macro->replaces = *replaces;
+    } else {
+        vec_init(macro->replaces);
+    }
+
+    if (params) {
+        macro->params = *params;
+    } else {
+        vec_init(macro->params);
+    }
+
+    hashmap_set(&pp->macros, &macro->name, macro);
+}
+
+// 查找宏定义
+static smcc_macro_t *find_macro(smcc_pp_t *pp, cstring_t *name) {
+    return hashmap_get(&pp->macros, name);
+}
+
+// 条件编译处理框架
+static void handle_if(smcc_pp_t *pp, const char *condition) {
+    if_stack_item_t item;
+    int cond_value;
+    // cond_value = evaluate_condition(pp, condition);
+
+    item.state = cond_value ? IFState_TRUE : IFState_FALSE;
+    item.skip = !cond_value;
+    vec_push(pp->if_stack, item);
+}
+
+static void handle_else(smcc_pp_t *pp) {
+    if (pp->if_stack.size == 0) {
+        // 错误：没有匹配的#if
+        return;
+    }
+
+    if_stack_item_t *top = &vec_at(pp->if_stack, pp->if_stack.size - 1);
+    if (top->state == IFState_ELSE) {
+        // 错误：#else重复出现
+        return;
+    }
+
+    top->skip = !top->skip;
+    top->state = IFState_ELSE;
+}
+
+static void handle_include(smcc_pp_t *pp, const char *filename,
+                           int system_header) {
+    // 查找文件路径逻辑
+    // 创建新的输入流
+    // 递归处理包含文件
+}
+
+// 解析标识符
+static cstring_t parse_identifier(core_stream_t *stream) {
+    cstring_t identifier = cstring_new();
+    core_stream_reset_char(stream);
+    int ch = core_stream_peek_char(stream);
+
+    // 标识符以字母或下划线开头
+    if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_')) {
+        LOG_WARN("Invalid identifier");
+        return identifier;
+    }
+    do {
+        cstring_push(&identifier, (char)ch);
+        core_stream_next_char(stream); // 消费字符
+        ch = core_stream_peek_char(stream);
+    } while ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+             (ch >= '0' && ch <= '9') || ch == '_');
+
+    return identifier;
+}
+
+// 跳过空白字符 ' ' and '\t'
+static void skip_whitespace(core_stream_t *stream) {
+    int ch;
+    core_stream_reset_char(stream);
+    while ((ch = core_stream_peek_char(stream)) != core_stream_eof) {
+        if (ch == ' ' || ch == '\t') {
+            core_stream_next_char(stream);
+        } else {
+            break;
+        }
+    }
+}
+
+#define X(name, type, tok) SMCC_STR(name),
+static const char *token_strings[] = {PP_INST_TOKEN};
+#undef X
+
+static const struct {
+    const char *name;
+    pp_token_t tok;
+} keywords[] = {
+#define X(name, type, tok) {#name, tok},
+    PP_INST_TOKEN
+#undef X
+};
+
+// by using binary search to find the keyword
+static inline int keyword_cmp(const char *name, int len) {
+    int low = 0;
+    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
+    while (low <= high) {
+        int mid = (low + high) / 2;
+        const char *key = keywords[mid].name;
+        int cmp = 0;
+
+        // 自定义字符串比较逻辑
+        for (int i = 0; i < len; i++) {
+            if (name[i] != key[i]) {
+                cmp = (unsigned char)name[i] - (unsigned char)key[i];
+                break;
+            }
+            if (name[i] == '\0')
+                break; // 遇到终止符提前结束
+        }
+
+        if (cmp == 0) {
+            // 完全匹配检查（长度相同）
+            if (key[len] == '\0')
+                return mid;
+            cmp = -1; // 当前关键词比输入长
+        }
+
+        if (cmp < 0) {
+            high = mid - 1;
+        } else {
+            low = mid + 1;
+        }
+    }
+    return -1; // Not a keyword.
+}
+
+typedef struct pp_stream {
+    core_stream_t stream;
+    core_stream_t *input;
+    smcc_pp_t *self;
+
+    usize size;
+    usize pos;
+    char buffer[PPROCESSER_BUFFER_SIZE];
+} pp_stream_t;
+
+static cbool parse_list(pp_stream_t *_stream, macro_list_t *list,
+                        cbool is_param) {
+    Assert(_stream != null);
+    core_stream_t *stream = _stream->input;
+    Assert(stream != null);
+    core_stream_reset_char(stream);
+
+    vec_init(*list);
+    int ch;
+    cstring_t str = cstring_new();
+    core_pos_t pos;
+
+    while ((ch = core_stream_peek_char(stream)) != core_stream_eof) {
+        if (is_param) {
+            // ( 参数 ) ( 参数, ... ) ( ... )
+            if (lex_parse_is_whitespace(ch)) {
+                // TODO #define ( A A , B ) need ERROR
+                lex_parse_skip_whitespace(stream, &pos);
+                core_stream_reset_char(stream);
+            } else if (ch == ',') {
+                vec_push(*list, str);
+                str = cstring_new();
+                core_stream_next_char(stream);
+                continue;
+            } else if (ch == ')') {
+                break;
+            } else if (ch == core_stream_eof || lex_parse_is_endline(ch)) {
+                LOG_ERROR("Invalid parameter list");
+                return false;
+            }
+        } else {
+            // 替换列表
+            if (lex_parse_is_whitespace(ch)) {
+                lex_parse_skip_whitespace(stream, &pos);
+                vec_push(*list, str);
+                str = cstring_new();
+                core_stream_reset_char(stream);
+                continue;
+            } else if (lex_parse_is_endline(ch)) {
+                break;
+            }
+        }
+        core_stream_next_char(stream);
+        cstring_push(&str, (char)ch);
+    }
+    vec_push(*list, str);
+    str = cstring_new();
+    return true;
+}
+
+// 解析预处理指令
+static void parse_directive(pp_stream_t *_stream) {
+    Assert(_stream != null);
+    core_stream_t *stream = _stream->input;
+    Assert(stream != null);
+
+    int ch;
+    core_pos_t pos;
+    core_stream_reset_char(stream);
+    // 跳过 '#' 和后续空白
+    if (core_stream_peek_char(stream) != '#') {
+        LOG_WARN("Invalid directive");
+        return;
+    }
+    core_stream_next_char(stream);
+
+    // TODO 允许空指令（# 后跟换行符），且无任何效果。
+    skip_whitespace(stream);
+    // 解析指令名称
+    cstring_t directive = parse_identifier(stream);
+    if (cstring_is_empty(&directive)) {
+        LOG_ERROR("expected indentifier");
+        goto ERR;
+    }
+    skip_whitespace(stream);
+    core_stream_reset_char(stream);
+
+    pp_token_t token =
+        keyword_cmp(cstring_as_cstr(&directive), cstring_len(&directive));
+    switch (token) {
+    case PP_TOK_DEFINE: {
+        cstring_t name = parse_identifier(stream);
+        if (cstring_is_empty(&name)) {
+            LOG_ERROR("expected indentifier");
+            goto ERR;
+        }
+        skip_whitespace(stream);
+        core_stream_reset_char(stream);
+
+        int ch = core_stream_peek_char(stream);
+        if (ch == '(') {
+            macro_list_t params;
+            parse_list(_stream, &params, true);
+            ch = core_stream_next_char(stream);
+            if (ch != ')') {
+            }
+            goto ERR;
+        }
+        macro_list_t replacement;
+        parse_list(_stream, &replacement, false);
+        add_macro(_stream->self, &name, &replacement, NULL, MACRO_OBJECT);
+        break;
+    }
+    case PP_TOK_UNDEF:
+    case PP_TOK_INCLUDE:
+    case PP_TOK_IF:
+    case PP_TOK_IFDEF:
+    case PP_TOK_IFNDEF:
+    case PP_TOK_ELSE:
+    case PP_TOK_ELIF:
+    case PP_TOK_ELIFDEF:
+    case PP_TOK_ELIFNDEF:
+    case PP_TOK_ENDIF:
+    case PP_TOK_LINE:
+    case PP_TOK_EMBED:
+    case PP_TOK_ERROR:
+    case PP_TOK_WARNING:
+    case PP_TOK_PRAMA:
+        TODO();
+        break;
+    default:
+        LOG_WARN("Unknown preprocessor directive: %s",
+                 cstring_as_cstr(&directive));
+    }
+
+    // TODO: win \r\n linux \n mac \r => all need transport to \n
+    core_stream_reset_char(stream);
+    lex_parse_skip_line(stream, &pos);
+
+    cstring_free(&directive);
+    return;
+ERR:
+    // TODO skip line
+    LOG_FATAL("Unhandled preprocessor directive");
+}
+
+static inline void stream_push_string(pp_stream_t *stream, cstring_t *str) {
+    stream->size += cstring_len(str);
+    Assert(stream->size <= PPROCESSER_BUFFER_SIZE);
+    smcc_memcpy(stream->buffer, cstring_as_cstr(str), stream->size);
+}
+
+static inline void stream_push_char(pp_stream_t *stream, int ch) {
+    stream->buffer[stream->size++] = ch;
+    Assert(stream->size <= PPROCESSER_BUFFER_SIZE);
+}
+
+static int next_char(core_stream_t *_stream) {
+    pp_stream_t *stream = (pp_stream_t *)_stream;
+    Assert(stream != null);
+READ_BUF:
+    if (stream->size != 0) {
+        if (stream->pos < stream->size) {
+            return stream->buffer[stream->pos++];
+        } else {
+            stream->size = 0;
+            stream->pos = 0;
+        }
+    }
+RETRY:
+    core_stream_reset_char(stream->input);
+    int ch = core_stream_peek_char(stream->input);
+    if (ch == '#') {
+        parse_directive(stream);
+        goto RETRY;
+    } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+               ch == '_') {
+        cstring_t identifier = parse_identifier(stream->input);
+        smcc_macro_t *macro = find_macro(stream->self, &identifier);
+        if (macro == null) {
+            stream_push_string(stream, &identifier);
+            cstring_free(&identifier);
+            goto READ_BUF;
+        } else {
+            cstring_free(&identifier);
+        }
+        if (macro->type == MACRO_OBJECT) {
+            for (usize i = 0; i < macro->replaces.size; ++i) {
+                stream_push_string(stream, &vec_at(macro->replaces, i));
+                // usize never using `-`
+                if (i + 1 < macro->replaces.size)
+                    stream_push_char(stream, ' ');
+            }
+            goto READ_BUF;
+        } else if (macro->type == MACRO_FUNCTION) {
+            TODO();
+        }
+        UNREACHABLE();
+    }
+    return core_stream_next_char(stream->input);
+}
+
+static core_stream_t *pp_stream_init(smcc_pp_t *self, core_stream_t *input) {
+    pp_stream_t *stream = smcc_malloc(sizeof(pp_stream_t));
+    if (stream == null) {
+        LOG_FATAL("Failed to allocate memory for output stream");
+    }
+    if (stream == null || self == null) {
+        return null;
+    }
+    stream->self = self;
+    stream->input = input;
+    stream->size = 0;
+    stream->pos = 0;
+
+    stream->stream.name = cstring_from_cstr("pipe_stream");
+    stream->stream.free_stream = null;
+    stream->stream.next_char = next_char;
+    stream->stream.peek_char = null;
+    stream->stream.reset_char = null;
+    stream->stream.read_buf = null;
+    return (core_stream_t *)stream;
+}
+
+core_stream_t *pp_init(smcc_pp_t *pp, core_stream_t *input) {
+    if (pp == null || input == null) {
+        return null;
+    }
+    core_mem_stream_t *stream = smcc_malloc(sizeof(core_mem_stream_t));
+
+    if (stream == null) {
+        LOG_FATAL("Failed to allocate memory for output stream");
+    }
+    pp->stream = pp_stream_init(pp, input);
+    Assert(pp->stream != null);
+
+    hashmap_init(&pp->macros);
+    pp->macros.hash_func = (u32 (*)(const void *))hash_func;
+    pp->macros.key_cmp = (int (*)(const void *, const void *))hash_cmp;
+    return pp->stream;
+}
+
+// 销毁预处理器
+void pp_drop(smcc_pp_t *pp) {
+    if (pp == NULL)
+        return;
+
+    // 清理所有宏定义
+    // 注意：需要实现 hashmap 的迭代和清理函数
+    hashmap_drop(&pp->macros);
+
+    // 清理字符串池
+    // strpool_destroy(&pp->strpool);
+
+    // 清理条件编译栈
+    // 需要释放栈中每个元素的资源（如果有的话）
+    // vec_free(pp->if_stack);
+
+    // 清理文件名
+    cstring_free(&pp->stream->name);
+}