refactor(lexer): 重构词法分析器头文件结构并优化缓冲区管理

移除了旧的lexer_stream.c实现，引入新的环形缓冲区机制来替代原有的动态数组缓冲区。更新了词法分析器的核心数据结构，修改了token获取相关函数的实现以支持新的缓冲区管理方式。 BREAKING CHANGE: 移除了scc_lexer_stream_t相关的API，替换为基于环形缓冲区的新接口scc_lexer_to_ring和相关函数。 feat(lexer_token): 添加词法分析结果内存泄漏警告注释 docs: 移除预处理器模块的测试文件和相关配置
2026-02-16 21:21:23 +08:00
parent 0e7dec202a
commit b4929be6b8
72 changed files with 119 additions and 2474 deletions
--- a/libs/lexer/include/lexer.h
+++ b/libs/lexer/include/lexer.h
@@ -1,100 +0,0 @@
-/**
- * @file lexer.h
- * @brief C语言词法分析器核心数据结构与接口
- */
-
-#ifndef __SCC_LEXER_H__
-#define __SCC_LEXER_H__
-
-#include "lexer_token.h"
-#include <scc_core.h>
-#include <scc_sstream.h>
-
-/**
- * @brief 词法分析器核心结构体
- *
- * 封装词法分析所需的状态信息和缓冲区管理
- */
-typedef struct scc_lexer {
-    scc_sstream_ring_t stream_ref;
-    int jump_macro;
-} scc_lexer_t;
-
-void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref);
-
-/**
- * @brief 获取原始token
- * @param[in] lexer 词法分析器实例
- * @param[out] token 输出token存储位置
- *
- * 此函数会返回所有类型的token，包括空白符等无效token
- */
-void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token);
-
-/**
- * @brief 获取有效token
- * @param[in] lexer 词法分析器实例
- * @param[out] token 输出token存储位置
- *
- * 此函数会自动跳过空白符等无效token，返回对语法分析有意义的token
- */
-void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token);
-
-typedef SCC_VEC(scc_lexer_tok_t) scc_lexer_tok_vec_t;
-
-typedef struct scc_lexer_stream scc_lexer_stream_t;
-struct scc_lexer_stream {
-    scc_lexer_t *lexer;
-    scc_lexer_tok_vec_t toks; // 循环缓冲区
-    usize curr_pos;           // 当前读取位置（逻辑位置）
-    usize probe_pos;          // 已填充位置（逻辑位置）
-    cbool need_comment;
-
-    /// @brief 向前读取n个token
-    const scc_lexer_tok_t *(*peek)(scc_lexer_stream_t *stream, usize n);
-
-    /// @brief 指针推进到offset
-    void (*advance)(scc_lexer_stream_t *stream, usize offset);
-
-    /// @brief 销毁并释放资源
-    void (*drop)(scc_lexer_stream_t *stream);
-};
-
-/**
- * @brief 将词法分析器转换成流式输出(自带缓冲区)
- * @param[in] lexer 已经词法分析器实例
- * @param[out] stream 输出流对象指针
- * @param[in] need_comment 输出时是否需要注释
- */
-void scc_lexer_to_stream(scc_lexer_t *lexer, scc_lexer_stream_t *stream,
-                         cbool need_comment);
-
-static inline const scc_lexer_tok_t *
-scc_lexer_stream_current(scc_lexer_stream_t *stream) {
-    Assert(stream != null);
-    return stream->peek(stream, 0);
-}
-
-static inline const scc_lexer_tok_t *
-scc_lexer_stream_peek(scc_lexer_stream_t *stream, usize n) {
-    Assert(stream != null);
-    return stream->peek(stream, n);
-}
-
-static inline void scc_lexer_stream_consume(scc_lexer_stream_t *stream) {
-    Assert(stream != null);
-    return stream->advance(stream, 1);
-}
-
-static inline void scc_lexer_stream_advance(scc_lexer_stream_t *stream,
-                                            usize n) {
-    Assert(stream != null);
-    return stream->advance(stream, n);
-}
-
-static inline void scc_lexer_stream_drop(scc_lexer_stream_t *stream) {
-    Assert(stream != null);
-    return stream->drop(stream);
-}
-
-#endif /* __SCC_LEXER_H__ */
--- a/libs/lexer/include/lexer_token.h
+++ b/libs/lexer/include/lexer_token.h
@@ -144,6 +144,10 @@ typedef enum scc_tok_subtype {
 scc_tok_subtype_t scc_get_tok_subtype(scc_tok_type_t type);
 const char *scc_get_tok_name(scc_tok_type_t type);

+/**
+ * @brief 词法分析结果
+ * @warning 需要手动释放lexeme否则会出现内存泄漏
+ */
 typedef struct scc_lexer_token {
    scc_tok_type_t type;
    scc_cstring_t lexeme;
--- a/libs/lexer/include/scc_lexer.h
+++ b/libs/lexer/include/scc_lexer.h
@@ -0,0 +1,54 @@
+/**
+ * @file lexer.h
+ * @brief C语言词法分析器核心数据结构与接口
+ */
+
+#ifndef __SCC_LEXER_H__
+#define __SCC_LEXER_H__
+
+#include "lexer_token.h"
+#include <scc_core.h>
+#include <scc_core_ring.h>
+#include <scc_sstream.h>
+
+typedef SCC_RING(scc_lexer_tok_t) scc_lexer_tok_ring_t;
+typedef SCC_VEC(scc_lexer_tok_t) scc_lexer_tok_vec_t;
+/**
+ * @brief 词法分析器核心结构体
+ *
+ * 封装词法分析所需的状态信息和缓冲区管理
+ */
+typedef struct scc_lexer {
+    scc_sstream_ring_t *stream_ref;
+    scc_lexer_tok_ring_t ring;
+    int ring_ref_count;
+    int jump_macro;
+} scc_lexer_t;
+
+void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref);
+
+/**
+ * @brief 获取原始token
+ * @param[in] lexer 词法分析器实例
+ * @param[out] token 输出token存储位置
+ *
+ * 此函数会返回所有类型的token，包括空白符等无效token
+ */
+void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token);
+
+/**
+ * @brief 获取有效token
+ * @param[in] lexer 词法分析器实例
+ * @param[out] token 输出token存储位置
+ *
+ * 此函数会自动跳过空白符等无效token，返回对语法分析有意义的token
+ */
+void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token);
+
+scc_lexer_tok_ring_t *scc_lexer_to_ring(scc_lexer_t *lexer, int ring_size,
+                                        cbool need_comment);
+
+void scc_lexer_drop_ring(scc_lexer_tok_ring_t *ring_ref);
+void scc_lexer_drop(scc_lexer_t *lexer);
+
+#endif /* __SCC_LEXER_H__ */
--- a/libs/lexer/src/lexer.c
+++ b/libs/lexer/src/lexer.c
@@ -1,5 +1,6 @@
-#include <lexer.h>
+#include "scc_lexer.h"
 #include <lexer_log.h>
+#include <scc_lexer.h>

 static const struct {
    const char *name;
@@ -41,7 +42,8 @@ static int keyword_cmp(const char *name, int len) {
 }

 void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref) {
-    lexer->stream_ref = *stream_ref;
+    lexer->stream_ref = stream_ref;
+    lexer->ring_ref_count = 0;
    lexer->jump_macro = false;
 }

@@ -68,7 +70,7 @@ static inline cbool is_hex_digit(int ch) {
 /* 从环形缓冲区预览一个字符（带EOF检测） */
 static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) {
    cbool ok;
-    scc_ring_peek(lexer->stream_ref, *out, ok);
+    scc_ring_peek(*lexer->stream_ref, *out, ok);
    return ok;
 }

@@ -76,7 +78,7 @@ static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) {
 static inline cbool next_char(scc_lexer_t *lexer, scc_cstring_t *lexeme,
                              scc_sstream_char_t *out) {
    cbool ok;
-    scc_ring_next(lexer->stream_ref, *out, ok);
+    scc_ring_next(*lexer->stream_ref, *out, ok);
    if (!ok)
        return false;
    scc_cstring_append_ch(lexeme, out->character);
@@ -132,7 +134,7 @@ void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
            next_char(lexer, &lex, &cur); // 消费 '/'
            while (peek_char(lexer, &cur) && !is_newline(cur.character)) {
                next_char(lexer, &lex, &cur);
-                scc_ring_consume(lexer->stream_ref);
+                scc_ring_consume(*lexer->stream_ref);
            }
            // 注释结束，不包含换行符（换行符单独成token）
        } else if (next.character == '*') {
@@ -150,7 +152,7 @@ void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
                    next_char(lexer, &lex, &cur); // 消费 '/'
                    break;
                }
-                scc_ring_consume(lexer->stream_ref);
+                scc_ring_consume(*lexer->stream_ref);
            }
        } else {
            // 只是除号 /
@@ -161,7 +163,7 @@ void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
        token->type = SCC_TOK_IDENT; // 暂定
        while (peek_char(lexer, &cur) && is_identifier_part(cur.character)) {
            next_char(lexer, &lex, &cur);
-            scc_ring_consume(lexer->stream_ref);
+            scc_ring_consume(*lexer->stream_ref);
        }
        // 检查是否为关键字
        int idx = keyword_cmp(scc_cstring_as_cstr(&lex), scc_cstring_len(&lex));
@@ -241,7 +243,7 @@ void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
            } else {
                next_char(lexer, &lex, &cur);
            }
-            scc_ring_consume(lexer->stream_ref);
+            scc_ring_consume(*lexer->stream_ref);
        }
    } else {
        scc_sstream_char_t next = {0};
@@ -447,7 +449,7 @@ void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
    }

    // 设置token
-    scc_ring_consume(lexer->stream_ref);
+    scc_ring_consume(*lexer->stream_ref);
    token->type = token->type; // 上面已设
    token->loc = start_loc;
    token->lexeme = lex; // 转移所有权
@@ -469,3 +471,42 @@ void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
    } while (subtype == SCC_TOK_SUBTYPE_EMPTYSPACE ||
             subtype == SCC_TOK_SUBTYPE_COMMENT);
 }
+
+static int fill_token(scc_lexer_tok_t *out, void *userdata) {
+    scc_lexer_t *lexer = userdata;
+    scc_lexer_get_token(lexer, out);
+    return 0;
+}
+
+static int fill_valid_token(scc_lexer_tok_t *out, void *userdata) {
+    scc_lexer_t *lexer = userdata;
+    scc_lexer_get_valid_token(lexer, out);
+    return 0;
+}
+
+scc_lexer_tok_ring_t *scc_lexer_to_ring(scc_lexer_t *lexer, int ring_size,
+                                        cbool need_comment) {
+    scc_ring_init(lexer->ring, ring_size,
+                  need_comment ? fill_token : fill_valid_token, lexer);
+    lexer->ring_ref_count++;
+    return &lexer->ring;
+}
+
+void scc_lexer_drop_ring(scc_lexer_tok_ring_t *ring_ref) {
+    scc_lexer_t *lexer = ring_ref->userdata;
+    if (lexer->ring_ref_count > 0) {
+        lexer->ring_ref_count--;
+    } else {
+        LOG_WARN("double drop sstream ring");
+    }
+}
+
+void scc_lexer_drop(scc_lexer_t *lexer) {
+    Assert(lexer != null);
+    if (lexer->ring_ref_count) {
+        LOG_FATAL("drop sstream must be drop ring before ref [%d]",
+                  lexer->ring_ref_count);
+    }
+    scc_ring_free(lexer->ring);
+    scc_sstream_drop_ring(lexer->stream_ref);
+}
--- a/libs/lexer/src/lexer_stream.c
+++ b/libs/lexer/src/lexer_stream.c
@@ -1,139 +0,0 @@
-#include <lexer.h>
-
-static void lexer_stream_extend(scc_lexer_stream_t *stream, usize n) {
-    Assert(stream != null);
-    // 检查是否需要扩容
-    if ((stream->probe_pos - stream->curr_pos + n) >= stream->toks.cap) {
-        // 需要扩容 - 创建新缓冲区
-        usize new_cap = stream->toks.cap * 2;
-        if (new_cap < stream->probe_pos - stream->curr_pos + n + 1) {
-            new_cap = stream->probe_pos - stream->curr_pos + n + 1;
-        }
-
-        scc_lexer_tok_t *new_data =
-            scc_realloc(null, new_cap * sizeof(scc_lexer_tok_t));
-        if (!new_data) {
-            LOG_FATAL("lexer_stream_extend: realloc failed\n");
-            return;
-        }
-
-        // 将旧缓冲区中的数据拷贝到新缓冲区，保持顺序
-        usize data_count = stream->probe_pos - stream->curr_pos;
-        for (usize i = 0; i < data_count; ++i) {
-            usize old_idx = (stream->curr_pos + i) % stream->toks.cap;
-            new_data[i] = stream->toks.data[old_idx];
-        }
-
-        // 释放旧缓冲区
-        if (stream->toks.data) {
-            scc_free(stream->toks.data);
-        }
-
-        // 更新结构体
-        stream->toks.data = new_data;
-        stream->toks.cap = new_cap;
-        stream->curr_pos = 0;
-        stream->probe_pos = data_count;
-    }
-
-    // 填充新token
-    for (usize i = 0; i < n; ++i) {
-        usize idx = (stream->probe_pos + i) % stream->toks.cap;
-        if (stream->need_comment)
-            scc_lexer_get_token(stream->lexer, &stream->toks.data[idx]);
-        else
-            scc_lexer_get_valid_token(stream->lexer, &stream->toks.data[idx]);
-    }
-
-    stream->probe_pos += n;
-}
-
-static const scc_lexer_tok_t *lexer_stream_peek(scc_lexer_stream_t *stream,
-                                                usize n) {
-    Assert(stream != null);
-
-    // 计算需要的前看token数量
-    usize available = stream->probe_pos - stream->curr_pos;
-    if (n >= available) {
-        // 需要扩展缓冲区
-        usize need = n - available + 1;
-        lexer_stream_extend(stream, need);
-    }
-
-    // 计算实际缓冲区中的位置
-    usize idx = (stream->curr_pos + n) % stream->toks.cap;
-    return &stream->toks.data[idx];
-}
-
-static void lexer_stream_advance(scc_lexer_stream_t *stream, usize offset) {
-    Assert(stream != null);
-
-    if (stream->curr_pos + offset > stream->probe_pos) {
-        // 尝试填充更多token
-        usize need = stream->curr_pos + offset - stream->probe_pos;
-        lexer_stream_extend(stream, need);
-    }
-
-    stream->curr_pos += offset;
-
-    // 可选：当已消费的token过多时，压缩缓冲区
-    if (stream->curr_pos > stream->toks.cap * 3 / 4) {
-        // 压缩缓冲区：将有效数据移动到前面
-        usize data_count = stream->probe_pos - stream->curr_pos;
-        scc_lexer_tok_t *temp =
-            scc_realloc(null, data_count * sizeof(scc_lexer_tok_t));
-        if (!temp)
-            return; // 压缩失败也没关系
-
-        for (usize i = 0; i < data_count; ++i) {
-            usize old_idx = (stream->curr_pos + i) % stream->toks.cap;
-            temp[i] = stream->toks.data[old_idx];
-        }
-
-        scc_free(stream->toks.data);
-        stream->toks.data = temp;
-        stream->toks.cap = data_count;
-        stream->curr_pos = 0;
-        stream->probe_pos = data_count;
-    }
-}
-
-static void lexer_stream_drop(scc_lexer_stream_t *stream) {
-    Assert(stream != null);
-
-    // 清理所有token（如果有需要清理的内部资源）
-    for (usize i = 0; i < stream->toks.cap; ++i) {
-        // 这里假设scc_lexer_tok_t可能包含需要释放的资源
-        // 如果有，需要调用相应的清理函数
-        // 例如: if (stream->toks.data[i].needs_free)
-        // scc_free(stream->toks.data[i].ptr);
-    }
-
-    scc_vec_free(stream->toks);
-    stream->lexer = null;
-    stream->curr_pos = 0;
-    stream->probe_pos = 0;
-    stream->need_comment = false;
-
-    stream->peek = null;
-    stream->advance = null;
-    stream->drop = null;
-}
-
-void scc_lexer_to_stream(scc_lexer_t *lexer, scc_lexer_stream_t *stream,
-                         cbool need_comment) {
-    Assert(lexer != null && stream != null);
-
-    stream->lexer = lexer;
-    stream->curr_pos = 0;
-    stream->probe_pos = 0;
-    stream->need_comment = need_comment;
-
-    // 初始化循环缓冲区
-    scc_vec_init(stream->toks);
-    scc_vec_realloc(stream->toks, 8); // 初始容量为8
-
-    stream->peek = lexer_stream_peek;
-    stream->advance = lexer_stream_advance;
-    stream->drop = lexer_stream_drop;
-}
--- a/libs/lexer/src/main.c
+++ b/libs/lexer/src/main.c
@@ -1,5 +1,5 @@
-#include <lexer.h>
 #include <lexer_log.h>
+#include <scc_lexer.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -53,10 +53,7 @@ int main(int argc, char *argv[]) {
                  scc_get_tok_name(token.type),
                  scc_cstring_as_cstr(&token.lexeme), token.loc.name,
                  token.loc.line, token.loc.col);
-        // LOG_DEBUG("%s", token.val.str);
-        // printf("line: %d, column: %d, type: %3d, typename: %s\n",
-        //     lexer.line, lexer.index, token.type,
-        //     scc_get_tok_name(token.type));
+        scc_cstring_free(&token.lexeme);
    }
    scc_sstream_drop_ring(ref);
    scc_sstream_drop(&stream);
--- a/libs/lexer/tests/test_lexer.c
+++ b/libs/lexer/tests/test_lexer.c
@@ -1,5 +1,5 @@
 // test_lexer.c
-#include <lexer.h>
+#include <scc_lexer.h>
 #include <string.h>
 #include <utest/acutest.h>