feat(lex_parser): 初始化词法解析器模块

新增词法解析器库 `smcc_lex_parser`，包含基础的词法规则解析功能： - 支持字符、字符串、数字、标识符的解析 - 支持跳过注释、空白符、行尾等辅助函数 - 提供对应的单元测试用例，覆盖各类合法与非法输入情况该模块依赖 `libcore`，并被 `smcc_lex` 模块引用以支持更上层的词法分析逻辑。
2025-11-23 22:53:46 +08:00
parent 67af0c6bf2
commit 871d031ceb
18 changed files with 996 additions and 392 deletions
--- a/libs/lex_parser/src/lex_parser.c
+++ b/libs/lex_parser/src/lex_parser.c
@@ -0,0 +1,406 @@
+#include <lex_parser.h>
+
+static inline cbool is_next_line(int ch) { return ch == '\n' || ch == '\r'; }
+
+void lex_parse_skip_endline(core_stream_t *input, core_pos_t *pos) {
+    core_stream_reset_char(input);
+    int ch = core_stream_peek_char(input);
+    if (ch == '\r') {
+        core_stream_next_char(input);
+        ch = core_stream_peek_char(input);
+        if (ch == '\n') {
+            core_stream_next_char(input);
+        }
+        core_pos_next_line(pos);
+    } else if (ch == '\n') {
+        core_stream_next_char(input);
+        core_pos_next_line(pos);
+    } else {
+        LOG_WARN("not a newline character");
+    }
+}
+
+/**
+ * @brief
+ *
+ * @param ch
+ * @return int
+ * https://cppreference.cn/w/c/language/escape
+ * `\'`	单引号	在 ASCII 编码中为字节 0x27
+ * `\"`	双引号	在 ASCII 编码中为字节 0x22
+ * `\?`	问号	在 ASCII 编码中为字节 0x3f
+ * `\\`	反斜杠	在 ASCII 编码中为字节 0x5c
+ * `\a`	响铃	在 ASCII 编码中为字节 0x07
+ * `\b`	退格	在 ASCII 编码中为字节 0x08
+ * `\f`	换页 - 新页	在 ASCII 编码中为字节 0x0c
+ * `\n`	换行 - 新行	在 ASCII 编码中为字节 0x0a
+ * `\r`	回车	在 ASCII 编码中为字节 0x0d
+ * `\t`	水平制表符	在 ASCII 编码中为字节 0x09
+ * `\v`	垂直制表符	在 ASCII 编码中为字节 0x0b
+ */
+static inline int got_simple_escape(int ch) {
+    /* clang-format off */
+    #define CASE(ch) case ch: return ch;
+    switch (ch) {
+    case '\'':  return '\'';
+    case '\"':  return '\"';
+    case '\?':  return '\?';
+    case '\\':  return '\\';
+    case 'a':   return '\a';
+    case 'b':   return '\b';
+    case 'f':   return '\f';
+    case 'n':   return '\n';
+    case 'r':   return '\r';
+    case 't':   return '\t';
+    case 'v':   return '\v';
+    default:    return -1;
+    }
+    /* clang-format on */
+}
+
+void lex_parse_skip_line(core_stream_t *input, core_pos_t *pos) {
+    core_stream_t *stream = input;
+    Assert(stream != null);
+    core_stream_reset_char(stream);
+    while (1) {
+        int ch = core_stream_peek_char(stream);
+
+        if (ch == core_stream_eof) {
+            return;
+        }
+
+        // TODO endline
+        if (is_next_line(ch)) {
+            lex_parse_skip_endline(stream, pos);
+            return;
+        } else {
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+        }
+    }
+}
+
+void lex_parse_skip_block_comment(core_stream_t *input, core_pos_t *pos) {
+    core_stream_t *stream = input;
+    int ch;
+    core_stream_reset_char(stream);
+    ch = core_stream_next_char(stream);
+    core_pos_next(pos);
+    // FIXME Assertion
+    Assert(ch == '/');
+    ch = core_stream_next_char(stream);
+    core_pos_next(pos);
+    Assert(ch == '*');
+
+    // all ready match `/*`
+    while (1) {
+        core_stream_reset_char(stream);
+        ch = core_stream_peek_char(stream);
+
+        if (ch == core_stream_eof) {
+            LOG_WARN("Unterminated block comment");
+            return;
+        }
+
+        if (is_next_line(ch)) {
+            lex_parse_skip_endline(stream, pos);
+            continue;
+        }
+        core_stream_next_char(stream);
+        core_pos_next(pos);
+        if (ch == '*') {
+            ch = core_stream_peek_char(stream);
+            if (ch == '/') {
+                core_stream_next_char(stream);
+                core_pos_next(pos);
+                return;
+            }
+        }
+    }
+}
+
+void lex_parse_skip_whitespace(core_stream_t *input, core_pos_t *pos) {
+    core_stream_t *stream = input;
+    Assert(stream != null);
+    core_stream_reset_char(stream);
+    while (1) {
+        int ch = core_stream_next_char(stream);
+
+        if (ch == core_stream_eof) {
+            return;
+        }
+
+        core_pos_next(pos);
+    }
+}
+
+static inline cbool _lex_parse_uint(core_stream_t *input, core_pos_t *pos,
+                                    int base, usize *output) {
+    Assert(input != null && pos != null);
+    if (input == null || pos == null) {
+        return false;
+    }
+    Assert(base == 2 || base == 8 || base == 10 || base == 16);
+    core_stream_reset_char(input);
+    int ch, tmp;
+    usize n = 0;
+    usize offset = pos->offset;
+    while (1) {
+        ch = core_stream_peek_char(input);
+
+        if (ch == core_stream_eof) {
+            break;
+        } else if (ch >= 'a' && ch <= 'z') {
+            tmp = ch - 'a' + 10;
+        } else if (ch >= 'A' && ch <= 'Z') {
+            tmp = ch - 'A' + 10;
+        } else if (ch >= '0' && ch <= '9') {
+            tmp = ch - '0';
+        } else {
+            break;
+        }
+
+        if (tmp >= base) {
+            LOG_ERROR("Invalid digit");
+            return false;
+        }
+
+        core_stream_next_char(input);
+        core_pos_next(pos);
+        n = n * base + tmp;
+        // TODO number overflow
+    }
+    if (offset == pos->offset) {
+        // None match any number
+        return false;
+    }
+    *output = n;
+    return true;
+}
+
+/**
+ * @brief
+ *
+ * @param input
+ * @param pos
+ * @return int
+ * https://cppreference.cn/w/c/language/character_constant
+ */
+int lex_parse_char(core_stream_t *input, core_pos_t *pos) {
+    core_stream_t *stream = input;
+    core_stream_reset_char(stream);
+    int ch = core_stream_peek_char(stream);
+    int ret = core_stream_eof;
+
+    if (ch == core_stream_eof) {
+        LOG_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '\'') {
+        LOG_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+    core_stream_next_char(stream);
+    core_pos_next(pos);
+
+    ch = core_stream_next_char(stream);
+    core_pos_next(pos);
+
+    if (ch == core_stream_eof) {
+        LOG_WARN("Unexpected EOF at middle");
+        goto ERR;
+    } else if (ch == '\\') {
+        ch = core_stream_next_char(stream);
+        core_pos_next(pos);
+        if (ch == '0') {
+            // 数字转义序列
+            // \nnn	任意八进制值	码元 nnn
+            // FIXME 这里如果返回 0 理论上为错误但是恰好与正确值相同
+            ret = 0;
+            _lex_parse_uint(stream, pos, 8, (usize *)&ret);
+        } else if (ch == 'x') {
+            // TODO https://cppreference.cn/w/c/language/escape
+            // \xn...	任意十六进制值	码元 n... (任意数量的十六进制数字)
+            // 通用字符名
+            TODO();
+        } else if (ch == 'u' || ch == 'U') {
+            // \unnnn (C99 起)	Unicode 值在允许范围内；
+            // 可能产生多个码元	码点 U+nnnn
+            // \Unnnnnnnn (C99 起)	Unicode 值在允许范围内；
+            // 可能产生多个码元	码点 U+nnnnnnnn
+            TODO();
+        } else if ((ret = got_simple_escape(ch)) == -1) {
+            LOG_ERROR("Invalid escape character");
+            goto ERR;
+        }
+    } else {
+        ret = ch;
+    }
+    if ((ch = core_stream_next_char(stream)) != '\'') {
+        LOG_ERROR("Unclosed character literal '%c' at end, expect `'`", ch);
+        core_pos_next(pos);
+        goto ERR;
+    }
+
+    return ret;
+ERR:
+    return core_stream_eof;
+}
+
+/**
+ * @brief
+ *
+ * @param input
+ * @param pos
+ * @param output
+ * @return cbool
+ * https://cppreference.cn/w/c/language/string_literal
+ */
+cbool lex_parse_string(core_stream_t *input, core_pos_t *pos,
+                       cstring_t *output) {
+    core_stream_t *stream = input;
+    core_stream_reset_char(stream);
+    int ch = core_stream_peek_char(stream);
+
+    Assert(cstring_is_empty(output));
+    if (ch == core_stream_eof) {
+        LOG_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch != '"') {
+        LOG_WARN("Unexpected character '%c' at begin", ch);
+        goto ERR;
+    }
+    core_stream_next_char(stream);
+    core_pos_next(pos);
+
+    cstring_t str = cstring_from_cstr("");
+    while (1) {
+        ch = core_stream_peek_char(stream);
+
+        if (ch == core_stream_eof) {
+            LOG_ERROR("Unexpected EOF at string literal");
+            goto ERR;
+        } else if (is_next_line(ch)) {
+            LOG_ERROR("Unexpected newline at string literal");
+            goto ERR;
+        } else if (ch == '\\') {
+            // TODO bad practice and maybe bugs here
+            core_stream_next_char(stream);
+            ch = core_stream_next_char(stream);
+            int val = got_simple_escape(ch);
+            if (val == -1) {
+                LOG_ERROR("Invalid escape character it is \\%c [%d]", ch, ch);
+            } else {
+                cstring_push(&str, val);
+                continue;
+            }
+        } else if (ch == '"') {
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+            break;
+        }
+
+        core_stream_next_char(stream);
+        core_pos_next(pos);
+        cstring_push(&str, ch);
+    }
+
+    *output = str;
+    return true;
+ERR:
+    cstring_free(&str);
+    return false;
+}
+
+/**
+ * @brief
+ *
+ * @param input
+ * @param pos
+ * @param output
+ * @return cbool
+ * https://cppreference.cn/w/c/language/integer_constant
+ */
+cbool lex_parse_number(core_stream_t *input, core_pos_t *pos, usize *output) {
+    core_stream_t *stream = input;
+    core_stream_reset_char(stream);
+    int ch = core_stream_peek_char(stream);
+    int base = 0;
+    if (ch == core_stream_eof) {
+        LOG_WARN("Unexpected EOF at begin");
+        goto ERR;
+    } else if (ch == '0') {
+        ch = core_stream_peek_char(stream);
+        if (ch == 'x' || ch == 'X') {
+            base = 16;
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+        } else if (ch == 'b' || ch == 'B') {
+            // FIXME C23 external integer base
+            base = 2;
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+        } else if (ch >= '0' && ch <= '7') {
+            base = 8;
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+        } else if (ch == '9' || ch == '8') {
+            LOG_ERROR("Invalid digit '%d' in octal literal", ch);
+            return false;
+        } else {
+            base = 10;
+        }
+    } else {
+        base = 10;
+    }
+
+    // 解析整数部分
+    core_stream_reset_char(stream);
+    usize n;
+    if (_lex_parse_uint(stream, pos, base, &n) == false) {
+        return false;
+    }
+    *output = n;
+    return true;
+ERR:
+    return false;
+}
+
+/**
+ * @brief
+ *
+ * @param input
+ * @param pos
+ * @param output
+ * @return cbool
+ * https://cppreference.cn/w/c/language/identifier
+ */
+cbool lex_parse_identifier(core_stream_t *input, core_pos_t *pos,
+                           cstring_t *output) {
+    Assert(cstring_is_empty(output));
+    core_stream_t *stream = input;
+    core_stream_reset_char(stream);
+    int ch = core_stream_peek_char(stream);
+
+    if (ch == core_stream_eof) {
+        LOG_WARN("Unexpected EOF at begin");
+    } else if (ch == '_' || (ch >= 'a' && ch <= 'z') ||
+               (ch >= 'A' && ch <= 'Z')) {
+        while (1) {
+            cstring_push(output, ch);
+            core_stream_next_char(stream);
+            core_pos_next(pos);
+            ch = core_stream_peek_char(stream);
+            if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+                (ch == '_') || (ch >= '0' && ch <= '9')) {
+                continue;
+            }
+            break;
+        }
+        return true;
+    }
+    return false;
+}