feat(lex_parser): 初始化词法解析器模块

新增词法解析器库 `smcc_lex_parser`,包含基础的词法规则解析功能:
- 支持字符、字符串、数字、标识符的解析
- 支持跳过注释、空白符、行尾等辅助函数
- 提供对应的单元测试用例,覆盖各类合法与非法输入情况

该模块依赖 `libcore`,并被 `smcc_lex` 模块引用以支持更上层的词法分析逻辑。
This commit is contained in:
zzy
2025-11-23 22:53:46 +08:00
parent 67af0c6bf2
commit 871d031ceb
18 changed files with 996 additions and 392 deletions

View File

@@ -0,0 +1,406 @@
#include <lex_parser.h>
static inline cbool is_next_line(int ch) { return ch == '\n' || ch == '\r'; }
void lex_parse_skip_endline(core_stream_t *input, core_pos_t *pos) {
core_stream_reset_char(input);
int ch = core_stream_peek_char(input);
if (ch == '\r') {
core_stream_next_char(input);
ch = core_stream_peek_char(input);
if (ch == '\n') {
core_stream_next_char(input);
}
core_pos_next_line(pos);
} else if (ch == '\n') {
core_stream_next_char(input);
core_pos_next_line(pos);
} else {
LOG_WARN("not a newline character");
}
}
/**
* @brief
*
* @param ch
* @return int
* https://cppreference.cn/w/c/language/escape
* `\'` 单引号 在 ASCII 编码中为字节 0x27
* `\"` 双引号 在 ASCII 编码中为字节 0x22
* `\?` 问号 在 ASCII 编码中为字节 0x3f
* `\\` 反斜杠 在 ASCII 编码中为字节 0x5c
* `\a` 响铃 在 ASCII 编码中为字节 0x07
* `\b` 退格 在 ASCII 编码中为字节 0x08
* `\f` 换页 - 新页 在 ASCII 编码中为字节 0x0c
* `\n` 换行 - 新行 在 ASCII 编码中为字节 0x0a
* `\r` 回车 在 ASCII 编码中为字节 0x0d
* `\t` 水平制表符 在 ASCII 编码中为字节 0x09
* `\v` 垂直制表符 在 ASCII 编码中为字节 0x0b
*/
static inline int got_simple_escape(int ch) {
/* clang-format off */
#define CASE(ch) case ch: return ch;
switch (ch) {
case '\'': return '\'';
case '\"': return '\"';
case '\?': return '\?';
case '\\': return '\\';
case 'a': return '\a';
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
default: return -1;
}
/* clang-format on */
}
void lex_parse_skip_line(core_stream_t *input, core_pos_t *pos) {
core_stream_t *stream = input;
Assert(stream != null);
core_stream_reset_char(stream);
while (1) {
int ch = core_stream_peek_char(stream);
if (ch == core_stream_eof) {
return;
}
// TODO endline
if (is_next_line(ch)) {
lex_parse_skip_endline(stream, pos);
return;
} else {
core_stream_next_char(stream);
core_pos_next(pos);
}
}
}
void lex_parse_skip_block_comment(core_stream_t *input, core_pos_t *pos) {
core_stream_t *stream = input;
int ch;
core_stream_reset_char(stream);
ch = core_stream_next_char(stream);
core_pos_next(pos);
// FIXME Assertion
Assert(ch == '/');
ch = core_stream_next_char(stream);
core_pos_next(pos);
Assert(ch == '*');
// all ready match `/*`
while (1) {
core_stream_reset_char(stream);
ch = core_stream_peek_char(stream);
if (ch == core_stream_eof) {
LOG_WARN("Unterminated block comment");
return;
}
if (is_next_line(ch)) {
lex_parse_skip_endline(stream, pos);
continue;
}
core_stream_next_char(stream);
core_pos_next(pos);
if (ch == '*') {
ch = core_stream_peek_char(stream);
if (ch == '/') {
core_stream_next_char(stream);
core_pos_next(pos);
return;
}
}
}
}
void lex_parse_skip_whitespace(core_stream_t *input, core_pos_t *pos) {
core_stream_t *stream = input;
Assert(stream != null);
core_stream_reset_char(stream);
while (1) {
int ch = core_stream_next_char(stream);
if (ch == core_stream_eof) {
return;
}
core_pos_next(pos);
}
}
static inline cbool _lex_parse_uint(core_stream_t *input, core_pos_t *pos,
int base, usize *output) {
Assert(input != null && pos != null);
if (input == null || pos == null) {
return false;
}
Assert(base == 2 || base == 8 || base == 10 || base == 16);
core_stream_reset_char(input);
int ch, tmp;
usize n = 0;
usize offset = pos->offset;
while (1) {
ch = core_stream_peek_char(input);
if (ch == core_stream_eof) {
break;
} else if (ch >= 'a' && ch <= 'z') {
tmp = ch - 'a' + 10;
} else if (ch >= 'A' && ch <= 'Z') {
tmp = ch - 'A' + 10;
} else if (ch >= '0' && ch <= '9') {
tmp = ch - '0';
} else {
break;
}
if (tmp >= base) {
LOG_ERROR("Invalid digit");
return false;
}
core_stream_next_char(input);
core_pos_next(pos);
n = n * base + tmp;
// TODO number overflow
}
if (offset == pos->offset) {
// None match any number
return false;
}
*output = n;
return true;
}
/**
* @brief
*
* @param input
* @param pos
* @return int
* https://cppreference.cn/w/c/language/character_constant
*/
int lex_parse_char(core_stream_t *input, core_pos_t *pos) {
core_stream_t *stream = input;
core_stream_reset_char(stream);
int ch = core_stream_peek_char(stream);
int ret = core_stream_eof;
if (ch == core_stream_eof) {
LOG_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch != '\'') {
LOG_WARN("Unexpected character '%c' at begin", ch);
goto ERR;
}
core_stream_next_char(stream);
core_pos_next(pos);
ch = core_stream_next_char(stream);
core_pos_next(pos);
if (ch == core_stream_eof) {
LOG_WARN("Unexpected EOF at middle");
goto ERR;
} else if (ch == '\\') {
ch = core_stream_next_char(stream);
core_pos_next(pos);
if (ch == '0') {
// 数字转义序列
// \nnn 任意八进制值 码元 nnn
// FIXME 这里如果返回 0 理论上为错误但是恰好与正确值相同
ret = 0;
_lex_parse_uint(stream, pos, 8, (usize *)&ret);
} else if (ch == 'x') {
// TODO https://cppreference.cn/w/c/language/escape
// \xn... 任意十六进制值 码元 n... (任意数量的十六进制数字)
// 通用字符名
TODO();
} else if (ch == 'u' || ch == 'U') {
// \unnnn (C99 起) Unicode 值在允许范围内;
// 可能产生多个码元 码点 U+nnnn
// \Unnnnnnnn (C99 起) Unicode 值在允许范围内;
// 可能产生多个码元 码点 U+nnnnnnnn
TODO();
} else if ((ret = got_simple_escape(ch)) == -1) {
LOG_ERROR("Invalid escape character");
goto ERR;
}
} else {
ret = ch;
}
if ((ch = core_stream_next_char(stream)) != '\'') {
LOG_ERROR("Unclosed character literal '%c' at end, expect `'`", ch);
core_pos_next(pos);
goto ERR;
}
return ret;
ERR:
return core_stream_eof;
}
/**
* @brief
*
* @param input
* @param pos
* @param output
* @return cbool
* https://cppreference.cn/w/c/language/string_literal
*/
cbool lex_parse_string(core_stream_t *input, core_pos_t *pos,
cstring_t *output) {
core_stream_t *stream = input;
core_stream_reset_char(stream);
int ch = core_stream_peek_char(stream);
Assert(cstring_is_empty(output));
if (ch == core_stream_eof) {
LOG_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch != '"') {
LOG_WARN("Unexpected character '%c' at begin", ch);
goto ERR;
}
core_stream_next_char(stream);
core_pos_next(pos);
cstring_t str = cstring_from_cstr("");
while (1) {
ch = core_stream_peek_char(stream);
if (ch == core_stream_eof) {
LOG_ERROR("Unexpected EOF at string literal");
goto ERR;
} else if (is_next_line(ch)) {
LOG_ERROR("Unexpected newline at string literal");
goto ERR;
} else if (ch == '\\') {
// TODO bad practice and maybe bugs here
core_stream_next_char(stream);
ch = core_stream_next_char(stream);
int val = got_simple_escape(ch);
if (val == -1) {
LOG_ERROR("Invalid escape character it is \\%c [%d]", ch, ch);
} else {
cstring_push(&str, val);
continue;
}
} else if (ch == '"') {
core_stream_next_char(stream);
core_pos_next(pos);
break;
}
core_stream_next_char(stream);
core_pos_next(pos);
cstring_push(&str, ch);
}
*output = str;
return true;
ERR:
cstring_free(&str);
return false;
}
/**
* @brief
*
* @param input
* @param pos
* @param output
* @return cbool
* https://cppreference.cn/w/c/language/integer_constant
*/
cbool lex_parse_number(core_stream_t *input, core_pos_t *pos, usize *output) {
core_stream_t *stream = input;
core_stream_reset_char(stream);
int ch = core_stream_peek_char(stream);
int base = 0;
if (ch == core_stream_eof) {
LOG_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch == '0') {
ch = core_stream_peek_char(stream);
if (ch == 'x' || ch == 'X') {
base = 16;
core_stream_next_char(stream);
core_pos_next(pos);
core_stream_next_char(stream);
core_pos_next(pos);
} else if (ch == 'b' || ch == 'B') {
// FIXME C23 external integer base
base = 2;
core_stream_next_char(stream);
core_pos_next(pos);
core_stream_next_char(stream);
core_pos_next(pos);
} else if (ch >= '0' && ch <= '7') {
base = 8;
core_stream_next_char(stream);
core_pos_next(pos);
} else if (ch == '9' || ch == '8') {
LOG_ERROR("Invalid digit '%d' in octal literal", ch);
return false;
} else {
base = 10;
}
} else {
base = 10;
}
// 解析整数部分
core_stream_reset_char(stream);
usize n;
if (_lex_parse_uint(stream, pos, base, &n) == false) {
return false;
}
*output = n;
return true;
ERR:
return false;
}
/**
* @brief
*
* @param input
* @param pos
* @param output
* @return cbool
* https://cppreference.cn/w/c/language/identifier
*/
cbool lex_parse_identifier(core_stream_t *input, core_pos_t *pos,
cstring_t *output) {
Assert(cstring_is_empty(output));
core_stream_t *stream = input;
core_stream_reset_char(stream);
int ch = core_stream_peek_char(stream);
if (ch == core_stream_eof) {
LOG_WARN("Unexpected EOF at begin");
} else if (ch == '_' || (ch >= 'a' && ch <= 'z') ||
(ch >= 'A' && ch <= 'Z')) {
while (1) {
cstring_push(output, ch);
core_stream_next_char(stream);
core_pos_next(pos);
ch = core_stream_peek_char(stream);
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
(ch == '_') || (ch >= '0' && ch <= '9')) {
continue;
}
break;
}
return true;
}
return false;
}