feat(lex_parser): 提取字符判断函数并增强解析器断言

将 `is_next_line` 内联函数重命名为 `lex_parse_is_endline` 并新增 `lex_parse_is_whitespace` 函数,统一用于词法解析中的字符分类。同时加强多个解析函数的输入参数断言,提升代码健壮性。

此外,修正了 `lex_parse_skip_whitespace` 中的逻辑错误,并优化部分注释和控制流结构。

feat(pprocessor): 初始化预处理器模块并添加基础功能实现

新增预处理器模块 `pprocessor`,包括宏定义、条件编译状态管理以及基本的指令解析框架。实现了标识符解析、空白跳过、关键字查找等功能,并初步支持 `#define` 指令的对象类宏替换。

该提交还引入了一组测试用例,覆盖多种宏展开场景及边界情况,确保预处理器的核心行为符合预期。
This commit is contained in:
zzy
2025-11-24 22:44:08 +08:00
parent 871d031ceb
commit e6a76e7a86
58 changed files with 1429 additions and 9 deletions

View File

@@ -0,0 +1,427 @@
/**
* @file pprocessor.c
* @brief C语言预处理器实现
*/
#include <lex_parser.h>
#include <pp_token.h>
#include <pprocessor.h>
#define PPROCESSER_BUFFER_SIZE (1024)
static u32 hash_func(cstring_t *string) {
return smcc_strhash32(cstring_as_cstr(string));
}
static int hash_cmp(const cstring_t *str1, const cstring_t *str2) {
if (str1->size != str2->size) {
return str1->size - str2->size;
}
return smcc_strcmp(cstring_as_cstr(str1), cstring_as_cstr(str2));
}
// 添加宏定义
static void add_macro(smcc_pp_t *pp, const cstring_t *name,
const macro_list_t *replaces, const macro_list_t *params,
macro_type_t type) {
smcc_macro_t *macro = smcc_malloc(sizeof(smcc_macro_t));
macro->name = *name;
macro->type = type;
if (replaces) {
macro->replaces = *replaces;
} else {
vec_init(macro->replaces);
}
if (params) {
macro->params = *params;
} else {
vec_init(macro->params);
}
hashmap_set(&pp->macros, &macro->name, macro);
}
// 查找宏定义
static smcc_macro_t *find_macro(smcc_pp_t *pp, cstring_t *name) {
return hashmap_get(&pp->macros, name);
}
// 条件编译处理框架
static void handle_if(smcc_pp_t *pp, const char *condition) {
if_stack_item_t item;
int cond_value;
// cond_value = evaluate_condition(pp, condition);
item.state = cond_value ? IFState_TRUE : IFState_FALSE;
item.skip = !cond_value;
vec_push(pp->if_stack, item);
}
static void handle_else(smcc_pp_t *pp) {
if (pp->if_stack.size == 0) {
// 错误:没有匹配的#if
return;
}
if_stack_item_t *top = &vec_at(pp->if_stack, pp->if_stack.size - 1);
if (top->state == IFState_ELSE) {
// 错误:#else重复出现
return;
}
top->skip = !top->skip;
top->state = IFState_ELSE;
}
static void handle_include(smcc_pp_t *pp, const char *filename,
int system_header) {
// 查找文件路径逻辑
// 创建新的输入流
// 递归处理包含文件
}
// 解析标识符
static cstring_t parse_identifier(core_stream_t *stream) {
cstring_t identifier = cstring_new();
core_stream_reset_char(stream);
int ch = core_stream_peek_char(stream);
// 标识符以字母或下划线开头
if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_')) {
LOG_WARN("Invalid identifier");
return identifier;
}
do {
cstring_push(&identifier, (char)ch);
core_stream_next_char(stream); // 消费字符
ch = core_stream_peek_char(stream);
} while ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
(ch >= '0' && ch <= '9') || ch == '_');
return identifier;
}
// 跳过空白字符 ' ' and '\t'
static void skip_whitespace(core_stream_t *stream) {
int ch;
core_stream_reset_char(stream);
while ((ch = core_stream_peek_char(stream)) != core_stream_eof) {
if (ch == ' ' || ch == '\t') {
core_stream_next_char(stream);
} else {
break;
}
}
}
#define X(name, type, tok) SMCC_STR(name),
static const char *token_strings[] = {PP_INST_TOKEN};
#undef X
static const struct {
const char *name;
pp_token_t tok;
} keywords[] = {
#define X(name, type, tok) {#name, tok},
PP_INST_TOKEN
#undef X
};
// by using binary search to find the keyword
static inline int keyword_cmp(const char *name, int len) {
int low = 0;
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
while (low <= high) {
int mid = (low + high) / 2;
const char *key = keywords[mid].name;
int cmp = 0;
// 自定义字符串比较逻辑
for (int i = 0; i < len; i++) {
if (name[i] != key[i]) {
cmp = (unsigned char)name[i] - (unsigned char)key[i];
break;
}
if (name[i] == '\0')
break; // 遇到终止符提前结束
}
if (cmp == 0) {
// 完全匹配检查(长度相同)
if (key[len] == '\0')
return mid;
cmp = -1; // 当前关键词比输入长
}
if (cmp < 0) {
high = mid - 1;
} else {
low = mid + 1;
}
}
return -1; // Not a keyword.
}
typedef struct pp_stream {
core_stream_t stream;
core_stream_t *input;
smcc_pp_t *self;
usize size;
usize pos;
char buffer[PPROCESSER_BUFFER_SIZE];
} pp_stream_t;
static cbool parse_list(pp_stream_t *_stream, macro_list_t *list,
cbool is_param) {
Assert(_stream != null);
core_stream_t *stream = _stream->input;
Assert(stream != null);
core_stream_reset_char(stream);
vec_init(*list);
int ch;
cstring_t str = cstring_new();
core_pos_t pos;
while ((ch = core_stream_peek_char(stream)) != core_stream_eof) {
if (is_param) {
// ( 参数 ) ( 参数, ... ) ( ... )
if (lex_parse_is_whitespace(ch)) {
// TODO #define ( A A , B ) need ERROR
lex_parse_skip_whitespace(stream, &pos);
core_stream_reset_char(stream);
} else if (ch == ',') {
vec_push(*list, str);
str = cstring_new();
core_stream_next_char(stream);
continue;
} else if (ch == ')') {
break;
} else if (ch == core_stream_eof || lex_parse_is_endline(ch)) {
LOG_ERROR("Invalid parameter list");
return false;
}
} else {
// 替换列表
if (lex_parse_is_whitespace(ch)) {
lex_parse_skip_whitespace(stream, &pos);
vec_push(*list, str);
str = cstring_new();
core_stream_reset_char(stream);
continue;
} else if (lex_parse_is_endline(ch)) {
break;
}
}
core_stream_next_char(stream);
cstring_push(&str, (char)ch);
}
vec_push(*list, str);
str = cstring_new();
return true;
}
// 解析预处理指令
static void parse_directive(pp_stream_t *_stream) {
Assert(_stream != null);
core_stream_t *stream = _stream->input;
Assert(stream != null);
int ch;
core_pos_t pos;
core_stream_reset_char(stream);
// 跳过 '#' 和后续空白
if (core_stream_peek_char(stream) != '#') {
LOG_WARN("Invalid directive");
return;
}
core_stream_next_char(stream);
// TODO 允许空指令(# 后跟换行符),且无任何效果。
skip_whitespace(stream);
// 解析指令名称
cstring_t directive = parse_identifier(stream);
if (cstring_is_empty(&directive)) {
LOG_ERROR("expected indentifier");
goto ERR;
}
skip_whitespace(stream);
core_stream_reset_char(stream);
pp_token_t token =
keyword_cmp(cstring_as_cstr(&directive), cstring_len(&directive));
switch (token) {
case PP_TOK_DEFINE: {
cstring_t name = parse_identifier(stream);
if (cstring_is_empty(&name)) {
LOG_ERROR("expected indentifier");
goto ERR;
}
skip_whitespace(stream);
core_stream_reset_char(stream);
int ch = core_stream_peek_char(stream);
if (ch == '(') {
macro_list_t params;
parse_list(_stream, &params, true);
ch = core_stream_next_char(stream);
if (ch != ')') {
}
goto ERR;
}
macro_list_t replacement;
parse_list(_stream, &replacement, false);
add_macro(_stream->self, &name, &replacement, NULL, MACRO_OBJECT);
break;
}
case PP_TOK_UNDEF:
case PP_TOK_INCLUDE:
case PP_TOK_IF:
case PP_TOK_IFDEF:
case PP_TOK_IFNDEF:
case PP_TOK_ELSE:
case PP_TOK_ELIF:
case PP_TOK_ELIFDEF:
case PP_TOK_ELIFNDEF:
case PP_TOK_ENDIF:
case PP_TOK_LINE:
case PP_TOK_EMBED:
case PP_TOK_ERROR:
case PP_TOK_WARNING:
case PP_TOK_PRAMA:
TODO();
break;
default:
LOG_WARN("Unknown preprocessor directive: %s",
cstring_as_cstr(&directive));
}
// TODO: win \r\n linux \n mac \r => all need transport to \n
core_stream_reset_char(stream);
lex_parse_skip_line(stream, &pos);
cstring_free(&directive);
return;
ERR:
// TODO skip line
LOG_FATAL("Unhandled preprocessor directive");
}
static inline void stream_push_string(pp_stream_t *stream, cstring_t *str) {
stream->size += cstring_len(str);
Assert(stream->size <= PPROCESSER_BUFFER_SIZE);
smcc_memcpy(stream->buffer, cstring_as_cstr(str), stream->size);
}
static inline void stream_push_char(pp_stream_t *stream, int ch) {
stream->buffer[stream->size++] = ch;
Assert(stream->size <= PPROCESSER_BUFFER_SIZE);
}
static int next_char(core_stream_t *_stream) {
pp_stream_t *stream = (pp_stream_t *)_stream;
Assert(stream != null);
READ_BUF:
if (stream->size != 0) {
if (stream->pos < stream->size) {
return stream->buffer[stream->pos++];
} else {
stream->size = 0;
stream->pos = 0;
}
}
RETRY:
core_stream_reset_char(stream->input);
int ch = core_stream_peek_char(stream->input);
if (ch == '#') {
parse_directive(stream);
goto RETRY;
} else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
ch == '_') {
cstring_t identifier = parse_identifier(stream->input);
smcc_macro_t *macro = find_macro(stream->self, &identifier);
if (macro == null) {
stream_push_string(stream, &identifier);
cstring_free(&identifier);
goto READ_BUF;
} else {
cstring_free(&identifier);
}
if (macro->type == MACRO_OBJECT) {
for (usize i = 0; i < macro->replaces.size; ++i) {
stream_push_string(stream, &vec_at(macro->replaces, i));
// usize never using `-`
if (i + 1 < macro->replaces.size)
stream_push_char(stream, ' ');
}
goto READ_BUF;
} else if (macro->type == MACRO_FUNCTION) {
TODO();
}
UNREACHABLE();
}
return core_stream_next_char(stream->input);
}
static core_stream_t *pp_stream_init(smcc_pp_t *self, core_stream_t *input) {
pp_stream_t *stream = smcc_malloc(sizeof(pp_stream_t));
if (stream == null) {
LOG_FATAL("Failed to allocate memory for output stream");
}
if (stream == null || self == null) {
return null;
}
stream->self = self;
stream->input = input;
stream->size = 0;
stream->pos = 0;
stream->stream.name = cstring_from_cstr("pipe_stream");
stream->stream.free_stream = null;
stream->stream.next_char = next_char;
stream->stream.peek_char = null;
stream->stream.reset_char = null;
stream->stream.read_buf = null;
return (core_stream_t *)stream;
}
core_stream_t *pp_init(smcc_pp_t *pp, core_stream_t *input) {
if (pp == null || input == null) {
return null;
}
core_mem_stream_t *stream = smcc_malloc(sizeof(core_mem_stream_t));
if (stream == null) {
LOG_FATAL("Failed to allocate memory for output stream");
}
pp->stream = pp_stream_init(pp, input);
Assert(pp->stream != null);
hashmap_init(&pp->macros);
pp->macros.hash_func = (u32 (*)(const void *))hash_func;
pp->macros.key_cmp = (int (*)(const void *, const void *))hash_cmp;
return pp->stream;
}
// 销毁预处理器
void pp_drop(smcc_pp_t *pp) {
if (pp == NULL)
return;
// 清理所有宏定义
// 注意:需要实现 hashmap 的迭代和清理函数
hashmap_drop(&pp->macros);
// 清理字符串池
// strpool_destroy(&pp->strpool);
// 清理条件编译栈
// 需要释放栈中每个元素的资源(如果有的话)
// vec_free(pp->if_stack);
// 清理文件名
cstring_free(&pp->stream->name);
}