移除了libs/format目录下的所有文件,包括: - cbuild.toml构建配置文件 - include/scf.h头文件 - include/scf_impl.h实现头文件 - src/scf.c源文件 - tests/test_scf.c测试文件 - tests/test_scf_x64.c x64架构测试文件 这些文件包含了SCF(scc format)格式的完整实现,但现在不再需要。 feat(lexer): 添加布尔字面量数字生成函数 在lexer工具头文件中添加了两个内联函数用于生成布尔值的数字字面量: - scc_lexer_gen_number_true: 将token类型设为整数字面量,值为"1" - scc_lexer_gen_number_false: 将token类型设为整数字面量,值为"0" refactor(lexer): 改进词法分析器错误处理 - 移除了多余的头文件包含 - 更新错误报告方式,使用SCC_ERROR宏替代LEX_ERROR,提供更准确的错误位置信息 refactor(pproc): 更新预处理器扩展器数据结构 - 将need_rescan字段类型从int改为cbool - 添加need_parse_defined字段用于控制defined操作符解析 - 更新函数签名以支持defined操作符解析参数
526 lines
17 KiB
C
526 lines
17 KiB
C
#include <lexer_log.h>
|
||
#include <scc_lexer.h>
|
||
|
||
static const struct {
|
||
const char *name;
|
||
scc_cstd_t std_type;
|
||
scc_tok_type_t tok_type;
|
||
} keywords[] = {
|
||
#define X(name, subtype, tok, std_type, ...) {#name, std_type, tok},
|
||
SCC_CKEYWORD_TABLE
|
||
#undef X
|
||
};
|
||
|
||
// by using binary search to find the keyword
|
||
static int keyword_cmp(const char *name, int len) {
|
||
int low = 0;
|
||
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
|
||
while (low <= high) {
|
||
int mid = (low + high) / 2;
|
||
const char *key = keywords[mid].name;
|
||
int cmp = 0;
|
||
for (int i = 0; i < len; i++) {
|
||
if (name[i] != key[i]) {
|
||
cmp = (unsigned char)name[i] - (unsigned char)key[i];
|
||
break;
|
||
}
|
||
if (name[i] == '\0')
|
||
break;
|
||
}
|
||
if (cmp == 0) {
|
||
if (key[len] == '\0')
|
||
return mid;
|
||
cmp = -1;
|
||
}
|
||
if (cmp < 0)
|
||
high = mid - 1;
|
||
else
|
||
low = mid + 1;
|
||
}
|
||
return -1; // 不是关键字
|
||
}
|
||
|
||
void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref) {
|
||
lexer->stream_ref = stream_ref;
|
||
lexer->ring_ref_count = 0;
|
||
lexer->jump_macro = false;
|
||
}
|
||
|
||
static inline cbool is_whitespace(int ch) {
|
||
return ch == ' ' || ch == '\t' || ch == '\v' || ch == '\f';
|
||
}
|
||
static inline cbool is_newline(int ch) { return ch == '\n' || ch == '\r'; }
|
||
static inline cbool is_digit(int ch) { return ch >= '0' && ch <= '9'; }
|
||
static inline cbool is_alpha(int ch) {
|
||
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
|
||
}
|
||
static inline cbool is_alnum(int ch) { return is_alpha(ch) || is_digit(ch); }
|
||
static inline cbool is_identifier_start(int ch) {
|
||
return is_alpha(ch) || ch == '_';
|
||
}
|
||
static inline cbool is_identifier_part(int ch) {
|
||
return is_alnum(ch) || ch == '_';
|
||
}
|
||
static inline cbool is_octal_digit(int ch) { return ch >= '0' && ch <= '7'; }
|
||
static inline cbool is_hex_digit(int ch) {
|
||
return is_digit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
|
||
}
|
||
|
||
/* 从环形缓冲区预览一个字符(带EOF检测) */
|
||
static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) {
|
||
cbool ok;
|
||
scc_ring_peek(*lexer->stream_ref, *out, ok);
|
||
return ok;
|
||
}
|
||
|
||
/* 从环形缓冲区消费一个字符,并将它追加到lexeme中 */
|
||
static inline cbool next_char(scc_lexer_t *lexer, scc_cstring_t *lexeme,
|
||
scc_sstream_char_t *out) {
|
||
cbool ok;
|
||
scc_ring_next(*lexer->stream_ref, *out, ok);
|
||
if (!ok)
|
||
return false;
|
||
scc_cstring_append_ch(lexeme, out->character);
|
||
return true;
|
||
}
|
||
|
||
#define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN)
|
||
|
||
void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
|
||
scc_sstream_char_t cur = {0};
|
||
scc_cstring_t lex = scc_cstring_create(); // 临时lexeme
|
||
|
||
// 尝试预览第一个字符
|
||
if (!peek_char(lexer, &cur)) {
|
||
token->type = SCC_TOK_EOF;
|
||
token->loc = (scc_pos_t){0, 1, 1, 0}; // 默认位置
|
||
token->lexeme = lex; // 空字符串
|
||
return;
|
||
}
|
||
|
||
// 记录起始位置
|
||
scc_pos_t start_loc = cur.pos;
|
||
int ch = cur.character;
|
||
|
||
// once step
|
||
if (is_whitespace(ch)) {
|
||
// 空白符: 连续收集
|
||
token->type = SCC_TOK_BLANK;
|
||
while (peek_char(lexer, &cur) && is_whitespace(cur.character)) {
|
||
next_char(lexer, &lex, &cur);
|
||
}
|
||
} else if (is_newline(ch)) {
|
||
// 换行符:处理 \r 或 \n,以及 \r\n 组合
|
||
token->type = SCC_TOK_ENDLINE;
|
||
next_char(lexer, &lex, &cur); // 消费第一个字符
|
||
if (ch == '\r') {
|
||
// 尝试消费后面的 \n
|
||
if (peek_char(lexer, &cur) && cur.character == '\n') {
|
||
next_char(lexer, &lex, &cur);
|
||
}
|
||
}
|
||
} else if (ch == '/') {
|
||
// 可能为注释或除号
|
||
scc_sstream_char_t next = {0};
|
||
next_char(lexer, &lex, &cur); // 消费 '/'
|
||
peek_char(lexer, &next);
|
||
if (next.character == '=') {
|
||
token->type = SCC_TOK_ASSIGN_DIV;
|
||
next_char(lexer, &lex, &cur);
|
||
} else if (next.character == '/') {
|
||
// 行注释 //
|
||
token->type = SCC_TOK_LINE_COMMENT;
|
||
next_char(lexer, &lex, &cur); // 消费 '/'
|
||
while (peek_char(lexer, &cur) && !is_newline(cur.character)) {
|
||
next_char(lexer, &lex, &cur);
|
||
scc_ring_consume(*lexer->stream_ref);
|
||
}
|
||
// 注释结束,不包含换行符(换行符单独成token)
|
||
} else if (next.character == '*') {
|
||
// 块注释 /*
|
||
token->type = SCC_TOK_BLOCK_COMMENT;
|
||
next_char(lexer, &lex, &cur); // 消费 '*'
|
||
while (1) {
|
||
if (!next_char(lexer, &lex, &cur)) {
|
||
// 文件结束,注释未闭合
|
||
LOG_ERROR("Unterminated block comment");
|
||
break;
|
||
}
|
||
if (cur.character == '*' && peek_char(lexer, &next) &&
|
||
next.character == '/') {
|
||
next_char(lexer, &lex, &cur); // 消费 '/'
|
||
break;
|
||
}
|
||
scc_ring_consume(*lexer->stream_ref);
|
||
}
|
||
} else {
|
||
// 只是除号 /
|
||
token->type = SCC_TOK_DIV;
|
||
}
|
||
} else if (is_identifier_start(ch)) {
|
||
// 标识符或关键字
|
||
token->type = SCC_TOK_IDENT; // 暂定
|
||
while (peek_char(lexer, &cur) && is_identifier_part(cur.character)) {
|
||
next_char(lexer, &lex, &cur);
|
||
scc_ring_consume(*lexer->stream_ref);
|
||
}
|
||
// 检查是否为关键字
|
||
int idx = keyword_cmp(scc_cstring_as_cstr(&lex), scc_cstring_len(&lex));
|
||
if (idx != -1) {
|
||
token->type = keywords[idx].tok_type;
|
||
}
|
||
} else if (is_digit(ch)) {
|
||
// 数字字面量(整数/浮点)
|
||
token->type = SCC_TOK_INT_LITERAL; // 先假定整数
|
||
cbool maybe_float = false;
|
||
while (1) {
|
||
next_char(lexer, &lex, &cur); // 消费当前数字
|
||
if (!peek_char(lexer, &cur))
|
||
break;
|
||
ch = cur.character;
|
||
if (is_digit(ch) || (ch == '.' && !maybe_float)) {
|
||
if (ch == '.')
|
||
maybe_float = true;
|
||
continue;
|
||
}
|
||
if (ch == 'e' || ch == 'E' || ch == 'p' || ch == 'P') {
|
||
maybe_float = true;
|
||
// 后面可能跟符号或数字
|
||
continue;
|
||
}
|
||
if (ch == 'x' || ch == 'X') {
|
||
// 十六进制前缀,需特殊处理
|
||
// 这里简化:将整个序列作为整数(保留前缀)
|
||
continue;
|
||
}
|
||
break;
|
||
}
|
||
if (maybe_float)
|
||
token->type = SCC_TOK_FLOAT_LITERAL;
|
||
} else if (ch == '\'') {
|
||
// 字符字面量
|
||
token->type = SCC_TOK_CHAR_LITERAL;
|
||
next_char(lexer, &lex, &cur); // 开头的 '
|
||
while (1) {
|
||
if (!peek_char(lexer, &cur)) {
|
||
LOG_ERROR("Unterminated character literal");
|
||
break;
|
||
}
|
||
if (cur.character == '\'') {
|
||
next_char(lexer, &lex, &cur); // 闭引号
|
||
break;
|
||
}
|
||
if (cur.character == '\\') {
|
||
// 转义序列:原样保存反斜杠和下一个字符
|
||
next_char(lexer, &lex, &cur);
|
||
if (!peek_char(lexer, &cur))
|
||
break;
|
||
next_char(lexer, &lex, &cur);
|
||
} else {
|
||
next_char(lexer, &lex, &cur);
|
||
}
|
||
}
|
||
} else if (ch == '"') {
|
||
// 字符串字面量
|
||
token->type = SCC_TOK_STRING_LITERAL;
|
||
next_char(lexer, &lex, &cur); // 开头的 "
|
||
while (1) {
|
||
if (!peek_char(lexer, &cur)) {
|
||
LOG_ERROR("Unterminated string literal");
|
||
break;
|
||
}
|
||
if (cur.character == '"') {
|
||
next_char(lexer, &lex, &cur); // 闭引号
|
||
break;
|
||
}
|
||
if (cur.character == '\\') {
|
||
// 转义序列
|
||
next_char(lexer, &lex, &cur);
|
||
if (!peek_char(lexer, &cur))
|
||
break;
|
||
next_char(lexer, &lex, &cur);
|
||
} else {
|
||
next_char(lexer, &lex, &cur);
|
||
}
|
||
scc_ring_consume(*lexer->stream_ref);
|
||
}
|
||
} else {
|
||
scc_sstream_char_t next = {0};
|
||
next_char(lexer, &lex, &cur);
|
||
peek_char(lexer, &next);
|
||
switch (ch) {
|
||
case '=':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_EQ;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_ASSIGN;
|
||
break;
|
||
}
|
||
break;
|
||
case '+':
|
||
switch (next.character) {
|
||
case '+':
|
||
token->type = SCC_TOK_ADD_ADD;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_ADD;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_ADD;
|
||
break;
|
||
}
|
||
break;
|
||
case '-':
|
||
switch (next.character) {
|
||
case '-':
|
||
token->type = SCC_TOK_SUB_SUB;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_SUB;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '>':
|
||
token->type = SCC_TOK_DEREF;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_SUB;
|
||
break;
|
||
}
|
||
break;
|
||
case '*':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_MUL;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_MUL;
|
||
break;
|
||
}
|
||
break;
|
||
case '%':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_MOD;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_MOD;
|
||
break;
|
||
}
|
||
break;
|
||
case '&':
|
||
switch (next.character) {
|
||
case '&':
|
||
token->type = SCC_TOK_AND_AND;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_AND;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_AND;
|
||
break;
|
||
}
|
||
break;
|
||
case '|':
|
||
switch (next.character) {
|
||
case '|':
|
||
token->type = SCC_TOK_OR_OR;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_OR;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_OR;
|
||
break;
|
||
}
|
||
break;
|
||
case '^':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_ASSIGN_XOR;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_XOR;
|
||
break;
|
||
}
|
||
break;
|
||
case '<':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_LE;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '<': {
|
||
next_char(lexer, &lex, &cur);
|
||
if (peek_char(lexer, &next) && next.character == '=') {
|
||
token->type = SCC_TOK_ASSIGN_L_SH;
|
||
next_char(lexer, &lex, &cur);
|
||
} else {
|
||
token->type = SCC_TOK_L_SH;
|
||
}
|
||
break;
|
||
}
|
||
default:
|
||
token->type = SCC_TOK_LT;
|
||
break;
|
||
}
|
||
break;
|
||
case '>':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_GE;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
case '>': {
|
||
next_char(lexer, &lex, &cur);
|
||
if (peek_char(lexer, &next) && next.character == '=') {
|
||
token->type = SCC_TOK_ASSIGN_R_SH;
|
||
next_char(lexer, &lex, &cur);
|
||
} else {
|
||
token->type = SCC_TOK_R_SH;
|
||
}
|
||
break;
|
||
}
|
||
default:
|
||
token->type = SCC_TOK_GT;
|
||
break;
|
||
}
|
||
break;
|
||
case '~':
|
||
token->type = SCC_TOK_BIT_NOT;
|
||
break;
|
||
case '!':
|
||
switch (next.character) {
|
||
case '=':
|
||
token->type = SCC_TOK_NEQ;
|
||
next_char(lexer, &lex, &cur);
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_NOT;
|
||
break;
|
||
}
|
||
break;
|
||
/* clang-format off */
|
||
case '[': token->type = SCC_TOK_L_BRACKET; break;
|
||
case ']': token->type = SCC_TOK_R_BRACKET; break;
|
||
case '(': token->type = SCC_TOK_L_PAREN; break;
|
||
case ')': token->type = SCC_TOK_R_PAREN; break;
|
||
case '{': token->type = SCC_TOK_L_BRACE; break;
|
||
case '}': token->type = SCC_TOK_R_BRACE; break;
|
||
case ';': token->type = SCC_TOK_SEMICOLON; break;
|
||
case ',': token->type = SCC_TOK_COMMA; break;
|
||
case ':': token->type = SCC_TOK_COLON; break;
|
||
/* clang-format on */
|
||
case '.':
|
||
if (next.character == '.' && peek_char(lexer, &next) &&
|
||
next.character == '.') {
|
||
token->type = SCC_TOK_ELLIPSIS;
|
||
next_char(lexer, &lex, &cur);
|
||
next_char(lexer, &lex, &cur);
|
||
} else {
|
||
token->type = SCC_TOK_DOT;
|
||
}
|
||
break;
|
||
case '?':
|
||
token->type = SCC_TOK_COND;
|
||
break;
|
||
case '#':
|
||
if (next.character == '#') {
|
||
token->type = SCC_TOK_SHARP_SHARP;
|
||
next_char(lexer, &lex, &cur);
|
||
} else
|
||
token->type = SCC_TOK_SHARP;
|
||
break;
|
||
default:
|
||
token->type = SCC_TOK_UNKNOWN;
|
||
SCC_ERROR(start_loc, "unsupported character '%c' (0x%x)", ch, ch);
|
||
break;
|
||
}
|
||
}
|
||
|
||
// 设置token
|
||
scc_ring_consume(*lexer->stream_ref);
|
||
token->type = token->type; // 上面已设
|
||
token->loc = start_loc;
|
||
token->lexeme = lex; // 转移所有权
|
||
LEX_DEBUG("get token `%s` (%s) at %s:%d:%d", scc_get_tok_name(token->type),
|
||
scc_cstring_as_cstr(&token->lexeme), token->loc.name,
|
||
token->loc.line, token->loc.col);
|
||
}
|
||
|
||
// scc_lexer_get_token maybe got invalid (with parser)
|
||
void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
|
||
scc_tok_subtype_t subtype;
|
||
while (1) {
|
||
scc_lexer_get_token(lexer, token);
|
||
subtype = scc_get_tok_subtype(token->type);
|
||
AssertFmt(subtype != SCC_TOK_SUBTYPE_INVALID,
|
||
"Invalid token: `%s` at %s:%d:%d",
|
||
scc_get_tok_name(token->type), token->loc.name,
|
||
token->loc.line, token->loc.col);
|
||
if (subtype == SCC_TOK_SUBTYPE_EMPTYSPACE ||
|
||
subtype == SCC_TOK_SUBTYPE_COMMENT) {
|
||
scc_lexer_tok_drop(token);
|
||
}
|
||
break;
|
||
};
|
||
}
|
||
|
||
static cbool fill_token(scc_lexer_tok_t *out, void *userdata) {
|
||
scc_lexer_t *lexer = userdata;
|
||
scc_lexer_get_token(lexer, out);
|
||
if (out->type == SCC_TOK_EOF) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
static cbool fill_valid_token(scc_lexer_tok_t *out, void *userdata) {
|
||
scc_lexer_t *lexer = userdata;
|
||
scc_lexer_get_valid_token(lexer, out);
|
||
if (out->type == SCC_TOK_EOF) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
scc_lexer_tok_ring_t *scc_lexer_to_ring(scc_lexer_t *lexer, int ring_size,
|
||
cbool fill_all) {
|
||
scc_ring_init(lexer->ring, ring_size,
|
||
fill_all ? fill_token : fill_valid_token, lexer);
|
||
lexer->ring_ref_count++;
|
||
return &lexer->ring;
|
||
}
|
||
|
||
void scc_lexer_drop_ring(scc_lexer_tok_ring_t *ring_ref) {
|
||
scc_lexer_t *lexer = ring_ref->userdata;
|
||
if (lexer->ring_ref_count > 0) {
|
||
lexer->ring_ref_count--;
|
||
} else {
|
||
LOG_WARN("double drop sstream ring");
|
||
}
|
||
}
|
||
|
||
void scc_lexer_drop(scc_lexer_t *lexer) {
|
||
Assert(lexer != null);
|
||
if (lexer->ring_ref_count) {
|
||
LOG_FATAL("drop sstream must be drop ring before ref [%d]",
|
||
lexer->ring_ref_count);
|
||
}
|
||
scc_ring_free(lexer->ring);
|
||
scc_sstream_drop_ring(lexer->stream_ref);
|
||
}
|