Files
scc/libs/lexer/src/lexer.c
zzy f56b13da2c refactor(format): 移除SCF格式相关文件
移除了libs/format目录下的所有文件,包括:
- cbuild.toml构建配置文件
- include/scf.h头文件
- include/scf_impl.h实现头文件
- src/scf.c源文件
- tests/test_scf.c测试文件
- tests/test_scf_x64.c x64架构测试文件

这些文件包含了SCF(scc format)格式的完整实现,但现在不再需要。

feat(lexer): 添加布尔字面量数字生成函数

在lexer工具头文件中添加了两个内联函数用于生成布尔值的数字字面量:
- scc_lexer_gen_number_true: 将token类型设为整数字面量,值为"1"
- scc_lexer_gen_number_false: 将token类型设为整数字面量,值为"0"

refactor(lexer): 改进词法分析器错误处理

- 移除了多余的头文件包含
- 更新错误报告方式,使用SCC_ERROR宏替代LEX_ERROR,提供更准确的错误位置信息

refactor(pproc): 更新预处理器扩展器数据结构

- 将need_rescan字段类型从int改为cbool
- 添加need_parse_defined字段用于控制defined操作符解析
- 更新函数签名以支持defined操作符解析参数
2026-02-26 10:25:45 +08:00

526 lines
17 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include <lexer_log.h>
#include <scc_lexer.h>
static const struct {
const char *name;
scc_cstd_t std_type;
scc_tok_type_t tok_type;
} keywords[] = {
#define X(name, subtype, tok, std_type, ...) {#name, std_type, tok},
SCC_CKEYWORD_TABLE
#undef X
};
// by using binary search to find the keyword
static int keyword_cmp(const char *name, int len) {
int low = 0;
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
while (low <= high) {
int mid = (low + high) / 2;
const char *key = keywords[mid].name;
int cmp = 0;
for (int i = 0; i < len; i++) {
if (name[i] != key[i]) {
cmp = (unsigned char)name[i] - (unsigned char)key[i];
break;
}
if (name[i] == '\0')
break;
}
if (cmp == 0) {
if (key[len] == '\0')
return mid;
cmp = -1;
}
if (cmp < 0)
high = mid - 1;
else
low = mid + 1;
}
return -1; // 不是关键字
}
void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref) {
lexer->stream_ref = stream_ref;
lexer->ring_ref_count = 0;
lexer->jump_macro = false;
}
static inline cbool is_whitespace(int ch) {
return ch == ' ' || ch == '\t' || ch == '\v' || ch == '\f';
}
static inline cbool is_newline(int ch) { return ch == '\n' || ch == '\r'; }
static inline cbool is_digit(int ch) { return ch >= '0' && ch <= '9'; }
static inline cbool is_alpha(int ch) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
static inline cbool is_alnum(int ch) { return is_alpha(ch) || is_digit(ch); }
static inline cbool is_identifier_start(int ch) {
return is_alpha(ch) || ch == '_';
}
static inline cbool is_identifier_part(int ch) {
return is_alnum(ch) || ch == '_';
}
static inline cbool is_octal_digit(int ch) { return ch >= '0' && ch <= '7'; }
static inline cbool is_hex_digit(int ch) {
return is_digit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
/* 从环形缓冲区预览一个字符带EOF检测 */
static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) {
cbool ok;
scc_ring_peek(*lexer->stream_ref, *out, ok);
return ok;
}
/* 从环形缓冲区消费一个字符并将它追加到lexeme中 */
static inline cbool next_char(scc_lexer_t *lexer, scc_cstring_t *lexeme,
scc_sstream_char_t *out) {
cbool ok;
scc_ring_next(*lexer->stream_ref, *out, ok);
if (!ok)
return false;
scc_cstring_append_ch(lexeme, out->character);
return true;
}
#define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN)
void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
scc_sstream_char_t cur = {0};
scc_cstring_t lex = scc_cstring_create(); // 临时lexeme
// 尝试预览第一个字符
if (!peek_char(lexer, &cur)) {
token->type = SCC_TOK_EOF;
token->loc = (scc_pos_t){0, 1, 1, 0}; // 默认位置
token->lexeme = lex; // 空字符串
return;
}
// 记录起始位置
scc_pos_t start_loc = cur.pos;
int ch = cur.character;
// once step
if (is_whitespace(ch)) {
// 空白符: 连续收集
token->type = SCC_TOK_BLANK;
while (peek_char(lexer, &cur) && is_whitespace(cur.character)) {
next_char(lexer, &lex, &cur);
}
} else if (is_newline(ch)) {
// 换行符:处理 \r 或 \n以及 \r\n 组合
token->type = SCC_TOK_ENDLINE;
next_char(lexer, &lex, &cur); // 消费第一个字符
if (ch == '\r') {
// 尝试消费后面的 \n
if (peek_char(lexer, &cur) && cur.character == '\n') {
next_char(lexer, &lex, &cur);
}
}
} else if (ch == '/') {
// 可能为注释或除号
scc_sstream_char_t next = {0};
next_char(lexer, &lex, &cur); // 消费 '/'
peek_char(lexer, &next);
if (next.character == '=') {
token->type = SCC_TOK_ASSIGN_DIV;
next_char(lexer, &lex, &cur);
} else if (next.character == '/') {
// 行注释 //
token->type = SCC_TOK_LINE_COMMENT;
next_char(lexer, &lex, &cur); // 消费 '/'
while (peek_char(lexer, &cur) && !is_newline(cur.character)) {
next_char(lexer, &lex, &cur);
scc_ring_consume(*lexer->stream_ref);
}
// 注释结束不包含换行符换行符单独成token
} else if (next.character == '*') {
// 块注释 /*
token->type = SCC_TOK_BLOCK_COMMENT;
next_char(lexer, &lex, &cur); // 消费 '*'
while (1) {
if (!next_char(lexer, &lex, &cur)) {
// 文件结束,注释未闭合
LOG_ERROR("Unterminated block comment");
break;
}
if (cur.character == '*' && peek_char(lexer, &next) &&
next.character == '/') {
next_char(lexer, &lex, &cur); // 消费 '/'
break;
}
scc_ring_consume(*lexer->stream_ref);
}
} else {
// 只是除号 /
token->type = SCC_TOK_DIV;
}
} else if (is_identifier_start(ch)) {
// 标识符或关键字
token->type = SCC_TOK_IDENT; // 暂定
while (peek_char(lexer, &cur) && is_identifier_part(cur.character)) {
next_char(lexer, &lex, &cur);
scc_ring_consume(*lexer->stream_ref);
}
// 检查是否为关键字
int idx = keyword_cmp(scc_cstring_as_cstr(&lex), scc_cstring_len(&lex));
if (idx != -1) {
token->type = keywords[idx].tok_type;
}
} else if (is_digit(ch)) {
// 数字字面量(整数/浮点)
token->type = SCC_TOK_INT_LITERAL; // 先假定整数
cbool maybe_float = false;
while (1) {
next_char(lexer, &lex, &cur); // 消费当前数字
if (!peek_char(lexer, &cur))
break;
ch = cur.character;
if (is_digit(ch) || (ch == '.' && !maybe_float)) {
if (ch == '.')
maybe_float = true;
continue;
}
if (ch == 'e' || ch == 'E' || ch == 'p' || ch == 'P') {
maybe_float = true;
// 后面可能跟符号或数字
continue;
}
if (ch == 'x' || ch == 'X') {
// 十六进制前缀,需特殊处理
// 这里简化:将整个序列作为整数(保留前缀)
continue;
}
break;
}
if (maybe_float)
token->type = SCC_TOK_FLOAT_LITERAL;
} else if (ch == '\'') {
// 字符字面量
token->type = SCC_TOK_CHAR_LITERAL;
next_char(lexer, &lex, &cur); // 开头的 '
while (1) {
if (!peek_char(lexer, &cur)) {
LOG_ERROR("Unterminated character literal");
break;
}
if (cur.character == '\'') {
next_char(lexer, &lex, &cur); // 闭引号
break;
}
if (cur.character == '\\') {
// 转义序列:原样保存反斜杠和下一个字符
next_char(lexer, &lex, &cur);
if (!peek_char(lexer, &cur))
break;
next_char(lexer, &lex, &cur);
} else {
next_char(lexer, &lex, &cur);
}
}
} else if (ch == '"') {
// 字符串字面量
token->type = SCC_TOK_STRING_LITERAL;
next_char(lexer, &lex, &cur); // 开头的 "
while (1) {
if (!peek_char(lexer, &cur)) {
LOG_ERROR("Unterminated string literal");
break;
}
if (cur.character == '"') {
next_char(lexer, &lex, &cur); // 闭引号
break;
}
if (cur.character == '\\') {
// 转义序列
next_char(lexer, &lex, &cur);
if (!peek_char(lexer, &cur))
break;
next_char(lexer, &lex, &cur);
} else {
next_char(lexer, &lex, &cur);
}
scc_ring_consume(*lexer->stream_ref);
}
} else {
scc_sstream_char_t next = {0};
next_char(lexer, &lex, &cur);
peek_char(lexer, &next);
switch (ch) {
case '=':
switch (next.character) {
case '=':
token->type = SCC_TOK_EQ;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_ASSIGN;
break;
}
break;
case '+':
switch (next.character) {
case '+':
token->type = SCC_TOK_ADD_ADD;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_ADD;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_ADD;
break;
}
break;
case '-':
switch (next.character) {
case '-':
token->type = SCC_TOK_SUB_SUB;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_SUB;
next_char(lexer, &lex, &cur);
break;
case '>':
token->type = SCC_TOK_DEREF;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_SUB;
break;
}
break;
case '*':
switch (next.character) {
case '=':
token->type = SCC_TOK_ASSIGN_MUL;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_MUL;
break;
}
break;
case '%':
switch (next.character) {
case '=':
token->type = SCC_TOK_ASSIGN_MOD;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_MOD;
break;
}
break;
case '&':
switch (next.character) {
case '&':
token->type = SCC_TOK_AND_AND;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_AND;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_AND;
break;
}
break;
case '|':
switch (next.character) {
case '|':
token->type = SCC_TOK_OR_OR;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_OR;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_OR;
break;
}
break;
case '^':
switch (next.character) {
case '=':
token->type = SCC_TOK_ASSIGN_XOR;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_XOR;
break;
}
break;
case '<':
switch (next.character) {
case '=':
token->type = SCC_TOK_LE;
next_char(lexer, &lex, &cur);
break;
case '<': {
next_char(lexer, &lex, &cur);
if (peek_char(lexer, &next) && next.character == '=') {
token->type = SCC_TOK_ASSIGN_L_SH;
next_char(lexer, &lex, &cur);
} else {
token->type = SCC_TOK_L_SH;
}
break;
}
default:
token->type = SCC_TOK_LT;
break;
}
break;
case '>':
switch (next.character) {
case '=':
token->type = SCC_TOK_GE;
next_char(lexer, &lex, &cur);
break;
case '>': {
next_char(lexer, &lex, &cur);
if (peek_char(lexer, &next) && next.character == '=') {
token->type = SCC_TOK_ASSIGN_R_SH;
next_char(lexer, &lex, &cur);
} else {
token->type = SCC_TOK_R_SH;
}
break;
}
default:
token->type = SCC_TOK_GT;
break;
}
break;
case '~':
token->type = SCC_TOK_BIT_NOT;
break;
case '!':
switch (next.character) {
case '=':
token->type = SCC_TOK_NEQ;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_NOT;
break;
}
break;
/* clang-format off */
case '[': token->type = SCC_TOK_L_BRACKET; break;
case ']': token->type = SCC_TOK_R_BRACKET; break;
case '(': token->type = SCC_TOK_L_PAREN; break;
case ')': token->type = SCC_TOK_R_PAREN; break;
case '{': token->type = SCC_TOK_L_BRACE; break;
case '}': token->type = SCC_TOK_R_BRACE; break;
case ';': token->type = SCC_TOK_SEMICOLON; break;
case ',': token->type = SCC_TOK_COMMA; break;
case ':': token->type = SCC_TOK_COLON; break;
/* clang-format on */
case '.':
if (next.character == '.' && peek_char(lexer, &next) &&
next.character == '.') {
token->type = SCC_TOK_ELLIPSIS;
next_char(lexer, &lex, &cur);
next_char(lexer, &lex, &cur);
} else {
token->type = SCC_TOK_DOT;
}
break;
case '?':
token->type = SCC_TOK_COND;
break;
case '#':
if (next.character == '#') {
token->type = SCC_TOK_SHARP_SHARP;
next_char(lexer, &lex, &cur);
} else
token->type = SCC_TOK_SHARP;
break;
default:
token->type = SCC_TOK_UNKNOWN;
SCC_ERROR(start_loc, "unsupported character '%c' (0x%x)", ch, ch);
break;
}
}
// 设置token
scc_ring_consume(*lexer->stream_ref);
token->type = token->type; // 上面已设
token->loc = start_loc;
token->lexeme = lex; // 转移所有权
LEX_DEBUG("get token `%s` (%s) at %s:%d:%d", scc_get_tok_name(token->type),
scc_cstring_as_cstr(&token->lexeme), token->loc.name,
token->loc.line, token->loc.col);
}
// scc_lexer_get_token maybe got invalid (with parser)
void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
scc_tok_subtype_t subtype;
while (1) {
scc_lexer_get_token(lexer, token);
subtype = scc_get_tok_subtype(token->type);
AssertFmt(subtype != SCC_TOK_SUBTYPE_INVALID,
"Invalid token: `%s` at %s:%d:%d",
scc_get_tok_name(token->type), token->loc.name,
token->loc.line, token->loc.col);
if (subtype == SCC_TOK_SUBTYPE_EMPTYSPACE ||
subtype == SCC_TOK_SUBTYPE_COMMENT) {
scc_lexer_tok_drop(token);
}
break;
};
}
static cbool fill_token(scc_lexer_tok_t *out, void *userdata) {
scc_lexer_t *lexer = userdata;
scc_lexer_get_token(lexer, out);
if (out->type == SCC_TOK_EOF) {
return false;
}
return true;
}
static cbool fill_valid_token(scc_lexer_tok_t *out, void *userdata) {
scc_lexer_t *lexer = userdata;
scc_lexer_get_valid_token(lexer, out);
if (out->type == SCC_TOK_EOF) {
return false;
}
return true;
}
scc_lexer_tok_ring_t *scc_lexer_to_ring(scc_lexer_t *lexer, int ring_size,
cbool fill_all) {
scc_ring_init(lexer->ring, ring_size,
fill_all ? fill_token : fill_valid_token, lexer);
lexer->ring_ref_count++;
return &lexer->ring;
}
void scc_lexer_drop_ring(scc_lexer_tok_ring_t *ring_ref) {
scc_lexer_t *lexer = ring_ref->userdata;
if (lexer->ring_ref_count > 0) {
lexer->ring_ref_count--;
} else {
LOG_WARN("double drop sstream ring");
}
}
void scc_lexer_drop(scc_lexer_t *lexer) {
Assert(lexer != null);
if (lexer->ring_ref_count) {
LOG_FATAL("drop sstream must be drop ring before ref [%d]",
lexer->ring_ref_count);
}
scc_ring_free(lexer->ring);
scc_sstream_drop_ring(lexer->stream_ref);
}