refactor(lex_parser): 移除旧的词法解析器实现并更新依赖

移除了 libs/lex_parser 目录下的所有头文件和源文件,包括:
- lex_parser.h 和 lex_parser.c 核心解析功能
- 所有测试文件(test_char.c, test_identifier.c, test_number.c,
  test_skip_block_comment.c, test_skip_line.c, test_string.c)

更新了 lexer 模块的依赖配置,将 lex_parser 替换为 sstream,
同时更新了 lexer.h 中的相关包含头文件和数据结构定义,
简化了 scc_lexer_t 结构体的字段。
This commit is contained in:
zzy
2026-02-16 16:56:40 +08:00
parent 088050c903
commit 0e7dec202a
30 changed files with 1840 additions and 1979 deletions

View File

@@ -1,4 +1,3 @@
#include <lex_parser.h>
#include <lexer.h>
#include <lexer_log.h>
@@ -13,442 +12,460 @@ static const struct {
};
// by using binary search to find the keyword
static inline int keyword_cmp(const char *name, int len) {
static int keyword_cmp(const char *name, int len) {
int low = 0;
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
while (low <= high) {
int mid = (low + high) / 2;
const char *key = keywords[mid].name;
int cmp = 0;
// 自定义字符串比较逻辑
for (int i = 0; i < len; i++) {
if (name[i] != key[i]) {
cmp = (unsigned char)name[i] - (unsigned char)key[i];
break;
}
if (name[i] == '\0')
break; // 遇到终止符提前结束
break;
}
if (cmp == 0) {
// 完全匹配检查(长度相同)
if (key[len] == '\0')
return mid;
cmp = -1; // 当前关键词比输入长
cmp = -1;
}
if (cmp < 0) {
if (cmp < 0)
high = mid - 1;
} else {
else
low = mid + 1;
}
}
return -1; // Not a keyword.
return -1; // 不是关键字
}
void scc_lexer_init(scc_lexer_t *lexer, scc_probe_stream_t *stream) {
lexer->stream = stream;
lexer->pos = scc_pos_create();
// FIXME
lexer->pos.name = scc_cstring_copy(&stream->name);
void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref) {
lexer->stream_ref = *stream_ref;
lexer->jump_macro = false;
}
static inline cbool is_whitespace(int ch) {
return ch == ' ' || ch == '\t' || ch == '\v' || ch == '\f';
}
static inline cbool is_newline(int ch) { return ch == '\n' || ch == '\r'; }
static inline cbool is_digit(int ch) { return ch >= '0' && ch <= '9'; }
static inline cbool is_alpha(int ch) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
static inline cbool is_alnum(int ch) { return is_alpha(ch) || is_digit(ch); }
static inline cbool is_identifier_start(int ch) {
return is_alpha(ch) || ch == '_';
}
static inline cbool is_identifier_part(int ch) {
return is_alnum(ch) || ch == '_';
}
static inline cbool is_octal_digit(int ch) { return ch >= '0' && ch <= '7'; }
static inline cbool is_hex_digit(int ch) {
return is_digit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
/* 从环形缓冲区预览一个字符带EOF检测 */
static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) {
cbool ok;
scc_ring_peek(lexer->stream_ref, *out, ok);
return ok;
}
/* 从环形缓冲区消费一个字符并将它追加到lexeme中 */
static inline cbool next_char(scc_lexer_t *lexer, scc_cstring_t *lexeme,
scc_sstream_char_t *out) {
cbool ok;
scc_ring_next(lexer->stream_ref, *out, ok);
if (!ok)
return false;
scc_cstring_append_ch(lexeme, out->character);
return true;
}
#define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN)
static void parse_line(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
token->loc = lexer->pos;
scc_probe_stream_t *stream = lexer->stream;
scc_probe_stream_reset(stream);
int ch = scc_probe_stream_next(stream);
usize n;
scc_cstring_t str = scc_cstring_create();
if (ch == scc_stream_eof) {
LEX_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch != '#') {
LEX_WARN("Unexpected character '%c' at begin", ch);
goto ERR;
}
const char line[] = "line";
for (int i = 0; i < (int)sizeof(line); i++) {
ch = scc_probe_stream_consume(stream);
scc_pos_next(&lexer->pos);
if (ch != line[i]) {
LEX_WARN("Maroc does not support in lexer rather in preprocessor, "
"it will be ignored");
goto SKIP_LINE;
}
}
if (scc_lex_parse_number(stream, &lexer->pos, &n) == false) {
LEX_ERROR("Invalid line number");
goto SKIP_LINE;
}
if (scc_probe_stream_consume(stream) != ' ') {
scc_lex_parse_skip_line(stream, &lexer->pos);
token->loc.line = token->value.u;
}
if (scc_probe_stream_next(stream) != '"') {
LEX_ERROR("Invalid `#` line");
goto SKIP_LINE;
}
if (scc_lex_parse_string(stream, &lexer->pos, &str) == false) {
LEX_ERROR("Invalid filename");
goto SKIP_LINE;
}
scc_lex_parse_skip_line(stream, &lexer->pos);
scc_probe_stream_sync(stream);
token->loc.line = n;
// FIXME memory leak
token->loc.name = scc_cstring_copy(&str);
scc_cstring_free(&str);
return;
SKIP_LINE:
scc_lex_parse_skip_line(stream, &lexer->pos);
scc_probe_stream_sync(stream);
ERR:
set_err_token(token);
scc_cstring_free(&str);
}
// /zh/c/language/operator_arithmetic.html
void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
token->loc = lexer->pos;
token->type = SCC_TOK_UNKNOWN;
scc_probe_stream_t *stream = lexer->stream;
scc_sstream_char_t cur;
scc_cstring_t lex = scc_cstring_create(); // 临时lexeme
scc_probe_stream_reset(stream);
scc_tok_type_t type = SCC_TOK_UNKNOWN;
int ch = scc_probe_stream_next(stream);
// 尝试预览第一个字符
if (!peek_char(lexer, &cur)) {
token->type = SCC_TOK_EOF;
token->loc = (scc_pos_t){0, 1, 1, 0}; // 默认位置
token->lexeme = lex; // 空字符串
return;
}
// 记录起始位置
scc_pos_t start_loc = cur.pos;
int ch = cur.character;
// once step
switch (ch) {
case '=':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_EQ;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_ASSIGN;
break;
if (is_whitespace(ch)) {
// 空白符: 连续收集
token->type = SCC_TOK_BLANK;
while (peek_char(lexer, &cur) && is_whitespace(cur.character)) {
next_char(lexer, &lex, &cur);
}
break;
case '+':
switch (scc_probe_stream_next(stream)) {
case '+':
type = SCC_TOK_ADD_ADD;
goto double_char;
case '=':
type = SCC_TOK_ASSIGN_ADD;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_ADD;
break;
}
break;
case '-':
switch (scc_probe_stream_next(stream)) {
case '-':
type = SCC_TOK_SUB_SUB;
goto double_char;
case '=':
type = SCC_TOK_ASSIGN_SUB;
goto double_char;
case '>':
type = SCC_TOK_DEREF;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_SUB;
break;
}
break;
case '*':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_ASSIGN_MUL;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_MUL;
break;
}
break;
case '/':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_ASSIGN_DIV;
goto double_char;
case '/':
scc_probe_stream_reset(stream);
scc_lex_parse_skip_line(stream, &lexer->pos);
scc_probe_stream_sync(stream);
token->type = SCC_TOK_LINE_COMMENT;
goto END;
case '*':
scc_probe_stream_reset(stream);
scc_lex_parse_skip_block_comment(stream, &lexer->pos);
scc_probe_stream_sync(stream);
token->type = SCC_TOK_BLOCK_COMMENT;
goto END;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_DIV;
break;
}
break;
case '%':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_ASSIGN_MOD;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_MOD;
break;
}
break;
case '&':
switch (scc_probe_stream_next(stream)) {
case '&':
type = SCC_TOK_AND_AND;
goto double_char;
case '=':
type = SCC_TOK_ASSIGN_AND;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_AND;
break;
}
break;
case '|':
switch (scc_probe_stream_next(stream)) {
case '|':
type = SCC_TOK_OR_OR;
goto double_char;
case '=':
type = SCC_TOK_ASSIGN_OR;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_OR;
break;
}
break;
case '^':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_ASSIGN_XOR;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_XOR;
break;
}
break;
case '<':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_LE;
goto double_char;
case '<': {
if (scc_probe_stream_next(stream) == '=') {
type = SCC_TOK_ASSIGN_L_SH;
goto triple_char;
} else {
type = SCC_TOK_L_SH;
goto double_char;
}
break;
}
default:
scc_probe_stream_reset(stream), type = SCC_TOK_LT;
break;
}
break;
case '>':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_GE;
goto double_char;
case '>': {
if (scc_probe_stream_next(stream) == '=') {
type = SCC_TOK_ASSIGN_R_SH;
goto triple_char;
} else {
type = SCC_TOK_R_SH;
goto double_char;
}
break;
}
default:
scc_probe_stream_reset(stream), type = SCC_TOK_GT;
break;
}
break;
case '~':
type = SCC_TOK_BIT_NOT;
break;
case '!':
switch (scc_probe_stream_next(stream)) {
case '=':
type = SCC_TOK_NEQ;
goto double_char;
default:
scc_probe_stream_reset(stream), type = SCC_TOK_NOT;
break;
}
break;
/* clang-format off */
case '[': type = SCC_TOK_L_BRACKET; break;
case ']': type = SCC_TOK_R_BRACKET; break;
case '(': type = SCC_TOK_L_PAREN; break;
case ')': type = SCC_TOK_R_PAREN; break;
case '{': type = SCC_TOK_L_BRACE; break;
case '}': type = SCC_TOK_R_BRACE; break;
case ';': type = SCC_TOK_SEMICOLON; break;
case ',': type = SCC_TOK_COMMA; break;
case ':': type = SCC_TOK_COLON; break;
/* clang-format on */
case '.':
if (scc_probe_stream_next(stream) == '.' &&
scc_probe_stream_next(stream) == '.') {
type = SCC_TOK_ELLIPSIS;
goto triple_char;
}
type = SCC_TOK_DOT;
break;
case '?':
type = SCC_TOK_COND;
break;
case '\v':
case '\f':
case ' ':
case '\t':
type = SCC_TOK_BLANK;
break;
case '\r':
case '\n':
scc_probe_stream_back(stream);
scc_lex_parse_skip_endline(stream, &lexer->pos);
scc_probe_stream_sync(stream);
} else if (is_newline(ch)) {
// 换行符:处理 \r 或 \n以及 \r\n 组合
token->type = SCC_TOK_ENDLINE;
goto END;
case '#':
parse_line(lexer, token);
token->type = SCC_TOK_SHARP;
goto END;
case '\0':
case scc_stream_eof:
// EOF
type = SCC_TOK_EOF;
break;
case '\'': {
token->loc = lexer->pos;
next_char(lexer, &lex, &cur); // 消费第一个字符
if (ch == '\r') {
// 尝试消费后面的 \n
if (peek_char(lexer, &cur) && cur.character == '\n') {
next_char(lexer, &lex, &cur);
}
}
} else if (ch == '/') {
// 可能为注释或除号
scc_sstream_char_t next = {0};
next_char(lexer, &lex, &cur); // 消费 '/'
peek_char(lexer, &next);
if (next.character == '=') {
token->type = SCC_TOK_ASSIGN_DIV;
next_char(lexer, &lex, &cur);
} else if (next.character == '/') {
// 行注释 //
token->type = SCC_TOK_LINE_COMMENT;
next_char(lexer, &lex, &cur); // 消费 '/'
while (peek_char(lexer, &cur) && !is_newline(cur.character)) {
next_char(lexer, &lex, &cur);
scc_ring_consume(lexer->stream_ref);
}
// 注释结束不包含换行符换行符单独成token
} else if (next.character == '*') {
// 块注释 /*
token->type = SCC_TOK_BLOCK_COMMENT;
next_char(lexer, &lex, &cur); // 消费 '*'
while (1) {
if (!next_char(lexer, &lex, &cur)) {
// 文件结束,注释未闭合
LOG_ERROR("Unterminated block comment");
break;
}
if (cur.character == '*' && peek_char(lexer, &next) &&
next.character == '/') {
next_char(lexer, &lex, &cur); // 消费 '/'
break;
}
scc_ring_consume(lexer->stream_ref);
}
} else {
// 只是除号 /
token->type = SCC_TOK_DIV;
}
} else if (is_identifier_start(ch)) {
// 标识符或关键字
token->type = SCC_TOK_IDENT; // 暂定
while (peek_char(lexer, &cur) && is_identifier_part(cur.character)) {
next_char(lexer, &lex, &cur);
scc_ring_consume(lexer->stream_ref);
}
// 检查是否为关键字
int idx = keyword_cmp(scc_cstring_as_cstr(&lex), scc_cstring_len(&lex));
if (idx != -1) {
token->type = keywords[idx].tok;
}
} else if (is_digit(ch)) {
// 数字字面量(整数/浮点)
token->type = SCC_TOK_INT_LITERAL; // 先假定整数
cbool maybe_float = false;
while (1) {
next_char(lexer, &lex, &cur); // 消费当前数字
if (!peek_char(lexer, &cur))
break;
ch = cur.character;
if (is_digit(ch) || (ch == '.' && !maybe_float)) {
if (ch == '.')
maybe_float = true;
continue;
}
if (ch == 'e' || ch == 'E' || ch == 'p' || ch == 'P') {
maybe_float = true;
// 后面可能跟符号或数字
continue;
}
if (ch == 'x' || ch == 'X') {
// 十六进制前缀,需特殊处理
// 这里简化:将整个序列作为整数(保留前缀)
continue;
}
break;
}
if (maybe_float)
token->type = SCC_TOK_FLOAT_LITERAL;
} else if (ch == '\'') {
// 字符字面量
token->type = SCC_TOK_CHAR_LITERAL;
scc_probe_stream_reset(stream);
int ch = scc_lex_parse_char(stream, &lexer->pos);
scc_probe_stream_sync(stream);
if (ch == scc_stream_eof) {
LEX_ERROR("Unexpected character literal");
token->type = SCC_TOK_UNKNOWN;
} else {
token->value.ch = ch;
next_char(lexer, &lex, &cur); // 开头的 '
while (1) {
if (!peek_char(lexer, &cur)) {
LOG_ERROR("Unterminated character literal");
break;
}
if (cur.character == '\'') {
next_char(lexer, &lex, &cur); // 闭引号
break;
}
if (cur.character == '\\') {
// 转义序列:原样保存反斜杠和下一个字符
next_char(lexer, &lex, &cur);
if (!peek_char(lexer, &cur))
break;
next_char(lexer, &lex, &cur);
} else {
next_char(lexer, &lex, &cur);
}
}
goto END;
}
case '"': {
token->loc = lexer->pos;
} else if (ch == '"') {
// 字符串字面量
token->type = SCC_TOK_STRING_LITERAL;
scc_cstring_t output = scc_cstring_create();
scc_probe_stream_reset(stream);
if (scc_lex_parse_string(stream, &lexer->pos, &output) == true) {
scc_probe_stream_sync(stream);
token->value.cstr.data = scc_cstring_as_cstr(&output);
token->value.cstr.len = scc_cstring_len(&output);
} else {
LEX_ERROR("Unexpected string literal");
next_char(lexer, &lex, &cur); // 开头的 "
while (1) {
if (!peek_char(lexer, &cur)) {
LOG_ERROR("Unterminated string literal");
break;
}
if (cur.character == '"') {
next_char(lexer, &lex, &cur); // 闭引号
break;
}
if (cur.character == '\\') {
// 转义序列
next_char(lexer, &lex, &cur);
if (!peek_char(lexer, &cur))
break;
next_char(lexer, &lex, &cur);
} else {
next_char(lexer, &lex, &cur);
}
scc_ring_consume(lexer->stream_ref);
}
} else {
scc_sstream_char_t next = {0};
next_char(lexer, &lex, &cur);
peek_char(lexer, &next);
switch (ch) {
case '=':
switch (next.character) {
case '=':
token->type = SCC_TOK_EQ;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_ASSIGN;
break;
}
break;
case '+':
switch (next.character) {
case '+':
token->type = SCC_TOK_ADD_ADD;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_ADD;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_ADD;
break;
}
break;
case '-':
switch (next.character) {
case '-':
token->type = SCC_TOK_SUB_SUB;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_SUB;
next_char(lexer, &lex, &cur);
break;
case '>':
token->type = SCC_TOK_DEREF;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_SUB;
break;
}
break;
case '*':
switch (next.character) {
case '=':
token->type = SCC_TOK_ASSIGN_MUL;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_MUL;
break;
}
break;
case '%':
switch (next.character) {
case '=':
token->type = SCC_TOK_ASSIGN_MOD;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_MOD;
break;
}
break;
case '&':
switch (next.character) {
case '&':
token->type = SCC_TOK_AND_AND;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_AND;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_AND;
break;
}
break;
case '|':
switch (next.character) {
case '|':
token->type = SCC_TOK_OR_OR;
next_char(lexer, &lex, &cur);
break;
case '=':
token->type = SCC_TOK_ASSIGN_OR;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_OR;
break;
}
break;
case '^':
switch (next.character) {
case '=':
token->type = SCC_TOK_ASSIGN_XOR;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_XOR;
break;
}
break;
case '<':
switch (next.character) {
case '=':
token->type = SCC_TOK_LE;
next_char(lexer, &lex, &cur);
break;
case '<': {
next_char(lexer, &lex, &cur);
if (peek_char(lexer, &next) && next.character == '=') {
token->type = SCC_TOK_ASSIGN_L_SH;
next_char(lexer, &lex, &cur);
} else {
token->type = SCC_TOK_L_SH;
}
break;
}
default:
token->type = SCC_TOK_LT;
break;
}
break;
case '>':
switch (next.character) {
case '=':
token->type = SCC_TOK_GE;
next_char(lexer, &lex, &cur);
break;
case '>': {
next_char(lexer, &lex, &cur);
if (peek_char(lexer, &next) && next.character == '=') {
token->type = SCC_TOK_ASSIGN_R_SH;
next_char(lexer, &lex, &cur);
} else {
token->type = SCC_TOK_R_SH;
}
break;
}
default:
token->type = SCC_TOK_GT;
break;
}
break;
case '~':
token->type = SCC_TOK_BIT_NOT;
break;
case '!':
switch (next.character) {
case '=':
token->type = SCC_TOK_NEQ;
next_char(lexer, &lex, &cur);
break;
default:
token->type = SCC_TOK_NOT;
break;
}
break;
/* clang-format off */
case '[': token->type = SCC_TOK_L_BRACKET; break;
case ']': token->type = SCC_TOK_R_BRACKET; break;
case '(': token->type = SCC_TOK_L_PAREN; break;
case ')': token->type = SCC_TOK_R_PAREN; break;
case '{': token->type = SCC_TOK_L_BRACE; break;
case '}': token->type = SCC_TOK_R_BRACE; break;
case ';': token->type = SCC_TOK_SEMICOLON; break;
case ',': token->type = SCC_TOK_COMMA; break;
case ':': token->type = SCC_TOK_COLON; break;
/* clang-format on */
case '.':
if (next.character == '.' && peek_char(lexer, &next) &&
next.character == '.') {
token->type = SCC_TOK_ELLIPSIS;
next_char(lexer, &lex, &cur);
next_char(lexer, &lex, &cur);
} else {
token->type = SCC_TOK_DOT;
}
break;
case '?':
token->type = SCC_TOK_COND;
break;
case '#':
token->type = SCC_TOK_SHARP;
break;
default:
token->type = SCC_TOK_UNKNOWN;
LEX_ERROR("unsupport char in sourse code `%c`:0x%x", ch, ch);
break;
}
goto END;
}
/* clang-format off */
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
/* clang-format on */
token->loc = lexer->pos;
token->type = SCC_TOK_INT_LITERAL;
usize output;
scc_probe_stream_reset(stream);
if (scc_lex_parse_number(stream, &lexer->pos, &output) == true) {
scc_probe_stream_sync(stream);
token->value.u = output;
} else {
LEX_ERROR("Unexpected number literal");
token->type = SCC_TOK_UNKNOWN;
}
goto END;
/* clang-format off */
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_':
/* clang-format on */
scc_cstring_t str = scc_cstring_create();
scc_probe_stream_reset(stream);
cbool ret = scc_lex_parse_identifier(stream, &lexer->pos, &str);
scc_probe_stream_sync(stream);
Assert(ret == true);
int res = keyword_cmp(scc_cstring_as_cstr(&str), scc_cstring_len(&str));
if (res == -1) {
token->value.cstr.data = (char *)scc_cstring_as_cstr(&str);
token->value.cstr.len = scc_cstring_len(&str);
type = SCC_TOK_IDENT;
} else {
scc_cstring_free(&str);
type = keywords[res].tok;
}
token->type = type;
goto END;
default:
LEX_ERROR("unsupport char in sourse code `%c`:0x%x", ch, ch);
break;
}
goto once_char;
triple_char:
scc_probe_stream_consume(stream);
scc_pos_next(&lexer->pos);
double_char:
scc_probe_stream_consume(stream);
scc_pos_next(&lexer->pos);
once_char:
scc_probe_stream_consume(stream);
scc_pos_next(&lexer->pos);
token->type = type;
END:
LEX_DEBUG("get token `%s` in %s:%d:%d", scc_get_tok_name(token->type),
token->loc.name, token->loc.line, token->loc.column);
// 设置token
scc_ring_consume(lexer->stream_ref);
token->type = token->type; // 上面已设
token->loc = start_loc;
token->lexeme = lex; // 转移所有权
LEX_DEBUG("get token `%s` (%s) at %s:%d:%d", scc_get_tok_name(token->type),
scc_cstring_as_cstr(&token->lexeme), token->loc.name,
token->loc.line, token->loc.col);
}
// scc_lexer_get_token maybe got invalid (with parser)
void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
scc_tok_subtype_t type;
scc_tok_subtype_t subtype;
do {
scc_lexer_get_token(lexer, token);
type = scc_get_tok_subtype(token->type);
AssertFmt(type != SCC_TOK_SUBTYPE_INVALID,
subtype = scc_get_tok_subtype(token->type);
AssertFmt(subtype != SCC_TOK_SUBTYPE_INVALID,
"Invalid token: `%s` at %s:%d:%d",
scc_get_tok_name(token->type), token->loc.name,
token->loc.line, token->loc.col);
Assert(type != SCC_TOK_SUBTYPE_INVALID);
} while (type == SCC_TOK_SUBTYPE_EMPTYSPACE ||
type == SCC_TOK_SUBTYPE_COMMENT);
} while (subtype == SCC_TOK_SUBTYPE_EMPTYSPACE ||
subtype == SCC_TOK_SUBTYPE_COMMENT);
}

66
libs/lexer/src/main.c Normal file
View File

@@ -0,0 +1,66 @@
#include <lexer.h>
#include <lexer_log.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/// gcc -g ../lexer.c ../token.c test_lexer.c -o test_lexer
/*
tok_tConstant {
int have;
union {
char ch;
int i;
float f;
double d;
long long ll;
char* str;
};
};
*/
int g_num;
int g_num_arr[3];
int main(int argc, char *argv[]) {
// int num = 0;
if (argc == 3 && strcmp(argv[2], "--debug") == 0) {
log_set_level(NULL, LOG_LEVEL_ALL);
} else {
// FIXME it is a hack lexer_logger
log_set_level(&__scc_lexer_log, LOG_LEVEL_NOTSET);
log_set_level(NULL, LOG_LEVEL_INFO | LOG_LEVEL_WARN | LOG_LEVEL_ERROR |
LOG_LEVEL_FATAL);
}
const char *file_name = __FILE__;
if (argc == 2) {
file_name = argv[1];
}
scc_lexer_t lexer;
scc_sstream_t stream;
scc_sstream_init(&stream, file_name, 16);
scc_sstream_ring_t *ref = scc_sstream_ref_ring(&stream);
scc_lexer_init(&lexer, ref);
scc_lexer_tok_t token;
while (1) {
scc_lexer_get_valid_token(&lexer, &token);
if (token.type == SCC_TOK_EOF) {
break;
}
LOG_DEBUG("get token [%-8s] `%s` at %s:%d:%d",
scc_get_tok_name(token.type),
scc_cstring_as_cstr(&token.lexeme), token.loc.name,
token.loc.line, token.loc.col);
// LOG_DEBUG("%s", token.val.str);
// printf("line: %d, column: %d, type: %3d, typename: %s\n",
// lexer.line, lexer.index, token.type,
// scc_get_tok_name(token.type));
}
scc_sstream_drop_ring(ref);
scc_sstream_drop(&stream);
LOG_INFO("Lexer is Ok...");
return 0;
}