feat(frontend): 重构词法分析器

- 添加 .gitignore 文件,忽略编译器生成的二进制文件
- 重构 lexer.c 文件,改进了关键字处理和字符串处理
- 更新前端的前端、解析器和 AST 相关文件,以适应新的词法分析器
- 优化了 token 相关的定义和函数,引入了新的 token 类型
This commit is contained in:
ZZY
2025-03-23 12:13:16 +08:00
parent 05c637e594
commit 2b4857001c
33 changed files with 532 additions and 624 deletions

View File

@@ -34,7 +34,7 @@ David Hanson / drh@drhanson.net
static const struct {
const char* name;
enum CSTD_KEYWORD std_type;
tok_type_t tok;
cc_tktype_t tok;
} keywords[] = {
#define X(name, std_type, tok, ...) { #name, std_type, tok },
KEYWORD_TABLE
@@ -74,19 +74,17 @@ static inline int keyword_cmp(const char* name, int len) {
return -1; // Not a keyword.
}
void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) {
init_lib_core();
lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
lexer->index = 1;
lexer->line = 1;
void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) {
lexer->strpool = strpool;
lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer);
lexer->loc.fname = strpool_intern(lexer->strpool, file_name);
lexer->loc.line = 1;
lexer->loc.col = 1;
lexer->stream = stream;
lexer->sread = sread;
for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
lexer->buffer[i] = 0;
}
rt_memset(lexer->buffer, 0, sizeof(lexer->buffer));
}
static void flush_buffer(lexer_t* lexer) {
@@ -94,7 +92,7 @@ static void flush_buffer(lexer_t* lexer) {
for (int i = 0; i < num; i++) {
lexer->buffer[i] = lexer->cur_ptr[i];
}
lexer->cur_ptr = (unsigned char*)lexer->buffer;
lexer->cur_ptr = lexer->buffer;
int read_size = LEXER_BUFFER_SIZE - num;
// TODO rt_size_t to int maybe lose precision
@@ -128,19 +126,20 @@ static void goto_block_comment(lexer_t* lexer) {
flush_buffer(lexer);
}
if (*lexer->cur_ptr == '\0') {
if (lexer->cur_ptr[0] == '\0') {
break;
} else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
lexer->cur_ptr += 2;
break;
} else {
if (lexer->cur_ptr[0] == '\n') lexer->loc.line++;
lexer->cur_ptr++;
}
}
}
// TODO escape character not enough
static char got_slash(unsigned char* peek) {
static char got_slash(char* peek) {
switch (*peek) {
case '\\': return '\\';
case '\'': return '\'';
@@ -162,7 +161,7 @@ static char got_slash(unsigned char* peek) {
static void parse_char_literal(lexer_t* lexer, tok_t* token) {
char val = 0;
unsigned char* peek = lexer->cur_ptr + 1;
char* peek = lexer->cur_ptr + 1;
if (*peek == '\\') {
peek++;
val = got_slash(peek);
@@ -172,16 +171,14 @@ static void parse_char_literal(lexer_t* lexer, tok_t* token) {
}
if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
token->val.ch = val;
lexer->cur_ptr = peek;
token->val.have = 1;
token->type = TOKEN_CHAR_LITERAL;
token->val.ch = val;
}
static void parse_string_literal(lexer_t* lexer, tok_t* token) {
unsigned char* peek = lexer->cur_ptr + 1;
char* peek = lexer->cur_ptr + 1;
// TODO string literal size check
char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1);
static char dest[LEXER_MAX_TOKEN_SIZE + 1];
int len = 0;
while (*peek != '"') {
@@ -196,14 +193,15 @@ static void parse_string_literal(lexer_t* lexer, tok_t* token) {
dest[len++] = *peek++;
}
dest[len] = '\0';
lexer->cur_ptr = peek + 1;
token->val.have = 1;
token->type = TOKEN_STRING_LITERAL;
lexer->cur_ptr = peek + 1; // 1 is `"`
lexer->loc.len = len + 2; // 2 is `"` `"`
token->val.str = strpool_intern(lexer->strpool, dest);
}
// FIXME it write by AI maybe error
static void parse_number(lexer_t* lexer, tok_t* token) {
unsigned char* peek = lexer->cur_ptr;
char* peek = lexer->cur_ptr;
int base = 10;
int is_float = 0;
long long int_val = 0;
@@ -278,14 +276,15 @@ static void parse_number(lexer_t* lexer, tok_t* token) {
}
// 存储结果
// TODO
lexer->loc.len = peek - lexer->cur_ptr;
lexer->cur_ptr = peek;
token->val.have = 1;
if (is_float) {
token->val.d = float_val;
token->type = TOKEN_FLOAT_LITERAL;
token->val.f32 = float_val;
token->sub_type = TOKEN_FLOAT_LITERAL;
} else {
token->val.ll = int_val;
token->type = TOKEN_INT_LITERAL;
token->val.i = int_val;
token->sub_type = TOKEN_INT_LITERAL;
}
}
@@ -296,160 +295,159 @@ void get_token(lexer_t* lexer, tok_t* token) {
if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
flush_buffer(lexer);
}
register unsigned char* peek = lexer->cur_ptr;
// 快速跳过空白符
while (*peek == ' ' || *peek == '\t') {
if (peek == lexer->end_ptr) {
break;
}
peek++;
}
if (peek != lexer->cur_ptr) {
// To TOKEN_FLUSH
lexer->cur_ptr = peek;
token->type = TOKEN_FLUSH;
}
tok_type_t tok = TOKEN_INIT;
tok_val_t constant;
constant.have = 0;
register char* peek = lexer->cur_ptr;
cc_tktype_t tk_type = TOKEN_INIT;
ctype_t literal = { 0 };
// once step
switch (*peek++) {
case '=':
case '=':
switch (*peek++) {
case '=': tok = TOKEN_EQ; break;
default: peek--, tok = TOKEN_ASSIGN; break;
case '=': tk_type = TOKEN_EQ; break;
default: peek--, tk_type = TOKEN_ASSIGN; break;
} break;
case '+':
switch (*peek++) {
case '+': tok = TOKEN_ADD_ADD; break;
case '=': tok = TOKEN_ASSIGN_ADD; break;
default: peek--, tok = TOKEN_ADD; break;
case '+': tk_type = TOKEN_ADD_ADD; break;
case '=': tk_type = TOKEN_ASSIGN_ADD; break;
default: peek--, tk_type = TOKEN_ADD; break;
} break;
case '-':
switch (*peek++) {
case '-': tok = TOKEN_SUB_SUB; break;
case '=': tok = TOKEN_ASSIGN_SUB; break;
case '-': tk_type = TOKEN_SUB_SUB; break;
case '=': tk_type = TOKEN_ASSIGN_SUB; break;
case '>': tok = TOKEN_DEREF; break;
default: peek--, tok = TOKEN_SUB; break;
case '>': tk_type = TOKEN_DEREF; break;
default: peek--, tk_type = TOKEN_SUB; break;
} break;
case '*':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_MUL; break;
default: peek--, tok = TOKEN_MUL; break;
case '=': tk_type = TOKEN_ASSIGN_MUL; break;
default: peek--, tk_type = TOKEN_MUL; break;
} break;
case '/':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_DIV; break;
case '=': tk_type = TOKEN_ASSIGN_DIV; break;
case '/': {
// need get a new line to parse
goto_newline(lexer);
tok = TOKEN_LINE_COMMENT;
tk_type = TOKEN_LINE_COMMENT;
goto END;
}
case '*': {
lexer->cur_ptr = peek;
goto_block_comment(lexer);
tok = TOKEN_BLOCK_COMMENT;
tk_type = TOKEN_BLOCK_COMMENT;
goto END;
}
default: peek--, tok = TOKEN_DIV; break;
default: peek--, tk_type = TOKEN_DIV; break;
} break;
case '%':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_MOD; break;
default: peek--, tok = TOKEN_MOD; break;
case '=': tk_type = TOKEN_ASSIGN_MOD; break;
default: peek--, tk_type = TOKEN_MOD; break;
} break;
case '&':
switch (*peek++) {
case '&': tok = TOKEN_AND_AND; break;
case '=': tok = TOKEN_ASSIGN_AND; break;
default: peek--, tok = TOKEN_AND; break;
case '&': tk_type = TOKEN_AND_AND; break;
case '=': tk_type = TOKEN_ASSIGN_AND; break;
default: peek--, tk_type = TOKEN_AND; break;
} break;
case '|':
switch (*peek++) {
case '|': tok = TOKEN_OR_OR; break;
case '=': tok = TOKEN_ASSIGN_OR; break;
default: peek--, tok = TOKEN_OR; break;
case '|': tk_type = TOKEN_OR_OR; break;
case '=': tk_type = TOKEN_ASSIGN_OR; break;
default: peek--, tk_type = TOKEN_OR; break;
} break;
case '^':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_XOR; break;
default: peek--, tok = TOKEN_XOR; break;
case '=': tk_type = TOKEN_ASSIGN_XOR; break;
default: peek--, tk_type = TOKEN_XOR; break;
} break;
case '<':
switch (*peek++) {
case '=': tok = TOKEN_LE; break;
case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
default: peek--, tok = TOKEN_LT; break;
case '=': tk_type = TOKEN_LE; break;
case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
default: peek--, tk_type = TOKEN_LT; break;
} break;
case '>':
switch (*peek++) {
case '=': tok = TOKEN_GE; break;
case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
default: peek--, tok = TOKEN_GT; break;
case '=': tk_type = TOKEN_GE; break;
case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
default: peek--, tk_type = TOKEN_GT; break;
} break;
case '~':
tok = TOKEN_BIT_NOT; break;
tk_type = TOKEN_BIT_NOT; break;
case '!':
switch (*peek++) {
case '=': tok = TOKEN_NEQ; break;
default: peek--, tok = TOKEN_NOT; break;
case '=': tk_type = TOKEN_NEQ; break;
default: peek--, tk_type = TOKEN_NOT; break;
} break;
case '[':
tok = TOKEN_L_BRACKET; break;
tk_type = TOKEN_L_BRACKET; break;
case ']':
tok = TOKEN_R_BRACKET; break;
tk_type = TOKEN_R_BRACKET; break;
case '(':
tok = TOKEN_L_PAREN; break;
tk_type = TOKEN_L_PAREN; break;
case ')':
tok = TOKEN_R_PAREN; break;
tk_type = TOKEN_R_PAREN; break;
case '{':
tok = TOKEN_L_BRACE; break;
tk_type = TOKEN_L_BRACE; break;
case '}':
tok = TOKEN_R_BRACE; break;
tk_type = TOKEN_R_BRACE; break;
case ';':
tok = TOKEN_SEMICOLON; break;
tk_type = TOKEN_SEMICOLON; break;
case ',':
tok = TOKEN_COMMA; break;
tk_type = TOKEN_COMMA; break;
case ':':
tok = TOKEN_COLON; break;
tk_type = TOKEN_COLON; break;
case '.':
if (peek[0] == '.' && peek[1] == '.') {
peek += 2;
tok = TOKEN_ELLIPSIS;
tk_type = TOKEN_ELLIPSIS;
} else {
tok = TOKEN_DOT;
tk_type = TOKEN_DOT;
}
break;
case '?':
tok = TOKEN_COND; break;
case '\v': case '\r': case '\f': // FIXME it parse as a blank character
tok = TOKEN_FLUSH; break;
case '\n':
tk_type = TOKEN_COND; break;
case '\v': case '\r': case '\f':
case ' ': case '\t':
tk_type = TOKEN_BLANK; break;
case '\n':
// you need to flush a newline or blank
lexer->line++;
tok = TOKEN_FLUSH; break;
lexer->loc.line += 1;
lexer->loc.col = -1;
lexer->loc.len = 1;
tk_type = TOKEN_BLANK;
break;
case '#':
LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored");
// TODO make line or file comment to change
LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
goto_newline(lexer);
tok = TOKEN_FLUSH;
tk_type = TOKEN_BLANK;
goto END;
case '\0':
// EOF
tok = TOKEN_EOF;
tk_type = TOKEN_EOF;
goto END;
case '\'':
return parse_char_literal(lexer, token);
return;
parse_char_literal(lexer, token);
literal = token->val;
tk_type = TOKEN_CHAR_LITERAL;
goto END; break;
case '"':
return parse_string_literal(lexer, token);
parse_string_literal(lexer, token);
literal = token->val;
tk_type = TOKEN_STRING_LITERAL;
goto END; break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return parse_number(lexer, token);
parse_number(lexer, token);
// TODO Make it easy
literal = token->val;
tk_type = token->sub_type;
goto END; break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
@@ -475,39 +473,53 @@ void get_token(lexer_t* lexer, tok_t* token) {
break;
}
int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr));
int strlen = peek - lexer->cur_ptr;
int res = keyword_cmp((const char*)lexer->cur_ptr, strlen);
if (res == -1) {
int strlen = peek - lexer->cur_ptr;
unsigned char* str = rt._malloc(strlen + 1);
constant.have = 1;
constant.str = (char*)str;
for (int i = 0; i < strlen; i++) {
str[i] = lexer->cur_ptr[i];
}
str[strlen] = '\0';
constant.have = 1;
constant.str = (char*)str;
tok = TOKEN_IDENT; break;
char prev = lexer->cur_ptr[strlen];
lexer->cur_ptr[strlen] = '\0';
literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr);
lexer->cur_ptr[strlen] = prev;
tk_type = TOKEN_IDENT; break;
} else {
tok = keywords[res].tok; break;
tk_type = keywords[res].tok; break;
}
default:
LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
break;
}
lexer->loc.len = peek - lexer->cur_ptr;
lexer->cur_ptr = peek;
END:
token->val = constant;
token->type = tok;
LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i);
lexer->loc.col += lexer->loc.len;
lexer->loc.len = 0;
token->val = literal;
token->sub_type = tk_type;
token->loc = lexer->loc;
static const tok_basic_type_t tok_type_map[] = {
// 普通token使用#str
#define X(str, basic, tok) [tok] = basic,
TOKEN_TABLE
#undef X
// 关键字使用#name
#define X(name, std, tok) [tok] = TK_BASIC_KEYWORD,
KEYWORD_TABLE
#undef X
};
token->type = tok_type_map[tk_type];
LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type),
token->loc.fname, token->loc.line, token->loc.col);
}
// get_token maybe got invalid (with parser)
void get_valid_token(lexer_t* lexer, tok_t* token) {
tok_type_t type;
tok_basic_type_t type;
do {
get_token(lexer, token);
type = token->type;
} while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
Assert(type != TK_BASIC_INVALID);
} while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT);
}