- 添加 .gitignore 文件,忽略编译器生成的二进制文件 - 重构 lexer.c 文件,改进了关键字处理和字符串处理 - 更新前端的前端、解析器和 AST 相关文件,以适应新的词法分析器 - 优化了 token 相关的定义和函数,引入了新的 token 类型
526 lines
16 KiB
C
526 lines
16 KiB
C
/**
|
|
* 仿照LCCompiler的词法分析部分
|
|
*
|
|
* 如下为LCC的README in 2025.2
|
|
This hierarchy is the distribution for lcc version 4.2.
|
|
|
|
lcc version 3.x is described in the book "A Retargetable C Compiler:
|
|
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
|
|
There are significant differences between 3.x and 4.x, most notably in
|
|
the intermediate code. For details, see
|
|
https://drh.github.io/lcc/documents/interface4.pdf.
|
|
|
|
VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
|
|
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
|
|
|
|
LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
|
|
|
|
LOG describes the changes since the last release.
|
|
|
|
CPYRIGHT describes the conditions under you can use, copy, modify, and
|
|
distribute lcc or works derived from lcc.
|
|
|
|
doc/install.html is an HTML file that gives a complete description of
|
|
the distribution and installation instructions.
|
|
|
|
Chris Fraser / cwf@aya.yale.edu
|
|
David Hanson / drh@drhanson.net
|
|
*/
|
|
#include <lib/core.h>
|
|
#include "lexer_log.h"
|
|
#include "token.h"
|
|
#include "lexer.h"
|
|
|
|
static const struct {
|
|
const char* name;
|
|
enum CSTD_KEYWORD std_type;
|
|
cc_tktype_t tok;
|
|
} keywords[] = {
|
|
#define X(name, std_type, tok, ...) { #name, std_type, tok },
|
|
KEYWORD_TABLE
|
|
#undef X
|
|
};
|
|
|
|
// by using binary search to find the keyword
|
|
static inline int keyword_cmp(const char* name, int len) {
|
|
int low = 0;
|
|
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
|
|
while (low <= high) {
|
|
int mid = (low + high) / 2;
|
|
const char *key = keywords[mid].name;
|
|
int cmp = 0;
|
|
|
|
// 自定义字符串比较逻辑
|
|
for (int i = 0; i < len; i++) {
|
|
if (name[i] != key[i]) {
|
|
cmp = (unsigned char)name[i] - (unsigned char)key[i];
|
|
break;
|
|
}
|
|
if (name[i] == '\0') break; // 遇到终止符提前结束
|
|
}
|
|
|
|
if (cmp == 0) {
|
|
// 完全匹配检查(长度相同)
|
|
if (key[len] == '\0') return mid;
|
|
cmp = -1; // 当前关键词比输入长
|
|
}
|
|
|
|
if (cmp < 0) {
|
|
high = mid - 1;
|
|
} else {
|
|
low = mid + 1;
|
|
}
|
|
}
|
|
return -1; // Not a keyword.
|
|
}
|
|
|
|
void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) {
|
|
lexer->strpool = strpool;
|
|
lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer);
|
|
lexer->loc.fname = strpool_intern(lexer->strpool, file_name);
|
|
lexer->loc.line = 1;
|
|
lexer->loc.col = 1;
|
|
|
|
lexer->stream = stream;
|
|
lexer->sread = sread;
|
|
|
|
rt_memset(lexer->buffer, 0, sizeof(lexer->buffer));
|
|
}
|
|
|
|
static void flush_buffer(lexer_t* lexer) {
|
|
int num = lexer->end_ptr - lexer->cur_ptr;
|
|
for (int i = 0; i < num; i++) {
|
|
lexer->buffer[i] = lexer->cur_ptr[i];
|
|
}
|
|
lexer->cur_ptr = lexer->buffer;
|
|
|
|
int read_size = LEXER_BUFFER_SIZE - num;
|
|
// TODO rt_size_t to int maybe lose precision
|
|
int got_size = lexer->sread(lexer->buffer + num, read_size, 1, read_size, lexer->stream);
|
|
if (got_size < 0) {
|
|
LEX_ERROR("lexer read error");
|
|
} else if (got_size < read_size) {
|
|
lexer->end_ptr += got_size;
|
|
lexer->end_ptr[0] = '\0'; // EOF
|
|
lexer->end_ptr++;
|
|
} else if (got_size == read_size) {
|
|
lexer->end_ptr += got_size;
|
|
} else {
|
|
LEX_ERROR("lexer read error imposible got_size > read_size maybe overflow?");
|
|
}
|
|
}
|
|
|
|
static void goto_newline(lexer_t* lexer) {
|
|
do {
|
|
if (lexer->cur_ptr == lexer->end_ptr) {
|
|
flush_buffer(lexer);
|
|
lexer->cur_ptr--;
|
|
}
|
|
lexer->cur_ptr++;
|
|
} while (*lexer->cur_ptr != '\n' && *lexer->cur_ptr != '\0');
|
|
}
|
|
|
|
static void goto_block_comment(lexer_t* lexer) {
|
|
while (1) {
|
|
if (lexer->end_ptr - lexer->cur_ptr < 2) {
|
|
flush_buffer(lexer);
|
|
}
|
|
|
|
if (lexer->cur_ptr[0] == '\0') {
|
|
break;
|
|
} else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
|
|
lexer->cur_ptr += 2;
|
|
break;
|
|
} else {
|
|
if (lexer->cur_ptr[0] == '\n') lexer->loc.line++;
|
|
lexer->cur_ptr++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO escape character not enough
|
|
static char got_slash(char* peek) {
|
|
switch (*peek) {
|
|
case '\\': return '\\';
|
|
case '\'': return '\'';
|
|
case '\"': return '\"';
|
|
case '\?': return '\?';
|
|
case '0': return '\0';
|
|
|
|
case 'b': return '\b';
|
|
case 'f': return '\f';
|
|
case 'n': return '\n';
|
|
case 'r': return '\r';
|
|
case 't': return '\t';
|
|
case 'v': return '\v';
|
|
default: break;
|
|
}
|
|
LEX_ERROR("Unknown escape character");
|
|
return -1;
|
|
}
|
|
|
|
static void parse_char_literal(lexer_t* lexer, tok_t* token) {
|
|
char val = 0;
|
|
char* peek = lexer->cur_ptr + 1;
|
|
if (*peek == '\\') {
|
|
peek++;
|
|
val = got_slash(peek);
|
|
peek++;
|
|
} else {
|
|
val = *peek++;
|
|
}
|
|
|
|
if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
|
|
lexer->cur_ptr = peek;
|
|
token->val.ch = val;
|
|
}
|
|
|
|
static void parse_string_literal(lexer_t* lexer, tok_t* token) {
|
|
char* peek = lexer->cur_ptr + 1;
|
|
// TODO string literal size check
|
|
static char dest[LEXER_MAX_TOKEN_SIZE + 1];
|
|
int len = 0;
|
|
|
|
while (*peek != '"') {
|
|
if (peek >= lexer->end_ptr) flush_buffer(lexer);
|
|
|
|
if (*peek == '\\') { // 处理转义
|
|
peek++;
|
|
*peek = got_slash(peek);
|
|
}
|
|
|
|
if (len >= LEXER_MAX_TOKEN_SIZE) LEX_ERROR("String too long");
|
|
dest[len++] = *peek++;
|
|
}
|
|
dest[len] = '\0';
|
|
lexer->cur_ptr = peek + 1; // 1 is `"`
|
|
lexer->loc.len = len + 2; // 2 is `"` `"`
|
|
|
|
token->val.str = strpool_intern(lexer->strpool, dest);
|
|
}
|
|
|
|
// FIXME it write by AI maybe error
|
|
static void parse_number(lexer_t* lexer, tok_t* token) {
|
|
char* peek = lexer->cur_ptr;
|
|
int base = 10;
|
|
int is_float = 0;
|
|
long long int_val = 0;
|
|
double float_val = 0.0;
|
|
double fraction = 1.0;
|
|
|
|
// 判断进制
|
|
if (*peek == '0') {
|
|
peek++;
|
|
switch (*peek) {
|
|
case 'x':
|
|
case 'X':
|
|
base = 16;
|
|
default:
|
|
base = 8;
|
|
}
|
|
}
|
|
|
|
// 解析整数部分
|
|
while (1) {
|
|
int digit = -1;
|
|
if (*peek >= '0' && *peek <= '9') {
|
|
digit = *peek - '0';
|
|
} else if (base == 16) {
|
|
if (*peek >= 'a' && *peek <= 'f') digit = *peek - 'a' + 10;
|
|
else if (*peek >= 'A' && *peek <= 'F') digit = *peek - 'A' + 10;
|
|
}
|
|
|
|
if (digit < 0 || digit >= base) break;
|
|
|
|
if (!is_float) {
|
|
int_val = int_val * base + digit;
|
|
} else {
|
|
float_val = float_val * base + digit;
|
|
fraction *= base;
|
|
}
|
|
peek++;
|
|
}
|
|
|
|
// 解析浮点数
|
|
if (*peek == '.' && base == 10) {
|
|
is_float = 1;
|
|
float_val = int_val;
|
|
peek++;
|
|
|
|
while (*peek >= '0' && *peek <= '9') {
|
|
float_val = float_val * 10.0 + (*peek - '0');
|
|
fraction *= 10.0;
|
|
peek++;
|
|
}
|
|
float_val /= fraction;
|
|
}
|
|
|
|
// 解析科学计数法
|
|
if ((*peek == 'e' || *peek == 'E') && base == 10) {
|
|
is_float = 1;
|
|
peek++;
|
|
// int exp_sign = 1;
|
|
int exponent = 0;
|
|
|
|
if (*peek == '+') peek++;
|
|
else if (*peek == '-') {
|
|
// exp_sign = -1;
|
|
peek++;
|
|
}
|
|
|
|
while (*peek >= '0' && *peek <= '9') {
|
|
exponent = exponent * 10 + (*peek - '0');
|
|
peek++;
|
|
}
|
|
// float_val *= pow(10.0, exp_sign * exponent);
|
|
}
|
|
|
|
// 存储结果
|
|
// TODO
|
|
lexer->loc.len = peek - lexer->cur_ptr;
|
|
lexer->cur_ptr = peek;
|
|
if (is_float) {
|
|
token->val.f32 = float_val;
|
|
token->sub_type = TOKEN_FLOAT_LITERAL;
|
|
} else {
|
|
token->val.i = int_val;
|
|
token->sub_type = TOKEN_INT_LITERAL;
|
|
}
|
|
}
|
|
|
|
#define GOT_ONE_TOKEN_BUF_SIZE 64
|
|
// /zh/c/language/operator_arithmetic.html
|
|
void get_token(lexer_t* lexer, tok_t* token) {
|
|
// 需要保证缓冲区始终可读
|
|
if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
|
|
flush_buffer(lexer);
|
|
}
|
|
register char* peek = lexer->cur_ptr;
|
|
|
|
cc_tktype_t tk_type = TOKEN_INIT;
|
|
ctype_t literal = { 0 };
|
|
|
|
// once step
|
|
switch (*peek++) {
|
|
case '=':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_EQ; break;
|
|
default: peek--, tk_type = TOKEN_ASSIGN; break;
|
|
} break;
|
|
case '+':
|
|
switch (*peek++) {
|
|
case '+': tk_type = TOKEN_ADD_ADD; break;
|
|
case '=': tk_type = TOKEN_ASSIGN_ADD; break;
|
|
default: peek--, tk_type = TOKEN_ADD; break;
|
|
} break;
|
|
case '-':
|
|
switch (*peek++) {
|
|
case '-': tk_type = TOKEN_SUB_SUB; break;
|
|
case '=': tk_type = TOKEN_ASSIGN_SUB; break;
|
|
|
|
case '>': tk_type = TOKEN_DEREF; break;
|
|
default: peek--, tk_type = TOKEN_SUB; break;
|
|
} break;
|
|
case '*':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_ASSIGN_MUL; break;
|
|
default: peek--, tk_type = TOKEN_MUL; break;
|
|
} break;
|
|
case '/':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_ASSIGN_DIV; break;
|
|
case '/': {
|
|
goto_newline(lexer);
|
|
tk_type = TOKEN_LINE_COMMENT;
|
|
goto END;
|
|
}
|
|
case '*': {
|
|
lexer->cur_ptr = peek;
|
|
goto_block_comment(lexer);
|
|
tk_type = TOKEN_BLOCK_COMMENT;
|
|
goto END;
|
|
}
|
|
default: peek--, tk_type = TOKEN_DIV; break;
|
|
} break;
|
|
case '%':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_ASSIGN_MOD; break;
|
|
default: peek--, tk_type = TOKEN_MOD; break;
|
|
} break;
|
|
case '&':
|
|
switch (*peek++) {
|
|
case '&': tk_type = TOKEN_AND_AND; break;
|
|
case '=': tk_type = TOKEN_ASSIGN_AND; break;
|
|
default: peek--, tk_type = TOKEN_AND; break;
|
|
} break;
|
|
case '|':
|
|
switch (*peek++) {
|
|
case '|': tk_type = TOKEN_OR_OR; break;
|
|
case '=': tk_type = TOKEN_ASSIGN_OR; break;
|
|
default: peek--, tk_type = TOKEN_OR; break;
|
|
} break;
|
|
case '^':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_ASSIGN_XOR; break;
|
|
default: peek--, tk_type = TOKEN_XOR; break;
|
|
} break;
|
|
case '<':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_LE; break;
|
|
case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
|
|
default: peek--, tk_type = TOKEN_LT; break;
|
|
} break;
|
|
case '>':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_GE; break;
|
|
case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
|
|
default: peek--, tk_type = TOKEN_GT; break;
|
|
} break;
|
|
case '~':
|
|
tk_type = TOKEN_BIT_NOT; break;
|
|
case '!':
|
|
switch (*peek++) {
|
|
case '=': tk_type = TOKEN_NEQ; break;
|
|
default: peek--, tk_type = TOKEN_NOT; break;
|
|
} break;
|
|
case '[':
|
|
tk_type = TOKEN_L_BRACKET; break;
|
|
case ']':
|
|
tk_type = TOKEN_R_BRACKET; break;
|
|
case '(':
|
|
tk_type = TOKEN_L_PAREN; break;
|
|
case ')':
|
|
tk_type = TOKEN_R_PAREN; break;
|
|
case '{':
|
|
tk_type = TOKEN_L_BRACE; break;
|
|
case '}':
|
|
tk_type = TOKEN_R_BRACE; break;
|
|
case ';':
|
|
tk_type = TOKEN_SEMICOLON; break;
|
|
case ',':
|
|
tk_type = TOKEN_COMMA; break;
|
|
case ':':
|
|
tk_type = TOKEN_COLON; break;
|
|
case '.':
|
|
if (peek[0] == '.' && peek[1] == '.') {
|
|
peek += 2;
|
|
tk_type = TOKEN_ELLIPSIS;
|
|
} else {
|
|
tk_type = TOKEN_DOT;
|
|
}
|
|
break;
|
|
case '?':
|
|
tk_type = TOKEN_COND; break;
|
|
case '\v': case '\r': case '\f':
|
|
case ' ': case '\t':
|
|
tk_type = TOKEN_BLANK; break;
|
|
case '\n':
|
|
// you need to flush a newline or blank
|
|
lexer->loc.line += 1;
|
|
lexer->loc.col = -1;
|
|
lexer->loc.len = 1;
|
|
tk_type = TOKEN_BLANK;
|
|
break;
|
|
case '#':
|
|
// TODO make line or file comment to change
|
|
LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
|
|
goto_newline(lexer);
|
|
tk_type = TOKEN_BLANK;
|
|
goto END;
|
|
case '\0':
|
|
// EOF
|
|
tk_type = TOKEN_EOF;
|
|
goto END;
|
|
case '\'':
|
|
parse_char_literal(lexer, token);
|
|
literal = token->val;
|
|
tk_type = TOKEN_CHAR_LITERAL;
|
|
goto END; break;
|
|
case '"':
|
|
parse_string_literal(lexer, token);
|
|
literal = token->val;
|
|
tk_type = TOKEN_STRING_LITERAL;
|
|
goto END; break;
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
parse_number(lexer, token);
|
|
// TODO Make it easy
|
|
literal = token->val;
|
|
tk_type = token->sub_type;
|
|
goto END; break;
|
|
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
|
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
|
|
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
|
|
case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
|
|
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
|
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
|
|
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
|
|
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
|
|
case '_':
|
|
// TOKEN_IDENT
|
|
if ((*peek == 'L' && *peek == '\'') || (*peek == 'L' && *peek == '"')) {
|
|
LEX_ERROR("unsupport wide-character char literal by `L` format");
|
|
}
|
|
while (1) {
|
|
if (peek == lexer->end_ptr) {
|
|
LEX_ERROR("unsupport outof 64 length identifier");
|
|
}
|
|
if ((*peek >= 'a' && *peek <= 'z') || (*peek >= 'A' && *peek <= 'Z') ||
|
|
(*peek == '_') || (*peek >= '0' && *peek <= '9')) {
|
|
peek++;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
int strlen = peek - lexer->cur_ptr;
|
|
int res = keyword_cmp((const char*)lexer->cur_ptr, strlen);
|
|
if (res == -1) {
|
|
char prev = lexer->cur_ptr[strlen];
|
|
lexer->cur_ptr[strlen] = '\0';
|
|
literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr);
|
|
lexer->cur_ptr[strlen] = prev;
|
|
tk_type = TOKEN_IDENT; break;
|
|
} else {
|
|
tk_type = keywords[res].tok; break;
|
|
}
|
|
default:
|
|
LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
|
|
break;
|
|
}
|
|
|
|
lexer->loc.len = peek - lexer->cur_ptr;
|
|
lexer->cur_ptr = peek;
|
|
END:
|
|
lexer->loc.col += lexer->loc.len;
|
|
lexer->loc.len = 0;
|
|
|
|
token->val = literal;
|
|
token->sub_type = tk_type;
|
|
token->loc = lexer->loc;
|
|
static const tok_basic_type_t tok_type_map[] = {
|
|
// 普通token使用#str
|
|
#define X(str, basic, tok) [tok] = basic,
|
|
TOKEN_TABLE
|
|
#undef X
|
|
|
|
// 关键字使用#name
|
|
#define X(name, std, tok) [tok] = TK_BASIC_KEYWORD,
|
|
KEYWORD_TABLE
|
|
#undef X
|
|
};
|
|
token->type = tok_type_map[tk_type];
|
|
LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type),
|
|
token->loc.fname, token->loc.line, token->loc.col);
|
|
}
|
|
|
|
// get_token maybe got invalid (with parser)
|
|
void get_valid_token(lexer_t* lexer, tok_t* token) {
|
|
tok_basic_type_t type;
|
|
do {
|
|
get_token(lexer, token);
|
|
type = token->type;
|
|
Assert(type != TK_BASIC_INVALID);
|
|
} while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT);
|
|
}
|