ZZY 05c637e594 refactor: 重构前端代码并添加日志功能
- 重命名和重构了多个文件,包括 lexer、parser 和 AST 相关代码
- 添加了日志功能,使用 LOG_* 宏替代原有的 error 和 warn 函数
- 优化了错误处理和内存分配方式
- 调整了代码结构,提高了模块化和可读性
2025-03-19 12:22:55 +08:00

514 lines
15 KiB
C

/**
* 仿照LCCompiler的词法分析部分
*
* 如下为LCC的README in 2025.2
This hierarchy is the distribution for lcc version 4.2.
lcc version 3.x is described in the book "A Retargetable C Compiler:
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
There are significant differences between 3.x and 4.x, most notably in
the intermediate code. For details, see
https://drh.github.io/lcc/documents/interface4.pdf.
VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
LOG describes the changes since the last release.
CPYRIGHT describes the conditions under you can use, copy, modify, and
distribute lcc or works derived from lcc.
doc/install.html is an HTML file that gives a complete description of
the distribution and installation instructions.
Chris Fraser / cwf@aya.yale.edu
David Hanson / drh@drhanson.net
*/
#include <lib/core.h>
#include "lexer_log.h"
#include "token.h"
#include "lexer.h"
static const struct {
const char* name;
enum CSTD_KEYWORD std_type;
tok_type_t tok;
} keywords[] = {
#define X(name, std_type, tok, ...) { #name, std_type, tok },
KEYWORD_TABLE
#undef X
};
// by using binary search to find the keyword
static inline int keyword_cmp(const char* name, int len) {
int low = 0;
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
while (low <= high) {
int mid = (low + high) / 2;
const char *key = keywords[mid].name;
int cmp = 0;
// 自定义字符串比较逻辑
for (int i = 0; i < len; i++) {
if (name[i] != key[i]) {
cmp = (unsigned char)name[i] - (unsigned char)key[i];
break;
}
if (name[i] == '\0') break; // 遇到终止符提前结束
}
if (cmp == 0) {
// 完全匹配检查(长度相同)
if (key[len] == '\0') return mid;
cmp = -1; // 当前关键词比输入长
}
if (cmp < 0) {
high = mid - 1;
} else {
low = mid + 1;
}
}
return -1; // Not a keyword.
}
void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) {
init_lib_core();
lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
lexer->index = 1;
lexer->line = 1;
lexer->stream = stream;
lexer->sread = sread;
for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
lexer->buffer[i] = 0;
}
}
static void flush_buffer(lexer_t* lexer) {
int num = lexer->end_ptr - lexer->cur_ptr;
for (int i = 0; i < num; i++) {
lexer->buffer[i] = lexer->cur_ptr[i];
}
lexer->cur_ptr = (unsigned char*)lexer->buffer;
int read_size = LEXER_BUFFER_SIZE - num;
// TODO rt_size_t to int maybe lose precision
int got_size = lexer->sread(lexer->buffer + num, read_size, 1, read_size, lexer->stream);
if (got_size < 0) {
LEX_ERROR("lexer read error");
} else if (got_size < read_size) {
lexer->end_ptr += got_size;
lexer->end_ptr[0] = '\0'; // EOF
lexer->end_ptr++;
} else if (got_size == read_size) {
lexer->end_ptr += got_size;
} else {
LEX_ERROR("lexer read error imposible got_size > read_size maybe overflow?");
}
}
static void goto_newline(lexer_t* lexer) {
do {
if (lexer->cur_ptr == lexer->end_ptr) {
flush_buffer(lexer);
lexer->cur_ptr--;
}
lexer->cur_ptr++;
} while (*lexer->cur_ptr != '\n' && *lexer->cur_ptr != '\0');
}
static void goto_block_comment(lexer_t* lexer) {
while (1) {
if (lexer->end_ptr - lexer->cur_ptr < 2) {
flush_buffer(lexer);
}
if (*lexer->cur_ptr == '\0') {
break;
} else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
lexer->cur_ptr += 2;
break;
} else {
lexer->cur_ptr++;
}
}
}
// TODO escape character not enough
static char got_slash(unsigned char* peek) {
switch (*peek) {
case '\\': return '\\';
case '\'': return '\'';
case '\"': return '\"';
case '\?': return '\?';
case '0': return '\0';
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
default: break;
}
LEX_ERROR("Unknown escape character");
return -1;
}
static void parse_char_literal(lexer_t* lexer, tok_t* token) {
char val = 0;
unsigned char* peek = lexer->cur_ptr + 1;
if (*peek == '\\') {
peek++;
val = got_slash(peek);
peek++;
} else {
val = *peek++;
}
if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
token->val.ch = val;
lexer->cur_ptr = peek;
token->val.have = 1;
token->type = TOKEN_CHAR_LITERAL;
}
static void parse_string_literal(lexer_t* lexer, tok_t* token) {
unsigned char* peek = lexer->cur_ptr + 1;
// TODO string literal size check
char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1);
int len = 0;
while (*peek != '"') {
if (peek >= lexer->end_ptr) flush_buffer(lexer);
if (*peek == '\\') { // 处理转义
peek++;
*peek = got_slash(peek);
}
if (len >= LEXER_MAX_TOKEN_SIZE) LEX_ERROR("String too long");
dest[len++] = *peek++;
}
dest[len] = '\0';
lexer->cur_ptr = peek + 1;
token->val.have = 1;
token->type = TOKEN_STRING_LITERAL;
}
// FIXME it write by AI maybe error
static void parse_number(lexer_t* lexer, tok_t* token) {
unsigned char* peek = lexer->cur_ptr;
int base = 10;
int is_float = 0;
long long int_val = 0;
double float_val = 0.0;
double fraction = 1.0;
// 判断进制
if (*peek == '0') {
peek++;
switch (*peek) {
case 'x':
case 'X':
base = 16;
default:
base = 8;
}
}
// 解析整数部分
while (1) {
int digit = -1;
if (*peek >= '0' && *peek <= '9') {
digit = *peek - '0';
} else if (base == 16) {
if (*peek >= 'a' && *peek <= 'f') digit = *peek - 'a' + 10;
else if (*peek >= 'A' && *peek <= 'F') digit = *peek - 'A' + 10;
}
if (digit < 0 || digit >= base) break;
if (!is_float) {
int_val = int_val * base + digit;
} else {
float_val = float_val * base + digit;
fraction *= base;
}
peek++;
}
// 解析浮点数
if (*peek == '.' && base == 10) {
is_float = 1;
float_val = int_val;
peek++;
while (*peek >= '0' && *peek <= '9') {
float_val = float_val * 10.0 + (*peek - '0');
fraction *= 10.0;
peek++;
}
float_val /= fraction;
}
// 解析科学计数法
if ((*peek == 'e' || *peek == 'E') && base == 10) {
is_float = 1;
peek++;
// int exp_sign = 1;
int exponent = 0;
if (*peek == '+') peek++;
else if (*peek == '-') {
// exp_sign = -1;
peek++;
}
while (*peek >= '0' && *peek <= '9') {
exponent = exponent * 10 + (*peek - '0');
peek++;
}
// float_val *= pow(10.0, exp_sign * exponent);
}
// 存储结果
lexer->cur_ptr = peek;
token->val.have = 1;
if (is_float) {
token->val.d = float_val;
token->type = TOKEN_FLOAT_LITERAL;
} else {
token->val.ll = int_val;
token->type = TOKEN_INT_LITERAL;
}
}
#define GOT_ONE_TOKEN_BUF_SIZE 64
// /zh/c/language/operator_arithmetic.html
void get_token(lexer_t* lexer, tok_t* token) {
// 需要保证缓冲区始终可读
if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
flush_buffer(lexer);
}
register unsigned char* peek = lexer->cur_ptr;
// 快速跳过空白符
while (*peek == ' ' || *peek == '\t') {
if (peek == lexer->end_ptr) {
break;
}
peek++;
}
if (peek != lexer->cur_ptr) {
// To TOKEN_FLUSH
lexer->cur_ptr = peek;
token->type = TOKEN_FLUSH;
}
tok_type_t tok = TOKEN_INIT;
tok_val_t constant;
constant.have = 0;
// once step
switch (*peek++) {
case '=':
switch (*peek++) {
case '=': tok = TOKEN_EQ; break;
default: peek--, tok = TOKEN_ASSIGN; break;
} break;
case '+':
switch (*peek++) {
case '+': tok = TOKEN_ADD_ADD; break;
case '=': tok = TOKEN_ASSIGN_ADD; break;
default: peek--, tok = TOKEN_ADD; break;
} break;
case '-':
switch (*peek++) {
case '-': tok = TOKEN_SUB_SUB; break;
case '=': tok = TOKEN_ASSIGN_SUB; break;
case '>': tok = TOKEN_DEREF; break;
default: peek--, tok = TOKEN_SUB; break;
} break;
case '*':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_MUL; break;
default: peek--, tok = TOKEN_MUL; break;
} break;
case '/':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_DIV; break;
case '/': {
// need get a new line to parse
goto_newline(lexer);
tok = TOKEN_LINE_COMMENT;
goto END;
}
case '*': {
lexer->cur_ptr = peek;
goto_block_comment(lexer);
tok = TOKEN_BLOCK_COMMENT;
goto END;
}
default: peek--, tok = TOKEN_DIV; break;
} break;
case '%':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_MOD; break;
default: peek--, tok = TOKEN_MOD; break;
} break;
case '&':
switch (*peek++) {
case '&': tok = TOKEN_AND_AND; break;
case '=': tok = TOKEN_ASSIGN_AND; break;
default: peek--, tok = TOKEN_AND; break;
} break;
case '|':
switch (*peek++) {
case '|': tok = TOKEN_OR_OR; break;
case '=': tok = TOKEN_ASSIGN_OR; break;
default: peek--, tok = TOKEN_OR; break;
} break;
case '^':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_XOR; break;
default: peek--, tok = TOKEN_XOR; break;
} break;
case '<':
switch (*peek++) {
case '=': tok = TOKEN_LE; break;
case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
default: peek--, tok = TOKEN_LT; break;
} break;
case '>':
switch (*peek++) {
case '=': tok = TOKEN_GE; break;
case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
default: peek--, tok = TOKEN_GT; break;
} break;
case '~':
tok = TOKEN_BIT_NOT; break;
case '!':
switch (*peek++) {
case '=': tok = TOKEN_NEQ; break;
default: peek--, tok = TOKEN_NOT; break;
} break;
case '[':
tok = TOKEN_L_BRACKET; break;
case ']':
tok = TOKEN_R_BRACKET; break;
case '(':
tok = TOKEN_L_PAREN; break;
case ')':
tok = TOKEN_R_PAREN; break;
case '{':
tok = TOKEN_L_BRACE; break;
case '}':
tok = TOKEN_R_BRACE; break;
case ';':
tok = TOKEN_SEMICOLON; break;
case ',':
tok = TOKEN_COMMA; break;
case ':':
tok = TOKEN_COLON; break;
case '.':
if (peek[0] == '.' && peek[1] == '.') {
peek += 2;
tok = TOKEN_ELLIPSIS;
} else {
tok = TOKEN_DOT;
}
break;
case '?':
tok = TOKEN_COND; break;
case '\v': case '\r': case '\f': // FIXME it parse as a blank character
tok = TOKEN_FLUSH; break;
case '\n':
// you need to flush a newline or blank
lexer->line++;
tok = TOKEN_FLUSH; break;
case '#':
LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored");
goto_newline(lexer);
tok = TOKEN_FLUSH;
goto END;
case '\0':
// EOF
tok = TOKEN_EOF;
goto END;
case '\'':
return parse_char_literal(lexer, token);
return;
case '"':
return parse_string_literal(lexer, token);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return parse_number(lexer, token);
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
case '_':
// TOKEN_IDENT
if ((*peek == 'L' && *peek == '\'') || (*peek == 'L' && *peek == '"')) {
LEX_ERROR("unsupport wide-character char literal by `L` format");
}
while (1) {
if (peek == lexer->end_ptr) {
LEX_ERROR("unsupport outof 64 length identifier");
}
if ((*peek >= 'a' && *peek <= 'z') || (*peek >= 'A' && *peek <= 'Z') ||
(*peek == '_') || (*peek >= '0' && *peek <= '9')) {
peek++;
continue;
}
break;
}
int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr));
if (res == -1) {
int strlen = peek - lexer->cur_ptr;
unsigned char* str = rt._malloc(strlen + 1);
constant.have = 1;
constant.str = (char*)str;
for (int i = 0; i < strlen; i++) {
str[i] = lexer->cur_ptr[i];
}
str[strlen] = '\0';
constant.have = 1;
constant.str = (char*)str;
tok = TOKEN_IDENT; break;
} else {
tok = keywords[res].tok; break;
}
default:
LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
break;
}
lexer->cur_ptr = peek;
END:
token->val = constant;
token->type = tok;
LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i);
}
// get_token maybe got invalid (with parser)
void get_valid_token(lexer_t* lexer, tok_t* token) {
tok_type_t type;
do {
get_token(lexer, token);
type = token->type;
} while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
}