Updated type names from `core_*` to `scc_*` across lex_parser and stream modules to maintain naming consistency within the SCC codebase. This includes changes to function signatures and internal usage of types like `core_probe_stream_t`, `core_pos_t`, and `cstring_t` to their `scc_*` counterparts.
483 lines
14 KiB
C
483 lines
14 KiB
C
/**
|
|
* 仿照LCCompiler的词法分析部分
|
|
*
|
|
* 如下为LCC的README in 2025.2
|
|
This hierarchy is the distribution for lcc version 4.2.
|
|
|
|
lcc version 3.x is described in the book "A Retargetable C Compiler:
|
|
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
|
|
There are significant differences between 3.x and 4.x, most notably in
|
|
the intermediate code. For details, see
|
|
https://drh.github.io/lcc/documents/interface4.pdf.
|
|
|
|
VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
|
|
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
|
|
|
|
LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
|
|
|
|
LOG describes the changes since the last release.
|
|
|
|
CPYRIGHT describes the conditions under you can use, copy, modify, and
|
|
distribute lcc or works derived from lcc.
|
|
|
|
doc/install.html is an HTML file that gives a complete description of
|
|
the distribution and installation instructions.
|
|
|
|
Chris Fraser / cwf@aya.yale.edu
|
|
David Hanson / drh@drhanson.net
|
|
*/
|
|
#include <lex_parser.h>
|
|
#include <lexer.h>
|
|
#include <lexer_log.h>
|
|
|
|
static const struct {
|
|
const char *name;
|
|
scc_cstd_t std_type;
|
|
scc_tok_type_t tok;
|
|
} keywords[] = {
|
|
#define X(name, subtype, tok, std_type, ...) {#name, std_type, tok},
|
|
SCC_CKEYWORD_TABLE
|
|
#undef X
|
|
};
|
|
|
|
// by using binary search to find the keyword
|
|
static inline int keyword_cmp(const char *name, int len) {
|
|
int low = 0;
|
|
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
|
|
while (low <= high) {
|
|
int mid = (low + high) / 2;
|
|
const char *key = keywords[mid].name;
|
|
int cmp = 0;
|
|
|
|
// 自定义字符串比较逻辑
|
|
for (int i = 0; i < len; i++) {
|
|
if (name[i] != key[i]) {
|
|
cmp = (unsigned char)name[i] - (unsigned char)key[i];
|
|
break;
|
|
}
|
|
if (name[i] == '\0')
|
|
break; // 遇到终止符提前结束
|
|
}
|
|
|
|
if (cmp == 0) {
|
|
// 完全匹配检查(长度相同)
|
|
if (key[len] == '\0')
|
|
return mid;
|
|
cmp = -1; // 当前关键词比输入长
|
|
}
|
|
|
|
if (cmp < 0) {
|
|
high = mid - 1;
|
|
} else {
|
|
low = mid + 1;
|
|
}
|
|
}
|
|
return -1; // Not a keyword.
|
|
}
|
|
|
|
void scc_lexer_init(scc_lexer_t *lexer, scc_probe_stream_t *stream) {
|
|
lexer->stream = stream;
|
|
lexer->pos = scc_pos_init();
|
|
// FIXME
|
|
lexer->pos.name = scc_cstring_from_cstr(scc_cstring_as_cstr(&stream->name));
|
|
}
|
|
|
|
#define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN)
|
|
|
|
static void parse_line(scc_lexer_t *lexer, lexer_tok_t *token) {
|
|
token->loc = lexer->pos;
|
|
scc_probe_stream_t *stream = lexer->stream;
|
|
scc_probe_stream_reset(stream);
|
|
int ch = scc_probe_stream_next(stream);
|
|
|
|
usize n;
|
|
scc_cstring_t str = scc_cstring_new();
|
|
|
|
if (ch == core_stream_eof) {
|
|
LEX_WARN("Unexpected EOF at begin");
|
|
goto ERR;
|
|
} else if (ch != '#') {
|
|
LEX_WARN("Unexpected character '%c' at begin", ch);
|
|
goto ERR;
|
|
}
|
|
|
|
const char line[] = "line";
|
|
|
|
for (int i = 0; i < (int)sizeof(line); i++) {
|
|
ch = scc_probe_stream_consume(stream);
|
|
core_pos_next(&lexer->pos);
|
|
if (ch != line[i]) {
|
|
LEX_WARN("Maroc does not support in lexer rather in preprocessor, "
|
|
"it will be ignored");
|
|
goto SKIP_LINE;
|
|
}
|
|
}
|
|
|
|
if (lex_parse_number(lexer->stream, &lexer->pos, &n) == false) {
|
|
LEX_ERROR("Invalid line number");
|
|
goto SKIP_LINE;
|
|
}
|
|
|
|
if (scc_probe_stream_consume(stream) != ' ') {
|
|
lex_parse_skip_line(lexer->stream, &lexer->pos);
|
|
token->loc.line = token->value.n;
|
|
}
|
|
|
|
if (scc_probe_stream_next(stream) != '"') {
|
|
LEX_ERROR("Invalid `#` line");
|
|
goto SKIP_LINE;
|
|
}
|
|
if (lex_parse_string(lexer->stream, &lexer->pos, &str) == false) {
|
|
LEX_ERROR("Invalid filename");
|
|
goto SKIP_LINE;
|
|
}
|
|
|
|
lex_parse_skip_line(lexer->stream, &lexer->pos);
|
|
token->loc.line = n;
|
|
// FIXME memory leak
|
|
token->loc.name = scc_cstring_from_cstr(scc_cstring_as_cstr(&str));
|
|
scc_cstring_free(&str);
|
|
return;
|
|
SKIP_LINE:
|
|
lex_parse_skip_line(lexer->stream, &lexer->pos);
|
|
ERR:
|
|
set_err_token(token);
|
|
scc_cstring_free(&str);
|
|
}
|
|
|
|
// /zh/c/language/operator_arithmetic.html
|
|
void scc_lexer_get_token(scc_lexer_t *lexer, lexer_tok_t *token) {
|
|
token->loc = lexer->pos;
|
|
token->type = SCC_TOK_UNKNOWN;
|
|
scc_probe_stream_t *stream = lexer->stream;
|
|
|
|
scc_probe_stream_reset(stream);
|
|
scc_tok_type_t type = SCC_TOK_UNKNOWN;
|
|
int ch = scc_probe_stream_next(stream);
|
|
|
|
// once step
|
|
switch (ch) {
|
|
case '=':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_EQ;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_ASSIGN;
|
|
break;
|
|
}
|
|
break;
|
|
case '+':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '+':
|
|
type = SCC_TOK_ADD_ADD;
|
|
goto double_char;
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_ADD;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_ADD;
|
|
break;
|
|
}
|
|
break;
|
|
case '-':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '-':
|
|
type = SCC_TOK_SUB_SUB;
|
|
goto double_char;
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_SUB;
|
|
goto double_char;
|
|
case '>':
|
|
type = SCC_TOK_DEREF;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_SUB;
|
|
break;
|
|
}
|
|
break;
|
|
case '*':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_MUL;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_MUL;
|
|
break;
|
|
}
|
|
break;
|
|
case '/':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_DIV;
|
|
goto double_char;
|
|
case '/':
|
|
lex_parse_skip_line(lexer->stream, &lexer->pos);
|
|
token->type = SCC_TOK_LINE_COMMENT;
|
|
goto END;
|
|
case '*':
|
|
lex_parse_skip_block_comment(lexer->stream, &lexer->pos);
|
|
token->type = SCC_TOK_BLOCK_COMMENT;
|
|
goto END;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_DIV;
|
|
break;
|
|
}
|
|
break;
|
|
case '%':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_MOD;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_MOD;
|
|
break;
|
|
}
|
|
break;
|
|
case '&':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '&':
|
|
type = SCC_TOK_AND_AND;
|
|
goto double_char;
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_AND;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_AND;
|
|
break;
|
|
}
|
|
break;
|
|
case '|':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '|':
|
|
type = SCC_TOK_OR_OR;
|
|
goto double_char;
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_OR;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_OR;
|
|
break;
|
|
}
|
|
break;
|
|
case '^':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_ASSIGN_XOR;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_XOR;
|
|
break;
|
|
}
|
|
break;
|
|
case '<':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_LE;
|
|
goto double_char;
|
|
case '<': {
|
|
if (scc_probe_stream_next(stream) == '=') {
|
|
type = SCC_TOK_ASSIGN_L_SH;
|
|
goto triple_char;
|
|
} else {
|
|
type = SCC_TOK_L_SH;
|
|
goto double_char;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_LT;
|
|
break;
|
|
}
|
|
break;
|
|
case '>':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_GE;
|
|
goto double_char;
|
|
case '>': {
|
|
if (scc_probe_stream_next(stream) == '=') {
|
|
type = SCC_TOK_ASSIGN_R_SH;
|
|
goto triple_char;
|
|
} else {
|
|
type = SCC_TOK_R_SH;
|
|
goto double_char;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_GT;
|
|
break;
|
|
}
|
|
break;
|
|
case '~':
|
|
type = SCC_TOK_BIT_NOT;
|
|
break;
|
|
case '!':
|
|
switch (scc_probe_stream_next(stream)) {
|
|
case '=':
|
|
type = SCC_TOK_NEQ;
|
|
goto double_char;
|
|
default:
|
|
scc_probe_stream_reset(stream), type = SCC_TOK_NOT;
|
|
break;
|
|
}
|
|
break;
|
|
case '[':
|
|
type = SCC_TOK_L_BRACKET;
|
|
break;
|
|
case ']':
|
|
type = SCC_TOK_R_BRACKET;
|
|
break;
|
|
case '(':
|
|
type = SCC_TOK_L_PAREN;
|
|
break;
|
|
case ')':
|
|
type = SCC_TOK_R_PAREN;
|
|
break;
|
|
case '{':
|
|
type = SCC_TOK_L_BRACE;
|
|
break;
|
|
case '}':
|
|
type = SCC_TOK_R_BRACE;
|
|
break;
|
|
case ';':
|
|
type = SCC_TOK_SEMICOLON;
|
|
break;
|
|
case ',':
|
|
type = SCC_TOK_COMMA;
|
|
break;
|
|
case ':':
|
|
type = SCC_TOK_COLON;
|
|
break;
|
|
case '.':
|
|
if (scc_probe_stream_next(stream) == '.' &&
|
|
scc_probe_stream_next(stream) == '.') {
|
|
type = SCC_TOK_ELLIPSIS;
|
|
goto triple_char;
|
|
}
|
|
type = SCC_TOK_DOT;
|
|
break;
|
|
case '?':
|
|
type = SCC_TOK_COND;
|
|
break;
|
|
case '\v':
|
|
case '\f':
|
|
case ' ':
|
|
case '\t':
|
|
type = SCC_TOK_BLANK;
|
|
break;
|
|
case '\r':
|
|
case '\n':
|
|
lex_parse_skip_endline(lexer->stream, &lexer->pos);
|
|
token->type = SCC_TOK_BLANK;
|
|
goto END;
|
|
case '#':
|
|
parse_line(lexer, token);
|
|
token->type = SCC_TOK_BLANK;
|
|
goto END;
|
|
case '\0':
|
|
case core_stream_eof:
|
|
// EOF
|
|
type = SCC_TOK_EOF;
|
|
break;
|
|
case '\'': {
|
|
token->loc = lexer->pos;
|
|
token->type = SCC_TOK_CHAR_LITERAL;
|
|
int ch = lex_parse_char(lexer->stream, &lexer->pos);
|
|
if (ch == core_stream_eof) {
|
|
LEX_ERROR("Unexpected character literal");
|
|
token->type = SCC_TOK_UNKNOWN;
|
|
} else {
|
|
token->value.ch = ch;
|
|
}
|
|
goto END;
|
|
}
|
|
case '"': {
|
|
token->loc = lexer->pos;
|
|
token->type = SCC_TOK_STRING_LITERAL;
|
|
scc_cstring_t output = scc_cstring_new();
|
|
if (lex_parse_string(lexer->stream, &lexer->pos, &output) == true) {
|
|
token->value.cstr.data = scc_cstring_as_cstr(&output);
|
|
token->value.cstr.len = scc_cstring_len(&output);
|
|
} else {
|
|
LEX_ERROR("Unexpected string literal");
|
|
token->type = SCC_TOK_UNKNOWN;
|
|
}
|
|
|
|
goto END;
|
|
}
|
|
/* clang-format off */
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
/* clang-format on */
|
|
token->loc = lexer->pos;
|
|
token->type = SCC_TOK_INT_LITERAL;
|
|
usize output;
|
|
if (lex_parse_number(lexer->stream, &lexer->pos, &output) == true) {
|
|
token->value.n = output;
|
|
} else {
|
|
LEX_ERROR("Unexpected number literal");
|
|
token->type = SCC_TOK_UNKNOWN;
|
|
}
|
|
goto END;
|
|
/* clang-format off */
|
|
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
|
|
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
|
|
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
|
|
case 'v': case 'w': case 'x': case 'y': case 'z':
|
|
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
|
|
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
|
|
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
|
|
case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_':
|
|
/* clang-format on */
|
|
scc_cstring_t str = scc_cstring_new();
|
|
cbool ret = lex_parse_identifier(lexer->stream, &lexer->pos, &str);
|
|
Assert(ret == true);
|
|
|
|
int res = keyword_cmp(scc_cstring_as_cstr(&str), scc_cstring_len(&str));
|
|
if (res == -1) {
|
|
token->value.cstr.data = (char *)scc_cstring_as_cstr(&str);
|
|
token->value.cstr.len = scc_cstring_len(&str);
|
|
type = SCC_TOK_IDENT;
|
|
} else {
|
|
scc_cstring_free(&str);
|
|
type = keywords[res].tok;
|
|
}
|
|
token->type = type;
|
|
goto END;
|
|
default:
|
|
LEX_ERROR("unsupport char in sourse code `%c`", ch);
|
|
break;
|
|
}
|
|
goto once_char;
|
|
triple_char:
|
|
scc_probe_stream_consume(stream);
|
|
core_pos_next(&lexer->pos);
|
|
double_char:
|
|
scc_probe_stream_consume(stream);
|
|
core_pos_next(&lexer->pos);
|
|
once_char:
|
|
scc_probe_stream_consume(stream);
|
|
core_pos_next(&lexer->pos);
|
|
token->type = type;
|
|
END:
|
|
LEX_DEBUG("get token `%s` in %s:%d:%d", scc_get_tok_name(token->type),
|
|
token->loc.name, token->loc.line, token->loc.column);
|
|
}
|
|
|
|
// scc_lexer_get_token maybe got invalid (with parser)
|
|
void scc_lexer_get_valid_token(scc_lexer_t *lexer, lexer_tok_t *token) {
|
|
scc_tok_subtype_t type;
|
|
do {
|
|
scc_lexer_get_token(lexer, token);
|
|
type = scc_get_tok_subtype(token->type);
|
|
AssertFmt(type != SCC_TOK_SUBTYPE_INVALID,
|
|
"Invalid token: `%s` at %s:%d:%d",
|
|
scc_get_tok_name(token->type), token->loc.name,
|
|
token->loc.line, token->loc.col);
|
|
Assert(type != SCC_TOK_SUBTYPE_INVALID);
|
|
} while (type == SCC_TOK_SUBTYPE_EMPTYSPACE ||
|
|
type == SCC_TOK_SUBTYPE_COMMENT);
|
|
}
|