/** * 仿照LCCompiler的词法分析部分 * * 如下为LCC的README in 2025.2 This hierarchy is the distribution for lcc version 4.2. lcc version 3.x is described in the book "A Retargetable C Compiler: Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1). There are significant differences between 3.x and 4.x, most notably in the intermediate code. For details, see https://drh.github.io/lcc/documents/interface4.pdf. VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION. LCC is a C89 ("ANSI C") compiler designed to be highly retargetable. LOG describes the changes since the last release. CPYRIGHT describes the conditions under you can use, copy, modify, and distribute lcc or works derived from lcc. doc/install.html is an HTML file that gives a complete description of the distribution and installation instructions. Chris Fraser / cwf@aya.yale.edu David Hanson / drh@drhanson.net */ #include #include #include static const struct { const char *name; scc_cstd_t std_type; scc_tok_type_t tok; } keywords[] = { #define X(name, subtype, tok, std_type, ...) {#name, std_type, tok}, SCC_CKEYWORD_TABLE #undef X }; // by using binary search to find the keyword static inline int keyword_cmp(const char *name, int len) { int low = 0; int high = sizeof(keywords) / sizeof(keywords[0]) - 1; while (low <= high) { int mid = (low + high) / 2; const char *key = keywords[mid].name; int cmp = 0; // 自定义字符串比较逻辑 for (int i = 0; i < len; i++) { if (name[i] != key[i]) { cmp = (unsigned char)name[i] - (unsigned char)key[i]; break; } if (name[i] == '\0') break; // 遇到终止符提前结束 } if (cmp == 0) { // 完全匹配检查(长度相同) if (key[len] == '\0') return mid; cmp = -1; // 当前关键词比输入长 } if (cmp < 0) { high = mid - 1; } else { low = mid + 1; } } return -1; // Not a keyword. } void scc_lexer_init(scc_lexer_t *lexer, scc_probe_stream_t *stream) { lexer->stream = stream; lexer->pos = scc_pos_init(); // FIXME lexer->pos.name = scc_cstring_from_cstr(scc_cstring_as_cstr(&stream->name)); } #define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN) static void parse_line(scc_lexer_t *lexer, lexer_tok_t *token) { token->loc = lexer->pos; scc_probe_stream_t *stream = lexer->stream; scc_probe_stream_reset(stream); int ch = scc_probe_stream_next(stream); usize n; scc_cstring_t str = scc_cstring_new(); if (ch == core_stream_eof) { LEX_WARN("Unexpected EOF at begin"); goto ERR; } else if (ch != '#') { LEX_WARN("Unexpected character '%c' at begin", ch); goto ERR; } const char line[] = "line"; for (int i = 0; i < (int)sizeof(line); i++) { ch = scc_probe_stream_consume(stream); core_pos_next(&lexer->pos); if (ch != line[i]) { LEX_WARN("Maroc does not support in lexer rather in preprocessor, " "it will be ignored"); goto SKIP_LINE; } } if (lex_parse_number(lexer->stream, &lexer->pos, &n) == false) { LEX_ERROR("Invalid line number"); goto SKIP_LINE; } if (scc_probe_stream_consume(stream) != ' ') { lex_parse_skip_line(lexer->stream, &lexer->pos); token->loc.line = token->value.n; } if (scc_probe_stream_next(stream) != '"') { LEX_ERROR("Invalid `#` line"); goto SKIP_LINE; } if (lex_parse_string(lexer->stream, &lexer->pos, &str) == false) { LEX_ERROR("Invalid filename"); goto SKIP_LINE; } lex_parse_skip_line(lexer->stream, &lexer->pos); token->loc.line = n; // FIXME memory leak token->loc.name = scc_cstring_from_cstr(scc_cstring_as_cstr(&str)); scc_cstring_free(&str); return; SKIP_LINE: lex_parse_skip_line(lexer->stream, &lexer->pos); ERR: set_err_token(token); scc_cstring_free(&str); } // /zh/c/language/operator_arithmetic.html void scc_lexer_get_token(scc_lexer_t *lexer, lexer_tok_t *token) { token->loc = lexer->pos; token->type = SCC_TOK_UNKNOWN; scc_probe_stream_t *stream = lexer->stream; scc_probe_stream_reset(stream); scc_tok_type_t type = SCC_TOK_UNKNOWN; int ch = scc_probe_stream_next(stream); // once step switch (ch) { case '=': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_EQ; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_ASSIGN; break; } break; case '+': switch (scc_probe_stream_next(stream)) { case '+': type = SCC_TOK_ADD_ADD; goto double_char; case '=': type = SCC_TOK_ASSIGN_ADD; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_ADD; break; } break; case '-': switch (scc_probe_stream_next(stream)) { case '-': type = SCC_TOK_SUB_SUB; goto double_char; case '=': type = SCC_TOK_ASSIGN_SUB; goto double_char; case '>': type = SCC_TOK_DEREF; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_SUB; break; } break; case '*': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_ASSIGN_MUL; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_MUL; break; } break; case '/': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_ASSIGN_DIV; goto double_char; case '/': lex_parse_skip_line(lexer->stream, &lexer->pos); token->type = SCC_TOK_LINE_COMMENT; goto END; case '*': lex_parse_skip_block_comment(lexer->stream, &lexer->pos); token->type = SCC_TOK_BLOCK_COMMENT; goto END; default: scc_probe_stream_reset(stream), type = SCC_TOK_DIV; break; } break; case '%': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_ASSIGN_MOD; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_MOD; break; } break; case '&': switch (scc_probe_stream_next(stream)) { case '&': type = SCC_TOK_AND_AND; goto double_char; case '=': type = SCC_TOK_ASSIGN_AND; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_AND; break; } break; case '|': switch (scc_probe_stream_next(stream)) { case '|': type = SCC_TOK_OR_OR; goto double_char; case '=': type = SCC_TOK_ASSIGN_OR; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_OR; break; } break; case '^': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_ASSIGN_XOR; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_XOR; break; } break; case '<': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_LE; goto double_char; case '<': { if (scc_probe_stream_next(stream) == '=') { type = SCC_TOK_ASSIGN_L_SH; goto triple_char; } else { type = SCC_TOK_L_SH; goto double_char; } break; } default: scc_probe_stream_reset(stream), type = SCC_TOK_LT; break; } break; case '>': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_GE; goto double_char; case '>': { if (scc_probe_stream_next(stream) == '=') { type = SCC_TOK_ASSIGN_R_SH; goto triple_char; } else { type = SCC_TOK_R_SH; goto double_char; } break; } default: scc_probe_stream_reset(stream), type = SCC_TOK_GT; break; } break; case '~': type = SCC_TOK_BIT_NOT; break; case '!': switch (scc_probe_stream_next(stream)) { case '=': type = SCC_TOK_NEQ; goto double_char; default: scc_probe_stream_reset(stream), type = SCC_TOK_NOT; break; } break; case '[': type = SCC_TOK_L_BRACKET; break; case ']': type = SCC_TOK_R_BRACKET; break; case '(': type = SCC_TOK_L_PAREN; break; case ')': type = SCC_TOK_R_PAREN; break; case '{': type = SCC_TOK_L_BRACE; break; case '}': type = SCC_TOK_R_BRACE; break; case ';': type = SCC_TOK_SEMICOLON; break; case ',': type = SCC_TOK_COMMA; break; case ':': type = SCC_TOK_COLON; break; case '.': if (scc_probe_stream_next(stream) == '.' && scc_probe_stream_next(stream) == '.') { type = SCC_TOK_ELLIPSIS; goto triple_char; } type = SCC_TOK_DOT; break; case '?': type = SCC_TOK_COND; break; case '\v': case '\f': case ' ': case '\t': type = SCC_TOK_BLANK; break; case '\r': case '\n': lex_parse_skip_endline(lexer->stream, &lexer->pos); token->type = SCC_TOK_BLANK; goto END; case '#': parse_line(lexer, token); token->type = SCC_TOK_BLANK; goto END; case '\0': case core_stream_eof: // EOF type = SCC_TOK_EOF; break; case '\'': { token->loc = lexer->pos; token->type = SCC_TOK_CHAR_LITERAL; int ch = lex_parse_char(lexer->stream, &lexer->pos); if (ch == core_stream_eof) { LEX_ERROR("Unexpected character literal"); token->type = SCC_TOK_UNKNOWN; } else { token->value.ch = ch; } goto END; } case '"': { token->loc = lexer->pos; token->type = SCC_TOK_STRING_LITERAL; scc_cstring_t output = scc_cstring_new(); if (lex_parse_string(lexer->stream, &lexer->pos, &output) == true) { token->value.cstr.data = scc_cstring_as_cstr(&output); token->value.cstr.len = scc_cstring_len(&output); } else { LEX_ERROR("Unexpected string literal"); token->type = SCC_TOK_UNKNOWN; } goto END; } /* clang-format off */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* clang-format on */ token->loc = lexer->pos; token->type = SCC_TOK_INT_LITERAL; usize output; if (lex_parse_number(lexer->stream, &lexer->pos, &output) == true) { token->value.n = output; } else { LEX_ERROR("Unexpected number literal"); token->type = SCC_TOK_UNKNOWN; } goto END; /* clang-format off */ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': /* clang-format on */ scc_cstring_t str = scc_cstring_new(); cbool ret = lex_parse_identifier(lexer->stream, &lexer->pos, &str); Assert(ret == true); int res = keyword_cmp(scc_cstring_as_cstr(&str), scc_cstring_len(&str)); if (res == -1) { token->value.cstr.data = (char *)scc_cstring_as_cstr(&str); token->value.cstr.len = scc_cstring_len(&str); type = SCC_TOK_IDENT; } else { scc_cstring_free(&str); type = keywords[res].tok; } token->type = type; goto END; default: LEX_ERROR("unsupport char in sourse code `%c`", ch); break; } goto once_char; triple_char: scc_probe_stream_consume(stream); core_pos_next(&lexer->pos); double_char: scc_probe_stream_consume(stream); core_pos_next(&lexer->pos); once_char: scc_probe_stream_consume(stream); core_pos_next(&lexer->pos); token->type = type; END: LEX_DEBUG("get token `%s` in %s:%d:%d", scc_get_tok_name(token->type), token->loc.name, token->loc.line, token->loc.column); } // scc_lexer_get_token maybe got invalid (with parser) void scc_lexer_get_valid_token(scc_lexer_t *lexer, lexer_tok_t *token) { scc_tok_subtype_t type; do { scc_lexer_get_token(lexer, token); type = scc_get_tok_subtype(token->type); AssertFmt(type != SCC_TOK_SUBTYPE_INVALID, "Invalid token: `%s` at %s:%d:%d", scc_get_tok_name(token->type), token->loc.name, token->loc.line, token->loc.col); Assert(type != SCC_TOK_SUBTYPE_INVALID); } while (type == SCC_TOK_SUBTYPE_EMPTYSPACE || type == SCC_TOK_SUBTYPE_COMMENT); }