#include #include static const struct { const char *name; scc_cstd_t std_type; scc_tok_type_t tok_type; } keywords[] = { #define X(name, subtype, tok, std_type, ...) {#name, std_type, tok}, SCC_CKEYWORD_TABLE #undef X }; // by using binary search to find the keyword static int keyword_cmp(const char *name, int len) { int low = 0; int high = sizeof(keywords) / sizeof(keywords[0]) - 1; while (low <= high) { int mid = (low + high) / 2; const char *key = keywords[mid].name; int cmp = 0; for (int i = 0; i < len; i++) { if (name[i] != key[i]) { cmp = (unsigned char)name[i] - (unsigned char)key[i]; break; } if (name[i] == '\0') break; } if (cmp == 0) { if (key[len] == '\0') return mid; cmp = -1; } if (cmp < 0) high = mid - 1; else low = mid + 1; } return -1; // 不是关键字 } void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref) { lexer->stream_ref = stream_ref; lexer->ring_ref_count = 0; lexer->jump_macro = false; } static inline cbool is_whitespace(int ch) { return ch == ' ' || ch == '\t' || ch == '\v' || ch == '\f'; } static inline cbool is_newline(int ch) { return ch == '\n' || ch == '\r'; } static inline cbool is_digit(int ch) { return ch >= '0' && ch <= '9'; } static inline cbool is_alpha(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); } static inline cbool is_alnum(int ch) { return is_alpha(ch) || is_digit(ch); } static inline cbool is_identifier_start(int ch) { return is_alpha(ch) || ch == '_'; } static inline cbool is_identifier_part(int ch) { return is_alnum(ch) || ch == '_'; } static inline cbool is_octal_digit(int ch) { return ch >= '0' && ch <= '7'; } static inline cbool is_hex_digit(int ch) { return is_digit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } /* 从环形缓冲区预览一个字符(带EOF检测) */ static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) { cbool ok; scc_ring_peek(*lexer->stream_ref, *out, ok); return ok; } /* 从环形缓冲区消费一个字符,并将它追加到lexeme中 */ static inline cbool next_char(scc_lexer_t *lexer, scc_cstring_t *lexeme, scc_sstream_char_t *out) { cbool ok; scc_ring_next(*lexer->stream_ref, *out, ok); if (!ok) return false; scc_cstring_append_ch(lexeme, out->character); return true; } #define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN) void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) { scc_sstream_char_t cur = {0}; scc_cstring_t lex = scc_cstring_create(); // 临时lexeme // 尝试预览第一个字符 if (!peek_char(lexer, &cur)) { token->type = SCC_TOK_EOF; token->loc = (scc_pos_t){0, 1, 1, 0}; // 默认位置 token->lexeme = lex; // 空字符串 return; } // 记录起始位置 scc_pos_t start_loc = cur.pos; int ch = cur.character; // once step if (is_whitespace(ch)) { // 空白符: 连续收集 token->type = SCC_TOK_BLANK; while (peek_char(lexer, &cur) && is_whitespace(cur.character)) { next_char(lexer, &lex, &cur); } } else if (is_newline(ch)) { // 换行符:处理 \r 或 \n,以及 \r\n 组合 token->type = SCC_TOK_ENDLINE; next_char(lexer, &lex, &cur); // 消费第一个字符 if (ch == '\r') { // 尝试消费后面的 \n if (peek_char(lexer, &cur) && cur.character == '\n') { next_char(lexer, &lex, &cur); } } } else if (ch == '/') { // 可能为注释或除号 scc_sstream_char_t next = {0}; next_char(lexer, &lex, &cur); // 消费 '/' peek_char(lexer, &next); if (next.character == '=') { token->type = SCC_TOK_ASSIGN_DIV; next_char(lexer, &lex, &cur); } else if (next.character == '/') { // 行注释 // token->type = SCC_TOK_LINE_COMMENT; next_char(lexer, &lex, &cur); // 消费 '/' while (peek_char(lexer, &cur) && !is_newline(cur.character)) { next_char(lexer, &lex, &cur); scc_ring_consume(*lexer->stream_ref); } // 注释结束,不包含换行符(换行符单独成token) } else if (next.character == '*') { // 块注释 /* token->type = SCC_TOK_BLOCK_COMMENT; next_char(lexer, &lex, &cur); // 消费 '*' while (1) { if (!next_char(lexer, &lex, &cur)) { // 文件结束,注释未闭合 LOG_ERROR("Unterminated block comment"); break; } if (cur.character == '*' && peek_char(lexer, &next) && next.character == '/') { next_char(lexer, &lex, &cur); // 消费 '/' break; } scc_ring_consume(*lexer->stream_ref); } } else { // 只是除号 / token->type = SCC_TOK_DIV; } } else if (is_identifier_start(ch)) { // 标识符或关键字 token->type = SCC_TOK_IDENT; // 暂定 while (peek_char(lexer, &cur) && is_identifier_part(cur.character)) { next_char(lexer, &lex, &cur); scc_ring_consume(*lexer->stream_ref); } // 检查是否为关键字 int idx = keyword_cmp(scc_cstring_as_cstr(&lex), scc_cstring_len(&lex)); if (idx != -1) { token->type = keywords[idx].tok_type; } } else if (is_digit(ch)) { // 数字字面量(整数/浮点) token->type = SCC_TOK_INT_LITERAL; // 先假定整数 cbool maybe_float = false; while (1) { next_char(lexer, &lex, &cur); // 消费当前数字 if (!peek_char(lexer, &cur)) break; ch = cur.character; if (is_digit(ch) || (ch == '.' && !maybe_float)) { if (ch == '.') maybe_float = true; continue; } if (ch == 'e' || ch == 'E' || ch == 'p' || ch == 'P') { maybe_float = true; // 后面可能跟符号或数字 continue; } if (ch == 'x' || ch == 'X') { // 十六进制前缀,需特殊处理 // 这里简化:将整个序列作为整数(保留前缀) continue; } break; } if (maybe_float) token->type = SCC_TOK_FLOAT_LITERAL; } else if (ch == '\'') { // 字符字面量 token->type = SCC_TOK_CHAR_LITERAL; next_char(lexer, &lex, &cur); // 开头的 ' while (1) { if (!peek_char(lexer, &cur)) { LOG_ERROR("Unterminated character literal"); break; } if (cur.character == '\'') { next_char(lexer, &lex, &cur); // 闭引号 break; } if (cur.character == '\\') { // 转义序列:原样保存反斜杠和下一个字符 next_char(lexer, &lex, &cur); if (!peek_char(lexer, &cur)) break; next_char(lexer, &lex, &cur); } else { next_char(lexer, &lex, &cur); } } } else if (ch == '"') { // 字符串字面量 token->type = SCC_TOK_STRING_LITERAL; next_char(lexer, &lex, &cur); // 开头的 " while (1) { if (!peek_char(lexer, &cur)) { LOG_ERROR("Unterminated string literal"); break; } if (cur.character == '"') { next_char(lexer, &lex, &cur); // 闭引号 break; } if (cur.character == '\\') { // 转义序列 next_char(lexer, &lex, &cur); if (!peek_char(lexer, &cur)) break; next_char(lexer, &lex, &cur); } else { next_char(lexer, &lex, &cur); } scc_ring_consume(*lexer->stream_ref); } } else { scc_sstream_char_t next = {0}; next_char(lexer, &lex, &cur); peek_char(lexer, &next); switch (ch) { case '=': switch (next.character) { case '=': token->type = SCC_TOK_EQ; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_ASSIGN; break; } break; case '+': switch (next.character) { case '+': token->type = SCC_TOK_ADD_ADD; next_char(lexer, &lex, &cur); break; case '=': token->type = SCC_TOK_ASSIGN_ADD; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_ADD; break; } break; case '-': switch (next.character) { case '-': token->type = SCC_TOK_SUB_SUB; next_char(lexer, &lex, &cur); break; case '=': token->type = SCC_TOK_ASSIGN_SUB; next_char(lexer, &lex, &cur); break; case '>': token->type = SCC_TOK_DEREF; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_SUB; break; } break; case '*': switch (next.character) { case '=': token->type = SCC_TOK_ASSIGN_MUL; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_MUL; break; } break; case '%': switch (next.character) { case '=': token->type = SCC_TOK_ASSIGN_MOD; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_MOD; break; } break; case '&': switch (next.character) { case '&': token->type = SCC_TOK_AND_AND; next_char(lexer, &lex, &cur); break; case '=': token->type = SCC_TOK_ASSIGN_AND; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_AND; break; } break; case '|': switch (next.character) { case '|': token->type = SCC_TOK_OR_OR; next_char(lexer, &lex, &cur); break; case '=': token->type = SCC_TOK_ASSIGN_OR; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_OR; break; } break; case '^': switch (next.character) { case '=': token->type = SCC_TOK_ASSIGN_XOR; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_XOR; break; } break; case '<': switch (next.character) { case '=': token->type = SCC_TOK_LE; next_char(lexer, &lex, &cur); break; case '<': { next_char(lexer, &lex, &cur); if (peek_char(lexer, &next) && next.character == '=') { token->type = SCC_TOK_ASSIGN_L_SH; next_char(lexer, &lex, &cur); } else { token->type = SCC_TOK_L_SH; } break; } default: token->type = SCC_TOK_LT; break; } break; case '>': switch (next.character) { case '=': token->type = SCC_TOK_GE; next_char(lexer, &lex, &cur); break; case '>': { next_char(lexer, &lex, &cur); if (peek_char(lexer, &next) && next.character == '=') { token->type = SCC_TOK_ASSIGN_R_SH; next_char(lexer, &lex, &cur); } else { token->type = SCC_TOK_R_SH; } break; } default: token->type = SCC_TOK_GT; break; } break; case '~': token->type = SCC_TOK_BIT_NOT; break; case '!': switch (next.character) { case '=': token->type = SCC_TOK_NEQ; next_char(lexer, &lex, &cur); break; default: token->type = SCC_TOK_NOT; break; } break; /* clang-format off */ case '[': token->type = SCC_TOK_L_BRACKET; break; case ']': token->type = SCC_TOK_R_BRACKET; break; case '(': token->type = SCC_TOK_L_PAREN; break; case ')': token->type = SCC_TOK_R_PAREN; break; case '{': token->type = SCC_TOK_L_BRACE; break; case '}': token->type = SCC_TOK_R_BRACE; break; case ';': token->type = SCC_TOK_SEMICOLON; break; case ',': token->type = SCC_TOK_COMMA; break; case ':': token->type = SCC_TOK_COLON; break; /* clang-format on */ case '.': if (next.character == '.' && peek_char(lexer, &next) && next.character == '.') { token->type = SCC_TOK_ELLIPSIS; next_char(lexer, &lex, &cur); next_char(lexer, &lex, &cur); } else { token->type = SCC_TOK_DOT; } break; case '?': token->type = SCC_TOK_COND; break; case '#': if (next.character == '#') { token->type = SCC_TOK_SHARP_SHARP; next_char(lexer, &lex, &cur); } else token->type = SCC_TOK_SHARP; break; default: token->type = SCC_TOK_UNKNOWN; SCC_ERROR(start_loc, "unsupported character '%c' (0x%x)", ch, ch); break; } } // 设置token scc_ring_consume(*lexer->stream_ref); token->type = token->type; // 上面已设 token->loc = start_loc; token->lexeme = lex; // 转移所有权 LEX_DEBUG("get token `%s` (%s) at %s:%d:%d", scc_get_tok_name(token->type), scc_cstring_as_cstr(&token->lexeme), token->loc.name, token->loc.line, token->loc.col); } // scc_lexer_get_token maybe got invalid (with parser) void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) { scc_tok_subtype_t subtype; while (1) { scc_lexer_get_token(lexer, token); subtype = scc_get_tok_subtype(token->type); AssertFmt(subtype != SCC_TOK_SUBTYPE_INVALID, "Invalid token: `%s` at %s:%d:%d", scc_get_tok_name(token->type), token->loc.name, token->loc.line, token->loc.col); if (subtype == SCC_TOK_SUBTYPE_EMPTYSPACE || subtype == SCC_TOK_SUBTYPE_COMMENT) { scc_lexer_tok_drop(token); } break; }; } static cbool fill_token(scc_lexer_tok_t *out, void *userdata) { scc_lexer_t *lexer = userdata; scc_lexer_get_token(lexer, out); if (out->type == SCC_TOK_EOF) { return false; } return true; } static cbool fill_valid_token(scc_lexer_tok_t *out, void *userdata) { scc_lexer_t *lexer = userdata; scc_lexer_get_valid_token(lexer, out); if (out->type == SCC_TOK_EOF) { return false; } return true; } scc_lexer_tok_ring_t *scc_lexer_to_ring(scc_lexer_t *lexer, int ring_size, cbool fill_all) { scc_ring_init(lexer->ring, ring_size, fill_all ? fill_token : fill_valid_token, lexer); lexer->ring_ref_count++; return &lexer->ring; } void scc_lexer_drop_ring(scc_lexer_tok_ring_t *ring_ref) { scc_lexer_t *lexer = ring_ref->userdata; if (lexer->ring_ref_count > 0) { lexer->ring_ref_count--; } else { LOG_WARN("double drop sstream ring"); } } void scc_lexer_drop(scc_lexer_t *lexer) { Assert(lexer != null); if (lexer->ring_ref_count) { LOG_FATAL("drop sstream must be drop ring before ref [%d]", lexer->ring_ref_count); } scc_ring_free(lexer->ring); scc_sstream_drop_ring(lexer->stream_ref); }