/** * 仿照LCCompiler的词法分析部分 * * 如下为LCC的README in 2025.2 This hierarchy is the distribution for lcc version 4.2. lcc version 3.x is described in the book "A Retargetable C Compiler: Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1). There are significant differences between 3.x and 4.x, most notably in the intermediate code. For details, see https://drh.github.io/lcc/documents/interface4.pdf. VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION. LCC is a C89 ("ANSI C") compiler designed to be highly retargetable. LOG describes the changes since the last release. CPYRIGHT describes the conditions under you can use, copy, modify, and distribute lcc or works derived from lcc. doc/install.html is an HTML file that gives a complete description of the distribution and installation instructions. Chris Fraser / cwf@aya.yale.edu David Hanson / drh@drhanson.net */ #include #include "lexer_log.h" #include "token.h" #include "lexer.h" static const struct { const char* name; enum CSTD_KEYWORD std_type; cc_tktype_t tok; } keywords[] = { #define X(name, std_type, tok, ...) { #name, std_type, tok }, KEYWORD_TABLE #undef X }; // by using binary search to find the keyword static inline int keyword_cmp(const char* name, int len) { int low = 0; int high = sizeof(keywords) / sizeof(keywords[0]) - 1; while (low <= high) { int mid = (low + high) / 2; const char *key = keywords[mid].name; int cmp = 0; // 自定义字符串比较逻辑 for (int i = 0; i < len; i++) { if (name[i] != key[i]) { cmp = (unsigned char)name[i] - (unsigned char)key[i]; break; } if (name[i] == '\0') break; // 遇到终止符提前结束 } if (cmp == 0) { // 完全匹配检查(长度相同) if (key[len] == '\0') return mid; cmp = -1; // 当前关键词比输入长 } if (cmp < 0) { high = mid - 1; } else { low = mid + 1; } } return -1; // Not a keyword. } void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) { lexer->strpool = strpool; lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer); lexer->loc.fname = strpool_intern(lexer->strpool, file_name); lexer->loc.line = 1; lexer->loc.col = 1; lexer->stream = stream; lexer->sread = sread; rt_memset(lexer->buffer, 0, sizeof(lexer->buffer)); } static void flush_buffer(lexer_t* lexer) { int num = lexer->end_ptr - lexer->cur_ptr; for (int i = 0; i < num; i++) { lexer->buffer[i] = lexer->cur_ptr[i]; } lexer->cur_ptr = lexer->buffer; int read_size = LEXER_BUFFER_SIZE - num; // TODO rt_size_t to int maybe lose precision int got_size = lexer->sread(lexer->buffer + num, read_size, 1, read_size, lexer->stream); if (got_size < 0) { LEX_ERROR("lexer read error"); } else if (got_size < read_size) { lexer->end_ptr += got_size; lexer->end_ptr[0] = '\0'; // EOF lexer->end_ptr++; } else if (got_size == read_size) { lexer->end_ptr += got_size; } else { LEX_ERROR("lexer read error imposible got_size > read_size maybe overflow?"); } } static void goto_newline(lexer_t* lexer) { do { if (lexer->cur_ptr == lexer->end_ptr) { flush_buffer(lexer); lexer->cur_ptr--; } lexer->cur_ptr++; } while (*lexer->cur_ptr != '\n' && *lexer->cur_ptr != '\0'); } static void goto_block_comment(lexer_t* lexer) { while (1) { if (lexer->end_ptr - lexer->cur_ptr < 2) { flush_buffer(lexer); } if (lexer->cur_ptr[0] == '\0') { break; } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') { lexer->cur_ptr += 2; break; } else { if (lexer->cur_ptr[0] == '\n') lexer->loc.line++; lexer->cur_ptr++; } } } // TODO escape character not enough static char got_slash(char* peek) { switch (*peek) { case '\\': return '\\'; case '\'': return '\''; case '\"': return '\"'; case '\?': return '\?'; case '0': return '\0'; case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; default: break; } LEX_ERROR("Unknown escape character"); return -1; } static void parse_char_literal(lexer_t* lexer, tok_t* token) { char val = 0; char* peek = lexer->cur_ptr + 1; if (*peek == '\\') { peek++; val = got_slash(peek); peek++; } else { val = *peek++; } if (*peek++ != '\'') LEX_ERROR("Unclosed character literal"); lexer->cur_ptr = peek; token->val.ch = val; } static void parse_string_literal(lexer_t* lexer, tok_t* token) { char* peek = lexer->cur_ptr + 1; // TODO string literal size check static char dest[LEXER_MAX_TOKEN_SIZE + 1]; int len = 0; while (*peek != '"') { if (peek >= lexer->end_ptr) flush_buffer(lexer); if (*peek == '\\') { // 处理转义 peek++; *peek = got_slash(peek); } if (len >= LEXER_MAX_TOKEN_SIZE) LEX_ERROR("String too long"); dest[len++] = *peek++; } dest[len] = '\0'; lexer->cur_ptr = peek + 1; // 1 is `"` lexer->loc.len = len + 2; // 2 is `"` `"` token->val.str = strpool_intern(lexer->strpool, dest); } // FIXME it write by AI maybe error static void parse_number(lexer_t* lexer, tok_t* token) { char* peek = lexer->cur_ptr; int base = 10; int is_float = 0; long long int_val = 0; double float_val = 0.0; double fraction = 1.0; // 判断进制 if (*peek == '0') { peek++; switch (*peek) { case 'x': case 'X': base = 16; default: base = 8; } } // 解析整数部分 while (1) { int digit = -1; if (*peek >= '0' && *peek <= '9') { digit = *peek - '0'; } else if (base == 16) { if (*peek >= 'a' && *peek <= 'f') digit = *peek - 'a' + 10; else if (*peek >= 'A' && *peek <= 'F') digit = *peek - 'A' + 10; } if (digit < 0 || digit >= base) break; if (!is_float) { int_val = int_val * base + digit; } else { float_val = float_val * base + digit; fraction *= base; } peek++; } // 解析浮点数 if (*peek == '.' && base == 10) { is_float = 1; float_val = int_val; peek++; while (*peek >= '0' && *peek <= '9') { float_val = float_val * 10.0 + (*peek - '0'); fraction *= 10.0; peek++; } float_val /= fraction; } // 解析科学计数法 if ((*peek == 'e' || *peek == 'E') && base == 10) { is_float = 1; peek++; // int exp_sign = 1; int exponent = 0; if (*peek == '+') peek++; else if (*peek == '-') { // exp_sign = -1; peek++; } while (*peek >= '0' && *peek <= '9') { exponent = exponent * 10 + (*peek - '0'); peek++; } // float_val *= pow(10.0, exp_sign * exponent); } // 存储结果 // TODO lexer->loc.len = peek - lexer->cur_ptr; lexer->cur_ptr = peek; if (is_float) { token->val.f32 = float_val; token->sub_type = TOKEN_FLOAT_LITERAL; } else { token->val.i = int_val; token->sub_type = TOKEN_INT_LITERAL; } } #define GOT_ONE_TOKEN_BUF_SIZE 64 // /zh/c/language/operator_arithmetic.html void get_token(lexer_t* lexer, tok_t* token) { // 需要保证缓冲区始终可读 if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) { flush_buffer(lexer); } register char* peek = lexer->cur_ptr; cc_tktype_t tk_type = TOKEN_INIT; ctype_t literal = { 0 }; // once step switch (*peek++) { case '=': switch (*peek++) { case '=': tk_type = TOKEN_EQ; break; default: peek--, tk_type = TOKEN_ASSIGN; break; } break; case '+': switch (*peek++) { case '+': tk_type = TOKEN_ADD_ADD; break; case '=': tk_type = TOKEN_ASSIGN_ADD; break; default: peek--, tk_type = TOKEN_ADD; break; } break; case '-': switch (*peek++) { case '-': tk_type = TOKEN_SUB_SUB; break; case '=': tk_type = TOKEN_ASSIGN_SUB; break; case '>': tk_type = TOKEN_DEREF; break; default: peek--, tk_type = TOKEN_SUB; break; } break; case '*': switch (*peek++) { case '=': tk_type = TOKEN_ASSIGN_MUL; break; default: peek--, tk_type = TOKEN_MUL; break; } break; case '/': switch (*peek++) { case '=': tk_type = TOKEN_ASSIGN_DIV; break; case '/': { goto_newline(lexer); tk_type = TOKEN_LINE_COMMENT; goto END; } case '*': { lexer->cur_ptr = peek; goto_block_comment(lexer); tk_type = TOKEN_BLOCK_COMMENT; goto END; } default: peek--, tk_type = TOKEN_DIV; break; } break; case '%': switch (*peek++) { case '=': tk_type = TOKEN_ASSIGN_MOD; break; default: peek--, tk_type = TOKEN_MOD; break; } break; case '&': switch (*peek++) { case '&': tk_type = TOKEN_AND_AND; break; case '=': tk_type = TOKEN_ASSIGN_AND; break; default: peek--, tk_type = TOKEN_AND; break; } break; case '|': switch (*peek++) { case '|': tk_type = TOKEN_OR_OR; break; case '=': tk_type = TOKEN_ASSIGN_OR; break; default: peek--, tk_type = TOKEN_OR; break; } break; case '^': switch (*peek++) { case '=': tk_type = TOKEN_ASSIGN_XOR; break; default: peek--, tk_type = TOKEN_XOR; break; } break; case '<': switch (*peek++) { case '=': tk_type = TOKEN_LE; break; case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break; default: peek--, tk_type = TOKEN_LT; break; } break; case '>': switch (*peek++) { case '=': tk_type = TOKEN_GE; break; case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break; default: peek--, tk_type = TOKEN_GT; break; } break; case '~': tk_type = TOKEN_BIT_NOT; break; case '!': switch (*peek++) { case '=': tk_type = TOKEN_NEQ; break; default: peek--, tk_type = TOKEN_NOT; break; } break; case '[': tk_type = TOKEN_L_BRACKET; break; case ']': tk_type = TOKEN_R_BRACKET; break; case '(': tk_type = TOKEN_L_PAREN; break; case ')': tk_type = TOKEN_R_PAREN; break; case '{': tk_type = TOKEN_L_BRACE; break; case '}': tk_type = TOKEN_R_BRACE; break; case ';': tk_type = TOKEN_SEMICOLON; break; case ',': tk_type = TOKEN_COMMA; break; case ':': tk_type = TOKEN_COLON; break; case '.': if (peek[0] == '.' && peek[1] == '.') { peek += 2; tk_type = TOKEN_ELLIPSIS; } else { tk_type = TOKEN_DOT; } break; case '?': tk_type = TOKEN_COND; break; case '\v': case '\r': case '\f': case ' ': case '\t': tk_type = TOKEN_BLANK; break; case '\n': // you need to flush a newline or blank lexer->loc.line += 1; lexer->loc.col = -1; lexer->loc.len = 1; tk_type = TOKEN_BLANK; break; case '#': // TODO make line or file comment to change LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored"); goto_newline(lexer); tk_type = TOKEN_BLANK; goto END; case '\0': // EOF tk_type = TOKEN_EOF; goto END; case '\'': parse_char_literal(lexer, token); literal = token->val; tk_type = TOKEN_CHAR_LITERAL; goto END; break; case '"': parse_string_literal(lexer, token); literal = token->val; tk_type = TOKEN_STRING_LITERAL; goto END; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': parse_number(lexer, token); // TODO Make it easy literal = token->val; tk_type = token->sub_type; goto END; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z': case '_': // TOKEN_IDENT if ((*peek == 'L' && *peek == '\'') || (*peek == 'L' && *peek == '"')) { LEX_ERROR("unsupport wide-character char literal by `L` format"); } while (1) { if (peek == lexer->end_ptr) { LEX_ERROR("unsupport outof 64 length identifier"); } if ((*peek >= 'a' && *peek <= 'z') || (*peek >= 'A' && *peek <= 'Z') || (*peek == '_') || (*peek >= '0' && *peek <= '9')) { peek++; continue; } break; } int strlen = peek - lexer->cur_ptr; int res = keyword_cmp((const char*)lexer->cur_ptr, strlen); if (res == -1) { char prev = lexer->cur_ptr[strlen]; lexer->cur_ptr[strlen] = '\0'; literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr); lexer->cur_ptr[strlen] = prev; tk_type = TOKEN_IDENT; break; } else { tk_type = keywords[res].tok; break; } default: LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr)); break; } lexer->loc.len = peek - lexer->cur_ptr; lexer->cur_ptr = peek; END: lexer->loc.col += lexer->loc.len; lexer->loc.len = 0; token->val = literal; token->sub_type = tk_type; token->loc = lexer->loc; static const tok_basic_type_t tok_type_map[] = { // 普通token使用#str #define X(str, basic, tok) [tok] = basic, TOKEN_TABLE #undef X // 关键字使用#name #define X(name, std, tok) [tok] = TK_BASIC_KEYWORD, KEYWORD_TABLE #undef X }; token->type = tok_type_map[tk_type]; LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type), token->loc.fname, token->loc.line, token->loc.col); } // get_token maybe got invalid (with parser) void get_valid_token(lexer_t* lexer, tok_t* token) { tok_basic_type_t type; do { get_token(lexer, token); type = token->type; Assert(type != TK_BASIC_INVALID); } while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT); }