diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37f8f4d --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +.vscode/ + +# smcc compiler generated files +*.bin + +# linux binary files +*.o +*.a +*.so +*.out + +# windows binary files +*.obj +*.lib +*.dll +*.exe + +# developed notes +note.md + +# python +.venv diff --git a/ccompiler/backend/riscv32/rv32.c b/ccompiler/backend/riscv32/rv32.c index 94fb86a..9784628 100644 --- a/ccompiler/backend/riscv32/rv32.c +++ b/ccompiler/backend/riscv32/rv32.c @@ -7,7 +7,7 @@ // 指令编码联合体(自动处理小端序) typedef union rv32code { uint32_t code; - u8_t bytes[4]; + uint8_t bytes[4]; } rv32code_t; #include "../../frontend/frontend.h" diff --git a/ccompiler/frontend/frontend.c b/ccompiler/frontend/frontend.c index ff7f59f..60bc6b7 100644 --- a/ccompiler/frontend/frontend.c +++ b/ccompiler/frontend/frontend.c @@ -4,14 +4,16 @@ ast_node_t* frontend(const char* file, void* stream, sread_fn sread) { init_lib_core(); + strpool_t strpool; + init_strpool(&strpool); lexer_t lexer; - init_lexer(&lexer, file, stream, sread); + init_lexer(&lexer, file, stream, sread, &strpool); symtab_t symtab; init_symtab(&symtab); - parser_t parser; + parser_t parser; init_parser(&parser, &lexer, &symtab); parse_prog(&parser); diff --git a/ccompiler/frontend/lexer/lexer.c b/ccompiler/frontend/lexer/lexer.c index 7cbf768..f327f8b 100644 --- a/ccompiler/frontend/lexer/lexer.c +++ b/ccompiler/frontend/lexer/lexer.c @@ -34,7 +34,7 @@ David Hanson / drh@drhanson.net static const struct { const char* name; enum CSTD_KEYWORD std_type; - tok_type_t tok; + cc_tktype_t tok; } keywords[] = { #define X(name, std_type, tok, ...) { #name, std_type, tok }, KEYWORD_TABLE @@ -74,19 +74,17 @@ static inline int keyword_cmp(const char* name, int len) { return -1; // Not a keyword. } -void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) { - init_lib_core(); - - lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer); - lexer->index = 1; - lexer->line = 1; +void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) { + lexer->strpool = strpool; + lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer); + lexer->loc.fname = strpool_intern(lexer->strpool, file_name); + lexer->loc.line = 1; + lexer->loc.col = 1; lexer->stream = stream; lexer->sread = sread; - for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) { - lexer->buffer[i] = 0; - } + rt_memset(lexer->buffer, 0, sizeof(lexer->buffer)); } static void flush_buffer(lexer_t* lexer) { @@ -94,7 +92,7 @@ static void flush_buffer(lexer_t* lexer) { for (int i = 0; i < num; i++) { lexer->buffer[i] = lexer->cur_ptr[i]; } - lexer->cur_ptr = (unsigned char*)lexer->buffer; + lexer->cur_ptr = lexer->buffer; int read_size = LEXER_BUFFER_SIZE - num; // TODO rt_size_t to int maybe lose precision @@ -128,19 +126,20 @@ static void goto_block_comment(lexer_t* lexer) { flush_buffer(lexer); } - if (*lexer->cur_ptr == '\0') { + if (lexer->cur_ptr[0] == '\0') { break; } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') { lexer->cur_ptr += 2; break; } else { + if (lexer->cur_ptr[0] == '\n') lexer->loc.line++; lexer->cur_ptr++; } } } // TODO escape character not enough -static char got_slash(unsigned char* peek) { +static char got_slash(char* peek) { switch (*peek) { case '\\': return '\\'; case '\'': return '\''; @@ -162,7 +161,7 @@ static char got_slash(unsigned char* peek) { static void parse_char_literal(lexer_t* lexer, tok_t* token) { char val = 0; - unsigned char* peek = lexer->cur_ptr + 1; + char* peek = lexer->cur_ptr + 1; if (*peek == '\\') { peek++; val = got_slash(peek); @@ -172,16 +171,14 @@ static void parse_char_literal(lexer_t* lexer, tok_t* token) { } if (*peek++ != '\'') LEX_ERROR("Unclosed character literal"); - token->val.ch = val; lexer->cur_ptr = peek; - token->val.have = 1; - token->type = TOKEN_CHAR_LITERAL; + token->val.ch = val; } static void parse_string_literal(lexer_t* lexer, tok_t* token) { - unsigned char* peek = lexer->cur_ptr + 1; + char* peek = lexer->cur_ptr + 1; // TODO string literal size check - char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1); + static char dest[LEXER_MAX_TOKEN_SIZE + 1]; int len = 0; while (*peek != '"') { @@ -196,14 +193,15 @@ static void parse_string_literal(lexer_t* lexer, tok_t* token) { dest[len++] = *peek++; } dest[len] = '\0'; - lexer->cur_ptr = peek + 1; - token->val.have = 1; - token->type = TOKEN_STRING_LITERAL; + lexer->cur_ptr = peek + 1; // 1 is `"` + lexer->loc.len = len + 2; // 2 is `"` `"` + + token->val.str = strpool_intern(lexer->strpool, dest); } // FIXME it write by AI maybe error static void parse_number(lexer_t* lexer, tok_t* token) { - unsigned char* peek = lexer->cur_ptr; + char* peek = lexer->cur_ptr; int base = 10; int is_float = 0; long long int_val = 0; @@ -278,14 +276,15 @@ static void parse_number(lexer_t* lexer, tok_t* token) { } // 存储结果 + // TODO + lexer->loc.len = peek - lexer->cur_ptr; lexer->cur_ptr = peek; - token->val.have = 1; if (is_float) { - token->val.d = float_val; - token->type = TOKEN_FLOAT_LITERAL; + token->val.f32 = float_val; + token->sub_type = TOKEN_FLOAT_LITERAL; } else { - token->val.ll = int_val; - token->type = TOKEN_INT_LITERAL; + token->val.i = int_val; + token->sub_type = TOKEN_INT_LITERAL; } } @@ -296,160 +295,159 @@ void get_token(lexer_t* lexer, tok_t* token) { if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) { flush_buffer(lexer); } - register unsigned char* peek = lexer->cur_ptr; - - // 快速跳过空白符 - while (*peek == ' ' || *peek == '\t') { - if (peek == lexer->end_ptr) { - break; - } - peek++; - } - if (peek != lexer->cur_ptr) { - // To TOKEN_FLUSH - lexer->cur_ptr = peek; - token->type = TOKEN_FLUSH; - } - - tok_type_t tok = TOKEN_INIT; - tok_val_t constant; - constant.have = 0; - + register char* peek = lexer->cur_ptr; + + cc_tktype_t tk_type = TOKEN_INIT; + ctype_t literal = { 0 }; + // once step switch (*peek++) { - case '=': + case '=': switch (*peek++) { - case '=': tok = TOKEN_EQ; break; - default: peek--, tok = TOKEN_ASSIGN; break; + case '=': tk_type = TOKEN_EQ; break; + default: peek--, tk_type = TOKEN_ASSIGN; break; } break; case '+': switch (*peek++) { - case '+': tok = TOKEN_ADD_ADD; break; - case '=': tok = TOKEN_ASSIGN_ADD; break; - default: peek--, tok = TOKEN_ADD; break; + case '+': tk_type = TOKEN_ADD_ADD; break; + case '=': tk_type = TOKEN_ASSIGN_ADD; break; + default: peek--, tk_type = TOKEN_ADD; break; } break; case '-': switch (*peek++) { - case '-': tok = TOKEN_SUB_SUB; break; - case '=': tok = TOKEN_ASSIGN_SUB; break; + case '-': tk_type = TOKEN_SUB_SUB; break; + case '=': tk_type = TOKEN_ASSIGN_SUB; break; - case '>': tok = TOKEN_DEREF; break; - default: peek--, tok = TOKEN_SUB; break; + case '>': tk_type = TOKEN_DEREF; break; + default: peek--, tk_type = TOKEN_SUB; break; } break; case '*': switch (*peek++) { - case '=': tok = TOKEN_ASSIGN_MUL; break; - default: peek--, tok = TOKEN_MUL; break; + case '=': tk_type = TOKEN_ASSIGN_MUL; break; + default: peek--, tk_type = TOKEN_MUL; break; } break; case '/': switch (*peek++) { - case '=': tok = TOKEN_ASSIGN_DIV; break; + case '=': tk_type = TOKEN_ASSIGN_DIV; break; case '/': { - // need get a new line to parse goto_newline(lexer); - tok = TOKEN_LINE_COMMENT; + tk_type = TOKEN_LINE_COMMENT; goto END; } case '*': { lexer->cur_ptr = peek; goto_block_comment(lexer); - tok = TOKEN_BLOCK_COMMENT; + tk_type = TOKEN_BLOCK_COMMENT; goto END; } - default: peek--, tok = TOKEN_DIV; break; + default: peek--, tk_type = TOKEN_DIV; break; } break; case '%': switch (*peek++) { - case '=': tok = TOKEN_ASSIGN_MOD; break; - default: peek--, tok = TOKEN_MOD; break; + case '=': tk_type = TOKEN_ASSIGN_MOD; break; + default: peek--, tk_type = TOKEN_MOD; break; } break; case '&': switch (*peek++) { - case '&': tok = TOKEN_AND_AND; break; - case '=': tok = TOKEN_ASSIGN_AND; break; - default: peek--, tok = TOKEN_AND; break; + case '&': tk_type = TOKEN_AND_AND; break; + case '=': tk_type = TOKEN_ASSIGN_AND; break; + default: peek--, tk_type = TOKEN_AND; break; } break; case '|': switch (*peek++) { - case '|': tok = TOKEN_OR_OR; break; - case '=': tok = TOKEN_ASSIGN_OR; break; - default: peek--, tok = TOKEN_OR; break; + case '|': tk_type = TOKEN_OR_OR; break; + case '=': tk_type = TOKEN_ASSIGN_OR; break; + default: peek--, tk_type = TOKEN_OR; break; } break; case '^': switch (*peek++) { - case '=': tok = TOKEN_ASSIGN_XOR; break; - default: peek--, tok = TOKEN_XOR; break; + case '=': tk_type = TOKEN_ASSIGN_XOR; break; + default: peek--, tk_type = TOKEN_XOR; break; } break; case '<': switch (*peek++) { - case '=': tok = TOKEN_LE; break; - case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break; - default: peek--, tok = TOKEN_LT; break; + case '=': tk_type = TOKEN_LE; break; + case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break; + default: peek--, tk_type = TOKEN_LT; break; } break; case '>': switch (*peek++) { - case '=': tok = TOKEN_GE; break; - case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break; - default: peek--, tok = TOKEN_GT; break; + case '=': tk_type = TOKEN_GE; break; + case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break; + default: peek--, tk_type = TOKEN_GT; break; } break; case '~': - tok = TOKEN_BIT_NOT; break; + tk_type = TOKEN_BIT_NOT; break; case '!': switch (*peek++) { - case '=': tok = TOKEN_NEQ; break; - default: peek--, tok = TOKEN_NOT; break; + case '=': tk_type = TOKEN_NEQ; break; + default: peek--, tk_type = TOKEN_NOT; break; } break; case '[': - tok = TOKEN_L_BRACKET; break; + tk_type = TOKEN_L_BRACKET; break; case ']': - tok = TOKEN_R_BRACKET; break; + tk_type = TOKEN_R_BRACKET; break; case '(': - tok = TOKEN_L_PAREN; break; + tk_type = TOKEN_L_PAREN; break; case ')': - tok = TOKEN_R_PAREN; break; + tk_type = TOKEN_R_PAREN; break; case '{': - tok = TOKEN_L_BRACE; break; + tk_type = TOKEN_L_BRACE; break; case '}': - tok = TOKEN_R_BRACE; break; + tk_type = TOKEN_R_BRACE; break; case ';': - tok = TOKEN_SEMICOLON; break; + tk_type = TOKEN_SEMICOLON; break; case ',': - tok = TOKEN_COMMA; break; + tk_type = TOKEN_COMMA; break; case ':': - tok = TOKEN_COLON; break; + tk_type = TOKEN_COLON; break; case '.': if (peek[0] == '.' && peek[1] == '.') { peek += 2; - tok = TOKEN_ELLIPSIS; + tk_type = TOKEN_ELLIPSIS; } else { - tok = TOKEN_DOT; + tk_type = TOKEN_DOT; } break; case '?': - tok = TOKEN_COND; break; - case '\v': case '\r': case '\f': // FIXME it parse as a blank character - tok = TOKEN_FLUSH; break; - case '\n': + tk_type = TOKEN_COND; break; + case '\v': case '\r': case '\f': + case ' ': case '\t': + tk_type = TOKEN_BLANK; break; + case '\n': // you need to flush a newline or blank - lexer->line++; - tok = TOKEN_FLUSH; break; + lexer->loc.line += 1; + lexer->loc.col = -1; + lexer->loc.len = 1; + tk_type = TOKEN_BLANK; + break; case '#': - LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored"); + // TODO make line or file comment to change + LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored"); goto_newline(lexer); - tok = TOKEN_FLUSH; + tk_type = TOKEN_BLANK; goto END; case '\0': // EOF - tok = TOKEN_EOF; + tk_type = TOKEN_EOF; goto END; case '\'': - return parse_char_literal(lexer, token); - return; + parse_char_literal(lexer, token); + literal = token->val; + tk_type = TOKEN_CHAR_LITERAL; + goto END; break; case '"': - return parse_string_literal(lexer, token); + parse_string_literal(lexer, token); + literal = token->val; + tk_type = TOKEN_STRING_LITERAL; + goto END; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - return parse_number(lexer, token); + parse_number(lexer, token); + // TODO Make it easy + literal = token->val; + tk_type = token->sub_type; + goto END; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': @@ -475,39 +473,53 @@ void get_token(lexer_t* lexer, tok_t* token) { break; } - int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr)); + int strlen = peek - lexer->cur_ptr; + int res = keyword_cmp((const char*)lexer->cur_ptr, strlen); if (res == -1) { - int strlen = peek - lexer->cur_ptr; - unsigned char* str = rt._malloc(strlen + 1); - constant.have = 1; - constant.str = (char*)str; - for (int i = 0; i < strlen; i++) { - str[i] = lexer->cur_ptr[i]; - } - str[strlen] = '\0'; - constant.have = 1; - constant.str = (char*)str; - tok = TOKEN_IDENT; break; + char prev = lexer->cur_ptr[strlen]; + lexer->cur_ptr[strlen] = '\0'; + literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr); + lexer->cur_ptr[strlen] = prev; + tk_type = TOKEN_IDENT; break; } else { - tok = keywords[res].tok; break; + tk_type = keywords[res].tok; break; } default: LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr)); break; } + lexer->loc.len = peek - lexer->cur_ptr; lexer->cur_ptr = peek; END: - token->val = constant; - token->type = tok; - LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i); + lexer->loc.col += lexer->loc.len; + lexer->loc.len = 0; + + token->val = literal; + token->sub_type = tk_type; + token->loc = lexer->loc; +static const tok_basic_type_t tok_type_map[] = { + // 普通token使用#str + #define X(str, basic, tok) [tok] = basic, + TOKEN_TABLE + #undef X + + // 关键字使用#name + #define X(name, std, tok) [tok] = TK_BASIC_KEYWORD, + KEYWORD_TABLE + #undef X +}; + token->type = tok_type_map[tk_type]; + LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type), + token->loc.fname, token->loc.line, token->loc.col); } // get_token maybe got invalid (with parser) void get_valid_token(lexer_t* lexer, tok_t* token) { - tok_type_t type; + tok_basic_type_t type; do { get_token(lexer, token); type = token->type; - } while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT); + Assert(type != TK_BASIC_INVALID); + } while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT); } diff --git a/ccompiler/frontend/lexer/lexer.h b/ccompiler/frontend/lexer/lexer.h index 3ae0b34..8f19b6f 100644 --- a/ccompiler/frontend/lexer/lexer.h +++ b/ccompiler/frontend/lexer/lexer.h @@ -1,5 +1,5 @@ -#ifndef __SMCC_LEXER_H__ -#define __SMCC_LEXER_H__ +#ifndef __SMCC_CC_LEXER_H__ +#define __SMCC_CC_LEXER_H__ #include #include "token.h" @@ -14,25 +14,25 @@ typedef int (*lexer_sread_fn)(void *dst_buf, int dst_size, int elem_size, int count, void *stream); typedef struct lexer { - int line; - int index; - // const char current_file_name[LEXER_BUFFER_SIZE+1]; + loc_t loc; - unsigned char* cur_ptr; // 当前扫描的字符,但是还没有开始扫描 - unsigned char* end_ptr; // 缓冲区最后一个字符的下一个位置 + char* cur_ptr; // 当前扫描的字符,但是还没有开始扫描 + char* end_ptr; // 缓冲区最后一个字符的下一个位置 char buffer[LEXER_BUFFER_SIZE+1]; lexer_sread_fn sread; void* stream; + + strpool_t* strpool; } lexer_t; void init_lexer(lexer_t* lexer, const char* file_name, void* stream, - lexer_sread_fn sread); + lexer_sread_fn sread, strpool_t* strpool); -// pure token getter it will included empty token like TOKEN_FLUSH +// pure token getter it will included empty token like TOKEN_BLANK void get_token(lexer_t* lexer, tok_t* token); -// get_token maybe got invalid (with parser as TOKEN_FLUSH) +// get_token maybe got invalid (with parser as TOKEN_BLANK) void get_valid_token(lexer_t* lexer, tok_t* token); #endif diff --git a/ccompiler/frontend/lexer/lexer_log.h b/ccompiler/frontend/lexer/lexer_log.h index c7b9587..21c7aa0 100644 --- a/ccompiler/frontend/lexer/lexer_log.h +++ b/ccompiler/frontend/lexer/lexer_log.h @@ -3,11 +3,44 @@ #include -#define LEX_NOTSET( fmt, ...) LOG_NOTSET("LEXER: " fmt, ##__VA_ARGS__) -#define LEX_DEBUG( fmt, ...) LOG_DEBUG("LEXER: " fmt, ##__VA_ARGS__) -#define LEX_INFO( fmt, ...) LOG_INFO("LEXER: " fmt, ##__VA_ARGS__) -#define LEX_WARN( fmt, ...) LOG_WARN("LEXER: " fmt, ##__VA_ARGS__) -#define LEX_ERROR( fmt, ...) LOG_ERROR("LEXER: " fmt, ##__VA_ARGS__) -#define LEX_FATAL( fmt, ...) LOG_FATAL("LEXER: " fmt, ##__VA_ARGS__) +#ifndef LEX_LOG_LEVEL +#define LEX_LOG_LEVEL 4 +#endif + +#if LEX_LOG_LEVEL <= 1 +#define LEX_NOTSET( fmt, ...) LOG_NOTSET("LEXER: " fmt, ##__VA_ARGS__) +#else +#define LEX_NOTSET( fmt, ...) +#endif + +#if LEX_LOG_LEVEL <= 2 +#define LEX_DEBUG( fmt, ...) LOG_DEBUG( "LEXER: " fmt, ##__VA_ARGS__) +#else +#define LEX_DEBUG( fmt, ...) +#endif + +#if LEX_LOG_LEVEL <= 3 +#define LEX_INFO( fmt, ...) LOG_INFO( "LEXER: " fmt, ##__VA_ARGS__) +#else +#define LEX_INFO( fmt, ...) +#endif + +#if LEX_LOG_LEVEL <= 4 +#define LEX_WARN( fmt, ...) LOG_WARN( "LEXER: " fmt, ##__VA_ARGS__) +#else +#define LEX_WARN( fmt, ...) +#endif + +#if LEX_LOG_LEVEL <= 5 +#define LEX_ERROR( fmt, ...) LOG_ERROR("LEXER: " fmt, ##__VA_ARGS__) +#else +#define LEX_ERROR( fmt, ...) +#endif + +#if LEX_LOG_LEVEL <= 6 +#define LEX_FATAL( fmt, ...) LOG_FATAL("LEXER: " fmt, ##__VA_ARGS__) +#else +#define LEX_FATAL( fmt, ...) +#endif #endif // __SMCC_LEXER_LOG_H__ diff --git a/ccompiler/frontend/lexer/tests/Makefile b/ccompiler/frontend/lexer/tests/Makefile index d6330b0..a918500 100644 --- a/ccompiler/frontend/lexer/tests/Makefile +++ b/ccompiler/frontend/lexer/tests/Makefile @@ -1,5 +1,5 @@ CC = gcc -CFLAGS = -g -Wall -I../../../.. +CFLAGS = -g -Wall -I../../../.. -DLEX_LOG_LEVEL=4 SRC = ../lexer.c ../token.c LIB = -L../../../../lib -lcore diff --git a/ccompiler/frontend/lexer/tests/run.c b/ccompiler/frontend/lexer/tests/run.c index fac6811..c165602 100644 --- a/ccompiler/frontend/lexer/tests/run.c +++ b/ccompiler/frontend/lexer/tests/run.c @@ -38,14 +38,18 @@ int main(int argc, char* argv[]) { printf("open file success\n"); lexer_t lexer; - init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s); + strpool_t strpool; + init_strpool(&strpool); + init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s, &strpool); tok_t tok; while (1) { get_valid_token(&lexer, &tok); - if (tok.type == TOKEN_EOF) { + if (tok.sub_type == TOKEN_EOF) { break; } + LOG_DEBUG("tk type `%s` in %s:%d:%d", get_tok_name(tok.sub_type), tok.loc.fname, tok.loc.line, tok.loc.col); + // LOG_DEBUG("%s", tok.val.str); // printf("line: %d, column: %d, type: %3d, typename: %s\n", // lexer.line, lexer.index, tok.type, get_tok_name(tok.type)); } diff --git a/ccompiler/frontend/lexer/tests/test.c b/ccompiler/frontend/lexer/tests/test.c index 5a8142a..6362c97 100644 --- a/ccompiler/frontend/lexer/tests/test.c +++ b/ccompiler/frontend/lexer/tests/test.c @@ -1,5 +1,5 @@ // test_lexer.c -#include "../../../../libcore/acutest.h" +#include #include "../lexer.h" #include @@ -13,7 +13,7 @@ int test_read(void *dst_buf, int dst_size, int elem_size, int count, void *strea } // 测试辅助函数 -static inline void test_lexer_string(const char* input, tok_type_t expected_type) { +static inline void test_lexer_string(const char* input, cc_tktype_t expected_type) { lexer_t lexer; tok_t token; diff --git a/ccompiler/frontend/lexer/token.c b/ccompiler/frontend/lexer/token.c index e55fcb0..08cc5b1 100644 --- a/ccompiler/frontend/lexer/token.c +++ b/ccompiler/frontend/lexer/token.c @@ -52,14 +52,14 @@ tok_t *peek_tok(tok_stream_t *tokbuf) { return &(tokbuf->buf[idx]); } -tok_type_t peek_tok_type(tok_stream_t* tokbuf) { - return peek_tok(tokbuf)->type; +cc_tktype_t peek_tok_type(tok_stream_t* tokbuf) { + return peek_tok(tokbuf)->sub_type; } -int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type) { +int expect_pop_tok(tok_stream_t* tokbuf, cc_tktype_t type) { flush_peek_tok(tokbuf); tok_t* tok = peek_tok(tokbuf); - if (tok->type != type) { + if (tok->sub_type != type) { LEX_ERROR("expected tok `%s` but got `%s`", get_tok_name(type), get_tok_name(tok->type)); return 0; } else { @@ -71,7 +71,7 @@ int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type) { // 生成字符串映射(根据需求选择#str或#name) static const char* token_strings[] = { // 普通token使用#str - #define X(str, tok) [tok] = #str, + #define X(str, basic, tok) [tok] = #str, TOKEN_TABLE #undef X @@ -81,6 +81,6 @@ static const char* token_strings[] = { #undef X }; -const char* get_tok_name(tok_type_t type) { +const char* get_tok_name(cc_tktype_t type) { return token_strings[type]; } diff --git a/ccompiler/frontend/lexer/token.h b/ccompiler/frontend/lexer/token.h index b4cbe9a..1811c43 100644 --- a/ccompiler/frontend/lexer/token.h +++ b/ccompiler/frontend/lexer/token.h @@ -1,5 +1,7 @@ -#ifndef __TOKEN_H__ -#define __TOKEN_H__ +#ifndef __SMCC_CC_TOKEN_H__ +#define __SMCC_CC_TOKEN_H__ + +#include enum CSTD_KEYWORD { CSTD_C89, @@ -46,68 +48,68 @@ enum CSTD_KEYWORD { // KEYWORD_TABLE #define TOKEN_TABLE \ - X(EOF , TOKEN_EOF) \ - X(init , TOKEN_INIT) \ - X(flush , TOKEN_FLUSH) \ - X("==" , TOKEN_EQ) \ - X("=" , TOKEN_ASSIGN) \ - X("++" , TOKEN_ADD_ADD) \ - X("+=" , TOKEN_ASSIGN_ADD) \ - X("+" , TOKEN_ADD) \ - X("--" , TOKEN_SUB_SUB) \ - X("-=" , TOKEN_ASSIGN_SUB) \ - X("->" , TOKEN_DEREF) \ - X("-" , TOKEN_SUB) \ - X("*=" , TOKEN_ASSIGN_MUL) \ - X("*" , TOKEN_MUL) \ - X("/=" , TOKEN_ASSIGN_DIV) \ - X("/" , TOKEN_DIV) \ - X("//" , TOKEN_LINE_COMMENT) \ - X("/* */" , TOKEN_BLOCK_COMMENT) \ - X("%=" , TOKEN_ASSIGN_MOD) \ - X("%" , TOKEN_MOD) \ - X("&&" , TOKEN_AND_AND) \ - X("&=" , TOKEN_ASSIGN_AND) \ - X("&" , TOKEN_AND) \ - X("||" , TOKEN_OR_OR) \ - X("|=" , TOKEN_ASSIGN_OR) \ - X("|" , TOKEN_OR) \ - X("^=" , TOKEN_ASSIGN_XOR) \ - X("^" , TOKEN_XOR) \ - X("<<=" , TOKEN_ASSIGN_L_SH) \ - X("<<" , TOKEN_L_SH) \ - X("<=" , TOKEN_LE) \ - X("<" , TOKEN_LT) \ - X(">>=" , TOKEN_ASSIGN_R_SH) \ - X(">>" , TOKEN_R_SH) \ - X(">=" , TOKEN_GE) \ - X(">" , TOKEN_GT) \ - X("!" , TOKEN_NOT) \ - X("!=" , TOKEN_NEQ) \ - X("~" , TOKEN_BIT_NOT) \ - X("[" , TOKEN_L_BRACKET) \ - X("]" , TOKEN_R_BRACKET) \ - X("(" , TOKEN_L_PAREN) \ - X(")" , TOKEN_R_PAREN) \ - X("{" , TOKEN_L_BRACE) \ - X("}" , TOKEN_R_BRACE) \ - X(";" , TOKEN_SEMICOLON) \ - X("," , TOKEN_COMMA) \ - X(":" , TOKEN_COLON) \ - X("." , TOKEN_DOT) \ - X("..." , TOKEN_ELLIPSIS) \ - X("?" , TOKEN_COND) \ - X(identifier , TOKEN_IDENT) \ - X(int_literal , TOKEN_INT_LITERAL) \ - X(float_literal , TOKEN_FLOAT_LITERAL) \ - X(char_literal , TOKEN_CHAR_LITERAL) \ - X(string_literal , TOKEN_STRING_LITERAL) \ + X(init , TK_BASIC_INVALID, TOKEN_INIT) \ + X(EOF , TK_BASIC_EOF, TOKEN_EOF) \ + X(blank , TK_BASIC_WHITESPACE, TOKEN_BLANK) \ + X("==" , TK_BASIC_OPERATOR, TOKEN_EQ) \ + X("=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN) \ + X("++" , TK_BASIC_OPERATOR, TOKEN_ADD_ADD) \ + X("+=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_ADD) \ + X("+" , TK_BASIC_OPERATOR, TOKEN_ADD) \ + X("--" , TK_BASIC_OPERATOR, TOKEN_SUB_SUB) \ + X("-=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_SUB) \ + X("->" , TK_BASIC_OPERATOR, TOKEN_DEREF) \ + X("-" , TK_BASIC_OPERATOR, TOKEN_SUB) \ + X("*=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_MUL) \ + X("*" , TK_BASIC_OPERATOR, TOKEN_MUL) \ + X("/=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_DIV) \ + X("/" , TK_BASIC_OPERATOR, TOKEN_DIV) \ + X("//" , TK_BASIC_COMMENT , TOKEN_LINE_COMMENT) \ + X("/* */" , TK_BASIC_COMMENT , TOKEN_BLOCK_COMMENT) \ + X("%=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_MOD) \ + X("%" , TK_BASIC_OPERATOR, TOKEN_MOD) \ + X("&&" , TK_BASIC_OPERATOR, TOKEN_AND_AND) \ + X("&=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_AND) \ + X("&" , TK_BASIC_OPERATOR, TOKEN_AND) \ + X("||" , TK_BASIC_OPERATOR, TOKEN_OR_OR) \ + X("|=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_OR) \ + X("|" , TK_BASIC_OPERATOR, TOKEN_OR) \ + X("^=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_XOR) \ + X("^" , TK_BASIC_OPERATOR, TOKEN_XOR) \ + X("<<=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_L_SH) \ + X("<<" , TK_BASIC_OPERATOR, TOKEN_L_SH) \ + X("<=" , TK_BASIC_OPERATOR, TOKEN_LE) \ + X("<" , TK_BASIC_OPERATOR, TOKEN_LT) \ + X(">>=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_R_SH) \ + X(">>" , TK_BASIC_OPERATOR, TOKEN_R_SH) \ + X(">=" , TK_BASIC_OPERATOR, TOKEN_GE) \ + X(">" , TK_BASIC_OPERATOR, TOKEN_GT) \ + X("!" , TK_BASIC_OPERATOR, TOKEN_NOT) \ + X("!=" , TK_BASIC_OPERATOR, TOKEN_NEQ) \ + X("~" , TK_BASIC_OPERATOR, TOKEN_BIT_NOT) \ + X("[" , TK_BASIC_OPERATOR, TOKEN_L_BRACKET) \ + X("]" , TK_BASIC_OPERATOR, TOKEN_R_BRACKET) \ + X("(" , TK_BASIC_OPERATOR, TOKEN_L_PAREN) \ + X(")" , TK_BASIC_OPERATOR, TOKEN_R_PAREN) \ + X("{" , TK_BASIC_OPERATOR, TOKEN_L_BRACE) \ + X("}" , TK_BASIC_OPERATOR, TOKEN_R_BRACE) \ + X(";" , TK_BASIC_OPERATOR, TOKEN_SEMICOLON) \ + X("," , TK_BASIC_OPERATOR, TOKEN_COMMA) \ + X(":" , TK_BASIC_OPERATOR, TOKEN_COLON) \ + X("." , TK_BASIC_OPERATOR, TOKEN_DOT) \ + X("..." , TK_BASIC_OPERATOR, TOKEN_ELLIPSIS) \ + X("?" , TK_BASIC_OPERATOR, TOKEN_COND) \ + X(ident , TK_BASIC_IDENTIFIER, TOKEN_IDENT) \ + X(int_literal , TK_BASIC_LITERAL, TOKEN_INT_LITERAL) \ + X(float_literal , TK_BASIC_LITERAL, TOKEN_FLOAT_LITERAL) \ + X(char_literal , TK_BASIC_LITERAL, TOKEN_CHAR_LITERAL) \ + X(string_literal , TK_BASIC_LITERAL, TOKEN_STRING_LITERAL) \ // END // 定义TokenType枚举 -typedef enum tok_type { +typedef enum cc_tktype { // 处理普通token - #define X(str, tok) tok, + #define X(str, basic, tok) tok, TOKEN_TABLE #undef X @@ -115,24 +117,7 @@ typedef enum tok_type { #define X(name, std, tok) tok, KEYWORD_TABLE #undef X -} tok_type_t; - -typedef struct tok_val { - int have; - union { - char ch; - int i; - float f; - double d; - long long ll; - char* str; - }; -} tok_val_t; - -typedef struct tok { - tok_type_t type; - tok_val_t val; -} tok_t; +} cc_tktype_t; typedef struct tok_stream { int cur; @@ -150,8 +135,8 @@ void init_tokbuf(tok_stream_t* tokbuf, void* stream, tok_stream_get_func gettok) tok_t* peek_tok(tok_stream_t* tokbuf); tok_t* pop_tok(tok_stream_t* tokbuf); void flush_peek_tok(tok_stream_t* tokbuf); -tok_type_t peek_tok_type(tok_stream_t* tokbuf); -int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type); -const char* get_tok_name(tok_type_t type); +cc_tktype_t peek_tok_type(tok_stream_t* tokbuf); +int expect_pop_tok(tok_stream_t* tokbuf, cc_tktype_t type); +const char* get_tok_name(cc_tktype_t type); #endif diff --git a/ccompiler/frontend/parser/ast/block.c b/ccompiler/frontend/parser/ast/block.c index 53ab003..d0850e5 100644 --- a/ccompiler/frontend/parser/ast/block.c +++ b/ccompiler/frontend/parser/ast/block.c @@ -19,7 +19,7 @@ ast_node_t* parse_block(parser_t* parser) { symtab_enter_scope(parser->symtab); tok_stream_t *tokbuf = &parser->tokbuf; flush_peek_tok(tokbuf); - tok_type_t ttype; + cc_tktype_t ttype; ast_node_t* node = new_ast_node_block(); expect_pop_tok(tokbuf, TOKEN_L_BRACE); diff --git a/ccompiler/frontend/parser/ast/decl.c b/ccompiler/frontend/parser/ast/decl.c index 3962728..66b362f 100644 --- a/ccompiler/frontend/parser/ast/decl.c +++ b/ccompiler/frontend/parser/ast/decl.c @@ -37,7 +37,7 @@ int peek_decl(tok_stream_t* tokbuf) { ast_node_t* parse_decl_val(parser_t* parser) { tok_stream_t* tokbuf = &parser->tokbuf; - tok_type_t ttype; + cc_tktype_t ttype; flush_peek_tok(tokbuf); ast_node_t* node; @@ -69,7 +69,7 @@ ast_node_t* parse_decl_val(parser_t* parser) { ast_node_t* parse_decl(parser_t* parser) { tok_stream_t* tokbuf = &parser->tokbuf; flush_peek_tok(tokbuf); - tok_type_t ttype; + cc_tktype_t ttype; ast_node_t* node; if (peek_decl(tokbuf) == 0) { diff --git a/ccompiler/frontend/parser/ast/expr.c b/ccompiler/frontend/parser/ast/expr.c index a5a7685..ea1c6ac 100644 --- a/ccompiler/frontend/parser/ast/expr.c +++ b/ccompiler/frontend/parser/ast/expr.c @@ -82,7 +82,7 @@ static ast_node_t* parse_comma(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_ static ast_node_t* parse_assign(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) { flush_peek_tok(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); pop_tok(tokbuf); ast_node_t* node = new_ast_node(); node->type = NT_ASSIGN; @@ -133,7 +133,7 @@ static ast_node_t* parse_assign(tok_stream_t* tokbuf, symtab_t *symtab, ast_node static ast_node_t* parse_cmp(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) { flush_peek_tok(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); pop_tok(tokbuf); ast_node_t* node = new_ast_node(); // saved left @@ -171,7 +171,7 @@ static ast_node_t* parse_cmp(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* static ast_node_t* parse_cal(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) { flush_peek_tok(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); pop_tok(tokbuf); ast_node_t* node = new_ast_node(); node->expr.left = left; @@ -238,7 +238,7 @@ static ast_node_t* parse_call(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t vector_init(node->call.params->params.params); pop_tok(tokbuf); // 跳过 '(' - tok_type_t ttype; + cc_tktype_t ttype; while (1) { flush_peek_tok(tokbuf); ttype = peek_tok_type(tokbuf); @@ -330,7 +330,7 @@ static ast_node_t *parse_primary_expression(tok_stream_t* tokbuf, symtab_t *symt node->type = NT_TERM_VAL; node->syms.tok = *tok; - switch (tok->type) { + switch (tok->sub_type) { case TOKEN_INT_LITERAL: // node->data.data_type = TYPE_INT; break; @@ -344,7 +344,7 @@ static ast_node_t *parse_primary_expression(tok_stream_t* tokbuf, symtab_t *symt // node->data.data_type = TYPE_POINTER; case TOKEN_IDENT: node = expect_pop_ident(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); if (ttype == TOKEN_L_PAREN) { node = parse_call(tokbuf, symtab, node); } else { @@ -365,7 +365,7 @@ END: } static ast_node_t *parse_subexpression(tok_stream_t* tokbuf, symtab_t *symtab, enum Precedence prec) { - tok_type_t ttype; + cc_tktype_t ttype; struct expr_prec_table_t* work; ast_node_t* left; @@ -400,7 +400,7 @@ ast_node_t* parse_expr(parser_t* parser) { tok_stream_t* tokbuf = &(parser->tokbuf); symtab_t *symtab = parser->symtab; flush_peek_tok(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); switch (ttype) { case TOKEN_NOT: case TOKEN_AND: diff --git a/ccompiler/frontend/parser/ast/func.c b/ccompiler/frontend/parser/ast/func.c index afc0e09..96e2ef2 100644 --- a/ccompiler/frontend/parser/ast/func.c +++ b/ccompiler/frontend/parser/ast/func.c @@ -9,7 +9,7 @@ // TODO 语义分析压入符号表 static void parse_params(parser_t* parser, tok_stream_t* cache, ast_node_t* node) { flush_peek_tok(cache); - tok_type_t ttype; + cc_tktype_t ttype; ast_node_t *params = new_ast_node(); node->decl_func.params = params; vector_init(params->params.params); @@ -79,7 +79,7 @@ ast_type_t check_is_func_decl(tok_stream_t* tokbuf, tok_stream_t* cache) { LOG_ERROR("function parameter list too long"); } cache->buf[cache->size++] = *tok; - switch (tok->type) { + switch (tok->sub_type) { case TOKEN_L_PAREN: depth++; break; diff --git a/ccompiler/frontend/parser/ast/stmt.c b/ccompiler/frontend/parser/ast/stmt.c index 8106ce1..3280526 100644 --- a/ccompiler/frontend/parser/ast/stmt.c +++ b/ccompiler/frontend/parser/ast/stmt.c @@ -4,7 +4,7 @@ ast_node_t* parse_stmt(parser_t* parser) { tok_stream_t* tokbuf = &parser->tokbuf; flush_peek_tok(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); ast_node_t* node = new_ast_node(); switch (ttype) { case TOKEN_IF: { diff --git a/ccompiler/frontend/parser/ast/term.c b/ccompiler/frontend/parser/ast/term.c index fccb248..ac782a7 100644 --- a/ccompiler/frontend/parser/ast/term.c +++ b/ccompiler/frontend/parser/ast/term.c @@ -3,8 +3,8 @@ #include "../type.h" ast_node_t* new_ast_ident_node(tok_t* tok) { - if (tok->type != TOKEN_IDENT) { - LOG_ERROR("syntax error: want identifier but got %d", tok->type); + if (tok->sub_type != TOKEN_IDENT) { + LOG_ERROR("syntax error: want identifier but got %d", tok->sub_type); } ast_node_t* node = new_ast_node(); node->type = NT_TERM_IDENT; @@ -24,7 +24,7 @@ ast_node_t* expect_pop_ident(tok_stream_t* tokbuf) { ast_node_t* parse_type(parser_t* parser) { tok_stream_t* tokbuf = &parser->tokbuf; flush_peek_tok(tokbuf); - tok_type_t ttype = peek_tok_type(tokbuf); + cc_tktype_t ttype = peek_tok_type(tokbuf); data_type_t dtype; switch(ttype) { case TOKEN_VOID: dtype = TYPE_VOID; break; diff --git a/ccompiler/frontend/parser/symtab/hashmap.c b/ccompiler/frontend/parser/symtab/hashmap.c deleted file mode 100644 index d45593b..0000000 --- a/ccompiler/frontend/parser/symtab/hashmap.c +++ /dev/null @@ -1,53 +0,0 @@ -// hashmap.c -#include "hashmap.h" -#include -#include - -// DJB2哈希算法 -static unsigned long hash(const char* str) { - unsigned long hash = 5381; - int c; - while ((c = *str++)) - hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ - return hash % HMAP_SIZE; -} - -void hmap_init(HashMap* map) { - memset(map->buckets, 0, sizeof(map->buckets)); -} - -void hmap_put(HashMap* map, const char* key, void* value) { - unsigned long idx = hash(key); - HashMapEntry* entry = malloc(sizeof(HashMapEntry)); - entry->key = strdup(key); - entry->value = value; - entry->next = map->buckets[idx]; - map->buckets[idx] = entry; -} - -void* hmap_get(HashMap* map, const char* key) { - unsigned long idx = hash(key); - HashMapEntry* entry = map->buckets[idx]; - while (entry) { - if (strcmp(entry->key, key) == 0) - return entry->value; - entry = entry->next; - } - return NULL; -} - -int hmap_contains(HashMap* map, const char* key) { - return hmap_get(map, key) != NULL; -} - -void hmap_destroy(HashMap* map) { - for (int i = 0; i < HMAP_SIZE; i++) { - HashMapEntry* entry = map->buckets[i]; - while (entry) { - HashMapEntry* next = entry->next; - free(entry->key); - free(entry); - entry = next; - } - } -} diff --git a/ccompiler/frontend/parser/symtab/hashmap.h b/ccompiler/frontend/parser/symtab/hashmap.h deleted file mode 100644 index b680b43..0000000 --- a/ccompiler/frontend/parser/symtab/hashmap.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef HASHMAP_H -#define HASHMAP_H - -#define HMAP_SIZE 64 - -typedef struct HashMapEntry { - char* key; - void* value; - struct HashMapEntry* next; -} HashMapEntry; - -typedef struct { - HashMapEntry* buckets[HMAP_SIZE]; -} HashMap; - -// 初始化哈希表 -void hmap_init(HashMap* map); - -// 插入键值对 -void hmap_put(HashMap* map, const char* key, void* value); - -// 查找键值 -void* hmap_get(HashMap* map, const char* key); - -// 检查键是否存在 -int hmap_contains(HashMap* map, const char* key); - -// 释放哈希表内存(不释放value) -void hmap_destroy(HashMap* map); - -#endif \ No newline at end of file diff --git a/ccompiler/frontend/parser/symtab/scope.c b/ccompiler/frontend/parser/symtab/scope.c deleted file mode 100644 index 1f1fcc5..0000000 --- a/ccompiler/frontend/parser/symtab/scope.c +++ /dev/null @@ -1,43 +0,0 @@ -// scope.c -#include "scope.h" -#include -#include - -typedef struct Scope Scope; - -Scope* scope_create(Scope* parent) { - Scope* scope = malloc(sizeof(Scope)); - hmap_init(&scope->symbols); - scope->parent = parent; - scope->base_offset = 0; - scope->cur_offset = 0; - return scope; -} - -void scope_destroy(Scope* scope) { - hmap_destroy(&scope->symbols); - free(scope); -} - -void scope_insert(Scope* scope, const char* name, void* symbol) { - if (hmap_contains(&scope->symbols, name)) { - // 处理重复定义错误 - fprintf(stderr, "Error: Symbol '%s' already defined\n", name); - exit(EXIT_FAILURE); - } - hmap_put(&scope->symbols, name, symbol); -} - -void* scope_lookup(Scope* scope, const char* name) { - void* symbol = NULL; - while (scope) { - symbol = hmap_get(&scope->symbols, name); - if (symbol) break; - scope = scope->parent; - } - return symbol; -} - -void* scope_lookup_current(Scope* scope, const char* name) { - return hmap_get(&scope->symbols, name); -} diff --git a/ccompiler/frontend/parser/symtab/scope.h b/ccompiler/frontend/parser/symtab/scope.h deleted file mode 100644 index 718e9f9..0000000 --- a/ccompiler/frontend/parser/symtab/scope.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef SCOPE_H -#define SCOPE_H - -#include "hashmap.h" - -struct Scope { - HashMap symbols; // 当前作用域符号表 - struct Scope* parent; // 上层作用域 - int base_offset; - int cur_offset; -}; - -// 创建新作用域(父作用域可为NULL) -struct Scope* scope_create(struct Scope* parent); - -// 销毁作用域 -void scope_destroy(struct Scope* scope); - -// 在当前作用域插入符号 -void scope_insert(struct Scope* scope, const char* name, void* symbol); - -// 逐级查找符号 -void* scope_lookup(struct Scope* scope, const char* name); - -// 仅在当前作用域查找 -void* scope_lookup_current(struct Scope* scope, const char* name); - -#endif diff --git a/ccompiler/frontend/parser/symtab/symtab.c b/ccompiler/frontend/parser/symtab/symtab.c deleted file mode 100644 index 8f14ce8..0000000 --- a/ccompiler/frontend/parser/symtab/symtab.c +++ /dev/null @@ -1,50 +0,0 @@ -// symtab.c -#include "../../frontend.h" -#include -#include "scope.h" -#include "symtab.h" - -typedef symtab_t symtab_t; -typedef struct Scope Scope; - -void init_symtab(symtab_t* symtab) { - symtab->global_scope = scope_create(NULL); - symtab->cur_scope = symtab->global_scope; -} - -void del_symtab(symtab_t* symtab) { - scope_destroy(symtab->global_scope); -} - -void symtab_enter_scope(symtab_t* symtab) { - struct Scope* scope = scope_create(symtab->cur_scope); - scope->base_offset = symtab->cur_scope->base_offset + symtab->cur_scope->cur_offset; - symtab->cur_scope = scope; -} - -void symtab_leave_scope(symtab_t* symtab) { - Scope * scope = symtab->cur_scope; - if (scope == NULL) { - LOG_ERROR("cannot leave NULL scope or global scope"); - } - symtab->cur_scope = symtab->cur_scope->parent; - scope_destroy(scope); -} - -void* symtab_add_symbol(symtab_t* symtab, const char* name, void* ast_node, int can_duplicate) { - struct Scope* scope = symtab->cur_scope; - void* node = scope_lookup_current(scope, name); - if (node != NULL) { - if (!can_duplicate) { - LOG_ERROR("duplicate symbol %s", name); - } - return node; - } - - scope_insert(scope, name, ast_node); - return node; -} - -void* symtab_lookup_symbol(symtab_t* symtab, const char* name) { - return scope_lookup(symtab->cur_scope, name); -} diff --git a/ccompiler/frontend/parser/symtab/symtab.h b/ccompiler/frontend/parser/symtab/symtab.h deleted file mode 100644 index 97d7c00..0000000 --- a/ccompiler/frontend/parser/symtab/symtab.h +++ /dev/null @@ -1,18 +0,0 @@ -// symtab.h -#ifndef __SYMTAB_H__ -#define __SYMTAB_H__ - -typedef struct symtab { - struct Scope* cur_scope; - struct Scope* global_scope; -} symtab_t; - -void init_symtab(symtab_t* symtab); -void del_symtab(symtab_t* symtab); - -void symtab_enter_scope(symtab_t* symtab); -void symtab_leave_scope(symtab_t* symtab); -void* symtab_add_symbol(symtab_t* symtab, const char* name, void* ast_node, int can_duplicate); -void* symtab_lookup_symbol(symtab_t* symtab, const char* name); - -#endif diff --git a/ccompiler/frontend/parser/tests/test_parser.c b/ccompiler/frontend/parser/tests/test_parser.c index b03ddcc..e552814 100644 --- a/ccompiler/frontend/parser/tests/test_parser.c +++ b/ccompiler/frontend/parser/tests/test_parser.c @@ -6,6 +6,7 @@ // gcc -g ../parser.c ../../lexer/lexer.c ../ast/ast.c ../ast/block.c ../ast/decl.c ../ast/expr.c ../ast/func.c ../ast/program.c ../ast/stmt.c ../ast/term.c ../symtab/hashmap.c ../symtab/scope.c ../symtab/symtab.c test_parser.c -o test_parser // gcc -g test_parser.c -L../.. -lfrontend -o test_parser int main(int argc, char** argv) { + init_lib_core(); const char* file_name = "test_file.c"; if (argc == 2) { file_name = argv[1]; @@ -17,8 +18,10 @@ int main(int argc, char** argv) { } printf("open file success\n"); - struct Lexer lexer; - init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s); + lexer_t lexer; + strpool_t strpool; + init_strpool(&strpool); + init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s, &strpool); struct SymbolTable symtab; init_symtab(&symtab); diff --git a/lib/Makefile b/lib/Makefile index 1e5ab6d..1b6cc55 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -7,14 +7,27 @@ CFLAGS = -g -Wall -I.. RT_DIR = ./rt LOG_DIR = ./rt/log -# 源文件列表 +# basic rt lib SRCS = \ $(RT_DIR)/std/rt_std.c \ ./core.c \ $(RT_DIR)/rt.c \ $(RT_DIR)/rt_alloc.c \ + $(RT_DIR)/rt_string.c \ $(LOG_DIR)/log.c +# utils lib +UTILS_DIR = ./utils +DS_DIR = $(UTILS_DIR)/ds +STRPOOL_DIR = $(UTILS_DIR)/strpool +SYMTAB_DIR = $(UTILS_DIR)/symtab +TOKBUF_DIR = $(UTILS_DIR)/tokbuf +SRCS += \ + $(DS_DIR)/hashtable.c \ + $(STRPOOL_DIR)/strpool.c \ + # $(SYMTAB_DIR)/symtab.c \ + # $(TOKBUF_DIR)/tokbuf.c + # 生成目标文件列表 OBJS = $(SRCS:.c=.o) diff --git a/lib/utils/ds/hashtable.c b/lib/utils/ds/hashtable.c index c61a120..79a456f 100644 --- a/lib/utils/ds/hashtable.c +++ b/lib/utils/ds/hashtable.c @@ -1,142 +1,129 @@ #include "hashtable.h" -#define LOAD_FACTOR 0.75f -// 素数表用于桶扩容(最后一个元素为最大允许容量) -static const int PRIME_CAPACITIES[] = { - 11, 23, 47, 97, 193, 389, 769, 1543, 3079, - 6151, 12289, 24593, 49157, 98317, 196613, 393241, - 786433, 1572869, 3145739, 6291469, 12582917, 25165843 -}; +#define INIT_HASH_TABLE_SIZE (32) -// 私有函数声明 -static u32_t calc_hash(const char* str, int len); -static void rehash(hash_table_t* ht); - -hash_table_t* new_hash_table(int init_size, int max_cap) { - hash_table_t* ht = salloc_alloc(sizeof(hash_table_t)); - hash_table_init(ht, init_size, max_cap); - return ht; +void hashtable_init(hash_table_t* ht) { + vector_init(ht->entries); + ht->count = 0; + ht->tombstone_count = 0; + Assert(ht->key_cmp != NULL && ht->hash_func != NULL); } -static inline get_real_size(int size) { - // 查找第一个不小于size的素数容量 - int cap_idx = 0; - if (size < 0) { - return PRIME_CAPACITIES[SMCC_ARRLEN(PRIME_CAPACITIES)-1]; - } - while (PRIME_CAPACITIES[cap_idx] < size && cap_idx < SMCC_ARRLEN(PRIME_CAPACITIES)-1) { - cap_idx++; - } - return PRIME_CAPACITIES[cap_idx]; +static int next_power_of_two(int n) { + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; } -void hash_table_init(hash_table_t* ht, int init_size, int max_cap) { - // 限制最大容量索引 - ht->max_cap = get_real_size(max_cap); - // 应用实际容量 - ht->cap = get_real_size(init_size); - ht->size = 0; - ht->buckets = NULL; - ht->buckets = salloc_realloc(ht->buckets, sizeof(hash_node_t*) * ht->cap); -} +static hash_entry_t* find_entry(hash_table_t* ht, const void* key, u32_t hash) { + if (ht->entries.cap == 0) return NULL; + + u32_t index = hash & (ht->entries.cap - 1); // 容量是2的幂 + u32_t probe = 0; -void hash_table_insert(hash_table_t* ht, const char* str, int len) { - // 自动扩容检查 - if (ht->size >= ht->cap * LOAD_FACTOR && ht->cap < ht->max_cap) { - rehash(ht); - } - - if (ht->size >= ht->cap) { - LOG_TRACE("Hash table size exceeds maximum capacity. Consider increasing max_capacity."); - } - - // 计算哈希值 - u32_t hash = calc_hash(str, len); - int bucket_idx = hash % ht->cap; - - // 检查重复 - hash_node_t* node = ht->buckets[bucket_idx]; - while (node) { - if (node->hash == hash && - node->len == len && - memcmp(node->str, str, len) == 0) { - return; // 已存在 + hash_entry_t* tombstone = NULL; + + while (1) { + hash_entry_t* entry = &vector_at(ht->entries, index); + if (entry->state == ENTRY_EMPTY) { + return tombstone ? tombstone : entry; } - node = node->next; - } - - // 创建新节点 - hash_node_t* new_node = salloc_alloc(sizeof(hash_node_t)); - new_node->str = str; - new_node->len = len; - new_node->hash = hash; - new_node->next = ht->buckets[bucket_idx]; - ht->buckets[bucket_idx] = new_node; - ht->size++; -} - -hash_node_t* hash_table_find(hash_table_t* ht, const char* str, int len) { - u32_t hash = calc_hash(str, len); - int bucket_idx = hash % ht->cap; - - hash_node_t* node = ht->buckets[bucket_idx]; - while (node) { - if (node->hash == hash && - node->len == len && - memcmp(node->str, str, len) == 0) { - return node; + + if (entry->state == ENTRY_TOMBSTONE) { + if (!tombstone) tombstone = entry; + } else if (entry->hash == hash && ht->key_cmp(entry->key, key) == 0) { + return entry; } - node = node->next; + + // Liner finding + index = (index + 1) & (ht->entries.cap - 1); + probe++; + if (probe >= ht->entries.cap) break; } + LOG_ERROR("hashset_find: hash table is full"); return NULL; } -static void rehash(hash_table_t* ht) { - int old_cap = ht->cap; - hash_node_t** old_buckets = ht->buckets; +static void adjust_capacity(hash_table_t* ht, int new_cap) { + new_cap = next_power_of_two(new_cap); + Assert(new_cap >= ht->entries.cap); - // 查找下一个素数容量 - int new_cap_idx = 0; - while (PRIME_CAPACITIES[new_cap_idx] <= old_cap && - new_cap_idx < ht->max_cap) { - new_cap_idx++; - } - ht->cap = PRIME_CAPACITIES[new_cap_idx]; + vector_header(old_entries, hash_entry_t); + old_entries.data = ht->entries.data; + old_entries.cap = ht->entries.cap; - // 分配新桶数组 - ht->buckets = salloc_alloc(sizeof(hash_node_t*) * ht->cap); - memset(ht->buckets, 0, sizeof(hash_node_t*) * ht->cap); + // Not used size but for gdb python extention debug + ht->entries.size = new_cap; + ht->entries.cap = new_cap; + ht->entries.data = salloc_realloc(NULL, new_cap * sizeof(hash_entry_t)); + rt_memset(ht->entries.data, 0, new_cap * sizeof(hash_entry_t)); - // 重新哈希所有节点 - for (int i = 0; i < old_cap; i++) { - hash_node_t* node = old_buckets[i]; - while (node) { - hash_node_t* next = node->next; - int new_bucket = node->hash % ht->cap; - node->next = ht->buckets[new_bucket]; - ht->buckets[new_bucket] = node; - node = next; + // rehash the all of the old data + for (rt_size_t i = 0; i < old_entries.cap; i++) { + hash_entry_t* entry = &vector_at(old_entries, i); + if (entry->state == ENTRY_ACTIVE) { + hash_entry_t* dest = find_entry(ht, entry->key, entry->hash); + *dest = *entry; } } - salloc_free(old_buckets); + vector_free(old_entries); + ht->tombstone_count = 0; } -static u32_t calc_hash(const char* str, int len) { - // 使用与HASH_FNV_1A宏一致的算法 - rt_strhash(str); -} - -void hash_table_destroy(hash_table_t* ht) { - for (int i = 0; i < ht->cap; i++) { - hash_node_t* node = ht->buckets[i]; - while (node) { - hash_node_t* next = node->next; - salloc_free(node); - node = next; - } +void* hashtable_set(hash_table_t* ht, const void* key, void* value) { + if (ht->count + ht->tombstone_count >= ht->entries.cap * 0.75) { + int new_cap = ht->entries.cap < INIT_HASH_TABLE_SIZE ? INIT_HASH_TABLE_SIZE : ht->entries.cap * 2; + adjust_capacity(ht, new_cap); } - salloc_free(ht->buckets); - ht->buckets = NULL; - ht->size = ht->cap = 0; -} \ No newline at end of file + + u32_t hash = ht->hash_func(key); + hash_entry_t* entry = find_entry(ht, key, hash); + + void* old_value = NULL; + if (entry->state == ENTRY_ACTIVE) { + old_value = entry->value; + } else { + if (entry->state == ENTRY_TOMBSTONE) ht->tombstone_count--; + ht->count++; + } + + entry->key = key; + entry->value = value; + entry->hash = hash; + entry->state = ENTRY_ACTIVE; + return old_value; +} + +void* hashtable_get(hash_table_t* ht, const void* key) { + if (ht->entries.cap == 0) return NULL; + + u32_t hash = ht->hash_func(key); + hash_entry_t* entry = find_entry(ht, key, hash); + return (entry && entry->state == ENTRY_ACTIVE) ? entry->value : NULL; +} + +void* hashtable_del(hash_table_t* ht, const void* key) { + if (ht->entries.cap == 0) return NULL; + + u32_t hash = ht->hash_func(key); + hash_entry_t* entry = find_entry(ht, key, hash); + + if (entry == NULL || entry->state != ENTRY_ACTIVE) return NULL; + + void* value = entry->value; + entry->state = ENTRY_TOMBSTONE; + ht->count--; + ht->tombstone_count++; + return value; +} + +void hashtable_destory(hash_table_t* ht) { + vector_free(ht->entries); + ht->count = 0; + ht->tombstone_count = 0; +} diff --git a/lib/utils/ds/hashtable.h b/lib/utils/ds/hashtable.h index 6c771f4..7bb21c8 100644 --- a/lib/utils/ds/hashtable.h +++ b/lib/utils/ds/hashtable.h @@ -1,27 +1,39 @@ #ifndef __SMCC_HASHTABLE_H__ #define __SMCC_HASHTABLE_H__ -#include +#include +#include "vector.h" -typedef struct hash_node { - const char* str; - int len; - u32_t hash; - struct hash_node* next; -} hash_node_t; +// 哈希表条目状态标记 +typedef enum hash_table_entry_state { + ENTRY_EMPTY, + ENTRY_ACTIVE, + ENTRY_TOMBSTONE +} ht_entry_state_t; +// 哈希表条目结构(不管理key/value内存) +typedef struct hash_entry { + const void* key; // 由调用者管理 + void* value; // 由调用者管理 + u32_t hash; // 预计算哈希值 + ht_entry_state_t state; // 条目状态 +} hash_entry_t; + +// 哈希表主体结构 typedef struct hash_table { - hash_node_t** buckets; - int size; - int cap; - int max_cap; + vector_header(entries, hash_entry_t); // 使用vector管理条目 + u32_t count; // 有效条目数(不含墓碑) + u32_t tombstone_count; // 墓碑数量 + u32_t (*hash_func)(const void* key); + int(*key_cmp)(const void* key1, const void* key2); } hash_table_t; -hash_table_t* new_hash_table(int init_size, int max_cap); -void hash_table_init(hash_table_t* ht, int init_size, int max_cap); -void hash_table_destroy(hash_table_t* ht); +// WARN you need set hash_func and key_cmp before use +void hashtable_init(hash_table_t* ht) ; -void hash_table_insert(hash_table_t* ht, const char* str, int len); -hash_node_t* hash_table_find(hash_table_t* ht, const char* str, int len); +void* hashtable_set(hash_table_t* ht, const void* key, void* value); +void* hashtable_get(hash_table_t* ht, const void* key); +void* hashtable_get(hash_table_t* ht, const void* key); +void hashtable_destory(hash_table_t* ht); #endif // __SMCC_HASHTABLE_H__ diff --git a/lib/utils/strpool/strpool.c b/lib/utils/strpool/strpool.c index e69de29..9b8690a 100644 --- a/lib/utils/strpool/strpool.c +++ b/lib/utils/strpool/strpool.c @@ -0,0 +1,32 @@ +#include "strpool.h" + +void init_strpool(strpool_t* pool) { + lalloc_init(&pool->stralloc); + + pool->ht.hash_func = (u32_t(*)(const void*))rt_strhash; + pool->ht.key_cmp = (int(*)(const void*, const void*))rt_strcmp; + hashtable_init(&pool->ht); +} + +const char* strpool_intern(strpool_t* pool, const char* str) { + void* existing = hashtable_get(&pool->ht, str); + if (existing) { + return existing; + } + + rt_size_t len = rt_strlen(str) + 1; + char* new_str = lalloc_alloc(&pool->stralloc, len); + if (!new_str) { + LOG_ERROR("strpool: Failed to allocate memory for string"); + return NULL; + } + rt_memcpy(new_str, str, len); + + hashtable_set(&pool->ht, new_str, new_str); + return new_str; +} + +void strpool_destroy(strpool_t* pool) { + hashtable_destory(&pool->ht); + lalloc_destroy(&pool->stralloc); +} diff --git a/lib/utils/strpool/strpool.h b/lib/utils/strpool/strpool.h index f0bf4cb..b24abad 100644 --- a/lib/utils/strpool/strpool.h +++ b/lib/utils/strpool/strpool.h @@ -2,11 +2,16 @@ #define __SMCC_STRPOOL_H__ #include -#include "../ds/hash.h" -typedef struct strpool { - long_alloc_t *long_alloc; -} strpool_t; +#include +#include -void new_strpool(); +typedef struct strpool { + hash_table_t ht; // 用于快速查找字符串 + long_alloc_t stralloc; // 专门用于字符串存储的分配器 +} strpool_t; + +void init_strpool(strpool_t* pool); +const char* strpool_intern(strpool_t* pool, const char* str); +void strpool_destroy(strpool_t* pool); #endif // __SMCC_STRPOOL_H__ diff --git a/lib/utils/symtab/symtab.h b/lib/utils/symtab/symtab.h index e69de29..07e4bf9 100644 --- a/lib/utils/symtab/symtab.h +++ b/lib/utils/symtab/symtab.h @@ -0,0 +1,6 @@ +#ifndef __SMCC_SYMTABL_H__ +#define __SMCC_SYMTABL_H__ + + + +#endif diff --git a/lib/utils/tokbuf/token.c b/lib/utils/tokbuf/tokbuf.c similarity index 100% rename from lib/utils/tokbuf/token.c rename to lib/utils/tokbuf/tokbuf.c diff --git a/lib/utils/tokbuf/token.h b/lib/utils/tokbuf/tokbuf.h similarity index 86% rename from lib/utils/tokbuf/token.h rename to lib/utils/tokbuf/tokbuf.h index e250850..af055d1 100644 --- a/lib/utils/tokbuf/token.h +++ b/lib/utils/tokbuf/tokbuf.h @@ -7,18 +7,20 @@ typedef struct loc { const char *fname; int line; int col; - short len; + int len; } loc_t; -typedef enum tok_type { +typedef enum tok_basic_type { TK_BASIC_INVALID, // 错误占位 TK_BASIC_KEYWORD, // 关键字 TK_BASIC_OPERATOR, // 操作符 TK_BASIC_IDENTIFIER, // 标识符 TK_BASIC_LITERAL, // 字面量 - TK_BASIC_PUNCTUATOR, // 标点符号 + + TK_BASIC_WHITESPACE, // 空白 + TK_BASIC_COMMENT, // 注释 TK_BASIC_EOF // 结束标记 -} tok_type_t; +} tok_basic_type_t; typedef union ctype { u8_t u8; @@ -34,10 +36,15 @@ typedef union ctype { iptr_t iptr; uptr_t uptr; void* ptr; + char ch; + int i; + + // MUST BE strpool ptr + const char* str; } ctype_t; typedef struct tok { - tok_type_t type; + tok_basic_type_t type; int sub_type; loc_t loc; ctype_t val; diff --git a/lib/utils/utils.h b/lib/utils/utils.h new file mode 100644 index 0000000..8d10737 --- /dev/null +++ b/lib/utils/utils.h @@ -0,0 +1,8 @@ +#ifndef __SMCC_LIB_UTILS_H__ +#define __SMCC_LIB_UTILS_H__ + +#include "strpool/strpool.h" +#include "symtab/symtab.h" +#include "tokbuf/tokbuf.h" + +#endif