feat(frontend): 重构词法分析器

- 添加 .gitignore 文件，忽略编译器生成的二进制文件 - 重构 lexer.c 文件，改进了关键字处理和字符串处理 - 更新前端的前端、解析器和 AST 相关文件，以适应新的词法分析器 - 优化了 token 相关的定义和函数，引入了新的 token 类型
2025-03-23 12:13:16 +08:00
parent 05c637e594
commit 2b4857001c
33 changed files with 532 additions and 624 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,22 @@
+.vscode/
+
+# smcc compiler generated files
+*.bin
+
+# linux binary files
+*.o
+*.a
+*.so
+*.out
+
+# windows binary files
+*.obj
+*.lib
+*.dll
+*.exe
+
+# developed notes
+note.md
+
+# python
+.venv
--- a/ccompiler/backend/riscv32/rv32.c
+++ b/ccompiler/backend/riscv32/rv32.c
@ -7,7 +7,7 @@
 // 指令编码联合体（自动处理小端序）
 typedef union rv32code {
    uint32_t code;
-    u8_t bytes[4];
+    uint8_t bytes[4];
 } rv32code_t;

 #include "../../frontend/frontend.h"
--- a/ccompiler/frontend/frontend.c
+++ b/ccompiler/frontend/frontend.c
@ -4,14 +4,16 @@

 ast_node_t* frontend(const char* file, void* stream, sread_fn sread) {
    init_lib_core();
+    strpool_t strpool;
+    init_strpool(&strpool);

    lexer_t lexer;
-    init_lexer(&lexer, file, stream, sread);
+    init_lexer(&lexer, file, stream, sread, &strpool);
    
    symtab_t symtab;
    init_symtab(&symtab);

-    parser_t  parser;
+    parser_t parser;
    init_parser(&parser, &lexer, &symtab);
    parse_prog(&parser);

--- a/ccompiler/frontend/lexer/lexer.c
+++ b/ccompiler/frontend/lexer/lexer.c
@ -34,7 +34,7 @@ David Hanson / drh@drhanson.net
 static const struct {
    const char* name;
    enum CSTD_KEYWORD std_type;
-    tok_type_t tok;
+    cc_tktype_t tok;
 } keywords[] = {
    #define X(name, std_type, tok, ...) { #name, std_type, tok },
    KEYWORD_TABLE
@ -74,19 +74,17 @@ static inline int keyword_cmp(const char* name, int len) {
    return -1; // Not a keyword.
 }

-void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) {
-    init_lib_core();
-
-    lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
-    lexer->index = 1;
-    lexer->line = 1;
+void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) {
+    lexer->strpool = strpool;
+    lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer);
+    lexer->loc.fname = strpool_intern(lexer->strpool, file_name);
+    lexer->loc.line = 1;
+    lexer->loc.col = 1;

    lexer->stream = stream;
    lexer->sread = sread;

-    for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
-        lexer->buffer[i] = 0;
-    }
+    rt_memset(lexer->buffer, 0, sizeof(lexer->buffer));
 }

 static void flush_buffer(lexer_t* lexer) {
@ -94,7 +92,7 @@ static void flush_buffer(lexer_t* lexer) {
    for (int i = 0; i < num; i++) {
        lexer->buffer[i] = lexer->cur_ptr[i];
    }
-    lexer->cur_ptr = (unsigned char*)lexer->buffer;
+    lexer->cur_ptr = lexer->buffer;

    int read_size = LEXER_BUFFER_SIZE - num;
    // TODO rt_size_t to int maybe lose precision
@ -128,19 +126,20 @@ static void goto_block_comment(lexer_t* lexer) {
            flush_buffer(lexer);
        }
        
-        if (*lexer->cur_ptr == '\0') {
+        if (lexer->cur_ptr[0] == '\0') {
            break;
        } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
            lexer->cur_ptr += 2;
            break;
        } else {
+            if (lexer->cur_ptr[0] == '\n') lexer->loc.line++;
            lexer->cur_ptr++;
        }
    }
 }

 // TODO escape character not enough
-static char got_slash(unsigned char* peek) {
+static char got_slash(char* peek) {
    switch (*peek) {
        case '\\':  return '\\';
        case '\'':  return '\'';
@ -162,7 +161,7 @@ static char got_slash(unsigned char* peek) {

 static void parse_char_literal(lexer_t* lexer, tok_t* token) {
    char val = 0;
-    unsigned char* peek = lexer->cur_ptr + 1;
+    char* peek = lexer->cur_ptr + 1;
    if (*peek == '\\') {
        peek++;
        val = got_slash(peek);
@ -172,16 +171,14 @@ static void parse_char_literal(lexer_t* lexer, tok_t* token) {
    }

    if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
-    token->val.ch = val;
    lexer->cur_ptr = peek;
-    token->val.have = 1;
-    token->type = TOKEN_CHAR_LITERAL;
+    token->val.ch = val;
 }

 static void parse_string_literal(lexer_t* lexer, tok_t* token) {
-    unsigned char* peek = lexer->cur_ptr + 1;
+    char* peek = lexer->cur_ptr + 1;
    // TODO string literal size check
-    char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1);
+    static char dest[LEXER_MAX_TOKEN_SIZE + 1];
    int len = 0;

    while (*peek != '"') {
@ -196,14 +193,15 @@ static void parse_string_literal(lexer_t* lexer, tok_t* token) {
        dest[len++] = *peek++;
    }
    dest[len] = '\0';
-    lexer->cur_ptr = peek + 1;
-    token->val.have = 1;
-    token->type = TOKEN_STRING_LITERAL;
+    lexer->cur_ptr = peek + 1; // 1 is `"`
+    lexer->loc.len = len + 2; // 2 is `"` `"`
+
+    token->val.str = strpool_intern(lexer->strpool, dest);
 }

 // FIXME it write by AI maybe error
 static void parse_number(lexer_t* lexer, tok_t* token) {
-    unsigned char* peek = lexer->cur_ptr;
+    char* peek = lexer->cur_ptr;
    int base = 10;
    int is_float = 0;
    long long int_val = 0;
@ -278,14 +276,15 @@ static void parse_number(lexer_t* lexer, tok_t* token) {
    }

    // 存储结果
+    // TODO
+    lexer->loc.len = peek - lexer->cur_ptr;
    lexer->cur_ptr = peek;
-    token->val.have = 1;
    if (is_float) {
-        token->val.d = float_val;
-        token->type = TOKEN_FLOAT_LITERAL;
+        token->val.f32 = float_val;
+        token->sub_type = TOKEN_FLOAT_LITERAL;
    } else {
-        token->val.ll = int_val;
-        token->type = TOKEN_INT_LITERAL;
+        token->val.i = int_val;
+        token->sub_type = TOKEN_INT_LITERAL;
    }
 }

@ -296,160 +295,159 @@ void get_token(lexer_t* lexer, tok_t* token) {
    if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
        flush_buffer(lexer);
    }
-    register unsigned char* peek = lexer->cur_ptr;
-    
-    // 快速跳过空白符
-    while (*peek == ' ' || *peek == '\t') {
-        if (peek == lexer->end_ptr) {
-            break;
-        }
-        peek++;
-    }
-    if (peek != lexer->cur_ptr) {
-        // To TOKEN_FLUSH
-        lexer->cur_ptr = peek;
-        token->type = TOKEN_FLUSH;
-    }
-    
-    tok_type_t tok = TOKEN_INIT;
-    tok_val_t constant;
-    constant.have = 0;
-    
+    register char* peek = lexer->cur_ptr;
+
+    cc_tktype_t tk_type = TOKEN_INIT;
+    ctype_t literal = { 0 };
+
    // once step
    switch (*peek++) {
-        case '=':
+    case '=':
        switch (*peek++) {
-            case '=': tok = TOKEN_EQ; break;
-            default: peek--, tok = TOKEN_ASSIGN; break;
+            case '=': tk_type = TOKEN_EQ; break;
+            default: peek--, tk_type = TOKEN_ASSIGN; break;
        } break;
    case '+':
        switch (*peek++) {
-            case '+': tok = TOKEN_ADD_ADD; break;
-            case '=': tok = TOKEN_ASSIGN_ADD; break;
-            default: peek--, tok = TOKEN_ADD; break;
+            case '+': tk_type = TOKEN_ADD_ADD; break;
+            case '=': tk_type = TOKEN_ASSIGN_ADD; break;
+            default: peek--, tk_type = TOKEN_ADD; break;
        } break;
    case '-':
        switch (*peek++) {
-            case '-': tok = TOKEN_SUB_SUB; break;
-            case '=': tok = TOKEN_ASSIGN_SUB; break;
+            case '-': tk_type = TOKEN_SUB_SUB; break;
+            case '=': tk_type = TOKEN_ASSIGN_SUB; break;

-            case '>': tok = TOKEN_DEREF; break;
-            default: peek--, tok = TOKEN_SUB; break;
+            case '>': tk_type = TOKEN_DEREF; break;
+            default: peek--, tk_type = TOKEN_SUB; break;
        } break;
    case '*':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_MUL; break;
-            default: peek--, tok = TOKEN_MUL; break;
+            case '=': tk_type = TOKEN_ASSIGN_MUL; break;
+            default: peek--, tk_type = TOKEN_MUL; break;
        } break;
    case '/':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_DIV; break;
+            case '=': tk_type = TOKEN_ASSIGN_DIV; break;
            case '/': {
-                // need get a new line to parse
                goto_newline(lexer);
-                tok = TOKEN_LINE_COMMENT;
+                tk_type = TOKEN_LINE_COMMENT;
                goto END;
            }
            case '*': {
                lexer->cur_ptr = peek;
                goto_block_comment(lexer);
-                tok = TOKEN_BLOCK_COMMENT;
+                tk_type = TOKEN_BLOCK_COMMENT;
                goto END;
            }
-            default: peek--, tok = TOKEN_DIV; break;
+            default: peek--, tk_type = TOKEN_DIV; break;
        } break;
    case '%':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_MOD; break;
-            default: peek--, tok = TOKEN_MOD; break;
+            case '=': tk_type = TOKEN_ASSIGN_MOD; break;
+            default: peek--, tk_type = TOKEN_MOD; break;
        } break;
    case '&':
        switch (*peek++) {
-            case '&': tok = TOKEN_AND_AND; break;
-            case '=': tok = TOKEN_ASSIGN_AND; break;
-            default: peek--, tok = TOKEN_AND; break;
+            case '&': tk_type = TOKEN_AND_AND; break;
+            case '=': tk_type = TOKEN_ASSIGN_AND; break;
+            default: peek--, tk_type = TOKEN_AND; break;
        } break;
    case '|':
        switch (*peek++) {
-            case '|': tok = TOKEN_OR_OR; break;
-            case '=': tok = TOKEN_ASSIGN_OR; break;
-            default: peek--, tok = TOKEN_OR; break;
+            case '|': tk_type = TOKEN_OR_OR; break;
+            case '=': tk_type = TOKEN_ASSIGN_OR; break;
+            default: peek--, tk_type = TOKEN_OR; break;
        } break;
    case '^':
        switch (*peek++) {
-            case '=': tok = TOKEN_ASSIGN_XOR; break;
-            default: peek--, tok = TOKEN_XOR; break;
+            case '=': tk_type = TOKEN_ASSIGN_XOR; break;
+            default: peek--, tk_type = TOKEN_XOR; break;
        } break;
    case '<':
        switch (*peek++) {
-            case '=': tok = TOKEN_LE; break;
-            case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
-            default: peek--, tok = TOKEN_LT; break;
+            case '=': tk_type = TOKEN_LE; break;
+            case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
+            default: peek--, tk_type = TOKEN_LT; break;
        } break;
    case '>':
        switch (*peek++) {
-            case '=': tok = TOKEN_GE; break;
-            case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
-            default: peek--, tok = TOKEN_GT; break;
+            case '=': tk_type = TOKEN_GE; break;
+            case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
+            default: peek--, tk_type = TOKEN_GT; break;
        } break;
    case '~':
-        tok = TOKEN_BIT_NOT; break;
+        tk_type = TOKEN_BIT_NOT; break;
    case '!':
        switch (*peek++) {
-            case '=': tok = TOKEN_NEQ; break;
-            default: peek--, tok = TOKEN_NOT; break;
+            case '=': tk_type = TOKEN_NEQ; break;
+            default: peek--, tk_type = TOKEN_NOT; break;
        } break;
    case '[':
-        tok = TOKEN_L_BRACKET; break;
+        tk_type = TOKEN_L_BRACKET; break;
    case ']':
-        tok = TOKEN_R_BRACKET; break;
+        tk_type = TOKEN_R_BRACKET; break;
    case '(':
-        tok = TOKEN_L_PAREN; break;
+        tk_type = TOKEN_L_PAREN; break;
    case ')':
-        tok = TOKEN_R_PAREN; break;
+        tk_type = TOKEN_R_PAREN; break;
    case '{':
-        tok = TOKEN_L_BRACE; break;
+        tk_type = TOKEN_L_BRACE; break;
    case '}':
-        tok = TOKEN_R_BRACE; break;
+        tk_type = TOKEN_R_BRACE; break;
    case ';':
-        tok = TOKEN_SEMICOLON; break;
+        tk_type = TOKEN_SEMICOLON; break;
    case ',':
-        tok = TOKEN_COMMA; break;
+        tk_type = TOKEN_COMMA; break;
    case ':':
-        tok = TOKEN_COLON; break;
+        tk_type = TOKEN_COLON; break;
    case '.':
        if (peek[0] == '.' && peek[1] == '.') {
            peek += 2;
-            tok = TOKEN_ELLIPSIS;
+            tk_type = TOKEN_ELLIPSIS;
        } else {
-            tok = TOKEN_DOT;
+            tk_type = TOKEN_DOT;
        }
        break;
    case '?':
-        tok = TOKEN_COND; break;
-    case '\v': case '\r': case '\f': // FIXME it parse as a blank character
-        tok = TOKEN_FLUSH; break;
-    case '\n': 
+        tk_type = TOKEN_COND; break;
+    case '\v': case '\r': case '\f':
+    case ' ': case '\t':
+        tk_type = TOKEN_BLANK; break;
+    case '\n':
        // you need to flush a newline or blank
-        lexer->line++;
-        tok = TOKEN_FLUSH; break;
+        lexer->loc.line += 1;
+        lexer->loc.col = -1;
+        lexer->loc.len = 1;
+        tk_type = TOKEN_BLANK;
+        break;
    case '#':
-        LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored");
+    // TODO make line or file comment to change
+        LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
        goto_newline(lexer);
-        tok = TOKEN_FLUSH;
+        tk_type = TOKEN_BLANK;
        goto END;
    case '\0':
        // EOF
-        tok = TOKEN_EOF;
+        tk_type = TOKEN_EOF;
        goto END;
    case '\'':
-        return parse_char_literal(lexer, token);
-        return;
+        parse_char_literal(lexer, token);
+        literal = token->val;
+        tk_type = TOKEN_CHAR_LITERAL;
+        goto END; break;
    case '"':
-        return parse_string_literal(lexer, token);
+        parse_string_literal(lexer, token);
+        literal = token->val;
+        tk_type = TOKEN_STRING_LITERAL;
+        goto END; break;
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
-        return parse_number(lexer, token);
+        parse_number(lexer, token);
+        // TODO Make it easy
+        literal = token->val;
+        tk_type = token->sub_type;
+        goto END; break;
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
@ -475,39 +473,53 @@ void get_token(lexer_t* lexer, tok_t* token) {
            break;
        }
    
-        int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr));
+        int strlen = peek - lexer->cur_ptr;
+        int res = keyword_cmp((const char*)lexer->cur_ptr, strlen);
        if (res == -1) {
-            int strlen = peek - lexer->cur_ptr;
-            unsigned char* str = rt._malloc(strlen + 1);
-            constant.have = 1;
-            constant.str = (char*)str;
-            for (int i = 0; i < strlen; i++) {
-                str[i] = lexer->cur_ptr[i];
-            }
-            str[strlen] = '\0';
-            constant.have = 1;
-            constant.str = (char*)str;
-            tok = TOKEN_IDENT; break;
+            char prev = lexer->cur_ptr[strlen];
+            lexer->cur_ptr[strlen] = '\0';
+            literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr);
+            lexer->cur_ptr[strlen] = prev;
+            tk_type = TOKEN_IDENT; break;
        } else {
-            tok = keywords[res].tok; break;
+            tk_type = keywords[res].tok; break;
        }
    default:
        LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
        break;
    }

+    lexer->loc.len = peek - lexer->cur_ptr;
    lexer->cur_ptr = peek;
 END:
-    token->val = constant;
-    token->type = tok;
-    LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i);
+    lexer->loc.col += lexer->loc.len;
+    lexer->loc.len = 0;
+
+    token->val = literal;
+    token->sub_type = tk_type;
+    token->loc = lexer->loc;
+static const tok_basic_type_t tok_type_map[] = {
+    // 普通token使用#str
+    #define X(str, basic, tok) [tok] = basic,
+    TOKEN_TABLE
+    #undef X
+    
+    // 关键字使用#name
+    #define X(name, std, tok) [tok] = TK_BASIC_KEYWORD,
+    KEYWORD_TABLE
+    #undef X
+};
+    token->type = tok_type_map[tk_type];
+    LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type),
+        token->loc.fname, token->loc.line, token->loc.col);
 }

 // get_token maybe got invalid (with parser)
 void get_valid_token(lexer_t* lexer, tok_t* token) {
-    tok_type_t type;
+    tok_basic_type_t type;
    do {
        get_token(lexer, token);
        type = token->type;
-    } while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
+        Assert(type != TK_BASIC_INVALID);
+    } while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT);
 }
--- a/ccompiler/frontend/lexer/lexer.h
+++ b/ccompiler/frontend/lexer/lexer.h
@ -1,5 +1,5 @@
-#ifndef __SMCC_LEXER_H__
-#define __SMCC_LEXER_H__
+#ifndef __SMCC_CC_LEXER_H__
+#define __SMCC_CC_LEXER_H__

 #include <lib/core.h>
 #include "token.h"
@ -14,25 +14,25 @@ typedef int (*lexer_sread_fn)(void *dst_buf, int dst_size,
        int elem_size, int count, void *stream);

 typedef struct lexer {
-    int line;
-    int index;
-    // const char current_file_name[LEXER_BUFFER_SIZE+1];
+    loc_t loc;

-    unsigned char* cur_ptr; // 当前扫描的字符，但是还没有开始扫描
-    unsigned char* end_ptr; // 缓冲区最后一个字符的下一个位置
+    char* cur_ptr; // 当前扫描的字符，但是还没有开始扫描
+    char* end_ptr; // 缓冲区最后一个字符的下一个位置
    char buffer[LEXER_BUFFER_SIZE+1];

    lexer_sread_fn sread;
    void* stream;
+
+    strpool_t* strpool;
 } lexer_t;

 void init_lexer(lexer_t* lexer, const char* file_name, void* stream,
-    lexer_sread_fn sread);
+    lexer_sread_fn sread, strpool_t* strpool);

-// pure token getter it will included empty token like TOKEN_FLUSH
+// pure token getter it will included empty token like TOKEN_BLANK
 void get_token(lexer_t* lexer, tok_t* token);

-// get_token maybe got invalid (with parser as TOKEN_FLUSH)
+// get_token maybe got invalid (with parser as TOKEN_BLANK)
 void get_valid_token(lexer_t* lexer, tok_t* token);

 #endif
--- a/ccompiler/frontend/lexer/lexer_log.h
+++ b/ccompiler/frontend/lexer/lexer_log.h
@ -3,11 +3,44 @@

 #include <lib/rt/rt.h>

-#define LEX_NOTSET( fmt, ...)     LOG_NOTSET("LEXER: " fmt, ##__VA_ARGS__)
-#define LEX_DEBUG(  fmt, ...)      LOG_DEBUG("LEXER: " fmt, ##__VA_ARGS__) 
-#define LEX_INFO(   fmt, ...)       LOG_INFO("LEXER: " fmt, ##__VA_ARGS__)  
-#define LEX_WARN(   fmt, ...)       LOG_WARN("LEXER: " fmt, ##__VA_ARGS__)  
-#define LEX_ERROR(  fmt, ...)      LOG_ERROR("LEXER: " fmt, ##__VA_ARGS__) 
-#define LEX_FATAL(  fmt, ...)      LOG_FATAL("LEXER: " fmt, ##__VA_ARGS__) 
+#ifndef LEX_LOG_LEVEL
+#define LEX_LOG_LEVEL 4
+#endif
+
+#if LEX_LOG_LEVEL <= 1
+#define LEX_NOTSET( fmt, ...)       LOG_NOTSET("LEXER: " fmt, ##__VA_ARGS__)
+#else
+#define LEX_NOTSET( fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 2
+#define LEX_DEBUG(  fmt, ...)       LOG_DEBUG( "LEXER: " fmt, ##__VA_ARGS__)
+#else
+#define LEX_DEBUG(  fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 3
+#define LEX_INFO(   fmt, ...)       LOG_INFO(  "LEXER: " fmt, ##__VA_ARGS__)
+#else
+#define LEX_INFO(   fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 4
+#define LEX_WARN(   fmt, ...)       LOG_WARN(  "LEXER: " fmt, ##__VA_ARGS__)
+#else
+#define LEX_WARN(   fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 5
+#define LEX_ERROR(  fmt, ...)       LOG_ERROR("LEXER: " fmt, ##__VA_ARGS__)
+#else
+#define LEX_ERROR(  fmt, ...)
+#endif
+
+#if LEX_LOG_LEVEL <= 6
+#define LEX_FATAL(  fmt, ...)       LOG_FATAL("LEXER: " fmt, ##__VA_ARGS__)
+#else
+#define LEX_FATAL(  fmt, ...)
+#endif

 #endif // __SMCC_LEXER_LOG_H__
--- a/ccompiler/frontend/lexer/tests/Makefile
+++ b/ccompiler/frontend/lexer/tests/Makefile
@ -1,5 +1,5 @@
 CC = gcc
-CFLAGS = -g -Wall -I../../../..
+CFLAGS = -g -Wall -I../../../.. -DLEX_LOG_LEVEL=4
 SRC = ../lexer.c ../token.c
 LIB = -L../../../../lib -lcore

--- a/ccompiler/frontend/lexer/tests/run.c
+++ b/ccompiler/frontend/lexer/tests/run.c
@ -38,14 +38,18 @@ int main(int argc, char* argv[]) {
    printf("open file success\n");

    lexer_t lexer;
-    init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s);
+    strpool_t strpool;
+    init_strpool(&strpool);
+    init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s, &strpool);
    tok_t tok;

    while (1) {
        get_valid_token(&lexer, &tok);
-        if (tok.type == TOKEN_EOF) {
+        if (tok.sub_type == TOKEN_EOF) {
            break;
        }
+        LOG_DEBUG("tk type `%s` in %s:%d:%d", get_tok_name(tok.sub_type), tok.loc.fname, tok.loc.line, tok.loc.col);
+        // LOG_DEBUG("%s", tok.val.str);
        // printf("line: %d, column: %d, type: %3d, typename: %s\n",
        //     lexer.line, lexer.index, tok.type, get_tok_name(tok.type));
    }
--- a/ccompiler/frontend/lexer/tests/test.c
+++ b/ccompiler/frontend/lexer/tests/test.c
@ -1,5 +1,5 @@
 // test_lexer.c
-#include "../../../../libcore/acutest.h"
+#include <lib/acutest.h>
 #include "../lexer.h"
 #include <string.h>

@ -13,7 +13,7 @@ int test_read(void *dst_buf, int dst_size, int elem_size, int count, void *strea
 }

 // 测试辅助函数
-static inline void test_lexer_string(const char* input, tok_type_t expected_type) {
+static inline void test_lexer_string(const char* input, cc_tktype_t expected_type) {
    lexer_t lexer;
    tok_t token;
    
--- a/ccompiler/frontend/lexer/token.c
+++ b/ccompiler/frontend/lexer/token.c
@ -52,14 +52,14 @@ tok_t *peek_tok(tok_stream_t *tokbuf) {
    return &(tokbuf->buf[idx]);
 }

-tok_type_t peek_tok_type(tok_stream_t* tokbuf) {
-    return peek_tok(tokbuf)->type;
+cc_tktype_t peek_tok_type(tok_stream_t* tokbuf) {
+    return peek_tok(tokbuf)->sub_type;
 }

-int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type) {
+int expect_pop_tok(tok_stream_t* tokbuf, cc_tktype_t type) {
    flush_peek_tok(tokbuf);
    tok_t* tok = peek_tok(tokbuf);
-    if (tok->type != type) {
+    if (tok->sub_type != type) {
        LEX_ERROR("expected tok `%s` but got `%s`", get_tok_name(type), get_tok_name(tok->type));
        return 0;
    } else {
@ -71,7 +71,7 @@ int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type) {
 // 生成字符串映射（根据需求选择#str或#name）
 static const char* token_strings[] = {
    // 普通token使用#str
-    #define X(str, tok) [tok] = #str,
+    #define X(str, basic, tok) [tok] = #str,
    TOKEN_TABLE
    #undef X
    
@ -81,6 +81,6 @@ static const char* token_strings[] = {
    #undef X
 };

-const char* get_tok_name(tok_type_t type) {
+const char* get_tok_name(cc_tktype_t type) {
    return token_strings[type];
 }
--- a/ccompiler/frontend/lexer/token.h
+++ b/ccompiler/frontend/lexer/token.h
@ -1,5 +1,7 @@
-#ifndef __TOKEN_H__
-#define __TOKEN_H__
+#ifndef __SMCC_CC_TOKEN_H__
+#define __SMCC_CC_TOKEN_H__
+
+#include <lib/utils/utils.h>

 enum CSTD_KEYWORD {
    CSTD_C89,
@ -46,68 +48,68 @@ enum CSTD_KEYWORD {
    // KEYWORD_TABLE

 #define TOKEN_TABLE \
-    X(EOF            , TOKEN_EOF)                           \
-    X(init           , TOKEN_INIT)                          \
-    X(flush          , TOKEN_FLUSH)                         \
-    X("=="           , TOKEN_EQ)                            \
-    X("="            , TOKEN_ASSIGN)                        \
-    X("++"           , TOKEN_ADD_ADD)                       \
-    X("+="           , TOKEN_ASSIGN_ADD)                    \
-    X("+"            , TOKEN_ADD)                           \
-    X("--"           , TOKEN_SUB_SUB)                       \
-    X("-="           , TOKEN_ASSIGN_SUB)                    \
-    X("->"           , TOKEN_DEREF)                         \
-    X("-"            , TOKEN_SUB)                           \
-    X("*="           , TOKEN_ASSIGN_MUL)                    \
-    X("*"            , TOKEN_MUL)                           \
-    X("/="           , TOKEN_ASSIGN_DIV)                    \
-    X("/"            , TOKEN_DIV)                           \
-    X("//"           , TOKEN_LINE_COMMENT)                  \
-    X("/* */"        , TOKEN_BLOCK_COMMENT)                 \
-    X("%="           , TOKEN_ASSIGN_MOD)                    \
-    X("%"            , TOKEN_MOD)                           \
-    X("&&"           , TOKEN_AND_AND)                       \
-    X("&="           , TOKEN_ASSIGN_AND)                    \
-    X("&"            , TOKEN_AND)                           \
-    X("||"           , TOKEN_OR_OR)                         \
-    X("|="           , TOKEN_ASSIGN_OR)                     \
-    X("|"            , TOKEN_OR)                            \
-    X("^="           , TOKEN_ASSIGN_XOR)                    \
-    X("^"            , TOKEN_XOR)                           \
-    X("<<="          , TOKEN_ASSIGN_L_SH)                   \
-    X("<<"           , TOKEN_L_SH)                          \
-    X("<="           , TOKEN_LE)                            \
-    X("<"            , TOKEN_LT)                            \
-    X(">>="          , TOKEN_ASSIGN_R_SH)                   \
-    X(">>"           , TOKEN_R_SH)                          \
-    X(">="           , TOKEN_GE)                            \
-    X(">"            , TOKEN_GT)                            \
-    X("!"            , TOKEN_NOT)                           \
-    X("!="           , TOKEN_NEQ)                           \
-    X("~"            , TOKEN_BIT_NOT)                       \
-    X("["            , TOKEN_L_BRACKET)                     \
-    X("]"            , TOKEN_R_BRACKET)                     \
-    X("("            , TOKEN_L_PAREN)                       \
-    X(")"            , TOKEN_R_PAREN)                       \
-    X("{"            , TOKEN_L_BRACE)                       \
-    X("}"            , TOKEN_R_BRACE)                       \
-    X(";"            , TOKEN_SEMICOLON)                     \
-    X(","            , TOKEN_COMMA)                         \
-    X(":"            , TOKEN_COLON)                         \
-    X("."            , TOKEN_DOT)                           \
-    X("..."          , TOKEN_ELLIPSIS)                      \
-    X("?"            , TOKEN_COND)                          \
-    X(identifier     , TOKEN_IDENT)                    \
-    X(int_literal    , TOKEN_INT_LITERAL)                   \
-    X(float_literal  , TOKEN_FLOAT_LITERAL)                 \
-    X(char_literal   , TOKEN_CHAR_LITERAL)                  \
-    X(string_literal , TOKEN_STRING_LITERAL)                \
+    X(init           , TK_BASIC_INVALID,        TOKEN_INIT)             \
+    X(EOF            , TK_BASIC_EOF,            TOKEN_EOF)              \
+    X(blank          , TK_BASIC_WHITESPACE,     TOKEN_BLANK)            \
+    X("=="           , TK_BASIC_OPERATOR,       TOKEN_EQ)               \
+    X("="            , TK_BASIC_OPERATOR,       TOKEN_ASSIGN)           \
+    X("++"           , TK_BASIC_OPERATOR,       TOKEN_ADD_ADD)          \
+    X("+="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_ADD)       \
+    X("+"            , TK_BASIC_OPERATOR,       TOKEN_ADD)              \
+    X("--"           , TK_BASIC_OPERATOR,       TOKEN_SUB_SUB)          \
+    X("-="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_SUB)       \
+    X("->"           , TK_BASIC_OPERATOR,       TOKEN_DEREF)            \
+    X("-"            , TK_BASIC_OPERATOR,       TOKEN_SUB)              \
+    X("*="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_MUL)       \
+    X("*"            , TK_BASIC_OPERATOR,       TOKEN_MUL)              \
+    X("/="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_DIV)       \
+    X("/"            , TK_BASIC_OPERATOR,       TOKEN_DIV)              \
+    X("//"           , TK_BASIC_COMMENT ,       TOKEN_LINE_COMMENT)     \
+    X("/* */"        , TK_BASIC_COMMENT ,       TOKEN_BLOCK_COMMENT)    \
+    X("%="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_MOD)       \
+    X("%"            , TK_BASIC_OPERATOR,       TOKEN_MOD)              \
+    X("&&"           , TK_BASIC_OPERATOR,       TOKEN_AND_AND)          \
+    X("&="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_AND)       \
+    X("&"            , TK_BASIC_OPERATOR,       TOKEN_AND)              \
+    X("||"           , TK_BASIC_OPERATOR,       TOKEN_OR_OR)            \
+    X("|="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_OR)        \
+    X("|"            , TK_BASIC_OPERATOR,       TOKEN_OR)               \
+    X("^="           , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_XOR)       \
+    X("^"            , TK_BASIC_OPERATOR,       TOKEN_XOR)              \
+    X("<<="          , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_L_SH)      \
+    X("<<"           , TK_BASIC_OPERATOR,       TOKEN_L_SH)             \
+    X("<="           , TK_BASIC_OPERATOR,       TOKEN_LE)               \
+    X("<"            , TK_BASIC_OPERATOR,       TOKEN_LT)               \
+    X(">>="          , TK_BASIC_OPERATOR,       TOKEN_ASSIGN_R_SH)      \
+    X(">>"           , TK_BASIC_OPERATOR,       TOKEN_R_SH)             \
+    X(">="           , TK_BASIC_OPERATOR,       TOKEN_GE)               \
+    X(">"            , TK_BASIC_OPERATOR,       TOKEN_GT)               \
+    X("!"            , TK_BASIC_OPERATOR,       TOKEN_NOT)              \
+    X("!="           , TK_BASIC_OPERATOR,       TOKEN_NEQ)              \
+    X("~"            , TK_BASIC_OPERATOR,       TOKEN_BIT_NOT)          \
+    X("["            , TK_BASIC_OPERATOR,       TOKEN_L_BRACKET)        \
+    X("]"            , TK_BASIC_OPERATOR,       TOKEN_R_BRACKET)        \
+    X("("            , TK_BASIC_OPERATOR,       TOKEN_L_PAREN)          \
+    X(")"            , TK_BASIC_OPERATOR,       TOKEN_R_PAREN)          \
+    X("{"            , TK_BASIC_OPERATOR,       TOKEN_L_BRACE)          \
+    X("}"            , TK_BASIC_OPERATOR,       TOKEN_R_BRACE)          \
+    X(";"            , TK_BASIC_OPERATOR,       TOKEN_SEMICOLON)        \
+    X(","            , TK_BASIC_OPERATOR,       TOKEN_COMMA)            \
+    X(":"            , TK_BASIC_OPERATOR,       TOKEN_COLON)            \
+    X("."            , TK_BASIC_OPERATOR,       TOKEN_DOT)              \
+    X("..."          , TK_BASIC_OPERATOR,       TOKEN_ELLIPSIS)         \
+    X("?"            , TK_BASIC_OPERATOR,       TOKEN_COND)             \
+    X(ident          , TK_BASIC_IDENTIFIER,     TOKEN_IDENT)            \
+    X(int_literal    , TK_BASIC_LITERAL,        TOKEN_INT_LITERAL)      \
+    X(float_literal  , TK_BASIC_LITERAL,        TOKEN_FLOAT_LITERAL)    \
+    X(char_literal   , TK_BASIC_LITERAL,        TOKEN_CHAR_LITERAL)     \
+    X(string_literal , TK_BASIC_LITERAL,        TOKEN_STRING_LITERAL)   \
    // END

 // 定义TokenType枚举
-typedef enum tok_type {
+typedef enum cc_tktype {
    // 处理普通token
-    #define X(str, tok) tok,
+    #define X(str, basic, tok) tok,
    TOKEN_TABLE
    #undef X
    
@ -115,24 +117,7 @@ typedef enum tok_type {
    #define X(name, std, tok) tok,
    KEYWORD_TABLE
    #undef X
-} tok_type_t;
-
-typedef struct tok_val {
-    int have;
-    union {
-        char ch;
-        int i;
-        float f;
-        double d;
-        long long ll;
-        char* str;
-    };
-} tok_val_t;
-
-typedef struct tok {
-    tok_type_t type;
-    tok_val_t val;
-} tok_t;
+} cc_tktype_t;

 typedef struct tok_stream {
    int cur;
@ -150,8 +135,8 @@ void init_tokbuf(tok_stream_t* tokbuf, void* stream, tok_stream_get_func gettok)
 tok_t* peek_tok(tok_stream_t* tokbuf);
 tok_t* pop_tok(tok_stream_t* tokbuf);
 void flush_peek_tok(tok_stream_t* tokbuf);
-tok_type_t peek_tok_type(tok_stream_t* tokbuf);
-int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type);
-const char* get_tok_name(tok_type_t type);
+cc_tktype_t peek_tok_type(tok_stream_t* tokbuf);
+int expect_pop_tok(tok_stream_t* tokbuf, cc_tktype_t type);
+const char* get_tok_name(cc_tktype_t type);

 #endif
--- a/ccompiler/frontend/parser/ast/block.c
+++ b/ccompiler/frontend/parser/ast/block.c
@ -19,7 +19,7 @@ ast_node_t* parse_block(parser_t* parser) {
    symtab_enter_scope(parser->symtab);
    tok_stream_t *tokbuf = &parser->tokbuf;
    flush_peek_tok(tokbuf);
-    tok_type_t ttype;
+    cc_tktype_t ttype;
    ast_node_t* node = new_ast_node_block();

    expect_pop_tok(tokbuf, TOKEN_L_BRACE);
--- a/ccompiler/frontend/parser/ast/decl.c
+++ b/ccompiler/frontend/parser/ast/decl.c
@ -37,7 +37,7 @@ int peek_decl(tok_stream_t* tokbuf) {

 ast_node_t* parse_decl_val(parser_t* parser) {
    tok_stream_t* tokbuf = &parser->tokbuf;
-    tok_type_t ttype;
+    cc_tktype_t ttype;
    flush_peek_tok(tokbuf);

    ast_node_t* node;
@ -69,7 +69,7 @@ ast_node_t* parse_decl_val(parser_t* parser) {
 ast_node_t* parse_decl(parser_t* parser) {
    tok_stream_t* tokbuf = &parser->tokbuf;
    flush_peek_tok(tokbuf);
-    tok_type_t ttype;
+    cc_tktype_t ttype;
    ast_node_t* node;
    
    if (peek_decl(tokbuf) == 0) {
--- a/ccompiler/frontend/parser/ast/expr.c
+++ b/ccompiler/frontend/parser/ast/expr.c
@ -82,7 +82,7 @@ static ast_node_t* parse_comma(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_

 static ast_node_t* parse_assign(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) {
    flush_peek_tok(tokbuf);
-    tok_type_t ttype = peek_tok_type(tokbuf);
+    cc_tktype_t ttype = peek_tok_type(tokbuf);
    pop_tok(tokbuf);
    ast_node_t* node = new_ast_node();
    node->type = NT_ASSIGN;
@ -133,7 +133,7 @@ static ast_node_t* parse_assign(tok_stream_t* tokbuf, symtab_t *symtab, ast_node

 static ast_node_t* parse_cmp(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) {
    flush_peek_tok(tokbuf);
-    tok_type_t ttype = peek_tok_type(tokbuf);
+    cc_tktype_t ttype = peek_tok_type(tokbuf);
    pop_tok(tokbuf);
    ast_node_t* node = new_ast_node();
    // saved left
@ -171,7 +171,7 @@ static ast_node_t* parse_cmp(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t*

 static ast_node_t* parse_cal(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) {
    flush_peek_tok(tokbuf);
-    tok_type_t ttype = peek_tok_type(tokbuf);
+    cc_tktype_t ttype = peek_tok_type(tokbuf);
    pop_tok(tokbuf);
    ast_node_t* node = new_ast_node();
    node->expr.left = left;
@ -238,7 +238,7 @@ static ast_node_t* parse_call(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t
    vector_init(node->call.params->params.params);
    pop_tok(tokbuf); // 跳过 '('
    
-    tok_type_t ttype;
+    cc_tktype_t ttype;
    while (1) {
        flush_peek_tok(tokbuf);
        ttype = peek_tok_type(tokbuf);
@ -330,7 +330,7 @@ static ast_node_t *parse_primary_expression(tok_stream_t* tokbuf, symtab_t *symt
    node->type = NT_TERM_VAL;
    node->syms.tok = *tok;

-    switch (tok->type) {
+    switch (tok->sub_type) {
    case TOKEN_INT_LITERAL:
        // node->data.data_type = TYPE_INT;
        break;
@ -344,7 +344,7 @@ static ast_node_t *parse_primary_expression(tok_stream_t* tokbuf, symtab_t *symt
        // node->data.data_type = TYPE_POINTER;
    case TOKEN_IDENT:
        node = expect_pop_ident(tokbuf);
-        tok_type_t ttype = peek_tok_type(tokbuf);
+        cc_tktype_t ttype = peek_tok_type(tokbuf);
        if (ttype == TOKEN_L_PAREN) {
            node = parse_call(tokbuf, symtab, node);
        } else {
@ -365,7 +365,7 @@ END:
 }

 static ast_node_t *parse_subexpression(tok_stream_t* tokbuf, symtab_t *symtab, enum Precedence prec) {
-    tok_type_t                  ttype;
+    cc_tktype_t                  ttype;
    struct expr_prec_table_t*   work;
    ast_node_t*                 left;

@ -400,7 +400,7 @@ ast_node_t* parse_expr(parser_t* parser) {
    tok_stream_t* tokbuf = &(parser->tokbuf);
    symtab_t *symtab = parser->symtab;
    flush_peek_tok(tokbuf);
-    tok_type_t ttype = peek_tok_type(tokbuf);
+    cc_tktype_t ttype = peek_tok_type(tokbuf);
    switch (ttype) {
    case TOKEN_NOT:
    case TOKEN_AND:
--- a/ccompiler/frontend/parser/ast/func.c
+++ b/ccompiler/frontend/parser/ast/func.c
@ -9,7 +9,7 @@
 // TODO 语义分析压入符号表
 static void parse_params(parser_t* parser, tok_stream_t* cache, ast_node_t* node) {
    flush_peek_tok(cache);
-    tok_type_t ttype;
+    cc_tktype_t ttype;
    ast_node_t *params = new_ast_node();
    node->decl_func.params = params;
    vector_init(params->params.params);
@ -79,7 +79,7 @@ ast_type_t check_is_func_decl(tok_stream_t* tokbuf, tok_stream_t* cache) {
            LOG_ERROR("function parameter list too long");
        }
        cache->buf[cache->size++] = *tok;
-        switch (tok->type) {
+        switch (tok->sub_type) {
        case TOKEN_L_PAREN:
            depth++;
            break;
--- a/ccompiler/frontend/parser/ast/stmt.c
+++ b/ccompiler/frontend/parser/ast/stmt.c
@ -4,7 +4,7 @@
 ast_node_t* parse_stmt(parser_t* parser) {
    tok_stream_t* tokbuf = &parser->tokbuf;
    flush_peek_tok(tokbuf);
-    tok_type_t ttype = peek_tok_type(tokbuf);
+    cc_tktype_t ttype = peek_tok_type(tokbuf);
    ast_node_t* node = new_ast_node();
    switch (ttype) {
    case TOKEN_IF: {
--- a/ccompiler/frontend/parser/ast/term.c
+++ b/ccompiler/frontend/parser/ast/term.c
@ -3,8 +3,8 @@
 #include "../type.h"

 ast_node_t* new_ast_ident_node(tok_t* tok) {
-    if (tok->type != TOKEN_IDENT) {
-        LOG_ERROR("syntax error: want identifier but got %d", tok->type);
+    if (tok->sub_type != TOKEN_IDENT) {
+        LOG_ERROR("syntax error: want identifier but got %d", tok->sub_type);
    }
    ast_node_t* node = new_ast_node();
    node->type = NT_TERM_IDENT;
@ -24,7 +24,7 @@ ast_node_t* expect_pop_ident(tok_stream_t* tokbuf) {
 ast_node_t* parse_type(parser_t* parser) {
    tok_stream_t* tokbuf = &parser->tokbuf;
    flush_peek_tok(tokbuf);
-    tok_type_t ttype = peek_tok_type(tokbuf);
+    cc_tktype_t ttype = peek_tok_type(tokbuf);
    data_type_t dtype;
    switch(ttype) {
        case TOKEN_VOID:    dtype = TYPE_VOID; break;
--- a/ccompiler/frontend/parser/symtab/hashmap.c
+++ b/ccompiler/frontend/parser/symtab/hashmap.c
@ -1,53 +0,0 @@
-// hashmap.c
-#include "hashmap.h"
-#include <stdlib.h>
-#include <string.h>
-
-// DJB2哈希算法
-static unsigned long hash(const char* str) {
-    unsigned long hash = 5381;
-    int c;
-    while ((c = *str++))
-        hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
-    return hash % HMAP_SIZE;
-}
-
-void hmap_init(HashMap* map) {
-    memset(map->buckets, 0, sizeof(map->buckets));
-}
-
-void hmap_put(HashMap* map, const char* key, void* value) {
-    unsigned long idx = hash(key);
-    HashMapEntry* entry = malloc(sizeof(HashMapEntry));
-    entry->key = strdup(key);
-    entry->value = value;
-    entry->next = map->buckets[idx];
-    map->buckets[idx] = entry;
-}
-
-void* hmap_get(HashMap* map, const char* key) {
-    unsigned long idx = hash(key);
-    HashMapEntry* entry = map->buckets[idx];
-    while (entry) {
-        if (strcmp(entry->key, key) == 0) 
-            return entry->value;
-        entry = entry->next;
-    }
-    return NULL;
-}
-
-int hmap_contains(HashMap* map, const char* key) {
-    return hmap_get(map, key) != NULL;
-}
-
-void hmap_destroy(HashMap* map) {
-    for (int i = 0; i < HMAP_SIZE; i++) {
-        HashMapEntry* entry = map->buckets[i];
-        while (entry) {
-            HashMapEntry* next = entry->next;
-            free(entry->key);
-            free(entry);
-            entry = next;
-        }
-    }
-}
--- a/ccompiler/frontend/parser/symtab/hashmap.h
+++ b/ccompiler/frontend/parser/symtab/hashmap.h
@ -1,31 +0,0 @@
-#ifndef HASHMAP_H
-#define HASHMAP_H
-
-#define HMAP_SIZE 64
-
-typedef struct HashMapEntry {
-    char* key;
-    void* value;
-    struct HashMapEntry* next;
-} HashMapEntry;
-
-typedef struct {
-    HashMapEntry* buckets[HMAP_SIZE];
-} HashMap;
-
-// 初始化哈希表
-void hmap_init(HashMap* map);
-
-// 插入键值对
-void hmap_put(HashMap* map, const char* key, void* value);
-
-// 查找键值
-void* hmap_get(HashMap* map, const char* key);
-
-// 检查键是否存在
-int hmap_contains(HashMap* map, const char* key);
-
-// 释放哈希表内存（不释放value）
-void hmap_destroy(HashMap* map);
-
-#endif
--- a/ccompiler/frontend/parser/symtab/scope.c
+++ b/ccompiler/frontend/parser/symtab/scope.c
@ -1,43 +0,0 @@
-// scope.c
-#include "scope.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct Scope Scope;
-
-Scope* scope_create(Scope* parent) {
-    Scope* scope = malloc(sizeof(Scope));
-    hmap_init(&scope->symbols);
-    scope->parent = parent;
-    scope->base_offset = 0;
-    scope->cur_offset = 0;
-    return scope;
-}
-
-void scope_destroy(Scope* scope) {
-    hmap_destroy(&scope->symbols);
-    free(scope);
-}
-
-void scope_insert(Scope* scope, const char* name, void* symbol) {
-    if (hmap_contains(&scope->symbols, name)) {
-        // 处理重复定义错误
-        fprintf(stderr, "Error: Symbol '%s' already defined\n", name);
-        exit(EXIT_FAILURE);
-    }
-    hmap_put(&scope->symbols, name, symbol);
-}
-
-void* scope_lookup(Scope* scope, const char* name) {
-    void* symbol = NULL;
-    while (scope) {
-        symbol = hmap_get(&scope->symbols, name);
-        if (symbol) break;
-        scope = scope->parent;
-    }
-    return symbol;
-}
-
-void* scope_lookup_current(Scope* scope, const char* name) {
-    return hmap_get(&scope->symbols, name);
-}
--- a/ccompiler/frontend/parser/symtab/scope.h
+++ b/ccompiler/frontend/parser/symtab/scope.h
@ -1,28 +0,0 @@
-#ifndef SCOPE_H
-#define SCOPE_H
-
-#include "hashmap.h"
-
-struct Scope {
-    HashMap symbols;      // 当前作用域符号表
-    struct Scope* parent; // 上层作用域
-    int base_offset;
-    int cur_offset;
-};
-
-// 创建新作用域（父作用域可为NULL）
-struct Scope* scope_create(struct Scope* parent);
-
-// 销毁作用域
-void scope_destroy(struct Scope* scope);
-
-// 在当前作用域插入符号
-void scope_insert(struct Scope* scope, const char* name, void* symbol);
-
-// 逐级查找符号
-void* scope_lookup(struct Scope* scope, const char* name);
-
-// 仅在当前作用域查找
-void* scope_lookup_current(struct Scope* scope, const char* name);
-
-#endif
--- a/ccompiler/frontend/parser/symtab/symtab.c
+++ b/ccompiler/frontend/parser/symtab/symtab.c
@ -1,50 +0,0 @@
-// symtab.c
-#include "../../frontend.h"
-#include <lib/core.h>
-#include "scope.h"
-#include "symtab.h"
-
-typedef symtab_t symtab_t;
-typedef struct Scope Scope;
-
-void init_symtab(symtab_t* symtab) {
-    symtab->global_scope = scope_create(NULL);
-    symtab->cur_scope = symtab->global_scope;
-}
-
-void del_symtab(symtab_t* symtab) {
-    scope_destroy(symtab->global_scope);
-}
-
-void symtab_enter_scope(symtab_t* symtab) {
-    struct Scope* scope = scope_create(symtab->cur_scope);
-    scope->base_offset = symtab->cur_scope->base_offset + symtab->cur_scope->cur_offset;
-    symtab->cur_scope = scope;
-}
-
-void symtab_leave_scope(symtab_t* symtab) {
-    Scope * scope = symtab->cur_scope;
-    if (scope == NULL) {
-        LOG_ERROR("cannot leave NULL scope or global scope");
-    }
-    symtab->cur_scope = symtab->cur_scope->parent;
-    scope_destroy(scope);
-}
-
-void* symtab_add_symbol(symtab_t* symtab, const char* name, void* ast_node, int can_duplicate) {
-    struct Scope* scope = symtab->cur_scope;
-    void* node = scope_lookup_current(scope, name);
-    if (node != NULL) {
-        if (!can_duplicate) {
-            LOG_ERROR("duplicate symbol %s", name);
-        }
-        return node;
-    }
-
-    scope_insert(scope, name, ast_node);
-    return node;
-}
-
-void* symtab_lookup_symbol(symtab_t* symtab, const char* name) {
-    return scope_lookup(symtab->cur_scope, name);
-}
--- a/ccompiler/frontend/parser/symtab/symtab.h
+++ b/ccompiler/frontend/parser/symtab/symtab.h
@ -1,18 +0,0 @@
-// symtab.h
-#ifndef __SYMTAB_H__
-#define __SYMTAB_H__
-
-typedef struct symtab {
-    struct Scope* cur_scope;
-    struct Scope* global_scope;
-} symtab_t;
-
-void init_symtab(symtab_t* symtab);
-void del_symtab(symtab_t* symtab);
-
-void symtab_enter_scope(symtab_t* symtab);
-void symtab_leave_scope(symtab_t* symtab);
-void* symtab_add_symbol(symtab_t* symtab, const char* name, void* ast_node, int can_duplicate);
-void* symtab_lookup_symbol(symtab_t* symtab, const char* name);
-
-#endif
--- a/ccompiler/frontend/parser/tests/test_parser.c
+++ b/ccompiler/frontend/parser/tests/test_parser.c
@ -6,6 +6,7 @@
 // gcc -g ../parser.c ../../lexer/lexer.c ../ast/ast.c ../ast/block.c ../ast/decl.c ../ast/expr.c ../ast/func.c ../ast/program.c ../ast/stmt.c ../ast/term.c ../symtab/hashmap.c ../symtab/scope.c ../symtab/symtab.c test_parser.c -o test_parser
 // gcc -g test_parser.c -L../.. -lfrontend -o test_parser
 int main(int argc, char** argv) {
+    init_lib_core();
    const char* file_name = "test_file.c";
    if (argc == 2) {
        file_name = argv[1];
@ -17,8 +18,10 @@ int main(int argc, char** argv) {
    }
    printf("open file success\n");

-    struct Lexer lexer;
-    init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s);
+    lexer_t lexer;
+    strpool_t strpool;
+    init_strpool(&strpool);
+    init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s, &strpool);
    
    struct SymbolTable symtab;
    init_symtab(&symtab);
--- a/lib/Makefile
+++ b/lib/Makefile
@ -7,14 +7,27 @@ CFLAGS = -g -Wall -I..
 RT_DIR = ./rt
 LOG_DIR = ./rt/log

-# 源文件列表
+# basic rt lib
 SRCS = \
 	$(RT_DIR)/std/rt_std.c \
 	./core.c \
 	$(RT_DIR)/rt.c \
 	$(RT_DIR)/rt_alloc.c \
+	$(RT_DIR)/rt_string.c \
 	$(LOG_DIR)/log.c

+# utils lib
+UTILS_DIR = ./utils
+DS_DIR = $(UTILS_DIR)/ds
+STRPOOL_DIR = $(UTILS_DIR)/strpool
+SYMTAB_DIR = $(UTILS_DIR)/symtab
+TOKBUF_DIR = $(UTILS_DIR)/tokbuf
+SRCS += \
+	$(DS_DIR)/hashtable.c \
+	$(STRPOOL_DIR)/strpool.c \
+	# $(SYMTAB_DIR)/symtab.c \
+	# $(TOKBUF_DIR)/tokbuf.c
+
 # 生成目标文件列表
 OBJS = $(SRCS:.c=.o)

--- a/lib/utils/ds/hashtable.c
+++ b/lib/utils/ds/hashtable.c
@ -1,142 +1,129 @@
 #include "hashtable.h"

-#define LOAD_FACTOR 0.75f
-// 素数表用于桶扩容（最后一个元素为最大允许容量）
-static const int PRIME_CAPACITIES[] = { 
-    11, 23, 47, 97, 193, 389, 769, 1543, 3079, 
-    6151, 12289, 24593, 49157, 98317, 196613, 393241,
-    786433, 1572869, 3145739, 6291469, 12582917, 25165843
-};
+#define INIT_HASH_TABLE_SIZE (32)

-// 私有函数声明
-static u32_t calc_hash(const char* str, int len);
-static void rehash(hash_table_t* ht);
-
-hash_table_t* new_hash_table(int init_size, int max_cap) {
-    hash_table_t* ht = salloc_alloc(sizeof(hash_table_t));
-    hash_table_init(ht, init_size, max_cap);
-    return ht;
+void hashtable_init(hash_table_t* ht) {
+    vector_init(ht->entries);
+    ht->count = 0;
+    ht->tombstone_count = 0;
+    Assert(ht->key_cmp != NULL && ht->hash_func != NULL);
 }

-static inline get_real_size(int size) {
-    // 查找第一个不小于size的素数容量
-    int cap_idx = 0;
-    if (size < 0) {
-        return PRIME_CAPACITIES[SMCC_ARRLEN(PRIME_CAPACITIES)-1];
-    }
-    while (PRIME_CAPACITIES[cap_idx] < size && cap_idx < SMCC_ARRLEN(PRIME_CAPACITIES)-1)  {
-        cap_idx++;
-    }
-    return PRIME_CAPACITIES[cap_idx];
+static int next_power_of_two(int n) {
+    n--;
+    n |= n >> 1;
+    n |= n >> 2;
+    n |= n >> 4;
+    n |= n >> 8;
+    n |= n >> 16;
+    return n + 1;
 }

-void hash_table_init(hash_table_t* ht, int init_size, int max_cap) {
-    // 限制最大容量索引
-    ht->max_cap = get_real_size(max_cap);
-    // 应用实际容量
-    ht->cap = get_real_size(init_size);
-    ht->size = 0;
-    ht->buckets = NULL;
-    ht->buckets = salloc_realloc(ht->buckets, sizeof(hash_node_t*) * ht->cap);
-}
+static hash_entry_t* find_entry(hash_table_t* ht, const void* key, u32_t hash) {
+    if (ht->entries.cap == 0) return NULL;
+    
+    u32_t index = hash & (ht->entries.cap - 1); // 容量是2的幂
+    u32_t probe = 0;

-void hash_table_insert(hash_table_t* ht, const char* str, int len) {
-    // 自动扩容检查
-    if (ht->size >= ht->cap * LOAD_FACTOR && ht->cap < ht->max_cap) {
-        rehash(ht);
-    }
-
-    if (ht->size >= ht->cap) {
-        LOG_TRACE("Hash table size exceeds maximum capacity. Consider increasing max_capacity.");
-    }
-
-    // 计算哈希值
-    u32_t hash = calc_hash(str, len);
-    int bucket_idx = hash % ht->cap;
-
-    // 检查重复
-    hash_node_t* node = ht->buckets[bucket_idx];
-    while (node) {
-        if (node->hash == hash && 
-            node->len == len && 
-            memcmp(node->str, str, len) == 0) {
-            return; // 已存在
+    hash_entry_t* tombstone = NULL;
+    
+    while (1) {
+        hash_entry_t* entry = &vector_at(ht->entries, index);
+        if (entry->state == ENTRY_EMPTY) {
+            return tombstone ? tombstone : entry;
        }
-        node = node->next;
-    }
-
-    // 创建新节点
-    hash_node_t* new_node = salloc_alloc(sizeof(hash_node_t));
-    new_node->str = str;
-    new_node->len = len;
-    new_node->hash = hash;
-    new_node->next = ht->buckets[bucket_idx];
-    ht->buckets[bucket_idx] = new_node;
-    ht->size++;
-}
-
-hash_node_t* hash_table_find(hash_table_t* ht, const char* str, int len) {
-    u32_t hash = calc_hash(str, len);
-    int bucket_idx = hash % ht->cap;
-
-    hash_node_t* node = ht->buckets[bucket_idx];
-    while (node) {
-        if (node->hash == hash && 
-            node->len == len && 
-            memcmp(node->str, str, len) == 0) {
-            return node;
+        
+        if (entry->state == ENTRY_TOMBSTONE) {
+            if (!tombstone) tombstone = entry;
+        } else if (entry->hash == hash && ht->key_cmp(entry->key, key) == 0) {
+            return entry;
        }
-        node = node->next;
+        
+        // Liner finding
+        index = (index + 1) & (ht->entries.cap - 1);
+        probe++;
+        if (probe >= ht->entries.cap) break;
    }
+    LOG_ERROR("hashset_find: hash table is full");
    return NULL;
 }

-static void rehash(hash_table_t* ht) {
-    int old_cap = ht->cap;
-    hash_node_t** old_buckets = ht->buckets;
+static void adjust_capacity(hash_table_t* ht, int new_cap) {
+    new_cap = next_power_of_two(new_cap);
+    Assert(new_cap >= ht->entries.cap);

-    // 查找下一个素数容量
-    int new_cap_idx = 0;
-    while (PRIME_CAPACITIES[new_cap_idx] <= old_cap && 
-           new_cap_idx < ht->max_cap) {
-        new_cap_idx++;
-    }
-    ht->cap = PRIME_CAPACITIES[new_cap_idx];
+    vector_header(old_entries, hash_entry_t);
+    old_entries.data = ht->entries.data;
+    old_entries.cap = ht->entries.cap;

-    // 分配新桶数组
-    ht->buckets = salloc_alloc(sizeof(hash_node_t*) * ht->cap);
-    memset(ht->buckets, 0, sizeof(hash_node_t*) * ht->cap);
+    // Not used size but for gdb python extention debug
+    ht->entries.size = new_cap;
+    ht->entries.cap = new_cap;
+    ht->entries.data = salloc_realloc(NULL, new_cap * sizeof(hash_entry_t));
+    rt_memset(ht->entries.data, 0, new_cap * sizeof(hash_entry_t));

-    // 重新哈希所有节点
-    for (int i = 0; i < old_cap; i++) {
-        hash_node_t* node = old_buckets[i];
-        while (node) {
-            hash_node_t* next = node->next;
-            int new_bucket = node->hash % ht->cap;
-            node->next = ht->buckets[new_bucket];
-            ht->buckets[new_bucket] = node;
-            node = next;
+    // rehash the all of the old data
+    for (rt_size_t i = 0; i < old_entries.cap; i++) {
+        hash_entry_t* entry = &vector_at(old_entries, i);
+        if (entry->state == ENTRY_ACTIVE) {
+            hash_entry_t* dest = find_entry(ht, entry->key, entry->hash);
+            *dest = *entry;
        }
    }

-    salloc_free(old_buckets);
+    vector_free(old_entries);
+    ht->tombstone_count = 0;
 }

-static u32_t calc_hash(const char* str, int len) {
-    // 使用与HASH_FNV_1A宏一致的算法
-    rt_strhash(str);
-}
-
-void hash_table_destroy(hash_table_t* ht) {
-    for (int i = 0; i < ht->cap; i++) {
-        hash_node_t* node = ht->buckets[i];
-        while (node) {
-            hash_node_t* next = node->next;
-            salloc_free(node);
-            node = next;
-        }
+void* hashtable_set(hash_table_t* ht, const void* key, void* value) {
+    if (ht->count + ht->tombstone_count >= ht->entries.cap * 0.75) {
+        int new_cap = ht->entries.cap < INIT_HASH_TABLE_SIZE ? INIT_HASH_TABLE_SIZE : ht->entries.cap * 2;
+        adjust_capacity(ht, new_cap);
    }
-    salloc_free(ht->buckets);
-    ht->buckets = NULL;
-    ht->size = ht->cap = 0;
-}
+
+    u32_t hash = ht->hash_func(key);
+    hash_entry_t* entry = find_entry(ht, key, hash);
+    
+    void* old_value = NULL;
+    if (entry->state == ENTRY_ACTIVE) {
+        old_value = entry->value;
+    } else {
+        if (entry->state == ENTRY_TOMBSTONE) ht->tombstone_count--;
+        ht->count++;
+    }
+    
+    entry->key = key;
+    entry->value = value;
+    entry->hash = hash;
+    entry->state = ENTRY_ACTIVE;
+    return old_value;
+}
+
+void* hashtable_get(hash_table_t* ht, const void* key) {
+    if (ht->entries.cap == 0) return NULL;
+    
+    u32_t hash = ht->hash_func(key);
+    hash_entry_t* entry = find_entry(ht, key, hash);
+    return (entry && entry->state == ENTRY_ACTIVE) ? entry->value : NULL;
+}
+
+void* hashtable_del(hash_table_t* ht, const void* key) {
+    if (ht->entries.cap == 0) return NULL;
+    
+    u32_t hash = ht->hash_func(key);
+    hash_entry_t* entry = find_entry(ht, key, hash);
+    
+    if (entry == NULL || entry->state != ENTRY_ACTIVE) return NULL;
+    
+    void* value = entry->value;
+    entry->state = ENTRY_TOMBSTONE;
+    ht->count--;
+    ht->tombstone_count++;
+    return value;
+}
+
+void hashtable_destory(hash_table_t* ht) {
+    vector_free(ht->entries);
+    ht->count = 0;
+    ht->tombstone_count = 0;
+}
--- a/lib/utils/ds/hashtable.h
+++ b/lib/utils/ds/hashtable.h
@ -1,27 +1,39 @@
 #ifndef __SMCC_HASHTABLE_H__
 #define __SMCC_HASHTABLE_H__

-#include <lib/rt/rt.h>
+#include <lib/rt/rt_alloc.h>
+#include "vector.h"

-typedef struct hash_node {
-    const char* str;
-    int len;
-    u32_t hash;
-    struct hash_node* next;
-} hash_node_t;
+// 哈希表条目状态标记
+typedef enum hash_table_entry_state {
+    ENTRY_EMPTY,
+    ENTRY_ACTIVE,
+    ENTRY_TOMBSTONE
+} ht_entry_state_t;

+// 哈希表条目结构（不管理key/value内存）
+typedef struct hash_entry {
+    const void* key;        // 由调用者管理
+    void* value;            // 由调用者管理
+    u32_t hash;            // 预计算哈希值
+    ht_entry_state_t state;    // 条目状态
+} hash_entry_t;
+
+// 哈希表主体结构
 typedef struct hash_table {
-    hash_node_t** buckets;
-    int size;
-    int cap;
-    int max_cap;
+    vector_header(entries, hash_entry_t);  // 使用vector管理条目
+    u32_t count;           // 有效条目数（不含墓碑）
+    u32_t tombstone_count; // 墓碑数量
+    u32_t (*hash_func)(const void* key);
+    int(*key_cmp)(const void* key1, const void* key2);
 } hash_table_t;

-hash_table_t* new_hash_table(int init_size, int max_cap);
-void hash_table_init(hash_table_t* ht, int init_size, int max_cap);
-void hash_table_destroy(hash_table_t* ht);
+// WARN you need set hash_func and key_cmp before use
+void hashtable_init(hash_table_t* ht) ;

-void hash_table_insert(hash_table_t* ht, const char* str, int len);
-hash_node_t* hash_table_find(hash_table_t* ht, const char* str, int len);
+void* hashtable_set(hash_table_t* ht, const void* key, void* value);
+void* hashtable_get(hash_table_t* ht, const void* key);
+void* hashtable_get(hash_table_t* ht, const void* key);
+void hashtable_destory(hash_table_t* ht);

 #endif // __SMCC_HASHTABLE_H__
--- a/lib/utils/strpool/strpool.c
+++ b/lib/utils/strpool/strpool.c
@ -0,0 +1,32 @@
+#include "strpool.h"
+
+void init_strpool(strpool_t* pool) {
+    lalloc_init(&pool->stralloc);
+    
+    pool->ht.hash_func = (u32_t(*)(const void*))rt_strhash;
+    pool->ht.key_cmp = (int(*)(const void*, const void*))rt_strcmp;
+    hashtable_init(&pool->ht);
+}
+
+const char* strpool_intern(strpool_t* pool, const char* str) {
+    void* existing = hashtable_get(&pool->ht, str);
+    if (existing) {
+        return existing;
+    }
+
+    rt_size_t len = rt_strlen(str) + 1;
+    char* new_str = lalloc_alloc(&pool->stralloc, len);
+    if (!new_str) {
+        LOG_ERROR("strpool: Failed to allocate memory for string");
+        return NULL;
+    }
+    rt_memcpy(new_str, str, len);
+
+    hashtable_set(&pool->ht, new_str, new_str);
+    return new_str;
+}
+
+void strpool_destroy(strpool_t* pool) {
+    hashtable_destory(&pool->ht);
+    lalloc_destroy(&pool->stralloc);
+}
--- a/lib/utils/strpool/strpool.h
+++ b/lib/utils/strpool/strpool.h
@ -2,11 +2,16 @@
 #define __SMCC_STRPOOL_H__

 #include <lib/core.h>
-#include "../ds/hash.h"
-typedef struct strpool {
-    long_alloc_t *long_alloc;
-} strpool_t; 
+#include <lib/rt/rt_alloc.h>
+#include <lib/utils/ds/hashtable.h>

-void new_strpool();
+typedef struct strpool {
+    hash_table_t ht;       // 用于快速查找字符串
+    long_alloc_t stralloc; // 专门用于字符串存储的分配器
+} strpool_t;
+
+void init_strpool(strpool_t* pool);
+const char* strpool_intern(strpool_t* pool, const char* str);
+void strpool_destroy(strpool_t* pool);

 #endif // __SMCC_STRPOOL_H__
--- a/lib/utils/symtab/symtab.h
+++ b/lib/utils/symtab/symtab.h
@ -0,0 +1,6 @@
+#ifndef __SMCC_SYMTABL_H__
+#define __SMCC_SYMTABL_H__
+
+
+
+#endif
--- a/lib/utils/tokbuf/tokbuf.c
+++ b/lib/utils/tokbuf/tokbuf.c
--- a/lib/utils/tokbuf/tokbuf.h
+++ b/lib/utils/tokbuf/tokbuf.h
@ -7,18 +7,20 @@ typedef struct loc {
    const char *fname;
    int line;
    int col;
-    short len;
+    int len;
 } loc_t;

-typedef enum tok_type {
+typedef enum tok_basic_type {
    TK_BASIC_INVALID,     // 错误占位
    TK_BASIC_KEYWORD,     // 关键字
    TK_BASIC_OPERATOR,    // 操作符
    TK_BASIC_IDENTIFIER,  // 标识符
    TK_BASIC_LITERAL,     // 字面量
-    TK_BASIC_PUNCTUATOR,  // 标点符号
+
+    TK_BASIC_WHITESPACE,  // 空白
+    TK_BASIC_COMMENT,     // 注释
    TK_BASIC_EOF          // 结束标记
-} tok_type_t;
+} tok_basic_type_t;

 typedef union ctype {
    u8_t    u8;
@ -34,10 +36,15 @@ typedef union ctype {
    iptr_t  iptr;
    uptr_t  uptr;
    void*   ptr;
+    char    ch;
+    int     i;
+
+    // MUST BE strpool ptr
+    const char* str;
 } ctype_t;

 typedef struct tok {
-    tok_type_t type;
+    tok_basic_type_t type;
    int sub_type;
    loc_t loc;
    ctype_t val;
--- a/lib/utils/utils.h
+++ b/lib/utils/utils.h
@ -0,0 +1,8 @@
+#ifndef __SMCC_LIB_UTILS_H__
+#define __SMCC_LIB_UTILS_H__
+
+#include "strpool/strpool.h"
+#include "symtab/symtab.h"
+#include "tokbuf/tokbuf.h"
+
+#endif