feat(frontend): 重构词法分析器

- 添加 .gitignore 文件,忽略编译器生成的二进制文件
- 重构 lexer.c 文件,改进了关键字处理和字符串处理
- 更新前端的前端、解析器和 AST 相关文件,以适应新的词法分析器
- 优化了 token 相关的定义和函数,引入了新的 token 类型
This commit is contained in:
ZZY 2025-03-23 12:13:16 +08:00
parent 05c637e594
commit 2b4857001c
33 changed files with 532 additions and 624 deletions

22
.gitignore vendored Normal file
View File

@ -0,0 +1,22 @@
.vscode/
# smcc compiler generated files
*.bin
# linux binary files
*.o
*.a
*.so
*.out
# windows binary files
*.obj
*.lib
*.dll
*.exe
# developed notes
note.md
# python
.venv

View File

@ -7,7 +7,7 @@
// 指令编码联合体(自动处理小端序)
typedef union rv32code {
uint32_t code;
u8_t bytes[4];
uint8_t bytes[4];
} rv32code_t;
#include "../../frontend/frontend.h"

View File

@ -4,14 +4,16 @@
ast_node_t* frontend(const char* file, void* stream, sread_fn sread) {
init_lib_core();
strpool_t strpool;
init_strpool(&strpool);
lexer_t lexer;
init_lexer(&lexer, file, stream, sread);
init_lexer(&lexer, file, stream, sread, &strpool);
symtab_t symtab;
init_symtab(&symtab);
parser_t parser;
parser_t parser;
init_parser(&parser, &lexer, &symtab);
parse_prog(&parser);

View File

@ -34,7 +34,7 @@ David Hanson / drh@drhanson.net
static const struct {
const char* name;
enum CSTD_KEYWORD std_type;
tok_type_t tok;
cc_tktype_t tok;
} keywords[] = {
#define X(name, std_type, tok, ...) { #name, std_type, tok },
KEYWORD_TABLE
@ -74,19 +74,17 @@ static inline int keyword_cmp(const char* name, int len) {
return -1; // Not a keyword.
}
void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) {
init_lib_core();
lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
lexer->index = 1;
lexer->line = 1;
void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) {
lexer->strpool = strpool;
lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer);
lexer->loc.fname = strpool_intern(lexer->strpool, file_name);
lexer->loc.line = 1;
lexer->loc.col = 1;
lexer->stream = stream;
lexer->sread = sread;
for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
lexer->buffer[i] = 0;
}
rt_memset(lexer->buffer, 0, sizeof(lexer->buffer));
}
static void flush_buffer(lexer_t* lexer) {
@ -94,7 +92,7 @@ static void flush_buffer(lexer_t* lexer) {
for (int i = 0; i < num; i++) {
lexer->buffer[i] = lexer->cur_ptr[i];
}
lexer->cur_ptr = (unsigned char*)lexer->buffer;
lexer->cur_ptr = lexer->buffer;
int read_size = LEXER_BUFFER_SIZE - num;
// TODO rt_size_t to int maybe lose precision
@ -128,19 +126,20 @@ static void goto_block_comment(lexer_t* lexer) {
flush_buffer(lexer);
}
if (*lexer->cur_ptr == '\0') {
if (lexer->cur_ptr[0] == '\0') {
break;
} else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
lexer->cur_ptr += 2;
break;
} else {
if (lexer->cur_ptr[0] == '\n') lexer->loc.line++;
lexer->cur_ptr++;
}
}
}
// TODO escape character not enough
static char got_slash(unsigned char* peek) {
static char got_slash(char* peek) {
switch (*peek) {
case '\\': return '\\';
case '\'': return '\'';
@ -162,7 +161,7 @@ static char got_slash(unsigned char* peek) {
static void parse_char_literal(lexer_t* lexer, tok_t* token) {
char val = 0;
unsigned char* peek = lexer->cur_ptr + 1;
char* peek = lexer->cur_ptr + 1;
if (*peek == '\\') {
peek++;
val = got_slash(peek);
@ -172,16 +171,14 @@ static void parse_char_literal(lexer_t* lexer, tok_t* token) {
}
if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
token->val.ch = val;
lexer->cur_ptr = peek;
token->val.have = 1;
token->type = TOKEN_CHAR_LITERAL;
token->val.ch = val;
}
static void parse_string_literal(lexer_t* lexer, tok_t* token) {
unsigned char* peek = lexer->cur_ptr + 1;
char* peek = lexer->cur_ptr + 1;
// TODO string literal size check
char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1);
static char dest[LEXER_MAX_TOKEN_SIZE + 1];
int len = 0;
while (*peek != '"') {
@ -196,14 +193,15 @@ static void parse_string_literal(lexer_t* lexer, tok_t* token) {
dest[len++] = *peek++;
}
dest[len] = '\0';
lexer->cur_ptr = peek + 1;
token->val.have = 1;
token->type = TOKEN_STRING_LITERAL;
lexer->cur_ptr = peek + 1; // 1 is `"`
lexer->loc.len = len + 2; // 2 is `"` `"`
token->val.str = strpool_intern(lexer->strpool, dest);
}
// FIXME it write by AI maybe error
static void parse_number(lexer_t* lexer, tok_t* token) {
unsigned char* peek = lexer->cur_ptr;
char* peek = lexer->cur_ptr;
int base = 10;
int is_float = 0;
long long int_val = 0;
@ -278,14 +276,15 @@ static void parse_number(lexer_t* lexer, tok_t* token) {
}
// 存储结果
// TODO
lexer->loc.len = peek - lexer->cur_ptr;
lexer->cur_ptr = peek;
token->val.have = 1;
if (is_float) {
token->val.d = float_val;
token->type = TOKEN_FLOAT_LITERAL;
token->val.f32 = float_val;
token->sub_type = TOKEN_FLOAT_LITERAL;
} else {
token->val.ll = int_val;
token->type = TOKEN_INT_LITERAL;
token->val.i = int_val;
token->sub_type = TOKEN_INT_LITERAL;
}
}
@ -296,160 +295,159 @@ void get_token(lexer_t* lexer, tok_t* token) {
if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
flush_buffer(lexer);
}
register unsigned char* peek = lexer->cur_ptr;
// 快速跳过空白符
while (*peek == ' ' || *peek == '\t') {
if (peek == lexer->end_ptr) {
break;
}
peek++;
}
if (peek != lexer->cur_ptr) {
// To TOKEN_FLUSH
lexer->cur_ptr = peek;
token->type = TOKEN_FLUSH;
}
tok_type_t tok = TOKEN_INIT;
tok_val_t constant;
constant.have = 0;
register char* peek = lexer->cur_ptr;
cc_tktype_t tk_type = TOKEN_INIT;
ctype_t literal = { 0 };
// once step
switch (*peek++) {
case '=':
case '=':
switch (*peek++) {
case '=': tok = TOKEN_EQ; break;
default: peek--, tok = TOKEN_ASSIGN; break;
case '=': tk_type = TOKEN_EQ; break;
default: peek--, tk_type = TOKEN_ASSIGN; break;
} break;
case '+':
switch (*peek++) {
case '+': tok = TOKEN_ADD_ADD; break;
case '=': tok = TOKEN_ASSIGN_ADD; break;
default: peek--, tok = TOKEN_ADD; break;
case '+': tk_type = TOKEN_ADD_ADD; break;
case '=': tk_type = TOKEN_ASSIGN_ADD; break;
default: peek--, tk_type = TOKEN_ADD; break;
} break;
case '-':
switch (*peek++) {
case '-': tok = TOKEN_SUB_SUB; break;
case '=': tok = TOKEN_ASSIGN_SUB; break;
case '-': tk_type = TOKEN_SUB_SUB; break;
case '=': tk_type = TOKEN_ASSIGN_SUB; break;
case '>': tok = TOKEN_DEREF; break;
default: peek--, tok = TOKEN_SUB; break;
case '>': tk_type = TOKEN_DEREF; break;
default: peek--, tk_type = TOKEN_SUB; break;
} break;
case '*':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_MUL; break;
default: peek--, tok = TOKEN_MUL; break;
case '=': tk_type = TOKEN_ASSIGN_MUL; break;
default: peek--, tk_type = TOKEN_MUL; break;
} break;
case '/':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_DIV; break;
case '=': tk_type = TOKEN_ASSIGN_DIV; break;
case '/': {
// need get a new line to parse
goto_newline(lexer);
tok = TOKEN_LINE_COMMENT;
tk_type = TOKEN_LINE_COMMENT;
goto END;
}
case '*': {
lexer->cur_ptr = peek;
goto_block_comment(lexer);
tok = TOKEN_BLOCK_COMMENT;
tk_type = TOKEN_BLOCK_COMMENT;
goto END;
}
default: peek--, tok = TOKEN_DIV; break;
default: peek--, tk_type = TOKEN_DIV; break;
} break;
case '%':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_MOD; break;
default: peek--, tok = TOKEN_MOD; break;
case '=': tk_type = TOKEN_ASSIGN_MOD; break;
default: peek--, tk_type = TOKEN_MOD; break;
} break;
case '&':
switch (*peek++) {
case '&': tok = TOKEN_AND_AND; break;
case '=': tok = TOKEN_ASSIGN_AND; break;
default: peek--, tok = TOKEN_AND; break;
case '&': tk_type = TOKEN_AND_AND; break;
case '=': tk_type = TOKEN_ASSIGN_AND; break;
default: peek--, tk_type = TOKEN_AND; break;
} break;
case '|':
switch (*peek++) {
case '|': tok = TOKEN_OR_OR; break;
case '=': tok = TOKEN_ASSIGN_OR; break;
default: peek--, tok = TOKEN_OR; break;
case '|': tk_type = TOKEN_OR_OR; break;
case '=': tk_type = TOKEN_ASSIGN_OR; break;
default: peek--, tk_type = TOKEN_OR; break;
} break;
case '^':
switch (*peek++) {
case '=': tok = TOKEN_ASSIGN_XOR; break;
default: peek--, tok = TOKEN_XOR; break;
case '=': tk_type = TOKEN_ASSIGN_XOR; break;
default: peek--, tk_type = TOKEN_XOR; break;
} break;
case '<':
switch (*peek++) {
case '=': tok = TOKEN_LE; break;
case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
default: peek--, tok = TOKEN_LT; break;
case '=': tk_type = TOKEN_LE; break;
case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
default: peek--, tk_type = TOKEN_LT; break;
} break;
case '>':
switch (*peek++) {
case '=': tok = TOKEN_GE; break;
case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
default: peek--, tok = TOKEN_GT; break;
case '=': tk_type = TOKEN_GE; break;
case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
default: peek--, tk_type = TOKEN_GT; break;
} break;
case '~':
tok = TOKEN_BIT_NOT; break;
tk_type = TOKEN_BIT_NOT; break;
case '!':
switch (*peek++) {
case '=': tok = TOKEN_NEQ; break;
default: peek--, tok = TOKEN_NOT; break;
case '=': tk_type = TOKEN_NEQ; break;
default: peek--, tk_type = TOKEN_NOT; break;
} break;
case '[':
tok = TOKEN_L_BRACKET; break;
tk_type = TOKEN_L_BRACKET; break;
case ']':
tok = TOKEN_R_BRACKET; break;
tk_type = TOKEN_R_BRACKET; break;
case '(':
tok = TOKEN_L_PAREN; break;
tk_type = TOKEN_L_PAREN; break;
case ')':
tok = TOKEN_R_PAREN; break;
tk_type = TOKEN_R_PAREN; break;
case '{':
tok = TOKEN_L_BRACE; break;
tk_type = TOKEN_L_BRACE; break;
case '}':
tok = TOKEN_R_BRACE; break;
tk_type = TOKEN_R_BRACE; break;
case ';':
tok = TOKEN_SEMICOLON; break;
tk_type = TOKEN_SEMICOLON; break;
case ',':
tok = TOKEN_COMMA; break;
tk_type = TOKEN_COMMA; break;
case ':':
tok = TOKEN_COLON; break;
tk_type = TOKEN_COLON; break;
case '.':
if (peek[0] == '.' && peek[1] == '.') {
peek += 2;
tok = TOKEN_ELLIPSIS;
tk_type = TOKEN_ELLIPSIS;
} else {
tok = TOKEN_DOT;
tk_type = TOKEN_DOT;
}
break;
case '?':
tok = TOKEN_COND; break;
case '\v': case '\r': case '\f': // FIXME it parse as a blank character
tok = TOKEN_FLUSH; break;
case '\n':
tk_type = TOKEN_COND; break;
case '\v': case '\r': case '\f':
case ' ': case '\t':
tk_type = TOKEN_BLANK; break;
case '\n':
// you need to flush a newline or blank
lexer->line++;
tok = TOKEN_FLUSH; break;
lexer->loc.line += 1;
lexer->loc.col = -1;
lexer->loc.len = 1;
tk_type = TOKEN_BLANK;
break;
case '#':
LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored");
// TODO make line or file comment to change
LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
goto_newline(lexer);
tok = TOKEN_FLUSH;
tk_type = TOKEN_BLANK;
goto END;
case '\0':
// EOF
tok = TOKEN_EOF;
tk_type = TOKEN_EOF;
goto END;
case '\'':
return parse_char_literal(lexer, token);
return;
parse_char_literal(lexer, token);
literal = token->val;
tk_type = TOKEN_CHAR_LITERAL;
goto END; break;
case '"':
return parse_string_literal(lexer, token);
parse_string_literal(lexer, token);
literal = token->val;
tk_type = TOKEN_STRING_LITERAL;
goto END; break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return parse_number(lexer, token);
parse_number(lexer, token);
// TODO Make it easy
literal = token->val;
tk_type = token->sub_type;
goto END; break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
@ -475,39 +473,53 @@ void get_token(lexer_t* lexer, tok_t* token) {
break;
}
int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr));
int strlen = peek - lexer->cur_ptr;
int res = keyword_cmp((const char*)lexer->cur_ptr, strlen);
if (res == -1) {
int strlen = peek - lexer->cur_ptr;
unsigned char* str = rt._malloc(strlen + 1);
constant.have = 1;
constant.str = (char*)str;
for (int i = 0; i < strlen; i++) {
str[i] = lexer->cur_ptr[i];
}
str[strlen] = '\0';
constant.have = 1;
constant.str = (char*)str;
tok = TOKEN_IDENT; break;
char prev = lexer->cur_ptr[strlen];
lexer->cur_ptr[strlen] = '\0';
literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr);
lexer->cur_ptr[strlen] = prev;
tk_type = TOKEN_IDENT; break;
} else {
tok = keywords[res].tok; break;
tk_type = keywords[res].tok; break;
}
default:
LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
break;
}
lexer->loc.len = peek - lexer->cur_ptr;
lexer->cur_ptr = peek;
END:
token->val = constant;
token->type = tok;
LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i);
lexer->loc.col += lexer->loc.len;
lexer->loc.len = 0;
token->val = literal;
token->sub_type = tk_type;
token->loc = lexer->loc;
static const tok_basic_type_t tok_type_map[] = {
// 普通token使用#str
#define X(str, basic, tok) [tok] = basic,
TOKEN_TABLE
#undef X
// 关键字使用#name
#define X(name, std, tok) [tok] = TK_BASIC_KEYWORD,
KEYWORD_TABLE
#undef X
};
token->type = tok_type_map[tk_type];
LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type),
token->loc.fname, token->loc.line, token->loc.col);
}
// get_token maybe got invalid (with parser)
void get_valid_token(lexer_t* lexer, tok_t* token) {
tok_type_t type;
tok_basic_type_t type;
do {
get_token(lexer, token);
type = token->type;
} while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
Assert(type != TK_BASIC_INVALID);
} while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT);
}

View File

@ -1,5 +1,5 @@
#ifndef __SMCC_LEXER_H__
#define __SMCC_LEXER_H__
#ifndef __SMCC_CC_LEXER_H__
#define __SMCC_CC_LEXER_H__
#include <lib/core.h>
#include "token.h"
@ -14,25 +14,25 @@ typedef int (*lexer_sread_fn)(void *dst_buf, int dst_size,
int elem_size, int count, void *stream);
typedef struct lexer {
int line;
int index;
// const char current_file_name[LEXER_BUFFER_SIZE+1];
loc_t loc;
unsigned char* cur_ptr; // 当前扫描的字符,但是还没有开始扫描
unsigned char* end_ptr; // 缓冲区最后一个字符的下一个位置
char* cur_ptr; // 当前扫描的字符,但是还没有开始扫描
char* end_ptr; // 缓冲区最后一个字符的下一个位置
char buffer[LEXER_BUFFER_SIZE+1];
lexer_sread_fn sread;
void* stream;
strpool_t* strpool;
} lexer_t;
void init_lexer(lexer_t* lexer, const char* file_name, void* stream,
lexer_sread_fn sread);
lexer_sread_fn sread, strpool_t* strpool);
// pure token getter it will included empty token like TOKEN_FLUSH
// pure token getter it will included empty token like TOKEN_BLANK
void get_token(lexer_t* lexer, tok_t* token);
// get_token maybe got invalid (with parser as TOKEN_FLUSH)
// get_token maybe got invalid (with parser as TOKEN_BLANK)
void get_valid_token(lexer_t* lexer, tok_t* token);
#endif

View File

@ -3,11 +3,44 @@
#include <lib/rt/rt.h>
#define LEX_NOTSET( fmt, ...) LOG_NOTSET("LEXER: " fmt, ##__VA_ARGS__)
#define LEX_DEBUG( fmt, ...) LOG_DEBUG("LEXER: " fmt, ##__VA_ARGS__)
#define LEX_INFO( fmt, ...) LOG_INFO("LEXER: " fmt, ##__VA_ARGS__)
#define LEX_WARN( fmt, ...) LOG_WARN("LEXER: " fmt, ##__VA_ARGS__)
#define LEX_ERROR( fmt, ...) LOG_ERROR("LEXER: " fmt, ##__VA_ARGS__)
#define LEX_FATAL( fmt, ...) LOG_FATAL("LEXER: " fmt, ##__VA_ARGS__)
#ifndef LEX_LOG_LEVEL
#define LEX_LOG_LEVEL 4
#endif
#if LEX_LOG_LEVEL <= 1
#define LEX_NOTSET( fmt, ...) LOG_NOTSET("LEXER: " fmt, ##__VA_ARGS__)
#else
#define LEX_NOTSET( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 2
#define LEX_DEBUG( fmt, ...) LOG_DEBUG( "LEXER: " fmt, ##__VA_ARGS__)
#else
#define LEX_DEBUG( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 3
#define LEX_INFO( fmt, ...) LOG_INFO( "LEXER: " fmt, ##__VA_ARGS__)
#else
#define LEX_INFO( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 4
#define LEX_WARN( fmt, ...) LOG_WARN( "LEXER: " fmt, ##__VA_ARGS__)
#else
#define LEX_WARN( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 5
#define LEX_ERROR( fmt, ...) LOG_ERROR("LEXER: " fmt, ##__VA_ARGS__)
#else
#define LEX_ERROR( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 6
#define LEX_FATAL( fmt, ...) LOG_FATAL("LEXER: " fmt, ##__VA_ARGS__)
#else
#define LEX_FATAL( fmt, ...)
#endif
#endif // __SMCC_LEXER_LOG_H__

View File

@ -1,5 +1,5 @@
CC = gcc
CFLAGS = -g -Wall -I../../../..
CFLAGS = -g -Wall -I../../../.. -DLEX_LOG_LEVEL=4
SRC = ../lexer.c ../token.c
LIB = -L../../../../lib -lcore

View File

@ -38,14 +38,18 @@ int main(int argc, char* argv[]) {
printf("open file success\n");
lexer_t lexer;
init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s);
strpool_t strpool;
init_strpool(&strpool);
init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s, &strpool);
tok_t tok;
while (1) {
get_valid_token(&lexer, &tok);
if (tok.type == TOKEN_EOF) {
if (tok.sub_type == TOKEN_EOF) {
break;
}
LOG_DEBUG("tk type `%s` in %s:%d:%d", get_tok_name(tok.sub_type), tok.loc.fname, tok.loc.line, tok.loc.col);
// LOG_DEBUG("%s", tok.val.str);
// printf("line: %d, column: %d, type: %3d, typename: %s\n",
// lexer.line, lexer.index, tok.type, get_tok_name(tok.type));
}

View File

@ -1,5 +1,5 @@
// test_lexer.c
#include "../../../../libcore/acutest.h"
#include <lib/acutest.h>
#include "../lexer.h"
#include <string.h>
@ -13,7 +13,7 @@ int test_read(void *dst_buf, int dst_size, int elem_size, int count, void *strea
}
// 测试辅助函数
static inline void test_lexer_string(const char* input, tok_type_t expected_type) {
static inline void test_lexer_string(const char* input, cc_tktype_t expected_type) {
lexer_t lexer;
tok_t token;

View File

@ -52,14 +52,14 @@ tok_t *peek_tok(tok_stream_t *tokbuf) {
return &(tokbuf->buf[idx]);
}
tok_type_t peek_tok_type(tok_stream_t* tokbuf) {
return peek_tok(tokbuf)->type;
cc_tktype_t peek_tok_type(tok_stream_t* tokbuf) {
return peek_tok(tokbuf)->sub_type;
}
int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type) {
int expect_pop_tok(tok_stream_t* tokbuf, cc_tktype_t type) {
flush_peek_tok(tokbuf);
tok_t* tok = peek_tok(tokbuf);
if (tok->type != type) {
if (tok->sub_type != type) {
LEX_ERROR("expected tok `%s` but got `%s`", get_tok_name(type), get_tok_name(tok->type));
return 0;
} else {
@ -71,7 +71,7 @@ int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type) {
// 生成字符串映射(根据需求选择#str或#name
static const char* token_strings[] = {
// 普通token使用#str
#define X(str, tok) [tok] = #str,
#define X(str, basic, tok) [tok] = #str,
TOKEN_TABLE
#undef X
@ -81,6 +81,6 @@ static const char* token_strings[] = {
#undef X
};
const char* get_tok_name(tok_type_t type) {
const char* get_tok_name(cc_tktype_t type) {
return token_strings[type];
}

View File

@ -1,5 +1,7 @@
#ifndef __TOKEN_H__
#define __TOKEN_H__
#ifndef __SMCC_CC_TOKEN_H__
#define __SMCC_CC_TOKEN_H__
#include <lib/utils/utils.h>
enum CSTD_KEYWORD {
CSTD_C89,
@ -46,68 +48,68 @@ enum CSTD_KEYWORD {
// KEYWORD_TABLE
#define TOKEN_TABLE \
X(EOF , TOKEN_EOF) \
X(init , TOKEN_INIT) \
X(flush , TOKEN_FLUSH) \
X("==" , TOKEN_EQ) \
X("=" , TOKEN_ASSIGN) \
X("++" , TOKEN_ADD_ADD) \
X("+=" , TOKEN_ASSIGN_ADD) \
X("+" , TOKEN_ADD) \
X("--" , TOKEN_SUB_SUB) \
X("-=" , TOKEN_ASSIGN_SUB) \
X("->" , TOKEN_DEREF) \
X("-" , TOKEN_SUB) \
X("*=" , TOKEN_ASSIGN_MUL) \
X("*" , TOKEN_MUL) \
X("/=" , TOKEN_ASSIGN_DIV) \
X("/" , TOKEN_DIV) \
X("//" , TOKEN_LINE_COMMENT) \
X("/* */" , TOKEN_BLOCK_COMMENT) \
X("%=" , TOKEN_ASSIGN_MOD) \
X("%" , TOKEN_MOD) \
X("&&" , TOKEN_AND_AND) \
X("&=" , TOKEN_ASSIGN_AND) \
X("&" , TOKEN_AND) \
X("||" , TOKEN_OR_OR) \
X("|=" , TOKEN_ASSIGN_OR) \
X("|" , TOKEN_OR) \
X("^=" , TOKEN_ASSIGN_XOR) \
X("^" , TOKEN_XOR) \
X("<<=" , TOKEN_ASSIGN_L_SH) \
X("<<" , TOKEN_L_SH) \
X("<=" , TOKEN_LE) \
X("<" , TOKEN_LT) \
X(">>=" , TOKEN_ASSIGN_R_SH) \
X(">>" , TOKEN_R_SH) \
X(">=" , TOKEN_GE) \
X(">" , TOKEN_GT) \
X("!" , TOKEN_NOT) \
X("!=" , TOKEN_NEQ) \
X("~" , TOKEN_BIT_NOT) \
X("[" , TOKEN_L_BRACKET) \
X("]" , TOKEN_R_BRACKET) \
X("(" , TOKEN_L_PAREN) \
X(")" , TOKEN_R_PAREN) \
X("{" , TOKEN_L_BRACE) \
X("}" , TOKEN_R_BRACE) \
X(";" , TOKEN_SEMICOLON) \
X("," , TOKEN_COMMA) \
X(":" , TOKEN_COLON) \
X("." , TOKEN_DOT) \
X("..." , TOKEN_ELLIPSIS) \
X("?" , TOKEN_COND) \
X(identifier , TOKEN_IDENT) \
X(int_literal , TOKEN_INT_LITERAL) \
X(float_literal , TOKEN_FLOAT_LITERAL) \
X(char_literal , TOKEN_CHAR_LITERAL) \
X(string_literal , TOKEN_STRING_LITERAL) \
X(init , TK_BASIC_INVALID, TOKEN_INIT) \
X(EOF , TK_BASIC_EOF, TOKEN_EOF) \
X(blank , TK_BASIC_WHITESPACE, TOKEN_BLANK) \
X("==" , TK_BASIC_OPERATOR, TOKEN_EQ) \
X("=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN) \
X("++" , TK_BASIC_OPERATOR, TOKEN_ADD_ADD) \
X("+=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_ADD) \
X("+" , TK_BASIC_OPERATOR, TOKEN_ADD) \
X("--" , TK_BASIC_OPERATOR, TOKEN_SUB_SUB) \
X("-=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_SUB) \
X("->" , TK_BASIC_OPERATOR, TOKEN_DEREF) \
X("-" , TK_BASIC_OPERATOR, TOKEN_SUB) \
X("*=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_MUL) \
X("*" , TK_BASIC_OPERATOR, TOKEN_MUL) \
X("/=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_DIV) \
X("/" , TK_BASIC_OPERATOR, TOKEN_DIV) \
X("//" , TK_BASIC_COMMENT , TOKEN_LINE_COMMENT) \
X("/* */" , TK_BASIC_COMMENT , TOKEN_BLOCK_COMMENT) \
X("%=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_MOD) \
X("%" , TK_BASIC_OPERATOR, TOKEN_MOD) \
X("&&" , TK_BASIC_OPERATOR, TOKEN_AND_AND) \
X("&=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_AND) \
X("&" , TK_BASIC_OPERATOR, TOKEN_AND) \
X("||" , TK_BASIC_OPERATOR, TOKEN_OR_OR) \
X("|=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_OR) \
X("|" , TK_BASIC_OPERATOR, TOKEN_OR) \
X("^=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_XOR) \
X("^" , TK_BASIC_OPERATOR, TOKEN_XOR) \
X("<<=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_L_SH) \
X("<<" , TK_BASIC_OPERATOR, TOKEN_L_SH) \
X("<=" , TK_BASIC_OPERATOR, TOKEN_LE) \
X("<" , TK_BASIC_OPERATOR, TOKEN_LT) \
X(">>=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_R_SH) \
X(">>" , TK_BASIC_OPERATOR, TOKEN_R_SH) \
X(">=" , TK_BASIC_OPERATOR, TOKEN_GE) \
X(">" , TK_BASIC_OPERATOR, TOKEN_GT) \
X("!" , TK_BASIC_OPERATOR, TOKEN_NOT) \
X("!=" , TK_BASIC_OPERATOR, TOKEN_NEQ) \
X("~" , TK_BASIC_OPERATOR, TOKEN_BIT_NOT) \
X("[" , TK_BASIC_OPERATOR, TOKEN_L_BRACKET) \
X("]" , TK_BASIC_OPERATOR, TOKEN_R_BRACKET) \
X("(" , TK_BASIC_OPERATOR, TOKEN_L_PAREN) \
X(")" , TK_BASIC_OPERATOR, TOKEN_R_PAREN) \
X("{" , TK_BASIC_OPERATOR, TOKEN_L_BRACE) \
X("}" , TK_BASIC_OPERATOR, TOKEN_R_BRACE) \
X(";" , TK_BASIC_OPERATOR, TOKEN_SEMICOLON) \
X("," , TK_BASIC_OPERATOR, TOKEN_COMMA) \
X(":" , TK_BASIC_OPERATOR, TOKEN_COLON) \
X("." , TK_BASIC_OPERATOR, TOKEN_DOT) \
X("..." , TK_BASIC_OPERATOR, TOKEN_ELLIPSIS) \
X("?" , TK_BASIC_OPERATOR, TOKEN_COND) \
X(ident , TK_BASIC_IDENTIFIER, TOKEN_IDENT) \
X(int_literal , TK_BASIC_LITERAL, TOKEN_INT_LITERAL) \
X(float_literal , TK_BASIC_LITERAL, TOKEN_FLOAT_LITERAL) \
X(char_literal , TK_BASIC_LITERAL, TOKEN_CHAR_LITERAL) \
X(string_literal , TK_BASIC_LITERAL, TOKEN_STRING_LITERAL) \
// END
// 定义TokenType枚举
typedef enum tok_type {
typedef enum cc_tktype {
// 处理普通token
#define X(str, tok) tok,
#define X(str, basic, tok) tok,
TOKEN_TABLE
#undef X
@ -115,24 +117,7 @@ typedef enum tok_type {
#define X(name, std, tok) tok,
KEYWORD_TABLE
#undef X
} tok_type_t;
typedef struct tok_val {
int have;
union {
char ch;
int i;
float f;
double d;
long long ll;
char* str;
};
} tok_val_t;
typedef struct tok {
tok_type_t type;
tok_val_t val;
} tok_t;
} cc_tktype_t;
typedef struct tok_stream {
int cur;
@ -150,8 +135,8 @@ void init_tokbuf(tok_stream_t* tokbuf, void* stream, tok_stream_get_func gettok)
tok_t* peek_tok(tok_stream_t* tokbuf);
tok_t* pop_tok(tok_stream_t* tokbuf);
void flush_peek_tok(tok_stream_t* tokbuf);
tok_type_t peek_tok_type(tok_stream_t* tokbuf);
int expect_pop_tok(tok_stream_t* tokbuf, tok_type_t type);
const char* get_tok_name(tok_type_t type);
cc_tktype_t peek_tok_type(tok_stream_t* tokbuf);
int expect_pop_tok(tok_stream_t* tokbuf, cc_tktype_t type);
const char* get_tok_name(cc_tktype_t type);
#endif

View File

@ -19,7 +19,7 @@ ast_node_t* parse_block(parser_t* parser) {
symtab_enter_scope(parser->symtab);
tok_stream_t *tokbuf = &parser->tokbuf;
flush_peek_tok(tokbuf);
tok_type_t ttype;
cc_tktype_t ttype;
ast_node_t* node = new_ast_node_block();
expect_pop_tok(tokbuf, TOKEN_L_BRACE);

View File

@ -37,7 +37,7 @@ int peek_decl(tok_stream_t* tokbuf) {
ast_node_t* parse_decl_val(parser_t* parser) {
tok_stream_t* tokbuf = &parser->tokbuf;
tok_type_t ttype;
cc_tktype_t ttype;
flush_peek_tok(tokbuf);
ast_node_t* node;
@ -69,7 +69,7 @@ ast_node_t* parse_decl_val(parser_t* parser) {
ast_node_t* parse_decl(parser_t* parser) {
tok_stream_t* tokbuf = &parser->tokbuf;
flush_peek_tok(tokbuf);
tok_type_t ttype;
cc_tktype_t ttype;
ast_node_t* node;
if (peek_decl(tokbuf) == 0) {

View File

@ -82,7 +82,7 @@ static ast_node_t* parse_comma(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_
static ast_node_t* parse_assign(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) {
flush_peek_tok(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
pop_tok(tokbuf);
ast_node_t* node = new_ast_node();
node->type = NT_ASSIGN;
@ -133,7 +133,7 @@ static ast_node_t* parse_assign(tok_stream_t* tokbuf, symtab_t *symtab, ast_node
static ast_node_t* parse_cmp(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) {
flush_peek_tok(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
pop_tok(tokbuf);
ast_node_t* node = new_ast_node();
// saved left
@ -171,7 +171,7 @@ static ast_node_t* parse_cmp(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t*
static ast_node_t* parse_cal(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t* left) {
flush_peek_tok(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
pop_tok(tokbuf);
ast_node_t* node = new_ast_node();
node->expr.left = left;
@ -238,7 +238,7 @@ static ast_node_t* parse_call(tok_stream_t* tokbuf, symtab_t *symtab, ast_node_t
vector_init(node->call.params->params.params);
pop_tok(tokbuf); // 跳过 '('
tok_type_t ttype;
cc_tktype_t ttype;
while (1) {
flush_peek_tok(tokbuf);
ttype = peek_tok_type(tokbuf);
@ -330,7 +330,7 @@ static ast_node_t *parse_primary_expression(tok_stream_t* tokbuf, symtab_t *symt
node->type = NT_TERM_VAL;
node->syms.tok = *tok;
switch (tok->type) {
switch (tok->sub_type) {
case TOKEN_INT_LITERAL:
// node->data.data_type = TYPE_INT;
break;
@ -344,7 +344,7 @@ static ast_node_t *parse_primary_expression(tok_stream_t* tokbuf, symtab_t *symt
// node->data.data_type = TYPE_POINTER;
case TOKEN_IDENT:
node = expect_pop_ident(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
if (ttype == TOKEN_L_PAREN) {
node = parse_call(tokbuf, symtab, node);
} else {
@ -365,7 +365,7 @@ END:
}
static ast_node_t *parse_subexpression(tok_stream_t* tokbuf, symtab_t *symtab, enum Precedence prec) {
tok_type_t ttype;
cc_tktype_t ttype;
struct expr_prec_table_t* work;
ast_node_t* left;
@ -400,7 +400,7 @@ ast_node_t* parse_expr(parser_t* parser) {
tok_stream_t* tokbuf = &(parser->tokbuf);
symtab_t *symtab = parser->symtab;
flush_peek_tok(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
switch (ttype) {
case TOKEN_NOT:
case TOKEN_AND:

View File

@ -9,7 +9,7 @@
// TODO 语义分析压入符号表
static void parse_params(parser_t* parser, tok_stream_t* cache, ast_node_t* node) {
flush_peek_tok(cache);
tok_type_t ttype;
cc_tktype_t ttype;
ast_node_t *params = new_ast_node();
node->decl_func.params = params;
vector_init(params->params.params);
@ -79,7 +79,7 @@ ast_type_t check_is_func_decl(tok_stream_t* tokbuf, tok_stream_t* cache) {
LOG_ERROR("function parameter list too long");
}
cache->buf[cache->size++] = *tok;
switch (tok->type) {
switch (tok->sub_type) {
case TOKEN_L_PAREN:
depth++;
break;

View File

@ -4,7 +4,7 @@
ast_node_t* parse_stmt(parser_t* parser) {
tok_stream_t* tokbuf = &parser->tokbuf;
flush_peek_tok(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
ast_node_t* node = new_ast_node();
switch (ttype) {
case TOKEN_IF: {

View File

@ -3,8 +3,8 @@
#include "../type.h"
ast_node_t* new_ast_ident_node(tok_t* tok) {
if (tok->type != TOKEN_IDENT) {
LOG_ERROR("syntax error: want identifier but got %d", tok->type);
if (tok->sub_type != TOKEN_IDENT) {
LOG_ERROR("syntax error: want identifier but got %d", tok->sub_type);
}
ast_node_t* node = new_ast_node();
node->type = NT_TERM_IDENT;
@ -24,7 +24,7 @@ ast_node_t* expect_pop_ident(tok_stream_t* tokbuf) {
ast_node_t* parse_type(parser_t* parser) {
tok_stream_t* tokbuf = &parser->tokbuf;
flush_peek_tok(tokbuf);
tok_type_t ttype = peek_tok_type(tokbuf);
cc_tktype_t ttype = peek_tok_type(tokbuf);
data_type_t dtype;
switch(ttype) {
case TOKEN_VOID: dtype = TYPE_VOID; break;

View File

@ -1,53 +0,0 @@
// hashmap.c
#include "hashmap.h"
#include <stdlib.h>
#include <string.h>
// DJB2哈希算法
static unsigned long hash(const char* str) {
unsigned long hash = 5381;
int c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash % HMAP_SIZE;
}
void hmap_init(HashMap* map) {
memset(map->buckets, 0, sizeof(map->buckets));
}
void hmap_put(HashMap* map, const char* key, void* value) {
unsigned long idx = hash(key);
HashMapEntry* entry = malloc(sizeof(HashMapEntry));
entry->key = strdup(key);
entry->value = value;
entry->next = map->buckets[idx];
map->buckets[idx] = entry;
}
void* hmap_get(HashMap* map, const char* key) {
unsigned long idx = hash(key);
HashMapEntry* entry = map->buckets[idx];
while (entry) {
if (strcmp(entry->key, key) == 0)
return entry->value;
entry = entry->next;
}
return NULL;
}
int hmap_contains(HashMap* map, const char* key) {
return hmap_get(map, key) != NULL;
}
void hmap_destroy(HashMap* map) {
for (int i = 0; i < HMAP_SIZE; i++) {
HashMapEntry* entry = map->buckets[i];
while (entry) {
HashMapEntry* next = entry->next;
free(entry->key);
free(entry);
entry = next;
}
}
}

View File

@ -1,31 +0,0 @@
#ifndef HASHMAP_H
#define HASHMAP_H
#define HMAP_SIZE 64
typedef struct HashMapEntry {
char* key;
void* value;
struct HashMapEntry* next;
} HashMapEntry;
typedef struct {
HashMapEntry* buckets[HMAP_SIZE];
} HashMap;
// 初始化哈希表
void hmap_init(HashMap* map);
// 插入键值对
void hmap_put(HashMap* map, const char* key, void* value);
// 查找键值
void* hmap_get(HashMap* map, const char* key);
// 检查键是否存在
int hmap_contains(HashMap* map, const char* key);
// 释放哈希表内存不释放value
void hmap_destroy(HashMap* map);
#endif

View File

@ -1,43 +0,0 @@
// scope.c
#include "scope.h"
#include <stdio.h>
#include <stdlib.h>
typedef struct Scope Scope;
Scope* scope_create(Scope* parent) {
Scope* scope = malloc(sizeof(Scope));
hmap_init(&scope->symbols);
scope->parent = parent;
scope->base_offset = 0;
scope->cur_offset = 0;
return scope;
}
void scope_destroy(Scope* scope) {
hmap_destroy(&scope->symbols);
free(scope);
}
void scope_insert(Scope* scope, const char* name, void* symbol) {
if (hmap_contains(&scope->symbols, name)) {
// 处理重复定义错误
fprintf(stderr, "Error: Symbol '%s' already defined\n", name);
exit(EXIT_FAILURE);
}
hmap_put(&scope->symbols, name, symbol);
}
void* scope_lookup(Scope* scope, const char* name) {
void* symbol = NULL;
while (scope) {
symbol = hmap_get(&scope->symbols, name);
if (symbol) break;
scope = scope->parent;
}
return symbol;
}
void* scope_lookup_current(Scope* scope, const char* name) {
return hmap_get(&scope->symbols, name);
}

View File

@ -1,28 +0,0 @@
#ifndef SCOPE_H
#define SCOPE_H
#include "hashmap.h"
struct Scope {
HashMap symbols; // 当前作用域符号表
struct Scope* parent; // 上层作用域
int base_offset;
int cur_offset;
};
// 创建新作用域父作用域可为NULL
struct Scope* scope_create(struct Scope* parent);
// 销毁作用域
void scope_destroy(struct Scope* scope);
// 在当前作用域插入符号
void scope_insert(struct Scope* scope, const char* name, void* symbol);
// 逐级查找符号
void* scope_lookup(struct Scope* scope, const char* name);
// 仅在当前作用域查找
void* scope_lookup_current(struct Scope* scope, const char* name);
#endif

View File

@ -1,50 +0,0 @@
// symtab.c
#include "../../frontend.h"
#include <lib/core.h>
#include "scope.h"
#include "symtab.h"
typedef symtab_t symtab_t;
typedef struct Scope Scope;
void init_symtab(symtab_t* symtab) {
symtab->global_scope = scope_create(NULL);
symtab->cur_scope = symtab->global_scope;
}
void del_symtab(symtab_t* symtab) {
scope_destroy(symtab->global_scope);
}
void symtab_enter_scope(symtab_t* symtab) {
struct Scope* scope = scope_create(symtab->cur_scope);
scope->base_offset = symtab->cur_scope->base_offset + symtab->cur_scope->cur_offset;
symtab->cur_scope = scope;
}
void symtab_leave_scope(symtab_t* symtab) {
Scope * scope = symtab->cur_scope;
if (scope == NULL) {
LOG_ERROR("cannot leave NULL scope or global scope");
}
symtab->cur_scope = symtab->cur_scope->parent;
scope_destroy(scope);
}
void* symtab_add_symbol(symtab_t* symtab, const char* name, void* ast_node, int can_duplicate) {
struct Scope* scope = symtab->cur_scope;
void* node = scope_lookup_current(scope, name);
if (node != NULL) {
if (!can_duplicate) {
LOG_ERROR("duplicate symbol %s", name);
}
return node;
}
scope_insert(scope, name, ast_node);
return node;
}
void* symtab_lookup_symbol(symtab_t* symtab, const char* name) {
return scope_lookup(symtab->cur_scope, name);
}

View File

@ -1,18 +0,0 @@
// symtab.h
#ifndef __SYMTAB_H__
#define __SYMTAB_H__
typedef struct symtab {
struct Scope* cur_scope;
struct Scope* global_scope;
} symtab_t;
void init_symtab(symtab_t* symtab);
void del_symtab(symtab_t* symtab);
void symtab_enter_scope(symtab_t* symtab);
void symtab_leave_scope(symtab_t* symtab);
void* symtab_add_symbol(symtab_t* symtab, const char* name, void* ast_node, int can_duplicate);
void* symtab_lookup_symbol(symtab_t* symtab, const char* name);
#endif

View File

@ -6,6 +6,7 @@
// gcc -g ../parser.c ../../lexer/lexer.c ../ast/ast.c ../ast/block.c ../ast/decl.c ../ast/expr.c ../ast/func.c ../ast/program.c ../ast/stmt.c ../ast/term.c ../symtab/hashmap.c ../symtab/scope.c ../symtab/symtab.c test_parser.c -o test_parser
// gcc -g test_parser.c -L../.. -lfrontend -o test_parser
int main(int argc, char** argv) {
init_lib_core();
const char* file_name = "test_file.c";
if (argc == 2) {
file_name = argv[1];
@ -17,8 +18,10 @@ int main(int argc, char** argv) {
}
printf("open file success\n");
struct Lexer lexer;
init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s);
lexer_t lexer;
strpool_t strpool;
init_strpool(&strpool);
init_lexer(&lexer, file_name, fp, (lexer_sread_fn)fread_s, &strpool);
struct SymbolTable symtab;
init_symtab(&symtab);

View File

@ -7,14 +7,27 @@ CFLAGS = -g -Wall -I..
RT_DIR = ./rt
LOG_DIR = ./rt/log
# 源文件列表
# basic rt lib
SRCS = \
$(RT_DIR)/std/rt_std.c \
./core.c \
$(RT_DIR)/rt.c \
$(RT_DIR)/rt_alloc.c \
$(RT_DIR)/rt_string.c \
$(LOG_DIR)/log.c
# utils lib
UTILS_DIR = ./utils
DS_DIR = $(UTILS_DIR)/ds
STRPOOL_DIR = $(UTILS_DIR)/strpool
SYMTAB_DIR = $(UTILS_DIR)/symtab
TOKBUF_DIR = $(UTILS_DIR)/tokbuf
SRCS += \
$(DS_DIR)/hashtable.c \
$(STRPOOL_DIR)/strpool.c \
# $(SYMTAB_DIR)/symtab.c \
# $(TOKBUF_DIR)/tokbuf.c
# 生成目标文件列表
OBJS = $(SRCS:.c=.o)

View File

@ -1,142 +1,129 @@
#include "hashtable.h"
#define LOAD_FACTOR 0.75f
// 素数表用于桶扩容(最后一个元素为最大允许容量)
static const int PRIME_CAPACITIES[] = {
11, 23, 47, 97, 193, 389, 769, 1543, 3079,
6151, 12289, 24593, 49157, 98317, 196613, 393241,
786433, 1572869, 3145739, 6291469, 12582917, 25165843
};
#define INIT_HASH_TABLE_SIZE (32)
// 私有函数声明
static u32_t calc_hash(const char* str, int len);
static void rehash(hash_table_t* ht);
hash_table_t* new_hash_table(int init_size, int max_cap) {
hash_table_t* ht = salloc_alloc(sizeof(hash_table_t));
hash_table_init(ht, init_size, max_cap);
return ht;
void hashtable_init(hash_table_t* ht) {
vector_init(ht->entries);
ht->count = 0;
ht->tombstone_count = 0;
Assert(ht->key_cmp != NULL && ht->hash_func != NULL);
}
static inline get_real_size(int size) {
// 查找第一个不小于size的素数容量
int cap_idx = 0;
if (size < 0) {
return PRIME_CAPACITIES[SMCC_ARRLEN(PRIME_CAPACITIES)-1];
}
while (PRIME_CAPACITIES[cap_idx] < size && cap_idx < SMCC_ARRLEN(PRIME_CAPACITIES)-1) {
cap_idx++;
}
return PRIME_CAPACITIES[cap_idx];
static int next_power_of_two(int n) {
n--;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
return n + 1;
}
void hash_table_init(hash_table_t* ht, int init_size, int max_cap) {
// 限制最大容量索引
ht->max_cap = get_real_size(max_cap);
// 应用实际容量
ht->cap = get_real_size(init_size);
ht->size = 0;
ht->buckets = NULL;
ht->buckets = salloc_realloc(ht->buckets, sizeof(hash_node_t*) * ht->cap);
}
static hash_entry_t* find_entry(hash_table_t* ht, const void* key, u32_t hash) {
if (ht->entries.cap == 0) return NULL;
u32_t index = hash & (ht->entries.cap - 1); // 容量是2的幂
u32_t probe = 0;
void hash_table_insert(hash_table_t* ht, const char* str, int len) {
// 自动扩容检查
if (ht->size >= ht->cap * LOAD_FACTOR && ht->cap < ht->max_cap) {
rehash(ht);
}
if (ht->size >= ht->cap) {
LOG_TRACE("Hash table size exceeds maximum capacity. Consider increasing max_capacity.");
}
// 计算哈希值
u32_t hash = calc_hash(str, len);
int bucket_idx = hash % ht->cap;
// 检查重复
hash_node_t* node = ht->buckets[bucket_idx];
while (node) {
if (node->hash == hash &&
node->len == len &&
memcmp(node->str, str, len) == 0) {
return; // 已存在
hash_entry_t* tombstone = NULL;
while (1) {
hash_entry_t* entry = &vector_at(ht->entries, index);
if (entry->state == ENTRY_EMPTY) {
return tombstone ? tombstone : entry;
}
node = node->next;
}
// 创建新节点
hash_node_t* new_node = salloc_alloc(sizeof(hash_node_t));
new_node->str = str;
new_node->len = len;
new_node->hash = hash;
new_node->next = ht->buckets[bucket_idx];
ht->buckets[bucket_idx] = new_node;
ht->size++;
}
hash_node_t* hash_table_find(hash_table_t* ht, const char* str, int len) {
u32_t hash = calc_hash(str, len);
int bucket_idx = hash % ht->cap;
hash_node_t* node = ht->buckets[bucket_idx];
while (node) {
if (node->hash == hash &&
node->len == len &&
memcmp(node->str, str, len) == 0) {
return node;
if (entry->state == ENTRY_TOMBSTONE) {
if (!tombstone) tombstone = entry;
} else if (entry->hash == hash && ht->key_cmp(entry->key, key) == 0) {
return entry;
}
node = node->next;
// Liner finding
index = (index + 1) & (ht->entries.cap - 1);
probe++;
if (probe >= ht->entries.cap) break;
}
LOG_ERROR("hashset_find: hash table is full");
return NULL;
}
static void rehash(hash_table_t* ht) {
int old_cap = ht->cap;
hash_node_t** old_buckets = ht->buckets;
static void adjust_capacity(hash_table_t* ht, int new_cap) {
new_cap = next_power_of_two(new_cap);
Assert(new_cap >= ht->entries.cap);
// 查找下一个素数容量
int new_cap_idx = 0;
while (PRIME_CAPACITIES[new_cap_idx] <= old_cap &&
new_cap_idx < ht->max_cap) {
new_cap_idx++;
}
ht->cap = PRIME_CAPACITIES[new_cap_idx];
vector_header(old_entries, hash_entry_t);
old_entries.data = ht->entries.data;
old_entries.cap = ht->entries.cap;
// 分配新桶数组
ht->buckets = salloc_alloc(sizeof(hash_node_t*) * ht->cap);
memset(ht->buckets, 0, sizeof(hash_node_t*) * ht->cap);
// Not used size but for gdb python extention debug
ht->entries.size = new_cap;
ht->entries.cap = new_cap;
ht->entries.data = salloc_realloc(NULL, new_cap * sizeof(hash_entry_t));
rt_memset(ht->entries.data, 0, new_cap * sizeof(hash_entry_t));
// 重新哈希所有节点
for (int i = 0; i < old_cap; i++) {
hash_node_t* node = old_buckets[i];
while (node) {
hash_node_t* next = node->next;
int new_bucket = node->hash % ht->cap;
node->next = ht->buckets[new_bucket];
ht->buckets[new_bucket] = node;
node = next;
// rehash the all of the old data
for (rt_size_t i = 0; i < old_entries.cap; i++) {
hash_entry_t* entry = &vector_at(old_entries, i);
if (entry->state == ENTRY_ACTIVE) {
hash_entry_t* dest = find_entry(ht, entry->key, entry->hash);
*dest = *entry;
}
}
salloc_free(old_buckets);
vector_free(old_entries);
ht->tombstone_count = 0;
}
static u32_t calc_hash(const char* str, int len) {
// 使用与HASH_FNV_1A宏一致的算法
rt_strhash(str);
}
void hash_table_destroy(hash_table_t* ht) {
for (int i = 0; i < ht->cap; i++) {
hash_node_t* node = ht->buckets[i];
while (node) {
hash_node_t* next = node->next;
salloc_free(node);
node = next;
}
void* hashtable_set(hash_table_t* ht, const void* key, void* value) {
if (ht->count + ht->tombstone_count >= ht->entries.cap * 0.75) {
int new_cap = ht->entries.cap < INIT_HASH_TABLE_SIZE ? INIT_HASH_TABLE_SIZE : ht->entries.cap * 2;
adjust_capacity(ht, new_cap);
}
salloc_free(ht->buckets);
ht->buckets = NULL;
ht->size = ht->cap = 0;
}
u32_t hash = ht->hash_func(key);
hash_entry_t* entry = find_entry(ht, key, hash);
void* old_value = NULL;
if (entry->state == ENTRY_ACTIVE) {
old_value = entry->value;
} else {
if (entry->state == ENTRY_TOMBSTONE) ht->tombstone_count--;
ht->count++;
}
entry->key = key;
entry->value = value;
entry->hash = hash;
entry->state = ENTRY_ACTIVE;
return old_value;
}
void* hashtable_get(hash_table_t* ht, const void* key) {
if (ht->entries.cap == 0) return NULL;
u32_t hash = ht->hash_func(key);
hash_entry_t* entry = find_entry(ht, key, hash);
return (entry && entry->state == ENTRY_ACTIVE) ? entry->value : NULL;
}
void* hashtable_del(hash_table_t* ht, const void* key) {
if (ht->entries.cap == 0) return NULL;
u32_t hash = ht->hash_func(key);
hash_entry_t* entry = find_entry(ht, key, hash);
if (entry == NULL || entry->state != ENTRY_ACTIVE) return NULL;
void* value = entry->value;
entry->state = ENTRY_TOMBSTONE;
ht->count--;
ht->tombstone_count++;
return value;
}
void hashtable_destory(hash_table_t* ht) {
vector_free(ht->entries);
ht->count = 0;
ht->tombstone_count = 0;
}

View File

@ -1,27 +1,39 @@
#ifndef __SMCC_HASHTABLE_H__
#define __SMCC_HASHTABLE_H__
#include <lib/rt/rt.h>
#include <lib/rt/rt_alloc.h>
#include "vector.h"
typedef struct hash_node {
const char* str;
int len;
u32_t hash;
struct hash_node* next;
} hash_node_t;
// 哈希表条目状态标记
typedef enum hash_table_entry_state {
ENTRY_EMPTY,
ENTRY_ACTIVE,
ENTRY_TOMBSTONE
} ht_entry_state_t;
// 哈希表条目结构不管理key/value内存
typedef struct hash_entry {
const void* key; // 由调用者管理
void* value; // 由调用者管理
u32_t hash; // 预计算哈希值
ht_entry_state_t state; // 条目状态
} hash_entry_t;
// 哈希表主体结构
typedef struct hash_table {
hash_node_t** buckets;
int size;
int cap;
int max_cap;
vector_header(entries, hash_entry_t); // 使用vector管理条目
u32_t count; // 有效条目数(不含墓碑)
u32_t tombstone_count; // 墓碑数量
u32_t (*hash_func)(const void* key);
int(*key_cmp)(const void* key1, const void* key2);
} hash_table_t;
hash_table_t* new_hash_table(int init_size, int max_cap);
void hash_table_init(hash_table_t* ht, int init_size, int max_cap);
void hash_table_destroy(hash_table_t* ht);
// WARN you need set hash_func and key_cmp before use
void hashtable_init(hash_table_t* ht) ;
void hash_table_insert(hash_table_t* ht, const char* str, int len);
hash_node_t* hash_table_find(hash_table_t* ht, const char* str, int len);
void* hashtable_set(hash_table_t* ht, const void* key, void* value);
void* hashtable_get(hash_table_t* ht, const void* key);
void* hashtable_get(hash_table_t* ht, const void* key);
void hashtable_destory(hash_table_t* ht);
#endif // __SMCC_HASHTABLE_H__

View File

@ -0,0 +1,32 @@
#include "strpool.h"
void init_strpool(strpool_t* pool) {
lalloc_init(&pool->stralloc);
pool->ht.hash_func = (u32_t(*)(const void*))rt_strhash;
pool->ht.key_cmp = (int(*)(const void*, const void*))rt_strcmp;
hashtable_init(&pool->ht);
}
const char* strpool_intern(strpool_t* pool, const char* str) {
void* existing = hashtable_get(&pool->ht, str);
if (existing) {
return existing;
}
rt_size_t len = rt_strlen(str) + 1;
char* new_str = lalloc_alloc(&pool->stralloc, len);
if (!new_str) {
LOG_ERROR("strpool: Failed to allocate memory for string");
return NULL;
}
rt_memcpy(new_str, str, len);
hashtable_set(&pool->ht, new_str, new_str);
return new_str;
}
void strpool_destroy(strpool_t* pool) {
hashtable_destory(&pool->ht);
lalloc_destroy(&pool->stralloc);
}

View File

@ -2,11 +2,16 @@
#define __SMCC_STRPOOL_H__
#include <lib/core.h>
#include "../ds/hash.h"
typedef struct strpool {
long_alloc_t *long_alloc;
} strpool_t;
#include <lib/rt/rt_alloc.h>
#include <lib/utils/ds/hashtable.h>
void new_strpool();
typedef struct strpool {
hash_table_t ht; // 用于快速查找字符串
long_alloc_t stralloc; // 专门用于字符串存储的分配器
} strpool_t;
void init_strpool(strpool_t* pool);
const char* strpool_intern(strpool_t* pool, const char* str);
void strpool_destroy(strpool_t* pool);
#endif // __SMCC_STRPOOL_H__

View File

@ -0,0 +1,6 @@
#ifndef __SMCC_SYMTABL_H__
#define __SMCC_SYMTABL_H__
#endif

View File

@ -7,18 +7,20 @@ typedef struct loc {
const char *fname;
int line;
int col;
short len;
int len;
} loc_t;
typedef enum tok_type {
typedef enum tok_basic_type {
TK_BASIC_INVALID, // 错误占位
TK_BASIC_KEYWORD, // 关键字
TK_BASIC_OPERATOR, // 操作符
TK_BASIC_IDENTIFIER, // 标识符
TK_BASIC_LITERAL, // 字面量
TK_BASIC_PUNCTUATOR, // 标点符号
TK_BASIC_WHITESPACE, // 空白
TK_BASIC_COMMENT, // 注释
TK_BASIC_EOF // 结束标记
} tok_type_t;
} tok_basic_type_t;
typedef union ctype {
u8_t u8;
@ -34,10 +36,15 @@ typedef union ctype {
iptr_t iptr;
uptr_t uptr;
void* ptr;
char ch;
int i;
// MUST BE strpool ptr
const char* str;
} ctype_t;
typedef struct tok {
tok_type_t type;
tok_basic_type_t type;
int sub_type;
loc_t loc;
ctype_t val;

8
lib/utils/utils.h Normal file
View File

@ -0,0 +1,8 @@
#ifndef __SMCC_LIB_UTILS_H__
#define __SMCC_LIB_UTILS_H__
#include "strpool/strpool.h"
#include "symtab/symtab.h"
#include "tokbuf/tokbuf.h"
#endif