feat(build): 引入新的 Python 构建系统并移除旧 Makefile

新增基于 Python 的构建脚本 `cbuild.py`,支持包管理、依赖解析和模块化编译。
同时添加 `.gitignore` 忽略 `build` 目录,并在 `justfile` 中更新构建命令。
移除了原有的 `lib/Makefile` 和主目录下的相关 make 规则,统一使用新构建系统。
This commit is contained in:
zzy
2025-11-20 10:44:59 +08:00
parent 8d97fe896c
commit e22811f2f5
140 changed files with 1996 additions and 10098 deletions

9
libs/README.md Normal file
View File

@@ -0,0 +1,9 @@
lexer 词法分析
parse 语法分析
ast 抽象语法树
sema 语义分析
ir 中间代码标识
opt 优化器
codegen 代码生成
target 目标平台支持

5
libs/lexer/README.md Normal file
View File

@@ -0,0 +1,5 @@
# 词法分析
参考LCC的此分析部分
主要使用 LL(n) 硬编码查找token

6
libs/lexer/cbuild.toml Normal file
View File

@@ -0,0 +1,6 @@
[package]
name = "smcc_lex"
dependencies = [
{ name = "libcore", path = "../../runtime/libcore" },
]

View File

@@ -0,0 +1,62 @@
/**
* @file lexer.h
* @brief C语言词法分析器核心数据结构与接口
*/
#ifndef __SMCC_CC_LEXER_H__
#define __SMCC_CC_LEXER_H__
#include <libcore.h>
#include "lexer_stream.h"
#include "lexer_token.h"
typedef struct lexer_loc {
const char *name;
usize name_len;
usize line;
usize column;
usize offset;
} lexer_loc_t;
typedef struct lexer_token {
token_type_t type;
core_cvalue_t value;
lexer_loc_t loc;
} lexer_tok_t;
/**
* @brief 词法分析器核心结构体
*
* 封装词法分析所需的状态信息和缓冲区管理
*/
typedef struct cc_lexer {
lexer_stream_t* stream;
lexer_loc_t pos;
} smcc_lexer_t;
/**
* @brief 初始化词法分析器
* @param[out] lexer 要初始化的词法分析器实例
* @param[in] stream 输入流对象指针
*/
void lexer_init(smcc_lexer_t* lexer, lexer_stream_t* stream);
/**
* @brief 获取原始token
* @param[in] lexer 词法分析器实例
* @param[out] token 输出token存储位置
*
* 此函数会返回所有类型的token包括空白符等无效token
*/
void lexer_get_token(smcc_lexer_t* lexer, lexer_tok_t* token);
/**
* @brief 获取有效token
* @param[in] lexer 词法分析器实例
* @param[out] token 输出token存储位置
*
* 此函数会自动跳过空白符等无效token返回对语法分析有意义的token
*/
void lexer_get_valid_token(smcc_lexer_t* lexer, lexer_tok_t* token);
#endif

View File

@@ -0,0 +1,48 @@
#ifndef __SMCC_LEXER_LOG_H__
#define __SMCC_LEXER_LOG_H__
#include <libcore.h>
#ifndef LEX_LOG_LEVEL
#define LEX_LOG_LEVEL 4
#endif
#if LEX_LOG_LEVEL <= 1
#define LEX_NOTSET( fmt, ...) MLOG_NOTSET(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
#else
#define LEX_NOTSET( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 2
#define LEX_DEBUG( fmt, ...) MLOG_DEBUG(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
#else
#define LEX_DEBUG( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 3
#define LEX_INFO( fmt, ...) MLOG_INFO(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
#else
#define LEX_INFO( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 4
#define LEX_WARN( fmt, ...) MLOG_WARN(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
#else
#define LEX_WARN( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 5
#define LEX_ERROR( fmt, ...) MLOG_ERROR(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
#else
#define LEX_ERROR( fmt, ...)
#endif
#if LEX_LOG_LEVEL <= 6
#define LEX_FATAL( fmt, ...) MLOG_FATAL(&__smcc_lexer_log, fmt, ##__VA_ARGS__)
#else
#define LEX_FATAL( fmt, ...)
#endif
extern logger_t __smcc_lexer_log;
#endif // __SMCC_LEXER_LOG_H__

View File

@@ -0,0 +1,37 @@
#include <core_type.h>
typedef struct lexer_stream lexer_stream_t;
#define lexer_stream_eof (-1)
struct lexer_stream {
const char* name;
usize name_len;
/// @brief 读取指定数量的字符到缓冲区
usize (*read_buf)(lexer_stream_t* stream, char* buffer, usize count);
/// @brief 获取下一个字符
int (*peek_char)(lexer_stream_t* stream);
/// @brief 重置字符流位置
void (*reset_char) (lexer_stream_t* stream);
/// @brief 读取并消费下一个字符(移动流位置)
int (*next_char)(lexer_stream_t* stream);
/// @brief 释放资源
void (*free_stream) (lexer_stream_t* steam);
};
#ifndef __SMCC_LEXER_NO_MEM_STREAM__
typedef struct lexer_mem_stream {
lexer_stream_t stream;
const char* data;
usize data_length;
usize curr_pos;
usize peek_pos;
cbool owned;
} lexer_mem_stream_t;
lexer_stream_t* lexer_mem_stream_init(lexer_mem_stream_t* stream, const char* data, usize length, cbool need_copy);
#endif

View File

@@ -0,0 +1,137 @@
#ifndef __SMCC_CC_TOKEN_H__
#define __SMCC_CC_TOKEN_H__
#include <libcore.h>
typedef enum ckeyword {
CSTD_C89,
CSTD_C99,
CEXT_ASM,
} ckeyword_t;
// Using Binary Search To Fast Find Keyword
#define KEYWORD_TABLE \
X(asm , TK_BASIC_KEYWORD , TOKEN_ASM , CEXT_ASM) \
X(break , TK_BASIC_KEYWORD , TOKEN_BREAK , CSTD_C89) \
X(case , TK_BASIC_KEYWORD , TOKEN_CASE , CSTD_C89) \
X(char , TK_BASIC_KEYWORD , TOKEN_CHAR , CSTD_C89) \
X(const , TK_BASIC_KEYWORD , TOKEN_CONST , CSTD_C89) \
X(continue , TK_BASIC_KEYWORD , TOKEN_CONTINUE , CSTD_C89) \
X(default , TK_BASIC_KEYWORD , TOKEN_DEFAULT , CSTD_C89) \
X(do , TK_BASIC_KEYWORD , TOKEN_DO , CSTD_C89) \
X(double , TK_BASIC_KEYWORD , TOKEN_DOUBLE , CSTD_C89) \
X(else , TK_BASIC_KEYWORD , TOKEN_ELSE , CSTD_C89) \
X(enum , TK_BASIC_KEYWORD , TOKEN_ENUM , CSTD_C89) \
X(extern , TK_BASIC_KEYWORD , TOKEN_EXTERN , CSTD_C89) \
X(float , TK_BASIC_KEYWORD , TOKEN_FLOAT , CSTD_C89) \
X(for , TK_BASIC_KEYWORD , TOKEN_FOR , CSTD_C89) \
X(goto , TK_BASIC_KEYWORD , TOKEN_GOTO , CSTD_C89) \
X(if , TK_BASIC_KEYWORD , TOKEN_IF , CSTD_C89) \
X(inline , TK_BASIC_KEYWORD , TOKEN_INLINE , CSTD_C99) \
X(int , TK_BASIC_KEYWORD , TOKEN_INT , CSTD_C89) \
X(long , TK_BASIC_KEYWORD , TOKEN_LONG , CSTD_C89) \
X(register , TK_BASIC_KEYWORD , TOKEN_REGISTER , CSTD_C89) \
X(restrict , TK_BASIC_KEYWORD , TOKEN_RESTRICT , CSTD_C99) \
X(return , TK_BASIC_KEYWORD , TOKEN_RETURN , CSTD_C89) \
X(short , TK_BASIC_KEYWORD , TOKEN_SHORT , CSTD_C89) \
X(signed , TK_BASIC_KEYWORD , TOKEN_SIGNED , CSTD_C89) \
X(sizeof , TK_BASIC_KEYWORD , TOKEN_SIZEOF , CSTD_C89) \
X(static , TK_BASIC_KEYWORD , TOKEN_STATIC , CSTD_C89) \
X(struct , TK_BASIC_KEYWORD , TOKEN_STRUCT , CSTD_C89) \
X(switch , TK_BASIC_KEYWORD , TOKEN_SWITCH , CSTD_C89) \
X(typedef , TK_BASIC_KEYWORD , TOKEN_TYPEDEF , CSTD_C89) \
X(union , TK_BASIC_KEYWORD , TOKEN_UNION , CSTD_C89) \
X(unsigned , TK_BASIC_KEYWORD , TOKEN_UNSIGNED , CSTD_C89) \
X(void , TK_BASIC_KEYWORD , TOKEN_VOID , CSTD_C89) \
X(volatile , TK_BASIC_KEYWORD , TOKEN_VOLATILE , CSTD_C89) \
X(while , TK_BASIC_KEYWORD , TOKEN_WHILE , CSTD_C89) \
// KEYWORD_TABLE
#define TOKEN_TABLE \
X(unknown , TK_BASIC_INVALID, TOKEN_UNKNOWN ) \
X(EOF , TK_BASIC_EOF, TOKEN_EOF ) \
X(blank , TK_BASIC_EMPTYSPACE, TOKEN_BLANK ) \
X("==" , TK_BASIC_OPERATOR, TOKEN_EQ ) \
X("=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN ) \
X("++" , TK_BASIC_OPERATOR, TOKEN_ADD_ADD ) \
X("+=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_ADD ) \
X("+" , TK_BASIC_OPERATOR, TOKEN_ADD ) \
X("--" , TK_BASIC_OPERATOR, TOKEN_SUB_SUB ) \
X("-=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_SUB ) \
X("->" , TK_BASIC_OPERATOR, TOKEN_DEREF ) \
X("-" , TK_BASIC_OPERATOR, TOKEN_SUB ) \
X("*=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_MUL ) \
X("*" , TK_BASIC_OPERATOR, TOKEN_MUL ) \
X("/=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_DIV ) \
X("/" , TK_BASIC_OPERATOR, TOKEN_DIV ) \
X("//" , TK_BASIC_COMMENT , TOKEN_LINE_COMMENT ) \
X("/* */" , TK_BASIC_COMMENT , TOKEN_BLOCK_COMMENT ) \
X("%=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_MOD ) \
X("%" , TK_BASIC_OPERATOR, TOKEN_MOD ) \
X("&&" , TK_BASIC_OPERATOR, TOKEN_AND_AND ) \
X("&=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_AND ) \
X("&" , TK_BASIC_OPERATOR, TOKEN_AND ) \
X("||" , TK_BASIC_OPERATOR, TOKEN_OR_OR ) \
X("|=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_OR ) \
X("|" , TK_BASIC_OPERATOR, TOKEN_OR ) \
X("^=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_XOR ) \
X("^" , TK_BASIC_OPERATOR, TOKEN_XOR ) \
X("<<=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_L_SH ) \
X("<<" , TK_BASIC_OPERATOR, TOKEN_L_SH ) \
X("<=" , TK_BASIC_OPERATOR, TOKEN_LE ) \
X("<" , TK_BASIC_OPERATOR, TOKEN_LT ) \
X(">>=" , TK_BASIC_OPERATOR, TOKEN_ASSIGN_R_SH ) \
X(">>" , TK_BASIC_OPERATOR, TOKEN_R_SH ) \
X(">=" , TK_BASIC_OPERATOR, TOKEN_GE ) \
X(">" , TK_BASIC_OPERATOR, TOKEN_GT ) \
X("!" , TK_BASIC_OPERATOR, TOKEN_NOT ) \
X("!=" , TK_BASIC_OPERATOR, TOKEN_NEQ ) \
X("~" , TK_BASIC_OPERATOR, TOKEN_BIT_NOT ) \
X("[" , TK_BASIC_OPERATOR, TOKEN_L_BRACKET ) \
X("]" , TK_BASIC_OPERATOR, TOKEN_R_BRACKET ) \
X("(" , TK_BASIC_OPERATOR, TOKEN_L_PAREN ) \
X(")" , TK_BASIC_OPERATOR, TOKEN_R_PAREN ) \
X("{" , TK_BASIC_OPERATOR, TOKEN_L_BRACE ) \
X("}" , TK_BASIC_OPERATOR, TOKEN_R_BRACE ) \
X(";" , TK_BASIC_OPERATOR, TOKEN_SEMICOLON ) \
X("," , TK_BASIC_OPERATOR, TOKEN_COMMA ) \
X(":" , TK_BASIC_OPERATOR, TOKEN_COLON ) \
X("." , TK_BASIC_OPERATOR, TOKEN_DOT ) \
X("..." , TK_BASIC_OPERATOR, TOKEN_ELLIPSIS ) \
X("?" , TK_BASIC_OPERATOR, TOKEN_COND ) \
X(ident , TK_BASIC_IDENTIFIER, TOKEN_IDENT ) \
X(int_literal , TK_BASIC_LITERAL, TOKEN_INT_LITERAL ) \
X(float_literal , TK_BASIC_LITERAL, TOKEN_FLOAT_LITERAL ) \
X(char_literal , TK_BASIC_LITERAL, TOKEN_CHAR_LITERAL ) \
X(string_literal , TK_BASIC_LITERAL, TOKEN_STRING_LITERAL ) \
// END
// 定义TokenType枚举
typedef enum cc_tktype {
// 处理普通token
#define X(str, subtype, tok) tok,
TOKEN_TABLE
#undef X
// 处理关键字(保持原有格式)
#define X(name, subtype, tok, std) tok,
KEYWORD_TABLE
#undef X
} token_type_t;
typedef enum token_subtype {
TK_BASIC_INVALID, // 错误占位
TK_BASIC_KEYWORD, // 关键字
TK_BASIC_OPERATOR, // 操作符
TK_BASIC_IDENTIFIER, // 标识符
TK_BASIC_LITERAL, // 字面量
TK_BASIC_EMPTYSPACE, // 空白
TK_BASIC_COMMENT, // 注释
TK_BASIC_EOF // 结束标记
} token_subtype_t;
token_subtype_t get_tok_subtype(token_type_t type);
const char* get_tok_name(token_type_t type);
#endif

637
libs/lexer/src/lexer.c Normal file
View File

@@ -0,0 +1,637 @@
/**
* 仿照LCCompiler的词法分析部分
*
* 如下为LCC的README in 2025.2
This hierarchy is the distribution for lcc version 4.2.
lcc version 3.x is described in the book "A Retargetable C Compiler:
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
There are significant differences between 3.x and 4.x, most notably in
the intermediate code. For details, see
https://drh.github.io/lcc/documents/interface4.pdf.
VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.
LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.
LOG describes the changes since the last release.
CPYRIGHT describes the conditions under you can use, copy, modify, and
distribute lcc or works derived from lcc.
doc/install.html is an HTML file that gives a complete description of
the distribution and installation instructions.
Chris Fraser / cwf@aya.yale.edu
David Hanson / drh@drhanson.net
*/
#include <lexer_log.h>
#include <lexer.h>
static const struct {
const char* name;
ckeyword_t std_type;
token_type_t tok;
} keywords[] = {
#define X(name, subtype, tok, std_type,...) { #name, std_type, tok },
KEYWORD_TABLE
#undef X
};
// by using binary search to find the keyword
static inline int keyword_cmp(const char* name, int len) {
int low = 0;
int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
while (low <= high) {
int mid = (low + high) / 2;
const char *key = keywords[mid].name;
int cmp = 0;
// 自定义字符串比较逻辑
for (int i = 0; i < len; i++) {
if (name[i] != key[i]) {
cmp = (unsigned char)name[i] - (unsigned char)key[i];
break;
}
if (name[i] == '\0') break; // 遇到终止符提前结束
}
if (cmp == 0) {
// 完全匹配检查(长度相同)
if (key[len] == '\0') return mid;
cmp = -1; // 当前关键词比输入长
}
if (cmp < 0) {
high = mid - 1;
} else {
low = mid + 1;
}
}
return -1; // Not a keyword.
}
void lexer_init(smcc_lexer_t* lexer, lexer_stream_t* stream) {
lexer->stream = stream;
lexer->pos = (lexer_loc_t) {
.name = stream->name,
.name_len = stream->name_len,
.line = 1,
.column = 1,
.offset = 0,
};
}
#define stream_reset_char(stream) ((stream)->reset_char(stream))
#define stream_next_char(stream) ((stream)->next_char(stream))
#define stream_peek_char(stream) ((stream)->peek_char(stream))
#define lexer_next_pos(lexer) ((lexer)->pos.column ++, (lexer)->pos.offset ++)
#define lexer_next_line(lexer) ((lexer)->pos.line ++, (lexer)->pos.column = 1)
#define set_err_token(token) ((token)->type = TOKEN_UNKNOWN)
static void skip_newline(smcc_lexer_t* lexer, lexer_tok_t* token) {
lexer_stream_t* stream = lexer->stream;
token->type = TOKEN_LINE_COMMENT;
// 循环直到遇到换行符或文件结束
while (1) {
int ch = stream_next_char(stream);
if (ch == lexer_stream_eof) {
// 到达文件末尾,直接返回
return;
}
// 更新位置信息
lexer_next_pos(lexer);
if (ch == '\n') {
// 遇到换行符,增加行号并重置列号
lexer_next_line(lexer);
return;
}
}
}
static void skip_block_comment(smcc_lexer_t* lexer, lexer_tok_t* token) {
lexer_stream_t* stream = lexer->stream;
token->type = TOKEN_BLOCK_COMMENT;
int ch;
stream_reset_char(stream);
ch = stream_next_char(stream);
lexer_next_pos(lexer);
// FIXME Assertion
Assert (ch == '/');
ch = stream_next_char(stream);
lexer_next_pos(lexer);
Assert (ch == '*');
// 我们已经识别了 "/*",现在需要找到 "*/"
while (1) {
ch = stream_next_char(stream);
lexer_next_pos(lexer);
if (ch == lexer_stream_eof) {
// 未闭合的块注释
LEX_WARN("Unterminated block comment");
return;
}
// LEX_ERROR("%c", ch);
// 更新位置信息
if (ch == '\n') {
lexer_next_line(lexer);
} else if (ch == '*') {
// 查看下一个字符是否是 '/'
int next_ch = stream_peek_char(stream);
if (next_ch == '/') {
// 消费 '/' 字符
stream_next_char(stream);
// 更新位置信息
lexer_next_pos(lexer);
// 成功找到注释结束标记
return;
}
}
}
}
// TODO escape character not enough
static inline int got_slash(int peek) {
switch (peek) {
case '\\': return '\\';
case '\'': return '\'';
case '\"': return '\"';
case '\?': return '\?';
case '0': return '\0';
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
default: break;
}
return -1;
}
static void parse_char(smcc_lexer_t* lexer, lexer_tok_t* token) {
token->loc = lexer->pos;
token->type = TOKEN_CHAR_LITERAL;
lexer_stream_t *stream = lexer->stream;
stream_reset_char(stream);
int ch = stream_peek_char(stream);
if (ch == lexer_stream_eof) {
LEX_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch != '\'') {
LEX_WARN("Unexpected character '%c' at begin", ch);
goto ERR;
}
stream_next_char(stream);
lexer_next_pos(lexer);
ch = stream_next_char(stream);
lexer_next_pos(lexer);
if (ch == lexer_stream_eof) {
LEX_WARN("Unexpected EOF at middle");
goto ERR;
} else if (ch == '\\') {
ch = stream_next_char(stream);
lexer_next_pos(lexer);
if ((ch = got_slash(ch)) == -1) {
LEX_ERROR("Invalid escape character");
// TODO 特殊情况处理
goto ERR;
}
token->value.ch = ch;
} else {
token->value.ch = ch;
}
if ((ch = stream_next_char(stream)) != '\'') {
LEX_ERROR("Unclosed character literal '%c' at end, expect `'`", ch);
lexer_next_pos(lexer);
goto ERR;
}
return;
ERR:
set_err_token(token);
}
static void parse_string(smcc_lexer_t* lexer, lexer_tok_t* token) {
token->loc = lexer->pos;
token->type = TOKEN_STRING_LITERAL;
lexer_stream_t *stream = lexer->stream;
stream_reset_char(stream);
int ch = stream_peek_char(stream);
if (ch == lexer_stream_eof) {
LEX_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch != '"') {
LEX_WARN("Unexpected character '%c' at begin", ch);
goto ERR;
}
stream_next_char(stream);
lexer_next_pos(lexer);
int base = 0;
cstring_t str = cstring_new();
while (1) {
ch = stream_peek_char(stream);
if (ch == lexer_stream_eof) {
LEX_ERROR("Unexpected EOF at string literal");
break;
} else if (ch == '\n') {
LEX_ERROR("Unexpected newline at string literal");
break;
} else if (ch == '\\') {
// TODO bad practice and maybe bugs here
stream_next_char(stream);
ch = stream_next_char(stream);
int val = got_slash(ch);
if (val == -1) {
LEX_ERROR("Invalid escape character it is \\%c [%d]", ch, ch);
} else {
cstring_push(&str, val);
continue;
}
} else if (ch == '"') {
stream_next_char(stream);
lexer_next_pos(lexer);
break;
}
stream_next_char(stream);
lexer_next_pos(lexer);
cstring_push(&str, ch);
}
token->value.cstr.data = (char*)cstring_as_cstr(&str);
token->value.cstr.len = cstring_len(&str);
return;
ERR:
set_err_token(token);
}
static void parse_number(smcc_lexer_t* lexer, lexer_tok_t* token) {
token->loc = lexer->pos;
lexer_stream_t *stream = lexer->stream;
stream_reset_char(stream);
int ch = stream_peek_char(stream);
int base = 0;
if (ch == lexer_stream_eof) {
LEX_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch == '0') {
ch = stream_peek_char(stream);
if (ch == 'x' || ch == 'X') {
base = 16;
stream_next_char(stream);
lexer_next_pos(lexer);
stream_next_char(stream);
lexer_next_pos(lexer);
} else if (ch == 'b' || ch == 'B') {
// FIXME C23 external integer base
base = 2;
stream_next_char(stream);
lexer_next_pos(lexer);
stream_next_char(stream);
lexer_next_pos(lexer);
} else if (ch >= '0' && ch <= '7') {
base = 8;
stream_next_char(stream);
lexer_next_pos(lexer);
} else {
base = 10;
}
} else {
base = 10;
}
// 解析整数部分
stream_reset_char(stream);
int tmp = 0;
token->value.n = 0;
while (1) {
ch = stream_peek_char(stream);
if (ch == lexer_stream_eof) {
break;
} else if (ch >= 'a' && ch <= 'z') {
tmp = ch - 'a' + 10;
} else if (ch >= 'A' && ch <= 'Z') {
tmp = ch - 'A' + 10;
} else if (ch >= '0' && ch <= '9') {
tmp = ch - '0';
} else {
break;
}
if (tmp >= base) {
LOG_ERROR("Invalid digit");
break;
}
stream_next_char(stream);
lexer_next_pos(lexer);
token->value.n = token->value.n * base + tmp;
// TODO number overflow
}
token->type = TOKEN_INT_LITERAL;
return;
ERR:
set_err_token(token);
}
static void parse_line(smcc_lexer_t* lexer, lexer_tok_t* token) {
token->loc = lexer->pos;
lexer_stream_t *stream = lexer->stream;
stream_reset_char(stream);
int ch = stream_peek_char(stream);
if (ch == lexer_stream_eof) {
LEX_WARN("Unexpected EOF at begin");
goto ERR;
} else if (ch != '#') {
LEX_WARN("Unexpected character '%c' at begin", ch);
goto ERR;
}
const char line[] = "line";
for (int i = 0; i < sizeof(line); i++) {
ch = stream_next_char(stream);
lexer_next_pos(lexer);
if (ch != line[i]) {
LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
skip_newline(lexer, token);
goto SKIP_LINE;
}
}
parse_number(lexer, token);
if (token->type != TOKEN_INT_LITERAL) {
LEX_ERROR("Invalid line number");
goto SKIP_LINE;
}
if (stream_next_char(stream) != ' ') {
skip_newline(lexer, token);
token->loc.line = token->value.n;
}
if (stream_peek_char(stream) != '"') {
LEX_ERROR("Invalid `#` line");
goto SKIP_LINE;
}
parse_string(lexer, token);
if (token->type != TOKEN_STRING_LITERAL) {
LEX_ERROR("Invalid filename");
goto SKIP_LINE;
}
skip_newline(lexer, token);
token->loc.line = token->value.n;
// FIXME memory leak
token->loc.name = cstring_as_cstr((const cstring_t *)&token->value.cstr);
token->loc.name_len = cstring_len((const cstring_t *)&token->value.cstr);
return;
SKIP_LINE:
skip_newline(lexer, token);
ERR:
set_err_token(token);
}
// /zh/c/language/operator_arithmetic.html
void lexer_get_token(smcc_lexer_t* lexer, lexer_tok_t* token) {
token->loc = lexer->pos;
token->type = TOKEN_UNKNOWN;
lexer_stream_t *stream = lexer->stream;
stream_reset_char(stream);
token_type_t type = TOKEN_UNKNOWN;
int ch = stream_peek_char(stream);
// once step
switch (ch) {
case '=':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_EQ; goto double_char;
default: stream_reset_char(stream), type = TOKEN_ASSIGN; break;
} break;
case '+':
switch (stream_peek_char(stream)) {
case '+': type = TOKEN_ADD_ADD; goto double_char;
case '=': type = TOKEN_ASSIGN_ADD; goto double_char;
default: stream_reset_char(stream), type = TOKEN_ADD; break;
} break;
case '-':
switch (stream_peek_char(stream)) {
case '-': type = TOKEN_SUB_SUB; goto double_char;
case '=': type = TOKEN_ASSIGN_SUB; goto double_char;
case '>': type = TOKEN_DEREF; goto double_char;
default: stream_reset_char(stream), type = TOKEN_SUB; break;
} break;
case '*':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_ASSIGN_MUL; goto double_char;
default: stream_reset_char(stream), type = TOKEN_MUL; break;
} break;
case '/':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_ASSIGN_DIV; goto double_char;
case '/': skip_newline(lexer, token); goto END;
case '*': skip_block_comment(lexer, token); goto END;
default: stream_reset_char(stream), type = TOKEN_DIV; break;
} break;
case '%':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_ASSIGN_MOD; goto double_char;
default: stream_reset_char(stream), type = TOKEN_MOD; break;
} break;
case '&':
switch (stream_peek_char(stream)) {
case '&': type = TOKEN_AND_AND; goto double_char;
case '=': type = TOKEN_ASSIGN_AND; goto double_char;
default: stream_reset_char(stream), type = TOKEN_AND; break;
} break;
case '|':
switch (stream_peek_char(stream)) {
case '|': type = TOKEN_OR_OR; goto double_char;
case '=': type = TOKEN_ASSIGN_OR; goto double_char;
default: stream_reset_char(stream), type = TOKEN_OR; break;
} break;
case '^':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_ASSIGN_XOR; goto double_char;
default: stream_reset_char(stream), type = TOKEN_XOR; break;
} break;
case '<':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_LE; goto double_char;
case '<': {
if (stream_peek_char(stream) == '=') {
type = TOKEN_ASSIGN_L_SH;
goto triple_char;
} else {
type = TOKEN_L_SH;
goto double_char;
}
break;
}
default: stream_reset_char(stream), type = TOKEN_LT; break;
} break;
case '>':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_GE; goto double_char;
case '>': {
if (stream_peek_char(stream) == '=') {
type = TOKEN_ASSIGN_R_SH;
goto triple_char;
} else {
type = TOKEN_R_SH;
goto double_char;
}
break;
}
default: stream_reset_char(stream), type = TOKEN_GT; break;
} break;
case '~':
type = TOKEN_BIT_NOT; break;
case '!':
switch (stream_peek_char(stream)) {
case '=': type = TOKEN_NEQ; goto double_char;
default: stream_reset_char(stream), type = TOKEN_NOT; break;
} break;
case '[':
type = TOKEN_L_BRACKET; break;
case ']':
type = TOKEN_R_BRACKET; break;
case '(':
type = TOKEN_L_PAREN; break;
case ')':
type = TOKEN_R_PAREN; break;
case '{':
type = TOKEN_L_BRACE; break;
case '}':
type = TOKEN_R_BRACE; break;
case ';':
type = TOKEN_SEMICOLON; break;
case ',':
type = TOKEN_COMMA; break;
case ':':
type = TOKEN_COLON; break;
case '.':
if (stream_peek_char(stream) == '.' && stream_peek_char(stream) == '.') {
type = TOKEN_ELLIPSIS;
goto triple_char;
}
type = TOKEN_DOT; break;
case '?':
type = TOKEN_COND; break;
case '\v': case '\r': case '\f':
case ' ': case '\t':
type = TOKEN_BLANK; break;
case '\n':
// you need to flush a newline or blank
stream_next_char(stream);
lexer_next_line(lexer);
// FIXME some error
token->type = TOKEN_BLANK;
goto END;
case '#':
parse_line(lexer, token);
token->type = TOKEN_BLANK;
goto END;
case '\0':
case lexer_stream_eof:
// EOF
type = TOKEN_EOF;
break;
case '\'':
parse_char(lexer, token);
goto END;
case '"':
parse_string(lexer, token);
goto END;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
parse_number(lexer, token);
goto END;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
case '_':
// TOKEN_IDENT
// TODO
// if ((ch == 'L' && ch == '\'') || (ch == 'L' && ch == '"')) {
// LEX_ERROR("unsupport wide-character char literal by `L` format");
// }
cstring_t str = cstring_new();
while (1) {
ch = stream_peek_char(stream);
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
(ch == '_') || (ch >= '0' && ch <= '9')) {
stream_next_char(stream);
lexer_next_pos(lexer);
cstring_push(&str, ch);
continue;
}
break;
}
int res = keyword_cmp((const char*)str.data, str.len);
if (res == -1) {
token->value.cstr.data = (char*)cstring_as_cstr(&str);
token->value.cstr.len = cstring_len(&str);
type = TOKEN_IDENT; break;
} else {
type = keywords[res].tok; break;
}
default:
LEX_ERROR("unsupport char in sourse code `%c`", ch);
break;
}
goto once_char;
triple_char:
stream_next_char(stream);
lexer_next_pos(lexer);
double_char:
stream_next_char(stream);
lexer_next_pos(lexer);
once_char:
stream_next_char(stream);
lexer_next_pos(lexer);
token->type = type;
END:
LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(token->type),
token->loc.name, token->loc.line, token->loc.column);
}
// lexer_get_token maybe got invalid (with parser)
void lexer_get_valid_token(smcc_lexer_t* lexer, lexer_tok_t* token) {
token_subtype_t type;
do {
lexer_get_token(lexer, token);
type = get_tok_subtype(token->type);
AssertFmt(type != TK_BASIC_INVALID, "Invalid token: `%s` at %s:%d:%d",
get_tok_name(token->type), token->loc.name, token->loc.line, token->loc.column);
} while (type == TK_BASIC_EMPTYSPACE || type == TK_BASIC_COMMENT);
}

View File

@@ -0,0 +1,7 @@
#include <lexer_log.h>
logger_t __smcc_lexer_log = {
.name = "lexer",
.level = LOG_LEVEL_ALL,
.handler = log_default_handler,
};

101
libs/lexer/src/mem_stream.c Normal file
View File

@@ -0,0 +1,101 @@
#include <lexer_stream.h>
#include <lexer_log.h>
#include <libcore.h>
// 内存流的具体实现结构
static usize read_buf(lexer_stream_t* _stream, char* buffer, usize count) {
Assert(buffer != null && buffer != null);
lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
usize remaining = stream->data_length - stream->curr_pos;
usize to_read = (remaining < count) ? remaining : count;
if (to_read > 0) {
smcc_memcpy(buffer, stream->data + stream->curr_pos, to_read);
stream->curr_pos += to_read;
} else {
LEX_WARN("Reading past end of stream [maybe count is too large or negative?]");
}
return to_read;
}
static int peek_char(lexer_stream_t* _stream) {
Assert(_stream != null);
lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
// 如果已经到达末尾返回EOF
if (stream->peek_pos >= stream->data_length) {
return lexer_stream_eof; // EOF
}
return (int)(unsigned char)stream->data[stream->peek_pos++];
}
static int next_char(lexer_stream_t* _stream) {
Assert(_stream != NULL);
lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
// 如果已经到达末尾返回EOF
if (stream->curr_pos >= stream->data_length) {
return lexer_stream_eof; // EOF
}
unsigned char ch = stream->data[stream->curr_pos++];
if (stream->peek_pos < stream->curr_pos) {
stream->peek_pos = stream->curr_pos;
}
return (int)ch;
}
static void reset_char(lexer_stream_t* _stream) {
Assert(_stream != NULL);
lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
stream->peek_pos = stream->curr_pos;
}
static void free_stream(lexer_stream_t* _stream) {
Assert(_stream != null);
lexer_mem_stream_t* stream = (lexer_mem_stream_t*)_stream;
if (stream->owned) {
smcc_free((void*)stream->data);
}
}
lexer_stream_t* lexer_mem_stream_init(lexer_mem_stream_t* stream, const char* data, usize length, cbool need_copy) {
if (stream == null || data == NULL || length == 0) {
LEX_ERROR("param error");
return null;
}
stream->owned = need_copy;
if (need_copy) {
char* buf = (char*)smcc_malloc(length);
if (buf == null) {
LEX_ERROR("malloc error");
return null;
}
smcc_memcpy(buf, data, length);
stream->data = buf;
} else {
stream->data = data;
}
stream->data_length = length;
stream->curr_pos = 0;
stream->peek_pos = 0;
static const char name[] = "mem_stream";
stream->stream.name = name;
stream->stream.name_len = sizeof(name) - 1;
stream->stream.read_buf = read_buf;
stream->stream.peek_char = peek_char;
stream->stream.next_char = next_char;
stream->stream.reset_char = reset_char;
stream->stream.free_stream = free_stream;
return (void*)stream;
}

30
libs/lexer/src/token.c Normal file
View File

@@ -0,0 +1,30 @@
#include <lexer_token.h>
// 生成字符串映射(根据需求选择#str或#name
static const char* token_strings[] = {
#define X(str, subtype, tok) [tok] = #str,
TOKEN_TABLE
#undef X
#define X(str, subtype, tok, std) [tok] = #str,
KEYWORD_TABLE
#undef X
};
static token_subtype_t token_subtypes[] = {
#define X(str, subtype, tok) [tok] = subtype,
TOKEN_TABLE
#undef X
#define X(str, subtype, tok, std) [tok] = subtype,
KEYWORD_TABLE
#undef X
};
token_subtype_t get_tok_subtype(token_type_t type) {
return token_subtypes[type];
}
const char* get_tok_name(token_type_t type) {
return token_strings[type];
}

View File

@@ -0,0 +1,4 @@
int main() {
}

View File

@@ -0,0 +1,83 @@
#include <lexer.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
/// gcc -g ../lexer.c ../token.c test_lexer.c -o test_lexer
/*
tok_tConstant {
int have;
union {
char ch;
int i;
float f;
double d;
long long ll;
char* str;
};
};
*/
int g_num;
int g_num_arr[3];
int main(int argc, char* argv[]) {
// int num = 0;
if (argc == 3 && strcmp(argv[2], "-nodebug") == 0) {
log_set_level(NULL, LOG_LEVEL_ALL);
}
const char* file_name = __FILE__;
if (argc == 2) {
file_name = argv[1];
}
FILE* fp = fopen(file_name, "rb");
if (fp == NULL) {
perror("open file failed");
return 1;
}
printf("open file success\n");
if (fseek(fp, 0, SEEK_END) != 0) {
perror("fseek failed");
return 1;
}
usize fsize = ftell(fp);
LOG_INFO("file size: %zu", fsize);
if (fseek(fp, 0, SEEK_SET)) {
perror("fseek failed");
return 1;
}
char* buffer = (char*) malloc(fsize);
usize read_ret = fread(buffer, 1, fsize, fp);
fclose(fp);
if (read_ret != fsize) {
LOG_FATAL("fread failed read_ret %u != fsize %u", read_ret, fsize);
free(buffer);
return 1;
}
smcc_lexer_t lexer;
lexer_mem_stream_t mem_stream = {0};
lexer_stream_t* stream = lexer_mem_stream_init(&mem_stream, buffer, fsize, false);
Assert(stream != null);
stream->name = __FILE__;
stream->name_len = strlen(__FILE__);
lexer_init(&lexer, stream);
lexer_tok_t tok;
while (1) {
lexer_get_valid_token(&lexer, &tok);
if (tok.type == TOKEN_EOF) {
break;
}
LOG_INFO("token `%s` at %s:%u:%u", get_tok_name(tok.type), tok.loc.name, tok.loc.line, tok.loc.column);
Assert(tok.loc.offset <= fsize);
// LOG_DEBUG("%s", tok.val.str);
// printf("line: %d, column: %d, type: %3d, typename: %s\n",
// lexer.line, lexer.index, tok.type, get_tok_name(tok.type));
}
free(buffer);
}