Files
scc/libs/lexer/tests/test_lexer.c
zzy 2e331ee016 feat(ast): 添加内置类型定义和AST节点初始化函数
添加了完整的内置类型支持,包括整数、浮点数、字符、布尔等基本类型,
以及它们的有符号/无符号变体。同时添加了大量的AST节点初始化函数,
简化了AST节点的创建过程。

BREAKING CHANGE: 重构了AST表达式和声明结构,移除了冗余字段,
统一了命名规范,并修改了函数调用和成员访问的表示方式。
2026-03-10 13:56:32 +08:00

414 lines
17 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// test_lexer.c
#include <scc_lexer.h>
#include <string.h>
#include <utest/acutest.h>
// 辅助函数:释放 token 的 lexeme
static void free_token(scc_lexer_tok_t *tok) { scc_cstring_free(&tok->lexeme); }
// 单 token 测试宏(检查类型)
#define TEST_TOKEN(input, expected_type) \
do { \
scc_lexer_t lexer; \
scc_lexer_tok_t token; \
scc_sstream_t stream; \
scc_sstream_init_by_buffer(&stream, input, strlen(input), 0, 16); \
scc_sstream_ring_t *ref = scc_sstream_to_ring(&stream); \
scc_lexer_init(&lexer, ref); \
scc_lexer_get_token(&lexer, &token); \
\
TEST_CHECK(token.type == expected_type); \
TEST_MSG("Input: '%s'", input); \
TEST_MSG("Expected: %s", scc_get_tok_name(expected_type)); \
TEST_MSG("Got: %s", scc_get_tok_name(token.type)); \
\
free_token(&token); \
scc_sstream_drop_ring(ref); \
scc_sstream_drop(&stream); \
} while (0)
// 多 token 序列测试宏(接受类型数组)
#define TEST_SEQUENCE(input, ...) \
do { \
scc_lexer_t lexer; \
scc_lexer_tok_t token; \
scc_sstream_t stream; \
scc_sstream_init_by_buffer(&stream, input, strlen(input), 0, 16); \
scc_sstream_ring_t *ref = scc_sstream_to_ring(&stream); \
scc_lexer_init(&lexer, ref); \
\
scc_tok_type_t expected[] = {__VA_ARGS__}; \
size_t count = sizeof(expected) / sizeof(expected[0]); \
for (size_t i = 0; i < count; i++) { \
scc_lexer_get_token(&lexer, &token); \
TEST_CHECK(token.type == expected[i]); \
TEST_MSG("Token %zu: input '%s'", i, input); \
TEST_MSG("Expected: %s", scc_get_tok_name(expected[i])); \
TEST_MSG("Got: %s", scc_get_tok_name(token.type)); \
free_token(&token); \
} \
\
scc_sstream_drop_ring(ref); \
scc_sstream_drop(&stream); \
} while (0)
// ============================ 测试用例 ============================
void test_operators() {
TEST_CASE("Arithmetic operators");
TEST_TOKEN("+", SCC_TOK_ADD);
TEST_TOKEN("++", SCC_TOK_ADD_ADD);
TEST_TOKEN("+=", SCC_TOK_ASSIGN_ADD);
TEST_TOKEN("-", SCC_TOK_SUB);
TEST_TOKEN("--", SCC_TOK_SUB_SUB);
TEST_TOKEN("-=", SCC_TOK_ASSIGN_SUB);
TEST_TOKEN("*", SCC_TOK_MUL);
TEST_TOKEN("*=", SCC_TOK_ASSIGN_MUL);
TEST_TOKEN("/", SCC_TOK_DIV);
TEST_TOKEN("/=", SCC_TOK_ASSIGN_DIV);
TEST_TOKEN("%", SCC_TOK_MOD);
TEST_TOKEN("%=", SCC_TOK_ASSIGN_MOD);
TEST_CASE("Bitwise operators");
TEST_TOKEN("&", SCC_TOK_AND);
TEST_TOKEN("&&", SCC_TOK_AND_AND);
TEST_TOKEN("&=", SCC_TOK_ASSIGN_AND);
TEST_TOKEN("|", SCC_TOK_OR);
TEST_TOKEN("||", SCC_TOK_OR_OR);
TEST_TOKEN("|=", SCC_TOK_ASSIGN_OR);
TEST_TOKEN("^", SCC_TOK_XOR);
TEST_TOKEN("^=", SCC_TOK_ASSIGN_XOR);
TEST_TOKEN("~", SCC_TOK_BIT_NOT);
TEST_TOKEN("<<", SCC_TOK_L_SH);
TEST_TOKEN("<<=", SCC_TOK_ASSIGN_L_SH);
TEST_TOKEN(">>", SCC_TOK_R_SH);
TEST_TOKEN(">>=", SCC_TOK_ASSIGN_R_SH);
TEST_CASE("Comparison operators");
TEST_TOKEN("==", SCC_TOK_EQ);
TEST_TOKEN("!=", SCC_TOK_NEQ);
TEST_TOKEN("<", SCC_TOK_LT);
TEST_TOKEN("<=", SCC_TOK_LE);
TEST_TOKEN(">", SCC_TOK_GT);
TEST_TOKEN(">=", SCC_TOK_GE);
TEST_CASE("Special symbols");
TEST_TOKEN("(", SCC_TOK_L_PAREN);
TEST_TOKEN(")", SCC_TOK_R_PAREN);
TEST_TOKEN("[", SCC_TOK_L_BRACKET);
TEST_TOKEN("]", SCC_TOK_R_BRACKET);
TEST_TOKEN("{", SCC_TOK_L_BRACE);
TEST_TOKEN("}", SCC_TOK_R_BRACE);
TEST_TOKEN(";", SCC_TOK_SEMICOLON);
TEST_TOKEN(",", SCC_TOK_COMMA);
TEST_TOKEN(":", SCC_TOK_COLON);
TEST_TOKEN(".", SCC_TOK_DOT);
TEST_TOKEN("...", SCC_TOK_ELLIPSIS);
TEST_TOKEN("->", SCC_TOK_DEREF);
TEST_TOKEN("?", SCC_TOK_COND);
}
void test_keywords() {
TEST_CASE("C89 keywords");
TEST_TOKEN("while", SCC_TOK_WHILE);
TEST_TOKEN("sizeof", SCC_TOK_SIZEOF);
TEST_TOKEN("if", SCC_TOK_IF);
TEST_TOKEN("else", SCC_TOK_ELSE);
TEST_TOKEN("for", SCC_TOK_FOR);
TEST_TOKEN("do", SCC_TOK_DO);
TEST_TOKEN("switch", SCC_TOK_SWITCH);
TEST_TOKEN("case", SCC_TOK_CASE);
TEST_TOKEN("default", SCC_TOK_DEFAULT);
TEST_TOKEN("break", SCC_TOK_BREAK);
TEST_TOKEN("continue", SCC_TOK_CONTINUE);
TEST_TOKEN("return", SCC_TOK_RETURN);
TEST_TOKEN("goto", SCC_TOK_GOTO);
TEST_TOKEN("auto", SCC_TOK_AUTO);
TEST_TOKEN("register", SCC_TOK_REGISTER);
TEST_TOKEN("static", SCC_TOK_STATIC);
TEST_TOKEN("extern", SCC_TOK_EXTERN);
TEST_TOKEN("typedef", SCC_TOK_TYPEDEF);
TEST_TOKEN("const", SCC_TOK_CONST);
TEST_TOKEN("volatile", SCC_TOK_VOLATILE);
TEST_TOKEN("signed", SCC_TOK_SIGNED);
TEST_TOKEN("unsigned", SCC_TOK_UNSIGNED);
TEST_TOKEN("short", SCC_TOK_SHORT);
TEST_TOKEN("long", SCC_TOK_LONG);
TEST_TOKEN("int", SCC_TOK_INT);
TEST_TOKEN("char", SCC_TOK_CHAR);
TEST_TOKEN("float", SCC_TOK_FLOAT);
TEST_TOKEN("double", SCC_TOK_DOUBLE);
TEST_TOKEN("void", SCC_TOK_VOID);
TEST_TOKEN("struct", SCC_TOK_STRUCT);
TEST_TOKEN("union", SCC_TOK_UNION);
TEST_TOKEN("enum", SCC_TOK_ENUM);
TEST_CASE("C99 keywords");
TEST_TOKEN("inline", SCC_TOK_INLINE);
TEST_TOKEN("restrict", SCC_TOK_RESTRICT);
// _Bool, _Complex, _Imaginary 可根据需要添加
TEST_CASE("SCC extensions (if enabled)");
TEST_TOKEN("asm", SCC_TOK_ASM);
TEST_TOKEN("atomic", SCC_TOK_ATOMIC);
TEST_TOKEN("bool", SCC_TOK_BOOL);
TEST_TOKEN("complex", SCC_TOK_COMPLEX);
}
void test_literals() {
TEST_CASE("Integer literals - decimal");
TEST_TOKEN("0", SCC_TOK_INT_LITERAL);
TEST_TOKEN("123", SCC_TOK_INT_LITERAL);
TEST_TOKEN("2147483647", SCC_TOK_INT_LITERAL);
TEST_TOKEN("4294967295", SCC_TOK_INT_LITERAL);
TEST_CASE("Integer literals - hexadecimal");
TEST_TOKEN("0x0", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0x1A3F", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0XABCDEF", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0x123abc", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0XFF", SCC_TOK_INT_LITERAL);
TEST_CASE("Integer literals - octal");
TEST_TOKEN("0123", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0777", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0", SCC_TOK_INT_LITERAL); // 0 既是十进制也是八进制
TEST_CASE("Integer literals - binary (C23 extension)");
TEST_TOKEN("0b1010", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0B1100", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0b0", SCC_TOK_INT_LITERAL);
TEST_CASE("Integer literals with suffixes");
TEST_TOKEN("123U", SCC_TOK_INT_LITERAL);
TEST_TOKEN("456L", SCC_TOK_INT_LITERAL);
TEST_TOKEN("789UL", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0x1FFLL", SCC_TOK_INT_LITERAL);
TEST_TOKEN("0b1010ULL", SCC_TOK_INT_LITERAL);
TEST_CASE("Floating literals - decimal");
TEST_TOKEN("0.0", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("3.14", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN(".5", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("0.", SCC_TOK_FLOAT_LITERAL);
TEST_CASE("Floating literals - scientific");
TEST_TOKEN("1e10", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("1E-5", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("2.5e+3", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN(".1e2", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("1.e3", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("123.456e-7", SCC_TOK_FLOAT_LITERAL);
TEST_CASE("Floating literals - hexadecimal (C99)");
TEST_TOKEN("0x1.2p3", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("0x1p-2", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("0x0.1p10", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("0X1.2P3", SCC_TOK_FLOAT_LITERAL);
TEST_CASE("Floating literals with suffixes");
TEST_TOKEN("1.0f", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("2.0F", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("3.0l", SCC_TOK_FLOAT_LITERAL);
TEST_TOKEN("4.0L", SCC_TOK_FLOAT_LITERAL);
TEST_CASE("Character literals - simple");
TEST_TOKEN("'a'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'0'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("' '", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\t'", SCC_TOK_CHAR_LITERAL); // 制表符在单引号内
TEST_CASE("Character literals - escape sequences");
TEST_TOKEN("'\\n'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\t'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\\\'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\''", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\\"'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\?'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\0'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\123'", SCC_TOK_CHAR_LITERAL); // 八进制
TEST_TOKEN("'\\xAB'", SCC_TOK_CHAR_LITERAL); // 十六进制
TEST_CASE("Character literals - multi-byte (implementation defined)");
TEST_TOKEN("'ab'", SCC_TOK_CHAR_LITERAL);
TEST_TOKEN("'\\x41\\x42'", SCC_TOK_CHAR_LITERAL); // 多个转义
TEST_CASE("String literals - basic");
TEST_TOKEN("\"hello\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("\"\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("\"a b c\"", SCC_TOK_STRING_LITERAL);
TEST_CASE("String literals - escape sequences");
TEST_TOKEN("\"a\\nb\\tc\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("\"\\\\ \\\" \\' \\?\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("\"\\123\\xAB\"", SCC_TOK_STRING_LITERAL);
TEST_CASE("String literals - wide and UTF-8 prefixes (C11)");
TEST_TOKEN("L\"wide\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("u\"utf16\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("U\"utf32\"", SCC_TOK_STRING_LITERAL);
TEST_TOKEN("u8\"utf8\"", SCC_TOK_STRING_LITERAL);
}
void test_whitespace() {
TEST_CASE("Whitespace characters");
TEST_TOKEN(" ", SCC_TOK_BLANK);
TEST_TOKEN("\t", SCC_TOK_BLANK);
TEST_TOKEN("\v", SCC_TOK_BLANK);
TEST_TOKEN("\f", SCC_TOK_BLANK);
TEST_TOKEN(" \t\v\f", SCC_TOK_BLANK); // 连续空白应为一个 token
}
void test_newlines() {
TEST_CASE("Newline characters");
TEST_TOKEN("\n", SCC_TOK_ENDLINE);
TEST_TOKEN("\r", SCC_TOK_ENDLINE);
TEST_TOKEN("\r\n", SCC_TOK_ENDLINE); // 应视为单个换行符
}
void test_comments() {
TEST_CASE("Line comments");
TEST_TOKEN("// single line comment", SCC_TOK_LINE_COMMENT);
TEST_TOKEN("// comment with // inside", SCC_TOK_LINE_COMMENT);
TEST_TOKEN("// comment at end", SCC_TOK_LINE_COMMENT);
TEST_CASE("Block comments");
TEST_TOKEN("/* simple */", SCC_TOK_BLOCK_COMMENT);
TEST_TOKEN("/* multi\nline */", SCC_TOK_BLOCK_COMMENT);
TEST_TOKEN("/**/", SCC_TOK_BLOCK_COMMENT); // 空注释
TEST_TOKEN("/* with * inside */", SCC_TOK_BLOCK_COMMENT);
TEST_TOKEN("/* nested /* not allowed in C */",
SCC_TOK_BLOCK_COMMENT); // 词法上不会嵌套
}
void test_identifiers() {
TEST_CASE("Valid identifiers");
TEST_TOKEN("foo", SCC_TOK_IDENT);
TEST_TOKEN("_foo", SCC_TOK_IDENT);
TEST_TOKEN("foo123", SCC_TOK_IDENT);
TEST_TOKEN("foo_bar", SCC_TOK_IDENT);
TEST_TOKEN("FOO", SCC_TOK_IDENT);
TEST_TOKEN("_", SCC_TOK_IDENT);
TEST_TOKEN("__LINE__", SCC_TOK_IDENT); // 预处理宏名也是标识符
// 超长标识符(假设缓冲区足够)
char long_id[1024];
scc_memset(long_id, 'a', sizeof(long_id) - 1);
long_id[sizeof(long_id) - 1] = '\0';
TEST_TOKEN(long_id, SCC_TOK_IDENT);
}
void test_preprocessor() {
TEST_CASE("Preprocessor directives - just the # token");
TEST_TOKEN("#", SCC_TOK_SHARP);
TEST_TOKEN("##", SCC_TOK_SHARP_SHARP);
// 多 token 序列测试 #include 等
TEST_SEQUENCE("#include <stdio.h>", SCC_TOK_SHARP, SCC_TOK_IDENT,
SCC_TOK_BLANK, SCC_TOK_LT, SCC_TOK_IDENT, SCC_TOK_DOT,
SCC_TOK_IDENT, SCC_TOK_GT);
TEST_SEQUENCE("#define FOO 123", SCC_TOK_SHARP, SCC_TOK_IDENT,
SCC_TOK_BLANK, SCC_TOK_IDENT, SCC_TOK_BLANK,
SCC_TOK_INT_LITERAL);
TEST_SEQUENCE("#define FOO(x) x + 1", SCC_TOK_SHARP, SCC_TOK_IDENT,
SCC_TOK_BLANK, SCC_TOK_IDENT, SCC_TOK_L_PAREN, SCC_TOK_IDENT,
SCC_TOK_R_PAREN, SCC_TOK_BLANK, SCC_TOK_IDENT, SCC_TOK_BLANK,
SCC_TOK_ADD, SCC_TOK_BLANK, SCC_TOK_INT_LITERAL);
TEST_SEQUENCE("#undef FOO", SCC_TOK_SHARP, SCC_TOK_IDENT, SCC_TOK_BLANK,
SCC_TOK_IDENT);
TEST_SEQUENCE("#error \"This is an error\"", SCC_TOK_SHARP, SCC_TOK_IDENT,
SCC_TOK_BLANK, SCC_TOK_STRING_LITERAL);
TEST_SEQUENCE("#warning \"This is an warning\"\n", SCC_TOK_SHARP,
SCC_TOK_IDENT, SCC_TOK_BLANK, SCC_TOK_STRING_LITERAL,
SCC_TOK_ENDLINE);
}
void test_edge_cases() {
TEST_CASE("Invalid characters");
TEST_TOKEN("@", SCC_TOK_UNKNOWN);
TEST_TOKEN("`", SCC_TOK_UNKNOWN);
TEST_TOKEN("$", SCC_TOK_UNKNOWN); // 在 C 中不是标识符字符
TEST_CASE("Empty input");
TEST_TOKEN("", SCC_TOK_EOF); // 立即 EOF
TEST_CASE("Only whitespace");
TEST_TOKEN(" \t", SCC_TOK_BLANK);
// 之后应该为 EOF但我们的单 token 测试只取第一个 token
TEST_CASE("Numbers followed by letters (no suffix)");
// 词法上应拆分为数字和标识符
TEST_SEQUENCE("123abc", SCC_TOK_INT_LITERAL, SCC_TOK_IDENT);
TEST_SEQUENCE("0x123xyz", SCC_TOK_INT_LITERAL, SCC_TOK_IDENT);
}
void test_sequences() {
TEST_CASE("Simple expression");
TEST_SEQUENCE("a + b * c", SCC_TOK_IDENT, SCC_TOK_BLANK, SCC_TOK_ADD,
SCC_TOK_BLANK, SCC_TOK_IDENT, SCC_TOK_BLANK, SCC_TOK_MUL,
SCC_TOK_BLANK, SCC_TOK_IDENT);
TEST_CASE("Function call");
TEST_SEQUENCE("func(1, 2);", SCC_TOK_IDENT, SCC_TOK_L_PAREN,
SCC_TOK_INT_LITERAL, SCC_TOK_COMMA, SCC_TOK_BLANK,
SCC_TOK_INT_LITERAL, SCC_TOK_R_PAREN, SCC_TOK_SEMICOLON);
TEST_CASE("Multi-character operators");
TEST_SEQUENCE(">>=", SCC_TOK_ASSIGN_R_SH);
TEST_SEQUENCE("<<=", SCC_TOK_ASSIGN_L_SH);
TEST_SEQUENCE("...", SCC_TOK_ELLIPSIS);
TEST_SEQUENCE("->", SCC_TOK_DEREF);
TEST_SEQUENCE("##", SCC_TOK_SHARP_SHARP); // 两个预处理记号
TEST_CASE("Comments and whitespace interleaved");
TEST_SEQUENCE("/* comment */ a // line comment\n b", SCC_TOK_BLOCK_COMMENT,
SCC_TOK_BLANK, SCC_TOK_IDENT, SCC_TOK_BLANK,
SCC_TOK_LINE_COMMENT, SCC_TOK_ENDLINE, SCC_TOK_BLANK,
SCC_TOK_IDENT);
TEST_CASE("String literals with escapes");
TEST_SEQUENCE("\"hello\\nworld\"", SCC_TOK_STRING_LITERAL);
TEST_SEQUENCE(
"L\"wide\"",
SCC_TOK_STRING_LITERAL); // 前缀作为标识符?不,整个是字符串字面量
TEST_CASE("Character literals with escapes");
TEST_SEQUENCE("'\\x41'", SCC_TOK_CHAR_LITERAL);
TEST_SEQUENCE("'\\123'", SCC_TOK_CHAR_LITERAL);
}
void test_error_recovery() {
// 测试未闭合的字符字面量:词法分析器可能继续直到遇到换行或 EOF
// 这里假设它会产生一个 SCC_TOK_CHAR_LITERAL 但包含到结束
// 但标准 C 中未闭合是错误,我们可能返回 UNKNOWN
// TEST_CASE("Unterminated character literal");
// TEST_TOKEN("'a", SCC_TOK_UNKNOWN); // 取决于实现,可能为 CHAR_LITERAL
// // 更可靠的测试:序列中下一个 token 是什么
// TEST_SEQUENCE("'a b", SCC_TOK_UNKNOWN,
// SCC_TOK_IDENT); // 假设第一个 token 是错误
// TEST_CASE("Unterminated string literal");
// TEST_TOKEN("\"hello", SCC_TOK_UNKNOWN); // 同样
// TEST_CASE("Unterminated block comment");
// TEST_SEQUENCE("/* comment",
// SCC_TOK_BLOCK_COMMENT); // 直到 EOF可能仍为注释
}
// ============================ 主测试列表 ============================
TEST_LIST = {
{"operators", test_operators},
{"keywords", test_keywords},
{"literals", test_literals},
{"whitespace", test_whitespace},
{"newlines", test_newlines},
{"comments", test_comments},
{"identifiers", test_identifiers},
{"preprocessor", test_preprocessor},
{"edge_cases", test_edge_cases},
{"sequences", test_sequences},
{"error_recovery", test_error_recovery},
{NULL, NULL},
};