fix(lexer): 修复词法分析器中的关键字比较与字符串处理逻辑

修正了关键字表的注释，明确要求其必须按字典序排列以确保二分查找正确性。在词法分析过程中，修复标识符解析时对 `cstring` 的使用问题，并调整 token 类型赋值顺序，避免潜在的未定义行为。同时新增测试文件用于验证操作符、关键字及各类字面量的识别准确性，并更新测试运行器的日志级别控制参数。
2025-11-20 22:49:22 +08:00
parent f29fd92fdf
commit 164bab0f13
5 changed files with 181 additions and 7 deletions
--- a/libs/lexer/include/lexer_token.h
+++ b/libs/lexer/include/lexer_token.h
@@ -9,8 +9,9 @@ typedef enum ckeyword {
    CEXT_ASM,
 } ckeyword_t;
 // Using Binary Search To Fast Find Keyword
 /* clang-format off */
 // WARNING: Using Binary Search To Fast Find Keyword
 // 你必须确保其中是按照字典序排列
 #define KEYWORD_TABLE \
    X(asm           , TK_BASIC_KEYWORD  , TOKEN_ASM       , CEXT_ASM) \
    X(break         , TK_BASIC_KEYWORD  , TOKEN_BREAK     , CSTD_C89) \
--- a/libs/lexer/src/lexer.c
+++ b/libs/lexer/src/lexer.c
@@ -693,6 +693,8 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) {
        //     LEX_ERROR("unsupport wide-character char literal by `L` format");
        // }
        cstring_t str = cstring_new();
        cstring_push(&str, stream_next_char(stream));
        lexer_next_pos(lexer);
        while (1) {
            ch = stream_peek_char(stream);
            if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
@@ -705,16 +707,17 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) {
            break;
        }
-        int res = keyword_cmp((const char *)str.data, str.size - 1);
+        int res = keyword_cmp(cstring_as_cstr(&str), cstring_len(&str));
        if (res == -1) {
            token->value.cstr.data = (char *)cstring_as_cstr(&str);
            token->value.cstr.len = cstring_len(&str);
            type = TOKEN_IDENT;
            break;
        } else {
            cstring_free(&str);
            type = keywords[res].tok;
            break;
        }
        token->type = type;
        goto END;
    default:
        LEX_ERROR("unsupport char in sourse code `%c`", ch);
        break;
--- a/libs/lexer/tests/test_number.c
+++ b/libs/lexer/tests/test_number.c
@@ -1,2 +0,0 @@
 int main() {}
--- a/libs/lexer/tests/test_parse.c
+++ b/libs/lexer/tests/test_parse.c
@@ -0,0 +1,170 @@
 // test_lexer.c
 #include <lexer.h>
 #include <string.h>
 #include <utest/acutest.h>
 // 测试辅助函数
 static inline void test_lexer_string(const char *input,
                                     token_type_t expected_type) {
    smcc_lexer_t lexer;
    lexer_tok_t token;
    core_mem_stream_t stream;
    lexer_init(&lexer,
               core_mem_stream_init(&stream, input, strlen(input), false));
    lexer_get_token(&lexer, &token);
    TEST_CHECK(token.type == expected_type);
    TEST_MSG("Expected: %s", get_tok_name(expected_type));
    TEST_MSG("Got: %s", get_tok_name(token.type));
 }
 // 基础运算符测试
 void test_operators() {
    TEST_CASE("Arithmetic operators");
    {
        test_lexer_string("+", TOKEN_ADD);
        test_lexer_string("++", TOKEN_ADD_ADD);
        test_lexer_string("+=", TOKEN_ASSIGN_ADD);
        test_lexer_string("-", TOKEN_SUB);
        test_lexer_string("--", TOKEN_SUB_SUB);
        test_lexer_string("-=", TOKEN_ASSIGN_SUB);
        test_lexer_string("*", TOKEN_MUL);
        test_lexer_string("*=", TOKEN_ASSIGN_MUL);
        test_lexer_string("/", TOKEN_DIV);
        test_lexer_string("/=", TOKEN_ASSIGN_DIV);
        test_lexer_string("%", TOKEN_MOD);
        test_lexer_string("%=", TOKEN_ASSIGN_MOD);
    }
    TEST_CASE("Bitwise operators");
    {
        test_lexer_string("&", TOKEN_AND);
        test_lexer_string("&&", TOKEN_AND_AND);
        test_lexer_string("&=", TOKEN_ASSIGN_AND);
        test_lexer_string("|", TOKEN_OR);
        test_lexer_string("||", TOKEN_OR_OR);
        test_lexer_string("|=", TOKEN_ASSIGN_OR);
        test_lexer_string("^", TOKEN_XOR);
        test_lexer_string("^=", TOKEN_ASSIGN_XOR);
        test_lexer_string("~", TOKEN_BIT_NOT);
        test_lexer_string("<<", TOKEN_L_SH);
        test_lexer_string("<<=", TOKEN_ASSIGN_L_SH);
        test_lexer_string(">>", TOKEN_R_SH);
        test_lexer_string(">>=", TOKEN_ASSIGN_R_SH);
    }
    TEST_CASE("Comparison operators");
    {
        test_lexer_string("==", TOKEN_EQ);
        test_lexer_string("!=", TOKEN_NEQ);
        test_lexer_string("<", TOKEN_LT);
        test_lexer_string("<=", TOKEN_LE);
        test_lexer_string(">", TOKEN_GT);
        test_lexer_string(">=", TOKEN_GE);
    }
    TEST_CASE("Special symbols");
    {
        test_lexer_string("(", TOKEN_L_PAREN);
        test_lexer_string(")", TOKEN_R_PAREN);
        test_lexer_string("[", TOKEN_L_BRACKET);
        test_lexer_string("]", TOKEN_R_BRACKET);
        test_lexer_string("{", TOKEN_L_BRACE);
        test_lexer_string("}", TOKEN_R_BRACE);
        test_lexer_string(";", TOKEN_SEMICOLON);
        test_lexer_string(",", TOKEN_COMMA);
        test_lexer_string(":", TOKEN_COLON);
        test_lexer_string(".", TOKEN_DOT);
        test_lexer_string("...", TOKEN_ELLIPSIS);
        test_lexer_string("->", TOKEN_DEREF);
        test_lexer_string("?", TOKEN_COND);
    }
 }
 // 关键字测试
 void test_keywords() {
    TEST_CASE("C89 keywords");
    test_lexer_string("while", TOKEN_WHILE);
    test_lexer_string("sizeof", TOKEN_SIZEOF);
    TEST_CASE("C99 keywords");
    test_lexer_string("restrict", TOKEN_RESTRICT);
    // test_lexer_string("_Bool", TOKEN_INT); // 需确认你的类型定义
 }
 // 字面量测试
 void test_literals() {
    TEST_CASE("Integer literals");
    {
        // 十进制
        test_lexer_string("0", TOKEN_INT_LITERAL);
        test_lexer_string("123", TOKEN_INT_LITERAL);
        test_lexer_string("2147483647", TOKEN_INT_LITERAL);
        // 十六进制
        test_lexer_string("0x0", TOKEN_INT_LITERAL);
        test_lexer_string("0x1A3F", TOKEN_INT_LITERAL);
        test_lexer_string("0XABCDEF", TOKEN_INT_LITERAL);
        // 八进制
        test_lexer_string("0123", TOKEN_INT_LITERAL);
        test_lexer_string("0777", TOKEN_INT_LITERAL);
        // 边界值测试
        test_lexer_string("2147483647", TOKEN_INT_LITERAL); // INT_MAX
        test_lexer_string("4294967295", TOKEN_INT_LITERAL); // UINT_MAX
    }
    TEST_CASE("Character literals");
    {
        test_lexer_string("'a'", TOKEN_CHAR_LITERAL);
        test_lexer_string("'\\n'", TOKEN_CHAR_LITERAL);
        test_lexer_string("'\\t'", TOKEN_CHAR_LITERAL);
        test_lexer_string("'\\\\'", TOKEN_CHAR_LITERAL);
        test_lexer_string("'\\0'", TOKEN_CHAR_LITERAL);
    }
    TEST_CASE("String literals");
    {
        test_lexer_string("\"hello\"", TOKEN_STRING_LITERAL);
        test_lexer_string("\"multi-line\\nstring\"", TOKEN_STRING_LITERAL);
        test_lexer_string("\"escape\\\"quote\"", TOKEN_STRING_LITERAL);
    }
    // TEST_CASE("Floating literals");
    // test_lexer_string("3.14e-5", TOKEN_FLOAT_LITERAL);
 }
 // 边界测试
 void test_edge_cases() {
    // TEST_CASE("Long identifiers");
    // char long_id[LEXER_MAX_TOKEN_SIZE+2] = {0};
    // memset(long_id, 'a', LEXER_MAX_TOKEN_SIZE+1);
    // test_lexer_string(long_id, TOKEN_IDENT);
    // TEST_CASE("Buffer boundary");
    // char boundary[LEXER_BUFFER_SIZE*2] = {0};
    // memset(boundary, '+', LEXER_BUFFER_SIZE*2-1);
    // test_lexer_string(boundary, TOKEN_ADD);
 }
 // 错误处理测试
 // void test_error_handling() {
 //     TEST_CASE("Invalid characters");
 //     cc_lexer_t lexer;
 //     tok_t token;
 //     init_lexer(&lexer, "test.c", NULL, test_read);
 //     get_valid_token(&lexer, &token);
 //     TEST_CHECK(token.type == TOKEN_EOF); // 应触发错误处理
 // }
 // 测试列表
 TEST_LIST = {{"operators", test_operators},
             {"keywords", test_keywords},
             {"literals", test_literals},
             {"edge_cases", test_edge_cases},
             //  {"error_handling", test_error_handling},
             {NULL, NULL}};
--- a/libs/lexer/tests/test_run.c
+++ b/libs/lexer/tests/test_run.c
@@ -22,7 +22,9 @@ int g_num;
 int g_num_arr[3];
 int main(int argc, char *argv[]) {
    // int num = 0;
-    if (argc == 3 && strcmp(argv[2], "-nodebug") == 0) {
+    if (argc == 3 && strcmp(argv[2], "--debug") == 0) {
        log_set_level(NULL, LOG_LEVEL_ALL);
    } else {
        log_set_level(NULL, LOG_LEVEL_INFO | LOG_LEVEL_WARN | LOG_LEVEL_ERROR);
    }