scc/libs/lexer/src/lexer.c

#include <lexer_log.h>
#include <scc_lexer.h>

static const struct {
    const char *name;
    scc_cstd_t std_type;
    scc_tok_type_t tok_type;
} keywords[] = {
#define X(name, subtype, tok, std_type, ...) {#name, std_type, tok},
    SCC_CKEYWORD_TABLE
#undef X
};

// by using binary search to find the keyword
static int keyword_cmp(const char *name, int len) {
    int low = 0;
    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
    while (low <= high) {
        int mid = (low + high) / 2;
        const char *key = keywords[mid].name;
        int cmp = 0;
        for (int i = 0; i < len; i++) {
            if (name[i] != key[i]) {
                cmp = (unsigned char)name[i] - (unsigned char)key[i];
                break;
            }
            if (name[i] == '\0')
                break;
        }
        if (cmp == 0) {
            if (key[len] == '\0')
                return mid;
            cmp = -1;
        }
        if (cmp < 0)
            high = mid - 1;
        else
            low = mid + 1;
    }
    return -1; // 不是关键字
}

void scc_lexer_init(scc_lexer_t *lexer, scc_sstream_ring_t *stream_ref) {
    lexer->stream_ref = stream_ref;
    lexer->ring_ref_count = 0;
    lexer->jump_macro = false;
}

static inline cbool is_whitespace(int ch) {
    return ch == ' ' || ch == '\t' || ch == '\v' || ch == '\f';
}
static inline cbool is_newline(int ch) { return ch == '\n' || ch == '\r'; }
static inline cbool is_digit(int ch) { return ch >= '0' && ch <= '9'; }
static inline cbool is_alpha(int ch) {
    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
static inline cbool is_alnum(int ch) { return is_alpha(ch) || is_digit(ch); }
static inline cbool is_identifier_start(int ch) {
    return is_alpha(ch) || ch == '_';
}
static inline cbool is_identifier_part(int ch) {
    return is_alnum(ch) || ch == '_';
}
static inline cbool is_octal_digit(int ch) { return ch >= '0' && ch <= '7'; }
static inline cbool is_hex_digit(int ch) {
    return is_digit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}

/* 从环形缓冲区预览一个字符（带EOF检测） */
static inline cbool peek_char(scc_lexer_t *lexer, scc_sstream_char_t *out) {
    cbool ok;
    scc_ring_peek(*lexer->stream_ref, *out, ok);
    return ok;
}

/* 从环形缓冲区消费一个字符，并将它追加到lexeme中 */
static inline cbool next_char(scc_lexer_t *lexer, scc_cstring_t *lexeme,
                              scc_sstream_char_t *out) {
    cbool ok;
    scc_ring_next(*lexer->stream_ref, *out, ok);
    if (!ok)
        return false;
    scc_cstring_append_ch(lexeme, out->character);
    return true;
}

#define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN)

void scc_lexer_get_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
    scc_sstream_char_t cur = {0};
    scc_cstring_t lex = scc_cstring_create(); // 临时lexeme

    // 尝试预览第一个字符
    if (!peek_char(lexer, &cur)) {
        token->type = SCC_TOK_EOF;
        token->loc = (scc_pos_t){0, 1, 1, 0}; // 默认位置
        token->lexeme = lex;                  // 空字符串
        return;
    }

    // 记录起始位置
    scc_pos_t start_loc = cur.pos;
    int ch = cur.character;

    // once step
    if (is_whitespace(ch)) {
        // 空白符: 连续收集
        token->type = SCC_TOK_BLANK;
        while (peek_char(lexer, &cur) && is_whitespace(cur.character)) {
            next_char(lexer, &lex, &cur);
        }
    } else if (is_newline(ch)) {
        // 换行符：处理 \r 或 \n，以及 \r\n 组合
        token->type = SCC_TOK_ENDLINE;
        next_char(lexer, &lex, &cur); // 消费第一个字符
        if (ch == '\r') {
            // 尝试消费后面的 \n
            if (peek_char(lexer, &cur) && cur.character == '\n') {
                next_char(lexer, &lex, &cur);
            }
        }
    } else if (ch == '/') {
        // 可能为注释或除号
        scc_sstream_char_t next = {0};
        next_char(lexer, &lex, &cur); // 消费 '/'
        peek_char(lexer, &next);
        if (next.character == '=') {
            token->type = SCC_TOK_ASSIGN_DIV;
            next_char(lexer, &lex, &cur);
        } else if (next.character == '/') {
            // 行注释 //
            token->type = SCC_TOK_LINE_COMMENT;
            next_char(lexer, &lex, &cur); // 消费 '/'
            while (peek_char(lexer, &cur) && !is_newline(cur.character)) {
                next_char(lexer, &lex, &cur);
                scc_ring_consume(*lexer->stream_ref);
            }
            // 注释结束，不包含换行符（换行符单独成token）
        } else if (next.character == '*') {
            // 块注释 /*
            token->type = SCC_TOK_BLOCK_COMMENT;
            next_char(lexer, &lex, &cur); // 消费 '*'
            while (1) {
                if (!next_char(lexer, &lex, &cur)) {
                    // 文件结束，注释未闭合
                    LOG_ERROR("Unterminated block comment");
                    break;
                }
                if (cur.character == '*' && peek_char(lexer, &next) &&
                    next.character == '/') {
                    next_char(lexer, &lex, &cur); // 消费 '/'
                    break;
                }
                scc_ring_consume(*lexer->stream_ref);
            }
        } else {
            // 只是除号 /
            token->type = SCC_TOK_DIV;
        }
    } else if (is_identifier_start(ch)) {
        // 标识符或关键字
        token->type = SCC_TOK_IDENT; // 暂定
        while (peek_char(lexer, &cur) && is_identifier_part(cur.character)) {
            next_char(lexer, &lex, &cur);
            scc_ring_consume(*lexer->stream_ref);
        }
        // 检查是否为关键字
        int idx = keyword_cmp(scc_cstring_as_cstr(&lex), scc_cstring_len(&lex));
        if (idx != -1) {
            token->type = keywords[idx].tok_type;
        }
    } else if (is_digit(ch)) {
        // 数字字面量（整数/浮点）
        token->type = SCC_TOK_INT_LITERAL; // 先假定整数
        cbool maybe_float = false;
        while (1) {
            next_char(lexer, &lex, &cur); // 消费当前数字
            if (!peek_char(lexer, &cur))
                break;
            ch = cur.character;
            if (is_digit(ch) || (ch == '.' && !maybe_float)) {
                if (ch == '.')
                    maybe_float = true;
                continue;
            }
            if (ch == 'e' || ch == 'E' || ch == 'p' || ch == 'P') {
                maybe_float = true;
                // 后面可能跟符号或数字
                continue;
            }
            if (ch == 'x' || ch == 'X') {
                // 十六进制前缀，需特殊处理
                // 这里简化：将整个序列作为整数（保留前缀）
                continue;
            }
            break;
        }
        if (maybe_float)
            token->type = SCC_TOK_FLOAT_LITERAL;
    } else if (ch == '\'') {
        // 字符字面量
        token->type = SCC_TOK_CHAR_LITERAL;
        next_char(lexer, &lex, &cur); // 开头的 '
        while (1) {
            if (!peek_char(lexer, &cur)) {
                LOG_ERROR("Unterminated character literal");
                break;
            }
            if (cur.character == '\'') {
                next_char(lexer, &lex, &cur); // 闭引号
                break;
            }
            if (cur.character == '\\') {
                // 转义序列：原样保存反斜杠和下一个字符
                next_char(lexer, &lex, &cur);
                if (!peek_char(lexer, &cur))
                    break;
                next_char(lexer, &lex, &cur);
            } else {
                next_char(lexer, &lex, &cur);
            }
        }
    } else if (ch == '"') {
        // 字符串字面量
        token->type = SCC_TOK_STRING_LITERAL;
        next_char(lexer, &lex, &cur); // 开头的 "
        while (1) {
            if (!peek_char(lexer, &cur)) {
                LOG_ERROR("Unterminated string literal");
                break;
            }
            if (cur.character == '"') {
                next_char(lexer, &lex, &cur); // 闭引号
                break;
            }
            if (cur.character == '\\') {
                // 转义序列
                next_char(lexer, &lex, &cur);
                if (!peek_char(lexer, &cur))
                    break;
                next_char(lexer, &lex, &cur);
            } else {
                next_char(lexer, &lex, &cur);
            }
            scc_ring_consume(*lexer->stream_ref);
        }
    } else {
        scc_sstream_char_t next = {0};
        next_char(lexer, &lex, &cur);
        peek_char(lexer, &next);
        switch (ch) {
        case '=':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_EQ;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_ASSIGN;
                break;
            }
            break;
        case '+':
            switch (next.character) {
            case '+':
                token->type = SCC_TOK_ADD_ADD;
                next_char(lexer, &lex, &cur);
                break;
            case '=':
                token->type = SCC_TOK_ASSIGN_ADD;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_ADD;
                break;
            }
            break;
        case '-':
            switch (next.character) {
            case '-':
                token->type = SCC_TOK_SUB_SUB;
                next_char(lexer, &lex, &cur);
                break;
            case '=':
                token->type = SCC_TOK_ASSIGN_SUB;
                next_char(lexer, &lex, &cur);
                break;
            case '>':
                token->type = SCC_TOK_DEREF;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_SUB;
                break;
            }
            break;
        case '*':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_ASSIGN_MUL;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_MUL;
                break;
            }
            break;
        case '%':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_ASSIGN_MOD;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_MOD;
                break;
            }
            break;
        case '&':
            switch (next.character) {
            case '&':
                token->type = SCC_TOK_AND_AND;
                next_char(lexer, &lex, &cur);
                break;
            case '=':
                token->type = SCC_TOK_ASSIGN_AND;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_AND;
                break;
            }
            break;
        case '|':
            switch (next.character) {
            case '|':
                token->type = SCC_TOK_OR_OR;
                next_char(lexer, &lex, &cur);
                break;
            case '=':
                token->type = SCC_TOK_ASSIGN_OR;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_OR;
                break;
            }
            break;
        case '^':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_ASSIGN_XOR;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_XOR;
                break;
            }
            break;
        case '<':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_LE;
                next_char(lexer, &lex, &cur);
                break;
            case '<': {
                next_char(lexer, &lex, &cur);
                if (peek_char(lexer, &next) && next.character == '=') {
                    token->type = SCC_TOK_ASSIGN_L_SH;
                    next_char(lexer, &lex, &cur);
                } else {
                    token->type = SCC_TOK_L_SH;
                }
                break;
            }
            default:
                token->type = SCC_TOK_LT;
                break;
            }
            break;
        case '>':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_GE;
                next_char(lexer, &lex, &cur);
                break;
            case '>': {
                next_char(lexer, &lex, &cur);
                if (peek_char(lexer, &next) && next.character == '=') {
                    token->type = SCC_TOK_ASSIGN_R_SH;
                    next_char(lexer, &lex, &cur);
                } else {
                    token->type = SCC_TOK_R_SH;
                }
                break;
            }
            default:
                token->type = SCC_TOK_GT;
                break;
            }
            break;
        case '~':
            token->type = SCC_TOK_BIT_NOT;
            break;
        case '!':
            switch (next.character) {
            case '=':
                token->type = SCC_TOK_NEQ;
                next_char(lexer, &lex, &cur);
                break;
            default:
                token->type = SCC_TOK_NOT;
                break;
            }
            break;
            /* clang-format off */
        case '[': token->type = SCC_TOK_L_BRACKET; break;
        case ']': token->type = SCC_TOK_R_BRACKET; break;
        case '(': token->type = SCC_TOK_L_PAREN; break;
        case ')': token->type = SCC_TOK_R_PAREN; break;
        case '{': token->type = SCC_TOK_L_BRACE; break;
        case '}': token->type = SCC_TOK_R_BRACE; break;
        case ';': token->type = SCC_TOK_SEMICOLON; break;
        case ',': token->type = SCC_TOK_COMMA; break;
        case ':': token->type = SCC_TOK_COLON; break;
            /* clang-format on */
        case '.':
            if (next.character == '.' && peek_char(lexer, &next) &&
                next.character == '.') {
                token->type = SCC_TOK_ELLIPSIS;
                next_char(lexer, &lex, &cur);
                next_char(lexer, &lex, &cur);
            } else {
                token->type = SCC_TOK_DOT;
            }
            break;
        case '?':
            token->type = SCC_TOK_COND;
            break;
        case '#':
            if (next.character == '#') {
                token->type = SCC_TOK_SHARP_SHARP;
                next_char(lexer, &lex, &cur);
            } else
                token->type = SCC_TOK_SHARP;
            break;
        default:
            token->type = SCC_TOK_UNKNOWN;
            SCC_ERROR(start_loc, "unsupported character '%c' (0x%x)", ch, ch);
            break;
        }
    }

    // 设置token
    scc_ring_consume(*lexer->stream_ref);
    token->type = token->type; // 上面已设
    token->loc = start_loc;
    token->lexeme = lex; // 转移所有权
    LEX_DEBUG("get token `%s` (%s) at %s:%d:%d", scc_get_tok_name(token->type),
              scc_cstring_as_cstr(&token->lexeme), token->loc.name,
              token->loc.line, token->loc.col);
}

// scc_lexer_get_token maybe got invalid (with parser)
void scc_lexer_get_valid_token(scc_lexer_t *lexer, scc_lexer_tok_t *token) {
    scc_tok_subtype_t subtype;
    while (1) {
        scc_lexer_get_token(lexer, token);
        subtype = scc_get_tok_subtype(token->type);
        AssertFmt(subtype != SCC_TOK_SUBTYPE_INVALID,
                  "Invalid token: `%s` at %s:%d:%d",
                  scc_get_tok_name(token->type), token->loc.name,
                  token->loc.line, token->loc.col);
        if (subtype == SCC_TOK_SUBTYPE_EMPTYSPACE ||
            subtype == SCC_TOK_SUBTYPE_COMMENT) {
            scc_lexer_tok_drop(token);
        }
        break;
    };
}

static cbool fill_token(scc_lexer_tok_t *out, void *userdata) {
    scc_lexer_t *lexer = userdata;
    scc_lexer_get_token(lexer, out);
    if (out->type == SCC_TOK_EOF) {
        return false;
    }
    return true;
}

static cbool fill_valid_token(scc_lexer_tok_t *out, void *userdata) {
    scc_lexer_t *lexer = userdata;
    scc_lexer_get_valid_token(lexer, out);
    if (out->type == SCC_TOK_EOF) {
        return false;
    }
    return true;
}

scc_lexer_tok_ring_t *scc_lexer_to_ring(scc_lexer_t *lexer, int ring_size,
                                        cbool fill_all) {
    scc_ring_init(lexer->ring, ring_size,
                  fill_all ? fill_token : fill_valid_token, lexer);
    lexer->ring_ref_count++;
    return &lexer->ring;
}

void scc_lexer_drop_ring(scc_lexer_tok_ring_t *ring_ref) {
    scc_lexer_t *lexer = ring_ref->userdata;
    if (lexer->ring_ref_count > 0) {
        lexer->ring_ref_count--;
    } else {
        LOG_WARN("double drop sstream ring");
    }
}

void scc_lexer_drop(scc_lexer_t *lexer) {
    Assert(lexer != null);
    if (lexer->ring_ref_count) {
        LOG_FATAL("drop sstream must be drop ring before ref [%d]",
                  lexer->ring_ref_count);
    }
    scc_ring_free(lexer->ring);
    scc_sstream_drop_ring(lexer->stream_ref);
}