scc/libs/lexer/src/lexer.c

/**
 * 仿照LCCompiler的词法分析部分
 *
 * 如下为LCC的README in 2025.2
This hierarchy is the distribution for lcc version 4.2.

lcc version 3.x is described in the book "A Retargetable C Compiler:
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
There are significant differences between 3.x and 4.x, most notably in
the intermediate code. For details, see
https://drh.github.io/lcc/documents/interface4.pdf.

VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.

LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.

LOG describes the changes since the last release.

CPYRIGHT describes the conditions under you can use, copy, modify, and
distribute lcc or works derived from lcc.

doc/install.html is an HTML file that gives a complete description of
the distribution and installation instructions.

Chris Fraser / cwf@aya.yale.edu
David Hanson / drh@drhanson.net
 */
#include <lex_parser.h>
#include <lexer.h>
#include <lexer_log.h>

static const struct {
    const char *name;
    scc_cstd_t std_type;
    scc_tok_type_t tok;
} keywords[] = {
#define X(name, subtype, tok, std_type, ...) {#name, std_type, tok},
    SCC_CKEYWORD_TABLE
#undef X
};

// by using binary search to find the keyword
static inline int keyword_cmp(const char *name, int len) {
    int low = 0;
    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
    while (low <= high) {
        int mid = (low + high) / 2;
        const char *key = keywords[mid].name;
        int cmp = 0;

        // 自定义字符串比较逻辑
        for (int i = 0; i < len; i++) {
            if (name[i] != key[i]) {
                cmp = (unsigned char)name[i] - (unsigned char)key[i];
                break;
            }
            if (name[i] == '\0')
                break; // 遇到终止符提前结束
        }

        if (cmp == 0) {
            // 完全匹配检查（长度相同）
            if (key[len] == '\0')
                return mid;
            cmp = -1; // 当前关键词比输入长
        }

        if (cmp < 0) {
            high = mid - 1;
        } else {
            low = mid + 1;
        }
    }
    return -1; // Not a keyword.
}

void scc_lexer_init(scc_lexer_t *lexer, scc_probe_stream_t *stream) {
    lexer->stream = stream;
    lexer->pos = scc_pos_init();
    // FIXME
    lexer->pos.name = scc_cstring_from_cstr(scc_cstring_as_cstr(&stream->name));
}

#define set_err_token(token) ((token)->type = SCC_TOK_UNKNOWN)

static void parse_line(scc_lexer_t *lexer, lexer_tok_t *token) {
    token->loc = lexer->pos;
    scc_probe_stream_t *stream = lexer->stream;
    scc_probe_stream_reset(stream);
    int ch = scc_probe_stream_next(stream);

    usize n;
    scc_cstring_t str = scc_cstring_new();

    if (ch == core_stream_eof) {
        LEX_WARN("Unexpected EOF at begin");
        goto ERR;
    } else if (ch != '#') {
        LEX_WARN("Unexpected character '%c' at begin", ch);
        goto ERR;
    }

    const char line[] = "line";

    for (int i = 0; i < (int)sizeof(line); i++) {
        ch = scc_probe_stream_consume(stream);
        core_pos_next(&lexer->pos);
        if (ch != line[i]) {
            LEX_WARN("Maroc does not support in lexer rather in preprocessor, "
                     "it will be ignored");
            goto SKIP_LINE;
        }
    }

    if (lex_parse_number(lexer->stream, &lexer->pos, &n) == false) {
        LEX_ERROR("Invalid line number");
        goto SKIP_LINE;
    }

    if (scc_probe_stream_consume(stream) != ' ') {
        lex_parse_skip_line(lexer->stream, &lexer->pos);
        token->loc.line = token->value.n;
    }

    if (scc_probe_stream_next(stream) != '"') {
        LEX_ERROR("Invalid `#` line");
        goto SKIP_LINE;
    }
    if (lex_parse_string(lexer->stream, &lexer->pos, &str) == false) {
        LEX_ERROR("Invalid filename");
        goto SKIP_LINE;
    }

    lex_parse_skip_line(lexer->stream, &lexer->pos);
    token->loc.line = n;
    // FIXME memory leak
    token->loc.name = scc_cstring_from_cstr(scc_cstring_as_cstr(&str));
    scc_cstring_free(&str);
    return;
SKIP_LINE:
    lex_parse_skip_line(lexer->stream, &lexer->pos);
ERR:
    set_err_token(token);
    scc_cstring_free(&str);
}

// /zh/c/language/operator_arithmetic.html
void scc_lexer_get_token(scc_lexer_t *lexer, lexer_tok_t *token) {
    token->loc = lexer->pos;
    token->type = SCC_TOK_UNKNOWN;
    scc_probe_stream_t *stream = lexer->stream;

    scc_probe_stream_reset(stream);
    scc_tok_type_t type = SCC_TOK_UNKNOWN;
    int ch = scc_probe_stream_next(stream);

    // once step
    switch (ch) {
    case '=':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_EQ;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_ASSIGN;
            break;
        }
        break;
    case '+':
        switch (scc_probe_stream_next(stream)) {
        case '+':
            type = SCC_TOK_ADD_ADD;
            goto double_char;
        case '=':
            type = SCC_TOK_ASSIGN_ADD;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_ADD;
            break;
        }
        break;
    case '-':
        switch (scc_probe_stream_next(stream)) {
        case '-':
            type = SCC_TOK_SUB_SUB;
            goto double_char;
        case '=':
            type = SCC_TOK_ASSIGN_SUB;
            goto double_char;
        case '>':
            type = SCC_TOK_DEREF;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_SUB;
            break;
        }
        break;
    case '*':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_ASSIGN_MUL;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_MUL;
            break;
        }
        break;
    case '/':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_ASSIGN_DIV;
            goto double_char;
        case '/':
            lex_parse_skip_line(lexer->stream, &lexer->pos);
            token->type = SCC_TOK_LINE_COMMENT;
            goto END;
        case '*':
            lex_parse_skip_block_comment(lexer->stream, &lexer->pos);
            token->type = SCC_TOK_BLOCK_COMMENT;
            goto END;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_DIV;
            break;
        }
        break;
    case '%':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_ASSIGN_MOD;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_MOD;
            break;
        }
        break;
    case '&':
        switch (scc_probe_stream_next(stream)) {
        case '&':
            type = SCC_TOK_AND_AND;
            goto double_char;
        case '=':
            type = SCC_TOK_ASSIGN_AND;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_AND;
            break;
        }
        break;
    case '|':
        switch (scc_probe_stream_next(stream)) {
        case '|':
            type = SCC_TOK_OR_OR;
            goto double_char;
        case '=':
            type = SCC_TOK_ASSIGN_OR;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_OR;
            break;
        }
        break;
    case '^':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_ASSIGN_XOR;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_XOR;
            break;
        }
        break;
    case '<':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_LE;
            goto double_char;
        case '<': {
            if (scc_probe_stream_next(stream) == '=') {
                type = SCC_TOK_ASSIGN_L_SH;
                goto triple_char;
            } else {
                type = SCC_TOK_L_SH;
                goto double_char;
            }
            break;
        }
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_LT;
            break;
        }
        break;
    case '>':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_GE;
            goto double_char;
        case '>': {
            if (scc_probe_stream_next(stream) == '=') {
                type = SCC_TOK_ASSIGN_R_SH;
                goto triple_char;
            } else {
                type = SCC_TOK_R_SH;
                goto double_char;
            }
            break;
        }
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_GT;
            break;
        }
        break;
    case '~':
        type = SCC_TOK_BIT_NOT;
        break;
    case '!':
        switch (scc_probe_stream_next(stream)) {
        case '=':
            type = SCC_TOK_NEQ;
            goto double_char;
        default:
            scc_probe_stream_reset(stream), type = SCC_TOK_NOT;
            break;
        }
        break;
    case '[':
        type = SCC_TOK_L_BRACKET;
        break;
    case ']':
        type = SCC_TOK_R_BRACKET;
        break;
    case '(':
        type = SCC_TOK_L_PAREN;
        break;
    case ')':
        type = SCC_TOK_R_PAREN;
        break;
    case '{':
        type = SCC_TOK_L_BRACE;
        break;
    case '}':
        type = SCC_TOK_R_BRACE;
        break;
    case ';':
        type = SCC_TOK_SEMICOLON;
        break;
    case ',':
        type = SCC_TOK_COMMA;
        break;
    case ':':
        type = SCC_TOK_COLON;
        break;
    case '.':
        if (scc_probe_stream_next(stream) == '.' &&
            scc_probe_stream_next(stream) == '.') {
            type = SCC_TOK_ELLIPSIS;
            goto triple_char;
        }
        type = SCC_TOK_DOT;
        break;
    case '?':
        type = SCC_TOK_COND;
        break;
    case '\v':
    case '\f':
    case ' ':
    case '\t':
        type = SCC_TOK_BLANK;
        break;
    case '\r':
    case '\n':
        lex_parse_skip_endline(lexer->stream, &lexer->pos);
        token->type = SCC_TOK_BLANK;
        goto END;
    case '#':
        parse_line(lexer, token);
        token->type = SCC_TOK_BLANK;
        goto END;
    case '\0':
    case core_stream_eof:
        // EOF
        type = SCC_TOK_EOF;
        break;
    case '\'': {
        token->loc = lexer->pos;
        token->type = SCC_TOK_CHAR_LITERAL;
        int ch = lex_parse_char(lexer->stream, &lexer->pos);
        if (ch == core_stream_eof) {
            LEX_ERROR("Unexpected character literal");
            token->type = SCC_TOK_UNKNOWN;
        } else {
            token->value.ch = ch;
        }
        goto END;
    }
    case '"': {
        token->loc = lexer->pos;
        token->type = SCC_TOK_STRING_LITERAL;
        scc_cstring_t output = scc_cstring_new();
        if (lex_parse_string(lexer->stream, &lexer->pos, &output) == true) {
            token->value.cstr.data = scc_cstring_as_cstr(&output);
            token->value.cstr.len = scc_cstring_len(&output);
        } else {
            LEX_ERROR("Unexpected string literal");
            token->type = SCC_TOK_UNKNOWN;
        }

        goto END;
    }
        /* clang-format off */
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
        /* clang-format on */
        token->loc = lexer->pos;
        token->type = SCC_TOK_INT_LITERAL;
        usize output;
        if (lex_parse_number(lexer->stream, &lexer->pos, &output) == true) {
            token->value.n = output;
        } else {
            LEX_ERROR("Unexpected number literal");
            token->type = SCC_TOK_UNKNOWN;
        }
        goto END;
        /* clang-format off */
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
    case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
    case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
    case 'v': case 'w': case 'x': case 'y': case 'z':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
    case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
    case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
    case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_':
        /* clang-format on */
        scc_cstring_t str = scc_cstring_new();
        cbool ret = lex_parse_identifier(lexer->stream, &lexer->pos, &str);
        Assert(ret == true);

        int res = keyword_cmp(scc_cstring_as_cstr(&str), scc_cstring_len(&str));
        if (res == -1) {
            token->value.cstr.data = (char *)scc_cstring_as_cstr(&str);
            token->value.cstr.len = scc_cstring_len(&str);
            type = SCC_TOK_IDENT;
        } else {
            scc_cstring_free(&str);
            type = keywords[res].tok;
        }
        token->type = type;
        goto END;
    default:
        LEX_ERROR("unsupport char in sourse code `%c`", ch);
        break;
    }
    goto once_char;
triple_char:
    scc_probe_stream_consume(stream);
    core_pos_next(&lexer->pos);
double_char:
    scc_probe_stream_consume(stream);
    core_pos_next(&lexer->pos);
once_char:
    scc_probe_stream_consume(stream);
    core_pos_next(&lexer->pos);
    token->type = type;
END:
    LEX_DEBUG("get token `%s` in %s:%d:%d", scc_get_tok_name(token->type),
              token->loc.name, token->loc.line, token->loc.column);
}

// scc_lexer_get_token maybe got invalid (with parser)
void scc_lexer_get_valid_token(scc_lexer_t *lexer, lexer_tok_t *token) {
    scc_tok_subtype_t type;
    do {
        scc_lexer_get_token(lexer, token);
        type = scc_get_tok_subtype(token->type);
        AssertFmt(type != SCC_TOK_SUBTYPE_INVALID,
                  "Invalid token: `%s` at %s:%d:%d",
                  scc_get_tok_name(token->type), token->loc.name,
                  token->loc.line, token->loc.col);
        Assert(type != SCC_TOK_SUBTYPE_INVALID);
    } while (type == SCC_TOK_SUBTYPE_EMPTYSPACE ||
             type == SCC_TOK_SUBTYPE_COMMENT);
}