smcc/ccompiler/frontend/lexer/lexer.c

/**
 * 仿照LCCompiler的词法分析部分
 *
 * 如下为LCC的README in 2025.2
This hierarchy is the distribution for lcc version 4.2.

lcc version 3.x is described in the book "A Retargetable C Compiler:
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
There are significant differences between 3.x and 4.x, most notably in
the intermediate code. For details, see
https://drh.github.io/lcc/documents/interface4.pdf.

VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.

LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.

LOG describes the changes since the last release.

CPYRIGHT describes the conditions under you can use, copy, modify, and
distribute lcc or works derived from lcc.

doc/install.html is an HTML file that gives a complete description of
the distribution and installation instructions.

Chris Fraser / cwf@aya.yale.edu
David Hanson / drh@drhanson.net
 */
#include <lib/core.h>
#include "lexer_log.h"
#include "token.h"
#include "lexer.h"

static const struct {
    const char* name;
    enum CSTD_KEYWORD std_type;
    cc_tktype_t tok;
} keywords[] = {
    #define X(name, std_type, tok, ...) { #name, std_type, tok },
    KEYWORD_TABLE
    #undef X
};

// by using binary search to find the keyword
static inline int keyword_cmp(const char* name, int len) {
    int low = 0;
    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
    while (low <= high) {
        int mid = (low + high) / 2;
        const char *key = keywords[mid].name;
        int cmp = 0;

        // 自定义字符串比较逻辑
        for (int i = 0; i < len; i++) {
            if (name[i] != key[i]) {
                cmp = (unsigned char)name[i] - (unsigned char)key[i];
                break;
            }
            if (name[i] == '\0') break; // 遇到终止符提前结束
        }

        if (cmp == 0) {
            // 完全匹配检查（长度相同）
            if (key[len] == '\0') return mid;
            cmp = -1; // 当前关键词比输入长
        }

        if (cmp < 0) {
            high = mid - 1;
        } else {
            low = mid + 1;
        }
    }
    return -1; // Not a keyword.
}

void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread, strpool_t* strpool) {
    lexer->strpool = strpool;
    lexer->cur_ptr = lexer->end_ptr = (char*)&(lexer->buffer);
    lexer->loc.fname = strpool_intern(lexer->strpool, file_name);
    lexer->loc.line = 1;
    lexer->loc.col = 1;

    lexer->stream = stream;
    lexer->sread = sread;

    rt_memset(lexer->buffer, 0, sizeof(lexer->buffer));
}

static void flush_buffer(lexer_t* lexer) {
    int num = lexer->end_ptr - lexer->cur_ptr;
    for (int i = 0; i < num; i++) {
        lexer->buffer[i] = lexer->cur_ptr[i];
    }
    lexer->cur_ptr = lexer->buffer;

    int read_size = LEXER_BUFFER_SIZE - num;
    // TODO rt_size_t to int maybe lose precision
    int got_size = lexer->sread(lexer->buffer + num, read_size, 1, read_size, lexer->stream);
    if (got_size < 0) {
        LEX_ERROR("lexer read error");
    } else if (got_size < read_size) {
        lexer->end_ptr += got_size;
        lexer->end_ptr[0] = '\0'; // EOF
        lexer->end_ptr++;
    } else if (got_size == read_size) {
        lexer->end_ptr += got_size;
    } else {
        LEX_ERROR("lexer read error imposible got_size > read_size maybe overflow?");
    }
}

static void goto_newline(lexer_t* lexer) {
    do {
        if (lexer->cur_ptr == lexer->end_ptr) {
            flush_buffer(lexer);
            lexer->cur_ptr--;
        }
        lexer->cur_ptr++;
    } while (*lexer->cur_ptr != '\n' && *lexer->cur_ptr != '\0');
}

static void goto_block_comment(lexer_t* lexer) {
    while (1) {
        if (lexer->end_ptr - lexer->cur_ptr < 2) {
            flush_buffer(lexer);
        }

        if (lexer->cur_ptr[0] == '\0') {
            break;
        } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
            lexer->cur_ptr += 2;
            break;
        } else {
            if (lexer->cur_ptr[0] == '\n') lexer->loc.line++;
            lexer->cur_ptr++;
        }
    }
}

// TODO escape character not enough
static char got_slash(char* peek) {
    switch (*peek) {
        case '\\':  return '\\';
        case '\'':  return '\'';
        case '\"':  return '\"';
        case '\?':  return '\?';
        case '0':   return '\0';

        case 'b': return '\b';
        case 'f': return '\f';
        case 'n': return '\n';
        case 'r': return '\r';
        case 't': return '\t';
        case 'v': return '\v';
        default: break;
    }
    LEX_ERROR("Unknown escape character");
    return -1;
}

static void parse_char_literal(lexer_t* lexer, tok_t* token) {
    char val = 0;
    char* peek = lexer->cur_ptr + 1;
    if (*peek == '\\') {
        peek++;
        val = got_slash(peek);
        peek++;
    } else {
        val = *peek++;
    }

    if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
    lexer->cur_ptr = peek;
    token->val.ch = val;
}

static void parse_string_literal(lexer_t* lexer, tok_t* token) {
    char* peek = lexer->cur_ptr + 1;
    // TODO string literal size check
    static char dest[LEXER_MAX_TOKEN_SIZE + 1];
    int len = 0;

    while (*peek != '"') {
        if (peek >= lexer->end_ptr) flush_buffer(lexer);

        if (*peek == '\\') { // 处理转义
            peek++;
            *peek = got_slash(peek);
        }

        if (len >= LEXER_MAX_TOKEN_SIZE) LEX_ERROR("String too long");
        dest[len++] = *peek++;
    }
    dest[len] = '\0';
    lexer->cur_ptr = peek + 1; // 1 is `"`
    lexer->loc.len = len + 2; // 2 is `"` `"`

    token->val.str = strpool_intern(lexer->strpool, dest);
}

// FIXME it write by AI maybe error
static void parse_number(lexer_t* lexer, tok_t* token) {
    char* peek = lexer->cur_ptr;
    int base = 10;
    int is_float = 0;
    long long int_val = 0;
    double float_val = 0.0;
    double fraction = 1.0;

    // 判断进制
    if (*peek == '0') {
        peek++;
        switch (*peek) {
            case 'x':
            case 'X':
                base = 16;
            default:
                base = 8;
        }
    }

    // 解析整数部分
    while (1) {
        int digit = -1;
        if (*peek >= '0' && *peek <= '9') {
            digit = *peek - '0';
        } else if (base == 16) {
            if (*peek >= 'a' && *peek <= 'f') digit = *peek - 'a' + 10;
            else if (*peek >= 'A' && *peek <= 'F') digit = *peek - 'A' + 10;
        }

        if (digit < 0 || digit >= base) break;

        if (!is_float) {
            int_val = int_val * base + digit;
        } else {
            float_val = float_val * base + digit;
            fraction *= base;
        }
        peek++;
    }

    // 解析浮点数
    if (*peek == '.' && base == 10) {
        is_float = 1;
        float_val = int_val;
        peek++;

        while (*peek >= '0' && *peek <= '9') {
            float_val = float_val * 10.0 + (*peek - '0');
            fraction *= 10.0;
            peek++;
        }
        float_val /= fraction;
    }

    // 解析科学计数法
    if ((*peek == 'e' || *peek == 'E') && base == 10) {
        is_float = 1;
        peek++;
        // int exp_sign = 1;
        int exponent = 0;

        if (*peek == '+') peek++;
        else if (*peek == '-') {
            // exp_sign = -1;
            peek++;
        }

        while (*peek >= '0' && *peek <= '9') {
            exponent = exponent * 10 + (*peek - '0');
            peek++;
        }
        // float_val *= pow(10.0, exp_sign * exponent);
    }

    // 存储结果
    // TODO
    lexer->loc.len = peek - lexer->cur_ptr;
    lexer->cur_ptr = peek;
    if (is_float) {
        token->val.f32 = float_val;
        token->sub_type = TOKEN_FLOAT_LITERAL;
    } else {
        token->val.i = int_val;
        token->sub_type = TOKEN_INT_LITERAL;
    }
}

#define GOT_ONE_TOKEN_BUF_SIZE 64
// /zh/c/language/operator_arithmetic.html
void get_token(lexer_t* lexer, tok_t* token) {
    // 需要保证缓冲区始终可读
    if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
        flush_buffer(lexer);
    }
    register char* peek = lexer->cur_ptr;

    cc_tktype_t tk_type = TOKEN_INIT;
    ctype_t literal = { 0 };

    // once step
    switch (*peek++) {
    case '=':
        switch (*peek++) {
            case '=': tk_type = TOKEN_EQ; break;
            default: peek--, tk_type = TOKEN_ASSIGN; break;
        } break;
    case '+':
        switch (*peek++) {
            case '+': tk_type = TOKEN_ADD_ADD; break;
            case '=': tk_type = TOKEN_ASSIGN_ADD; break;
            default: peek--, tk_type = TOKEN_ADD; break;
        } break;
    case '-':
        switch (*peek++) {
            case '-': tk_type = TOKEN_SUB_SUB; break;
            case '=': tk_type = TOKEN_ASSIGN_SUB; break;

            case '>': tk_type = TOKEN_DEREF; break;
            default: peek--, tk_type = TOKEN_SUB; break;
        } break;
    case '*':
        switch (*peek++) {
            case '=': tk_type = TOKEN_ASSIGN_MUL; break;
            default: peek--, tk_type = TOKEN_MUL; break;
        } break;
    case '/':
        switch (*peek++) {
            case '=': tk_type = TOKEN_ASSIGN_DIV; break;
            case '/': {
                goto_newline(lexer);
                tk_type = TOKEN_LINE_COMMENT;
                goto END;
            }
            case '*': {
                lexer->cur_ptr = peek;
                goto_block_comment(lexer);
                tk_type = TOKEN_BLOCK_COMMENT;
                goto END;
            }
            default: peek--, tk_type = TOKEN_DIV; break;
        } break;
    case '%':
        switch (*peek++) {
            case '=': tk_type = TOKEN_ASSIGN_MOD; break;
            default: peek--, tk_type = TOKEN_MOD; break;
        } break;
    case '&':
        switch (*peek++) {
            case '&': tk_type = TOKEN_AND_AND; break;
            case '=': tk_type = TOKEN_ASSIGN_AND; break;
            default: peek--, tk_type = TOKEN_AND; break;
        } break;
    case '|':
        switch (*peek++) {
            case '|': tk_type = TOKEN_OR_OR; break;
            case '=': tk_type = TOKEN_ASSIGN_OR; break;
            default: peek--, tk_type = TOKEN_OR; break;
        } break;
    case '^':
        switch (*peek++) {
            case '=': tk_type = TOKEN_ASSIGN_XOR; break;
            default: peek--, tk_type = TOKEN_XOR; break;
        } break;
    case '<':
        switch (*peek++) {
            case '=': tk_type = TOKEN_LE; break;
            case '<': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
            default: peek--, tk_type = TOKEN_LT; break;
        } break;
    case '>':
        switch (*peek++) {
            case '=': tk_type = TOKEN_GE; break;
            case '>': tk_type = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
            default: peek--, tk_type = TOKEN_GT; break;
        } break;
    case '~':
        tk_type = TOKEN_BIT_NOT; break;
    case '!':
        switch (*peek++) {
            case '=': tk_type = TOKEN_NEQ; break;
            default: peek--, tk_type = TOKEN_NOT; break;
        } break;
    case '[':
        tk_type = TOKEN_L_BRACKET; break;
    case ']':
        tk_type = TOKEN_R_BRACKET; break;
    case '(':
        tk_type = TOKEN_L_PAREN; break;
    case ')':
        tk_type = TOKEN_R_PAREN; break;
    case '{':
        tk_type = TOKEN_L_BRACE; break;
    case '}':
        tk_type = TOKEN_R_BRACE; break;
    case ';':
        tk_type = TOKEN_SEMICOLON; break;
    case ',':
        tk_type = TOKEN_COMMA; break;
    case ':':
        tk_type = TOKEN_COLON; break;
    case '.':
        if (peek[0] == '.' && peek[1] == '.') {
            peek += 2;
            tk_type = TOKEN_ELLIPSIS;
        } else {
            tk_type = TOKEN_DOT;
        }
        break;
    case '?':
        tk_type = TOKEN_COND; break;
    case '\v': case '\r': case '\f':
    case ' ': case '\t':
        tk_type = TOKEN_BLANK; break;
    case '\n':
        // you need to flush a newline or blank
        lexer->loc.line += 1;
        lexer->loc.col = -1;
        lexer->loc.len = 1;
        tk_type = TOKEN_BLANK;
        break;
    case '#':
    // TODO make line or file comment to change
        LEX_WARN("Maroc does not support in lexer rather in preprocessor, it will be ignored");
        goto_newline(lexer);
        tk_type = TOKEN_BLANK;
        goto END;
    case '\0':
        // EOF
        tk_type = TOKEN_EOF;
        goto END;
    case '\'':
        parse_char_literal(lexer, token);
        literal = token->val;
        tk_type = TOKEN_CHAR_LITERAL;
        goto END; break;
    case '"':
        parse_string_literal(lexer, token);
        literal = token->val;
        tk_type = TOKEN_STRING_LITERAL;
        goto END; break;
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
        parse_number(lexer, token);
        // TODO Make it easy
        literal = token->val;
        tk_type = token->sub_type;
        goto END; break;
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
    case '_':
        // TOKEN_IDENT
        if ((*peek == 'L' && *peek == '\'') || (*peek == 'L' && *peek == '"')) {
            LEX_ERROR("unsupport wide-character char literal by `L` format");
        }
        while (1) {
            if (peek == lexer->end_ptr) {
                LEX_ERROR("unsupport outof 64 length identifier");
            }
            if ((*peek >= 'a' && *peek <= 'z') || (*peek >= 'A' && *peek <= 'Z') ||
                    (*peek == '_') || (*peek >= '0' && *peek <= '9')) {
                peek++;
                continue;
            }
            break;
        }

        int strlen = peek - lexer->cur_ptr;
        int res = keyword_cmp((const char*)lexer->cur_ptr, strlen);
        if (res == -1) {
            char prev = lexer->cur_ptr[strlen];
            lexer->cur_ptr[strlen] = '\0';
            literal.str = strpool_intern(lexer->strpool, lexer->cur_ptr);
            lexer->cur_ptr[strlen] = prev;
            tk_type = TOKEN_IDENT; break;
        } else {
            tk_type = keywords[res].tok; break;
        }
    default:
        LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
        break;
    }

    lexer->loc.len = peek - lexer->cur_ptr;
    lexer->cur_ptr = peek;
END:
    lexer->loc.col += lexer->loc.len;
    lexer->loc.len = 0;

    token->val = literal;
    token->sub_type = tk_type;
    token->loc = lexer->loc;
static const tok_basic_type_t tok_type_map[] = {
    // 普通token使用#str
    #define X(str, basic, tok) [tok] = basic,
    TOKEN_TABLE
    #undef X

    // 关键字使用#name
    #define X(name, std, tok) [tok] = TK_BASIC_KEYWORD,
    KEYWORD_TABLE
    #undef X
};
    token->type = tok_type_map[tk_type];
    LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(tk_type),
        token->loc.fname, token->loc.line, token->loc.col);
}

// get_token maybe got invalid (with parser)
void get_valid_token(lexer_t* lexer, tok_t* token) {
    tok_basic_type_t type;
    do {
        get_token(lexer, token);
        type = token->type;
        Assert(type != TK_BASIC_INVALID);
    } while (type == TK_BASIC_WHITESPACE || type == TK_BASIC_COMMENT);
}