smcc/ccompiler/frontend/lexer/lexer.c

/**
 * 仿照LCCompiler的词法分析部分
 *
 * 如下为LCC的README in 2025.2
This hierarchy is the distribution for lcc version 4.2.

lcc version 3.x is described in the book "A Retargetable C Compiler:
Design and Implementation" (Addison-Wesley, 1995, ISBN 0-8053-1670-1).
There are significant differences between 3.x and 4.x, most notably in
the intermediate code. For details, see
https://drh.github.io/lcc/documents/interface4.pdf.

VERSION 4.2 IS INCOMPATIBLE WITH EARLIER VERSIONS OF LCC. DO NOT
UNLOAD THIS DISTRIBUTION ON TOP OF A 3.X DISTRIBUTION.

LCC is a C89 ("ANSI C") compiler designed to be highly retargetable.

LOG describes the changes since the last release.

CPYRIGHT describes the conditions under you can use, copy, modify, and
distribute lcc or works derived from lcc.

doc/install.html is an HTML file that gives a complete description of
the distribution and installation instructions.

Chris Fraser / cwf@aya.yale.edu
David Hanson / drh@drhanson.net
 */
#include <lib/core.h>
#include "lexer_log.h"
#include "token.h"
#include "lexer.h"

static const struct {
    const char* name;
    enum CSTD_KEYWORD std_type;
    tok_type_t tok;
} keywords[] = {
    #define X(name, std_type, tok, ...) { #name, std_type, tok },
    KEYWORD_TABLE
    #undef X
};

// by using binary search to find the keyword
static inline int keyword_cmp(const char* name, int len) {
    int low = 0;
    int high = sizeof(keywords) / sizeof(keywords[0]) - 1;
    while (low <= high) {
        int mid = (low + high) / 2;
        const char *key = keywords[mid].name;
        int cmp = 0;

        // 自定义字符串比较逻辑
        for (int i = 0; i < len; i++) {
            if (name[i] != key[i]) {
                cmp = (unsigned char)name[i] - (unsigned char)key[i];
                break;
            }
            if (name[i] == '\0') break; // 遇到终止符提前结束
        }

        if (cmp == 0) {
            // 完全匹配检查（长度相同）
            if (key[len] == '\0') return mid;
            cmp = -1; // 当前关键词比输入长
        }

        if (cmp < 0) {
            high = mid - 1;
        } else {
            low = mid + 1;
        }
    }
    return -1; // Not a keyword.
}

void init_lexer(lexer_t* lexer, const char* file_name, void* stream, lexer_sread_fn sread) {
    init_lib_core();

    lexer->cur_ptr = lexer->end_ptr = (unsigned char*)&(lexer->buffer);
    lexer->index = 1;
    lexer->line = 1;

    lexer->stream = stream;
    lexer->sread = sread;

    for (int i = 0; i < sizeof(lexer->buffer) / sizeof(lexer->buffer[0]); i++) {
        lexer->buffer[i] = 0;
    }
}

static void flush_buffer(lexer_t* lexer) {
    int num = lexer->end_ptr - lexer->cur_ptr;
    for (int i = 0; i < num; i++) {
        lexer->buffer[i] = lexer->cur_ptr[i];
    }
    lexer->cur_ptr = (unsigned char*)lexer->buffer;

    int read_size = LEXER_BUFFER_SIZE - num;
    // TODO rt_size_t to int maybe lose precision
    int got_size = lexer->sread(lexer->buffer + num, read_size, 1, read_size, lexer->stream);
    if (got_size < 0) {
        LEX_ERROR("lexer read error");
    } else if (got_size < read_size) {
        lexer->end_ptr += got_size;
        lexer->end_ptr[0] = '\0'; // EOF
        lexer->end_ptr++;
    } else if (got_size == read_size) {
        lexer->end_ptr += got_size;
    } else {
        LEX_ERROR("lexer read error imposible got_size > read_size maybe overflow?");
    }
}

static void goto_newline(lexer_t* lexer) {
    do {
        if (lexer->cur_ptr == lexer->end_ptr) {
            flush_buffer(lexer);
            lexer->cur_ptr--;
        }
        lexer->cur_ptr++;
    } while (*lexer->cur_ptr != '\n' && *lexer->cur_ptr != '\0');
}

static void goto_block_comment(lexer_t* lexer) {
    while (1) {
        if (lexer->end_ptr - lexer->cur_ptr < 2) {
            flush_buffer(lexer);
        }

        if (*lexer->cur_ptr == '\0') {
            break;
        } else if (lexer->cur_ptr[0] == '*' && lexer->cur_ptr[1] == '/') {
            lexer->cur_ptr += 2;
            break;
        } else {
            lexer->cur_ptr++;
        }
    }
}

// TODO escape character not enough
static char got_slash(unsigned char* peek) {
    switch (*peek) {
        case '\\':  return '\\';
        case '\'':  return '\'';
        case '\"':  return '\"';
        case '\?':  return '\?';
        case '0':   return '\0';

        case 'b': return '\b';
        case 'f': return '\f';
        case 'n': return '\n';
        case 'r': return '\r';
        case 't': return '\t';
        case 'v': return '\v';
        default: break;
    }
    LEX_ERROR("Unknown escape character");
    return -1;
}

static void parse_char_literal(lexer_t* lexer, tok_t* token) {
    char val = 0;
    unsigned char* peek = lexer->cur_ptr + 1;
    if (*peek == '\\') {
        peek++;
        val = got_slash(peek);
        peek++;
    } else {
        val = *peek++;
    }

    if (*peek++ != '\'') LEX_ERROR("Unclosed character literal");
    token->val.ch = val;
    lexer->cur_ptr = peek;
    token->val.have = 1;
    token->type = TOKEN_CHAR_LITERAL;
}

static void parse_string_literal(lexer_t* lexer, tok_t* token) {
    unsigned char* peek = lexer->cur_ptr + 1;
    // TODO string literal size check
    char* dest = token->val.str = rt._malloc(LEXER_MAX_TOKEN_SIZE + 1);
    int len = 0;

    while (*peek != '"') {
        if (peek >= lexer->end_ptr) flush_buffer(lexer);

        if (*peek == '\\') { // 处理转义
            peek++;
            *peek = got_slash(peek);
        }

        if (len >= LEXER_MAX_TOKEN_SIZE) LEX_ERROR("String too long");
        dest[len++] = *peek++;
    }
    dest[len] = '\0';
    lexer->cur_ptr = peek + 1;
    token->val.have = 1;
    token->type = TOKEN_STRING_LITERAL;
}

// FIXME it write by AI maybe error
static void parse_number(lexer_t* lexer, tok_t* token) {
    unsigned char* peek = lexer->cur_ptr;
    int base = 10;
    int is_float = 0;
    long long int_val = 0;
    double float_val = 0.0;
    double fraction = 1.0;

    // 判断进制
    if (*peek == '0') {
        peek++;
        switch (*peek) {
            case 'x':
            case 'X':
                base = 16;
            default:
                base = 8;
        }
    }

    // 解析整数部分
    while (1) {
        int digit = -1;
        if (*peek >= '0' && *peek <= '9') {
            digit = *peek - '0';
        } else if (base == 16) {
            if (*peek >= 'a' && *peek <= 'f') digit = *peek - 'a' + 10;
            else if (*peek >= 'A' && *peek <= 'F') digit = *peek - 'A' + 10;
        }

        if (digit < 0 || digit >= base) break;

        if (!is_float) {
            int_val = int_val * base + digit;
        } else {
            float_val = float_val * base + digit;
            fraction *= base;
        }
        peek++;
    }

    // 解析浮点数
    if (*peek == '.' && base == 10) {
        is_float = 1;
        float_val = int_val;
        peek++;

        while (*peek >= '0' && *peek <= '9') {
            float_val = float_val * 10.0 + (*peek - '0');
            fraction *= 10.0;
            peek++;
        }
        float_val /= fraction;
    }

    // 解析科学计数法
    if ((*peek == 'e' || *peek == 'E') && base == 10) {
        is_float = 1;
        peek++;
        // int exp_sign = 1;
        int exponent = 0;

        if (*peek == '+') peek++;
        else if (*peek == '-') {
            // exp_sign = -1;
            peek++;
        }

        while (*peek >= '0' && *peek <= '9') {
            exponent = exponent * 10 + (*peek - '0');
            peek++;
        }
        // float_val *= pow(10.0, exp_sign * exponent);
    }

    // 存储结果
    lexer->cur_ptr = peek;
    token->val.have = 1;
    if (is_float) {
        token->val.d = float_val;
        token->type = TOKEN_FLOAT_LITERAL;
    } else {
        token->val.ll = int_val;
        token->type = TOKEN_INT_LITERAL;
    }
}

#define GOT_ONE_TOKEN_BUF_SIZE 64
// /zh/c/language/operator_arithmetic.html
void get_token(lexer_t* lexer, tok_t* token) {
    // 需要保证缓冲区始终可读
    if (lexer->end_ptr - lexer->cur_ptr < GOT_ONE_TOKEN_BUF_SIZE) {
        flush_buffer(lexer);
    }
    register unsigned char* peek = lexer->cur_ptr;

    // 快速跳过空白符
    while (*peek == ' ' || *peek == '\t') {
        if (peek == lexer->end_ptr) {
            break;
        }
        peek++;
    }
    if (peek != lexer->cur_ptr) {
        // To TOKEN_FLUSH
        lexer->cur_ptr = peek;
        token->type = TOKEN_FLUSH;
    }

    tok_type_t tok = TOKEN_INIT;
    tok_val_t constant;
    constant.have = 0;

    // once step
    switch (*peek++) {
        case '=':
        switch (*peek++) {
            case '=': tok = TOKEN_EQ; break;
            default: peek--, tok = TOKEN_ASSIGN; break;
        } break;
    case '+':
        switch (*peek++) {
            case '+': tok = TOKEN_ADD_ADD; break;
            case '=': tok = TOKEN_ASSIGN_ADD; break;
            default: peek--, tok = TOKEN_ADD; break;
        } break;
    case '-':
        switch (*peek++) {
            case '-': tok = TOKEN_SUB_SUB; break;
            case '=': tok = TOKEN_ASSIGN_SUB; break;

            case '>': tok = TOKEN_DEREF; break;
            default: peek--, tok = TOKEN_SUB; break;
        } break;
    case '*':
        switch (*peek++) {
            case '=': tok = TOKEN_ASSIGN_MUL; break;
            default: peek--, tok = TOKEN_MUL; break;
        } break;
    case '/':
        switch (*peek++) {
            case '=': tok = TOKEN_ASSIGN_DIV; break;
            case '/': {
                // need get a new line to parse
                goto_newline(lexer);
                tok = TOKEN_LINE_COMMENT;
                goto END;
            }
            case '*': {
                lexer->cur_ptr = peek;
                goto_block_comment(lexer);
                tok = TOKEN_BLOCK_COMMENT;
                goto END;
            }
            default: peek--, tok = TOKEN_DIV; break;
        } break;
    case '%':
        switch (*peek++) {
            case '=': tok = TOKEN_ASSIGN_MOD; break;
            default: peek--, tok = TOKEN_MOD; break;
        } break;
    case '&':
        switch (*peek++) {
            case '&': tok = TOKEN_AND_AND; break;
            case '=': tok = TOKEN_ASSIGN_AND; break;
            default: peek--, tok = TOKEN_AND; break;
        } break;
    case '|':
        switch (*peek++) {
            case '|': tok = TOKEN_OR_OR; break;
            case '=': tok = TOKEN_ASSIGN_OR; break;
            default: peek--, tok = TOKEN_OR; break;
        } break;
    case '^':
        switch (*peek++) {
            case '=': tok = TOKEN_ASSIGN_XOR; break;
            default: peek--, tok = TOKEN_XOR; break;
        } break;
    case '<':
        switch (*peek++) {
            case '=': tok = TOKEN_LE; break;
            case '<': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_L_SH) : TOKEN_L_SH; break;
            default: peek--, tok = TOKEN_LT; break;
        } break;
    case '>':
        switch (*peek++) {
            case '=': tok = TOKEN_GE; break;
            case '>': tok = (*peek == '=') ? (peek++, TOKEN_ASSIGN_R_SH) : TOKEN_R_SH; break;
            default: peek--, tok = TOKEN_GT; break;
        } break;
    case '~':
        tok = TOKEN_BIT_NOT; break;
    case '!':
        switch (*peek++) {
            case '=': tok = TOKEN_NEQ; break;
            default: peek--, tok = TOKEN_NOT; break;
        } break;
    case '[':
        tok = TOKEN_L_BRACKET; break;
    case ']':
        tok = TOKEN_R_BRACKET; break;
    case '(':
        tok = TOKEN_L_PAREN; break;
    case ')':
        tok = TOKEN_R_PAREN; break;
    case '{':
        tok = TOKEN_L_BRACE; break;
    case '}':
        tok = TOKEN_R_BRACE; break;
    case ';':
        tok = TOKEN_SEMICOLON; break;
    case ',':
        tok = TOKEN_COMMA; break;
    case ':':
        tok = TOKEN_COLON; break;
    case '.':
        if (peek[0] == '.' && peek[1] == '.') {
            peek += 2;
            tok = TOKEN_ELLIPSIS;
        } else {
            tok = TOKEN_DOT;
        }
        break;
    case '?':
        tok = TOKEN_COND; break;
    case '\v': case '\r': case '\f': // FIXME it parse as a blank character
        tok = TOKEN_FLUSH; break;
    case '\n':
        // you need to flush a newline or blank
        lexer->line++;
        tok = TOKEN_FLUSH; break;
    case '#':
        LEX_WARN("Marroc does not support in lexer rather in preprocessor, it will be ignored");
        goto_newline(lexer);
        tok = TOKEN_FLUSH;
        goto END;
    case '\0':
        // EOF
        tok = TOKEN_EOF;
        goto END;
    case '\'':
        return parse_char_literal(lexer, token);
        return;
    case '"':
        return parse_string_literal(lexer, token);
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
        return parse_number(lexer, token);
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':case 'Y': case 'Z':
    case '_':
        // TOKEN_IDENT
        if ((*peek == 'L' && *peek == '\'') || (*peek == 'L' && *peek == '"')) {
            LEX_ERROR("unsupport wide-character char literal by `L` format");
        }
        while (1) {
            if (peek == lexer->end_ptr) {
                LEX_ERROR("unsupport outof 64 length identifier");
            }
            if ((*peek >= 'a' && *peek <= 'z') || (*peek >= 'A' && *peek <= 'Z') ||
                    (*peek == '_') || (*peek >= '0' && *peek <= '9')) {
                peek++;
                continue;
            }
            break;
        }

        int res = keyword_cmp((const char*)lexer->cur_ptr, peek - (lexer->cur_ptr));
        if (res == -1) {
            int strlen = peek - lexer->cur_ptr;
            unsigned char* str = rt._malloc(strlen + 1);
            constant.have = 1;
            constant.str = (char*)str;
            for (int i = 0; i < strlen; i++) {
                str[i] = lexer->cur_ptr[i];
            }
            str[strlen] = '\0';
            constant.have = 1;
            constant.str = (char*)str;
            tok = TOKEN_IDENT; break;
        } else {
            tok = keywords[res].tok; break;
        }
    default:
        LEX_ERROR("unsupport char in sourse code `%c`", *(lexer->cur_ptr));
        break;
    }

    lexer->cur_ptr = peek;
END:
    token->val = constant;
    token->type = tok;
    LEX_DEBUG("get token `%s` (ch: %c, int: %d)", get_tok_name(token->type), token->val.ch, token->val.i);
}

// get_token maybe got invalid (with parser)
void get_valid_token(lexer_t* lexer, tok_t* token) {
    tok_type_t type;
    do {
        get_token(lexer, token);
        type = token->type;
    } while (type == TOKEN_FLUSH || type == TOKEN_LINE_COMMENT || type == TOKEN_BLOCK_COMMENT);
}