diff --git a/libs/lex_parser/cbuild.toml b/libs/lex_parser/cbuild.toml new file mode 100644 index 0000000..1d2d79b --- /dev/null +++ b/libs/lex_parser/cbuild.toml @@ -0,0 +1,5 @@ +[package] +name = "smcc_lex_parser" +version = "0.1.0" + +dependencies = [{ name = "libcore", path = "../../runtime/libcore" }] diff --git a/libs/lex_parser/include/lex_parser.h b/libs/lex_parser/include/lex_parser.h new file mode 100644 index 0000000..1bb0557 --- /dev/null +++ b/libs/lex_parser/include/lex_parser.h @@ -0,0 +1,17 @@ +#ifndef __SMCC_LEX_PARSER_H__ +#define __SMCC_LEX_PARSER_H__ + +#include + +int lex_parse_char(core_stream_t *input, core_pos_t *pos); +cbool lex_parse_string(core_stream_t *input, core_pos_t *pos, + cstring_t *output); +cbool lex_parse_number(core_stream_t *input, core_pos_t *pos, usize *output); +cbool lex_parse_identifier(core_stream_t *input, core_pos_t *pos, + cstring_t *output); +void lex_parse_skip_endline(core_stream_t *input, core_pos_t *pos); +void lex_parse_skip_block_comment(core_stream_t *input, core_pos_t *pos); +void lex_parse_skip_line(core_stream_t *input, core_pos_t *pos); +void lex_parse_skip_whitespace(core_stream_t *input, core_pos_t *pos); + +#endif /* __SMCC_LEX_PARSER_H__ */ diff --git a/libs/lex_parser/src/lex_parser.c b/libs/lex_parser/src/lex_parser.c new file mode 100644 index 0000000..2641d0a --- /dev/null +++ b/libs/lex_parser/src/lex_parser.c @@ -0,0 +1,406 @@ +#include + +static inline cbool is_next_line(int ch) { return ch == '\n' || ch == '\r'; } + +void lex_parse_skip_endline(core_stream_t *input, core_pos_t *pos) { + core_stream_reset_char(input); + int ch = core_stream_peek_char(input); + if (ch == '\r') { + core_stream_next_char(input); + ch = core_stream_peek_char(input); + if (ch == '\n') { + core_stream_next_char(input); + } + core_pos_next_line(pos); + } else if (ch == '\n') { + core_stream_next_char(input); + core_pos_next_line(pos); + } else { + LOG_WARN("not a newline character"); + } +} + +/** + * @brief + * + * @param ch + * @return int + * https://cppreference.cn/w/c/language/escape + * `\'` 单引号 在 ASCII 编码中为字节 0x27 + * `\"` 双引号 在 ASCII 编码中为字节 0x22 + * `\?` 问号 在 ASCII 编码中为字节 0x3f + * `\\` 反斜杠 在 ASCII 编码中为字节 0x5c + * `\a` 响铃 在 ASCII 编码中为字节 0x07 + * `\b` 退格 在 ASCII 编码中为字节 0x08 + * `\f` 换页 - 新页 在 ASCII 编码中为字节 0x0c + * `\n` 换行 - 新行 在 ASCII 编码中为字节 0x0a + * `\r` 回车 在 ASCII 编码中为字节 0x0d + * `\t` 水平制表符 在 ASCII 编码中为字节 0x09 + * `\v` 垂直制表符 在 ASCII 编码中为字节 0x0b + */ +static inline int got_simple_escape(int ch) { + /* clang-format off */ + #define CASE(ch) case ch: return ch; + switch (ch) { + case '\'': return '\''; + case '\"': return '\"'; + case '\?': return '\?'; + case '\\': return '\\'; + case 'a': return '\a'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'v': return '\v'; + default: return -1; + } + /* clang-format on */ +} + +void lex_parse_skip_line(core_stream_t *input, core_pos_t *pos) { + core_stream_t *stream = input; + Assert(stream != null); + core_stream_reset_char(stream); + while (1) { + int ch = core_stream_peek_char(stream); + + if (ch == core_stream_eof) { + return; + } + + // TODO endline + if (is_next_line(ch)) { + lex_parse_skip_endline(stream, pos); + return; + } else { + core_stream_next_char(stream); + core_pos_next(pos); + } + } +} + +void lex_parse_skip_block_comment(core_stream_t *input, core_pos_t *pos) { + core_stream_t *stream = input; + int ch; + core_stream_reset_char(stream); + ch = core_stream_next_char(stream); + core_pos_next(pos); + // FIXME Assertion + Assert(ch == '/'); + ch = core_stream_next_char(stream); + core_pos_next(pos); + Assert(ch == '*'); + + // all ready match `/*` + while (1) { + core_stream_reset_char(stream); + ch = core_stream_peek_char(stream); + + if (ch == core_stream_eof) { + LOG_WARN("Unterminated block comment"); + return; + } + + if (is_next_line(ch)) { + lex_parse_skip_endline(stream, pos); + continue; + } + core_stream_next_char(stream); + core_pos_next(pos); + if (ch == '*') { + ch = core_stream_peek_char(stream); + if (ch == '/') { + core_stream_next_char(stream); + core_pos_next(pos); + return; + } + } + } +} + +void lex_parse_skip_whitespace(core_stream_t *input, core_pos_t *pos) { + core_stream_t *stream = input; + Assert(stream != null); + core_stream_reset_char(stream); + while (1) { + int ch = core_stream_next_char(stream); + + if (ch == core_stream_eof) { + return; + } + + core_pos_next(pos); + } +} + +static inline cbool _lex_parse_uint(core_stream_t *input, core_pos_t *pos, + int base, usize *output) { + Assert(input != null && pos != null); + if (input == null || pos == null) { + return false; + } + Assert(base == 2 || base == 8 || base == 10 || base == 16); + core_stream_reset_char(input); + int ch, tmp; + usize n = 0; + usize offset = pos->offset; + while (1) { + ch = core_stream_peek_char(input); + + if (ch == core_stream_eof) { + break; + } else if (ch >= 'a' && ch <= 'z') { + tmp = ch - 'a' + 10; + } else if (ch >= 'A' && ch <= 'Z') { + tmp = ch - 'A' + 10; + } else if (ch >= '0' && ch <= '9') { + tmp = ch - '0'; + } else { + break; + } + + if (tmp >= base) { + LOG_ERROR("Invalid digit"); + return false; + } + + core_stream_next_char(input); + core_pos_next(pos); + n = n * base + tmp; + // TODO number overflow + } + if (offset == pos->offset) { + // None match any number + return false; + } + *output = n; + return true; +} + +/** + * @brief + * + * @param input + * @param pos + * @return int + * https://cppreference.cn/w/c/language/character_constant + */ +int lex_parse_char(core_stream_t *input, core_pos_t *pos) { + core_stream_t *stream = input; + core_stream_reset_char(stream); + int ch = core_stream_peek_char(stream); + int ret = core_stream_eof; + + if (ch == core_stream_eof) { + LOG_WARN("Unexpected EOF at begin"); + goto ERR; + } else if (ch != '\'') { + LOG_WARN("Unexpected character '%c' at begin", ch); + goto ERR; + } + core_stream_next_char(stream); + core_pos_next(pos); + + ch = core_stream_next_char(stream); + core_pos_next(pos); + + if (ch == core_stream_eof) { + LOG_WARN("Unexpected EOF at middle"); + goto ERR; + } else if (ch == '\\') { + ch = core_stream_next_char(stream); + core_pos_next(pos); + if (ch == '0') { + // 数字转义序列 + // \nnn 任意八进制值 码元 nnn + // FIXME 这里如果返回 0 理论上为错误但是恰好与正确值相同 + ret = 0; + _lex_parse_uint(stream, pos, 8, (usize *)&ret); + } else if (ch == 'x') { + // TODO https://cppreference.cn/w/c/language/escape + // \xn... 任意十六进制值 码元 n... (任意数量的十六进制数字) + // 通用字符名 + TODO(); + } else if (ch == 'u' || ch == 'U') { + // \unnnn (C99 起) Unicode 值在允许范围内; + // 可能产生多个码元 码点 U+nnnn + // \Unnnnnnnn (C99 起) Unicode 值在允许范围内; + // 可能产生多个码元 码点 U+nnnnnnnn + TODO(); + } else if ((ret = got_simple_escape(ch)) == -1) { + LOG_ERROR("Invalid escape character"); + goto ERR; + } + } else { + ret = ch; + } + if ((ch = core_stream_next_char(stream)) != '\'') { + LOG_ERROR("Unclosed character literal '%c' at end, expect `'`", ch); + core_pos_next(pos); + goto ERR; + } + + return ret; +ERR: + return core_stream_eof; +} + +/** + * @brief + * + * @param input + * @param pos + * @param output + * @return cbool + * https://cppreference.cn/w/c/language/string_literal + */ +cbool lex_parse_string(core_stream_t *input, core_pos_t *pos, + cstring_t *output) { + core_stream_t *stream = input; + core_stream_reset_char(stream); + int ch = core_stream_peek_char(stream); + + Assert(cstring_is_empty(output)); + if (ch == core_stream_eof) { + LOG_WARN("Unexpected EOF at begin"); + goto ERR; + } else if (ch != '"') { + LOG_WARN("Unexpected character '%c' at begin", ch); + goto ERR; + } + core_stream_next_char(stream); + core_pos_next(pos); + + cstring_t str = cstring_from_cstr(""); + while (1) { + ch = core_stream_peek_char(stream); + + if (ch == core_stream_eof) { + LOG_ERROR("Unexpected EOF at string literal"); + goto ERR; + } else if (is_next_line(ch)) { + LOG_ERROR("Unexpected newline at string literal"); + goto ERR; + } else if (ch == '\\') { + // TODO bad practice and maybe bugs here + core_stream_next_char(stream); + ch = core_stream_next_char(stream); + int val = got_simple_escape(ch); + if (val == -1) { + LOG_ERROR("Invalid escape character it is \\%c [%d]", ch, ch); + } else { + cstring_push(&str, val); + continue; + } + } else if (ch == '"') { + core_stream_next_char(stream); + core_pos_next(pos); + break; + } + + core_stream_next_char(stream); + core_pos_next(pos); + cstring_push(&str, ch); + } + + *output = str; + return true; +ERR: + cstring_free(&str); + return false; +} + +/** + * @brief + * + * @param input + * @param pos + * @param output + * @return cbool + * https://cppreference.cn/w/c/language/integer_constant + */ +cbool lex_parse_number(core_stream_t *input, core_pos_t *pos, usize *output) { + core_stream_t *stream = input; + core_stream_reset_char(stream); + int ch = core_stream_peek_char(stream); + int base = 0; + if (ch == core_stream_eof) { + LOG_WARN("Unexpected EOF at begin"); + goto ERR; + } else if (ch == '0') { + ch = core_stream_peek_char(stream); + if (ch == 'x' || ch == 'X') { + base = 16; + core_stream_next_char(stream); + core_pos_next(pos); + core_stream_next_char(stream); + core_pos_next(pos); + } else if (ch == 'b' || ch == 'B') { + // FIXME C23 external integer base + base = 2; + core_stream_next_char(stream); + core_pos_next(pos); + core_stream_next_char(stream); + core_pos_next(pos); + } else if (ch >= '0' && ch <= '7') { + base = 8; + core_stream_next_char(stream); + core_pos_next(pos); + } else if (ch == '9' || ch == '8') { + LOG_ERROR("Invalid digit '%d' in octal literal", ch); + return false; + } else { + base = 10; + } + } else { + base = 10; + } + + // 解析整数部分 + core_stream_reset_char(stream); + usize n; + if (_lex_parse_uint(stream, pos, base, &n) == false) { + return false; + } + *output = n; + return true; +ERR: + return false; +} + +/** + * @brief + * + * @param input + * @param pos + * @param output + * @return cbool + * https://cppreference.cn/w/c/language/identifier + */ +cbool lex_parse_identifier(core_stream_t *input, core_pos_t *pos, + cstring_t *output) { + Assert(cstring_is_empty(output)); + core_stream_t *stream = input; + core_stream_reset_char(stream); + int ch = core_stream_peek_char(stream); + + if (ch == core_stream_eof) { + LOG_WARN("Unexpected EOF at begin"); + } else if (ch == '_' || (ch >= 'a' && ch <= 'z') || + (ch >= 'A' && ch <= 'Z')) { + while (1) { + cstring_push(output, ch); + core_stream_next_char(stream); + core_pos_next(pos); + ch = core_stream_peek_char(stream); + if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || + (ch == '_') || (ch >= '0' && ch <= '9')) { + continue; + } + break; + } + return true; + } + return false; +} diff --git a/libs/lex_parser/tests/test_char.c b/libs/lex_parser/tests/test_char.c new file mode 100644 index 0000000..56cd53c --- /dev/null +++ b/libs/lex_parser/tests/test_char.c @@ -0,0 +1,60 @@ +// test_char.c +#include +#include + +cbool check_char(const char *str, int expect, int *output) { + log_set_level(&__default_logger_root, 0); + core_pos_t pos = core_pos_init(); + core_mem_stream_t mem_stream; + core_stream_t *stream = + core_mem_stream_init(&mem_stream, str, smcc_strlen(str), false); + *output = lex_parse_char(stream, &pos); + return *output == expect; +} + +#define CHECK_CHAR_VALID(str, expect) \ + do { \ + int _output; \ + cbool ret = check_char(str, expect, &_output); \ + TEST_CHECK(ret == true); \ + } while (0) + +#define CHECK_CHAR_INVALID(str) \ + do { \ + int _output; \ + check_char(str, core_stream_eof, &_output); \ + TEST_CHECK(_output == core_stream_eof); \ + } while (0) + +void test_simple_char(void) { + TEST_CASE("simple chars"); + CHECK_CHAR_VALID("'a'", 'a'); + CHECK_CHAR_VALID("'Z'", 'Z'); + CHECK_CHAR_VALID("'0'", '0'); + CHECK_CHAR_VALID("' '", ' '); +} + +void test_escape_char(void) { + TEST_CASE("escape chars"); + CHECK_CHAR_VALID("'\\n'", '\n'); + CHECK_CHAR_VALID("'\\t'", '\t'); + CHECK_CHAR_VALID("'\\r'", '\r'); + CHECK_CHAR_VALID("'\\\\'", '\\'); + CHECK_CHAR_VALID("'\\''", '\''); + CHECK_CHAR_VALID("'\\\"'", '\"'); +} + +void test_invalid_char(void) { + TEST_CASE("invalid chars"); + CHECK_CHAR_INVALID("'"); + CHECK_CHAR_INVALID("''"); + CHECK_CHAR_INVALID("'ab'"); + CHECK_CHAR_INVALID("'\\'"); +} + +TEST_LIST = { + {"test_simple_char", test_simple_char}, + {"test_escape_char", test_escape_char}, + {"test_invalid_char", test_invalid_char}, + {NULL, NULL}, +}; \ No newline at end of file diff --git a/libs/lex_parser/tests/test_identifier.c b/libs/lex_parser/tests/test_identifier.c new file mode 100644 index 0000000..24c2be4 --- /dev/null +++ b/libs/lex_parser/tests/test_identifier.c @@ -0,0 +1,55 @@ +// test_identifier.c +#include +#include + +cbool check_identifier(const char *str, const char *expect, cstring_t *output) { + log_set_level(&__default_logger_root, 0); + core_pos_t pos = core_pos_init(); + core_mem_stream_t mem_stream; + core_stream_t *stream = + core_mem_stream_init(&mem_stream, str, smcc_strlen(str), false); + + cbool ret = lex_parse_identifier(stream, &pos, output); + if (ret && expect) { + return strcmp(output->data, expect) == 0; + } + return ret; +} + +#define CHECK_IDENTIFIER_VALID(str, expect) \ + do { \ + cstring_t _output = cstring_new(); \ + cbool ret = check_identifier(str, expect, &_output); \ + TEST_CHECK(ret == true); \ + TEST_CHECK(strcmp(_output.data, expect) == 0); \ + cstring_free(&_output); \ + } while (0) + +#define CHECK_IDENTIFIER_INVALID(str) \ + do { \ + cstring_t _output = cstring_new(); \ + cbool ret = check_identifier(str, NULL, &_output); \ + TEST_CHECK(ret == false); \ + cstring_free(&_output); \ + } while (0) + +void test_valid_identifier(void) { + TEST_CASE("valid identifiers"); + CHECK_IDENTIFIER_VALID("variable", "variable"); + CHECK_IDENTIFIER_VALID("my_var", "my_var"); + CHECK_IDENTIFIER_VALID("_private", "_private"); + CHECK_IDENTIFIER_VALID("Var123", "Var123"); + CHECK_IDENTIFIER_VALID("a", "a"); +} + +void test_invalid_identifier(void) { + TEST_CASE("invalid identifiers"); + CHECK_IDENTIFIER_INVALID(""); + CHECK_IDENTIFIER_INVALID("123var"); +} + +TEST_LIST = { + {"test_valid_identifier", test_valid_identifier}, + {"test_invalid_identifier", test_invalid_identifier}, + {NULL, NULL}, +}; \ No newline at end of file diff --git a/libs/lex_parser/tests/test_number.c b/libs/lex_parser/tests/test_number.c new file mode 100644 index 0000000..eda153b --- /dev/null +++ b/libs/lex_parser/tests/test_number.c @@ -0,0 +1,132 @@ +#include +#include +cbool check(const char *str, usize expect, usize *output) { + // TODO maybe have other logger + log_set_level(&__default_logger_root, 0); + core_pos_t pos = core_pos_init(); + core_mem_stream_t mem_stream; + core_stream_t *stream = + core_mem_stream_init(&mem_stream, str, smcc_strlen(str), false); + return lex_parse_number(stream, &pos, output); +} + +#define CHECK_VALID(str, expect) \ + do { \ + usize _output; \ + cbool ret = check(str, expect, &_output); \ + TEST_CHECK(ret == true); \ + TEST_CHECK(_output == expect); \ + } while (0) + +#define CHECK_INVALID(str) \ + do { \ + usize _output; \ + cbool ret = check(str, 0, &_output); \ + TEST_CHECK(ret == false); \ + } while (0) + +void test_simple_hex(void) { + TEST_CASE("lowercase hex"); + CHECK_VALID("0xff", 255); + CHECK_VALID("0x0", 0); + CHECK_VALID("0xa", 10); + CHECK_VALID("0xf", 15); + CHECK_VALID("0x1a", 26); + + TEST_CASE("uppercase hex"); + CHECK_VALID("0xFF", 255); + CHECK_VALID("0xA0", 160); + CHECK_VALID("0xCAFEBABE", 3405691582); + + TEST_CASE("mixed case hex"); + CHECK_VALID("0xFf", 255); + CHECK_VALID("0xCaFeBaBe", 3405691582); + + TEST_CASE("larger hex values"); + CHECK_VALID("0xff00", 65280); + CHECK_VALID("0xFFFF", 65535); + + TEST_CASE("invalid hex"); + CHECK_INVALID("0xG"); // Invalid hex digit + CHECK_INVALID("0xyz"); // Invalid prefix + CHECK_INVALID("0x"); // Incomplete hex +} + +void test_simple_oct(void) { + TEST_CASE("basic octal"); + CHECK_VALID("00", 0); + CHECK_VALID("01", 1); + CHECK_VALID("07", 7); + + TEST_CASE("multi-digit octal"); + CHECK_VALID("010", 8); + CHECK_VALID("017", 15); + CHECK_VALID("077", 63); + + TEST_CASE("larger octal values"); + CHECK_VALID("0177", 127); + CHECK_VALID("0377", 255); + CHECK_VALID("0777", 511); + + TEST_CASE("invalid octal"); + CHECK_INVALID("08"); // Invalid octal digit + CHECK_INVALID("09"); // Invalid octal digit +} + +void test_simple_dec(void) { + TEST_CASE("single digits"); + CHECK_VALID("0", 0); + CHECK_VALID("1", 1); + CHECK_VALID("9", 9); + + TEST_CASE("multi-digit decimal"); + CHECK_VALID("10", 10); + CHECK_VALID("42", 42); + CHECK_VALID("123", 123); + + TEST_CASE("larger decimal values"); + CHECK_VALID("999", 999); + CHECK_VALID("1234", 1234); + CHECK_VALID("65535", 65535); +} + +void test_simple_bin(void) { + TEST_CASE("basic binary"); + CHECK_VALID("0b0", 0); + CHECK_VALID("0b1", 1); + + TEST_CASE("multi-digit binary"); + CHECK_VALID("0b10", 2); + CHECK_VALID("0b11", 3); + CHECK_VALID("0b100", 4); + CHECK_VALID("0b1010", 10); + + TEST_CASE("larger binary values"); + CHECK_VALID("0b1111", 15); + CHECK_VALID("0b11111111", 255); + CHECK_VALID("0b10101010", 170); + + TEST_CASE("invalid binary"); + CHECK_INVALID("0b2"); // Invalid binary digit + CHECK_INVALID("0b3"); // Invalid binary digit + CHECK_INVALID("0b"); // Incomplete binary +} + +void test_edge_cases(void) { + TEST_CASE("empty string"); + CHECK_INVALID(""); // Empty string + + TEST_CASE("non-numeric strings"); + CHECK_INVALID("abc"); // Non-numeric + CHECK_INVALID("xyz"); // Non-numeric + + TEST_CASE("mixed invalid formats"); + CHECK_INVALID("0x1G"); // Mixed valid/invalid hex + CHECK_INVALID("0b12"); // Mixed valid/invalid binary +} + +TEST_LIST = { + {"test_simple_hex", test_simple_hex}, {"test_simple_oct", test_simple_oct}, + {"test_simple_dec", test_simple_dec}, {"test_simple_bin", test_simple_bin}, + {"test_edge_cases", test_edge_cases}, {NULL, NULL}, +}; \ No newline at end of file diff --git a/libs/lex_parser/tests/test_skip_block_comment.c b/libs/lex_parser/tests/test_skip_block_comment.c new file mode 100644 index 0000000..c04acbf --- /dev/null +++ b/libs/lex_parser/tests/test_skip_block_comment.c @@ -0,0 +1,50 @@ +// test_skip_block_comment.c +#include +#include + +void check_skip_block_comment(const char *str, const char *expect_remaining) { + log_set_level(&__default_logger_root, 0); + core_pos_t pos = core_pos_init(); + core_mem_stream_t mem_stream; + core_stream_t *stream = + core_mem_stream_init(&mem_stream, str, smcc_strlen(str), false); + + lex_parse_skip_block_comment(stream, &pos); + + // Check remaining content + char buffer[256] = {0}; + int i = 0; + int ch; + while ((ch = core_stream_next_char(stream)) != core_stream_eof && i < 255) { + buffer[i++] = (char)ch; + } + + if (expect_remaining) { + TEST_CHECK(strcmp(buffer, expect_remaining) == 0); + } +} + +void test_simple_block_comment(void) { + TEST_CASE("simple block comments"); + check_skip_block_comment("/* comment */", ""); + check_skip_block_comment("/* comment */ int x;", " int x;"); +} + +void test_multiline_block_comment(void) { + TEST_CASE("multiline block comments"); + check_skip_block_comment("/* line1\nline2 */", ""); + check_skip_block_comment("/* line1\nline2 */ int x;", " int x;"); +} + +void test_nested_asterisk_block_comment(void) { + TEST_CASE("nested asterisk block comments"); + check_skip_block_comment("/* *** */", ""); + check_skip_block_comment("/* *** */ int x;", " int x;"); +} + +TEST_LIST = { + {"test_simple_block_comment", test_simple_block_comment}, + {"test_multiline_block_comment", test_multiline_block_comment}, + {"test_nested_asterisk_block_comment", test_nested_asterisk_block_comment}, + {NULL, NULL}, +}; \ No newline at end of file diff --git a/libs/lex_parser/tests/test_skip_line.c b/libs/lex_parser/tests/test_skip_line.c new file mode 100644 index 0000000..fca4441 --- /dev/null +++ b/libs/lex_parser/tests/test_skip_line.c @@ -0,0 +1,49 @@ +// test_skip_line.c +#include +#include + +void check_skip_line(const char *str, const char *expect_remaining) { + log_set_level(&__default_logger_root, 0); + core_pos_t pos = core_pos_init(); + core_mem_stream_t mem_stream; + core_stream_t *stream = + core_mem_stream_init(&mem_stream, str, smcc_strlen(str), false); + + lex_parse_skip_line(stream, &pos); + + // Check remaining content + char buffer[256] = {0}; + int i = 0; + int ch; + while ((ch = core_stream_next_char(stream)) != core_stream_eof && i < 255) { + buffer[i++] = (char)ch; + } + + if (expect_remaining) { + TEST_CHECK(strcmp(buffer, expect_remaining) == 0); + } +} + +void test_simple_line_comment(void) { + TEST_CASE("simple line comments"); + check_skip_line("// comment\n", ""); + check_skip_line("// comment\nint x;", "int x;"); +} + +void test_crlf_line_comment(void) { + TEST_CASE("CRLF line comments"); + check_skip_line("// comment\r\n", ""); + check_skip_line("// comment\r\nint x;", "int x;"); +} + +void test_eof_line_comment(void) { + TEST_CASE("EOF line comments"); + check_skip_line("// comment", ""); +} + +TEST_LIST = { + {"test_simple_line_comment", test_simple_line_comment}, + {"test_crlf_line_comment", test_crlf_line_comment}, + {"test_eof_line_comment", test_eof_line_comment}, + {NULL, NULL}, +}; \ No newline at end of file diff --git a/libs/lex_parser/tests/test_string.c b/libs/lex_parser/tests/test_string.c new file mode 100644 index 0000000..690f6ea --- /dev/null +++ b/libs/lex_parser/tests/test_string.c @@ -0,0 +1,62 @@ +// test_string.c +#include +#include + +cbool check_string(const char *str, const char *expect, cstring_t *output) { + log_set_level(&__default_logger_root, 0); + core_pos_t pos = core_pos_init(); + core_mem_stream_t mem_stream; + core_stream_t *stream = + core_mem_stream_init(&mem_stream, str, smcc_strlen(str), false); + + cbool ret = lex_parse_string(stream, &pos, output); + if (ret && expect) { + return strcmp(output->data, expect) == 0; + } + return ret; +} + +#define CHECK_STRING_VALID(str, expect) \ + do { \ + cstring_t _output = cstring_new(); \ + cbool ret = check_string(str, expect, &_output); \ + TEST_CHECK(ret == true); \ + TEST_CHECK(strcmp(_output.data, expect) == 0); \ + cstring_free(&_output); \ + } while (0) + +#define CHECK_STRING_INVALID(str) \ + do { \ + cstring_t _output = cstring_new(); \ + cbool ret = check_string(str, NULL, &_output); \ + TEST_CHECK(ret == false); \ + cstring_free(&_output); \ + } while (0) + +void test_simple_string(void) { + TEST_CASE("simple strings"); + CHECK_STRING_VALID("\"\"", ""); + CHECK_STRING_VALID("\"hello\"", "hello"); + CHECK_STRING_VALID("\"hello world\"", "hello world"); +} + +void test_escape_string(void) { + TEST_CASE("escape strings"); + CHECK_STRING_VALID("\"\\n\"", "\n"); + CHECK_STRING_VALID("\"\\t\"", "\t"); + CHECK_STRING_VALID("\"\\\"\"", "\""); + CHECK_STRING_VALID("\"Hello\\nWorld\"", "Hello\nWorld"); +} + +void test_invalid_string(void) { + TEST_CASE("invalid strings"); + CHECK_STRING_INVALID("\"unterminated"); + CHECK_STRING_INVALID("\"newline\n\""); +} + +TEST_LIST = { + {"test_simple_string", test_simple_string}, + {"test_escape_string", test_escape_string}, + {"test_invalid_string", test_invalid_string}, + {NULL, NULL}, +}; \ No newline at end of file diff --git a/libs/lexer/cbuild.toml b/libs/lexer/cbuild.toml index 6bc10cc..c91fe12 100644 --- a/libs/lexer/cbuild.toml +++ b/libs/lexer/cbuild.toml @@ -2,4 +2,7 @@ name = "smcc_lex" version = "0.1.0" -dependencies = [{ name = "libcore", path = "../../runtime/libcore" }] +dependencies = [ + { name = "libcore", path = "../../runtime/libcore" }, + { name = "smcc_lex_parser", path = "../lex_parser" }, +] diff --git a/libs/lexer/include/lexer.h b/libs/lexer/include/lexer.h index ca71453..e36d43d 100644 --- a/libs/lexer/include/lexer.h +++ b/libs/lexer/include/lexer.h @@ -9,18 +9,10 @@ #include "lexer_token.h" #include -typedef struct lexer_loc { - const char *name; - usize name_len; - usize line; - usize column; - usize offset; -} lexer_loc_t; - typedef struct lexer_token { token_type_t type; core_cvalue_t value; - lexer_loc_t loc; + core_pos_t loc; } lexer_tok_t; /** @@ -30,7 +22,7 @@ typedef struct lexer_token { */ typedef struct cc_lexer { core_stream_t *stream; - lexer_loc_t pos; + core_pos_t pos; } smcc_lexer_t; /** diff --git a/libs/lexer/src/lexer.c b/libs/lexer/src/lexer.c index 7169e41..5748192 100644 --- a/libs/lexer/src/lexer.c +++ b/libs/lexer/src/lexer.c @@ -26,6 +26,7 @@ the distribution and installation instructions. Chris Fraser / cwf@aya.yale.edu David Hanson / drh@drhanson.net */ +#include #include #include @@ -76,303 +77,21 @@ static inline int keyword_cmp(const char *name, int len) { void lexer_init(smcc_lexer_t *lexer, core_stream_t *stream) { lexer->stream = stream; - lexer->pos = (lexer_loc_t){ - .name = cstring_as_cstr(&stream->name), - .name_len = cstring_len(&stream->name), - .line = 1, - .column = 1, - .offset = 0, - }; + lexer->pos = core_pos_init(); + // FIXME + lexer->pos.name = cstring_from_cstr(cstring_as_cstr(&stream->name)); } -#define stream_reset_char(stream) ((stream)->reset_char(stream)) -#define stream_next_char(stream) ((stream)->next_char(stream)) -#define stream_peek_char(stream) ((stream)->peek_char(stream)) -#define lexer_next_pos(lexer) ((lexer)->pos.column++, (lexer)->pos.offset++) -#define lexer_next_line(lexer) ((lexer)->pos.line++, (lexer)->pos.column = 1) #define set_err_token(token) ((token)->type = TOKEN_UNKNOWN) -static void skip_newline(smcc_lexer_t *lexer, lexer_tok_t *token) { - core_stream_t *stream = lexer->stream; - token->type = TOKEN_LINE_COMMENT; - - // 循环直到遇到换行符或文件结束 - while (1) { - int ch = stream_next_char(stream); - - if (ch == core_stream_eof) { - // 到达文件末尾,直接返回 - return; - } - - // 更新位置信息 - lexer_next_pos(lexer); - if (ch == '\n') { - // 遇到换行符,增加行号并重置列号 - lexer_next_line(lexer); - return; - } - } -} - -static void skip_block_comment(smcc_lexer_t *lexer, lexer_tok_t *token) { - core_stream_t *stream = lexer->stream; - token->type = TOKEN_BLOCK_COMMENT; - int ch; - - stream_reset_char(stream); - ch = stream_next_char(stream); - lexer_next_pos(lexer); - // FIXME Assertion - Assert(ch == '/'); - ch = stream_next_char(stream); - lexer_next_pos(lexer); - Assert(ch == '*'); - // 我们已经识别了 "/*",现在需要找到 "*/" - while (1) { - ch = stream_next_char(stream); - lexer_next_pos(lexer); - - if (ch == core_stream_eof) { - // 未闭合的块注释 - LEX_WARN("Unterminated block comment"); - return; - } - - // LEX_ERROR("%c", ch); - - // 更新位置信息 - if (ch == '\n') { - lexer_next_line(lexer); - } else if (ch == '*') { - // 查看下一个字符是否是 '/' - int next_ch = stream_peek_char(stream); - - if (next_ch == '/') { - // 消费 '/' 字符 - stream_next_char(stream); - - // 更新位置信息 - lexer_next_pos(lexer); - - // 成功找到注释结束标记 - return; - } - } - } -} - -// TODO escape character not enough -static inline int got_slash(int peek) { - switch (peek) { - case '\\': - return '\\'; - case '\'': - return '\''; - case '\"': - return '\"'; - case '\?': - return '\?'; - case '0': - return '\0'; - - case 'b': - return '\b'; - case 'f': - return '\f'; - case 'n': - return '\n'; - case 'r': - return '\r'; - case 't': - return '\t'; - case 'v': - return '\v'; - default: - break; - } - return -1; -} - -static void parse_char(smcc_lexer_t *lexer, lexer_tok_t *token) { - token->loc = lexer->pos; - token->type = TOKEN_CHAR_LITERAL; - core_stream_t *stream = lexer->stream; - stream_reset_char(stream); - int ch = stream_peek_char(stream); - - if (ch == core_stream_eof) { - LEX_WARN("Unexpected EOF at begin"); - goto ERR; - } else if (ch != '\'') { - LEX_WARN("Unexpected character '%c' at begin", ch); - goto ERR; - } - stream_next_char(stream); - lexer_next_pos(lexer); - - ch = stream_next_char(stream); - lexer_next_pos(lexer); - - if (ch == core_stream_eof) { - LEX_WARN("Unexpected EOF at middle"); - goto ERR; - } else if (ch == '\\') { - ch = stream_next_char(stream); - lexer_next_pos(lexer); - if ((ch = got_slash(ch)) == -1) { - LEX_ERROR("Invalid escape character"); - // TODO 特殊情况处理 - goto ERR; - } - token->value.ch = ch; - } else { - token->value.ch = ch; - } - if ((ch = stream_next_char(stream)) != '\'') { - LEX_ERROR("Unclosed character literal '%c' at end, expect `'`", ch); - lexer_next_pos(lexer); - goto ERR; - } - - return; -ERR: - set_err_token(token); -} - -static void parse_string(smcc_lexer_t *lexer, lexer_tok_t *token) { - token->loc = lexer->pos; - token->type = TOKEN_STRING_LITERAL; - core_stream_t *stream = lexer->stream; - stream_reset_char(stream); - int ch = stream_peek_char(stream); - - if (ch == core_stream_eof) { - LEX_WARN("Unexpected EOF at begin"); - goto ERR; - } else if (ch != '"') { - LEX_WARN("Unexpected character '%c' at begin", ch); - goto ERR; - } - stream_next_char(stream); - lexer_next_pos(lexer); - - int base = 0; - cstring_t str = cstring_new(); - while (1) { - ch = stream_peek_char(stream); - - if (ch == core_stream_eof) { - LEX_ERROR("Unexpected EOF at string literal"); - break; - } else if (ch == '\n') { - LEX_ERROR("Unexpected newline at string literal"); - break; - } else if (ch == '\\') { - // TODO bad practice and maybe bugs here - stream_next_char(stream); - ch = stream_next_char(stream); - int val = got_slash(ch); - if (val == -1) { - LEX_ERROR("Invalid escape character it is \\%c [%d]", ch, ch); - } else { - cstring_push(&str, val); - continue; - } - } else if (ch == '"') { - stream_next_char(stream); - lexer_next_pos(lexer); - break; - } - - stream_next_char(stream); - lexer_next_pos(lexer); - cstring_push(&str, ch); - } - - token->value.cstr.data = (char *)cstring_as_cstr(&str); - token->value.cstr.len = cstring_len(&str); - return; -ERR: - set_err_token(token); -} - -static void parse_number(smcc_lexer_t *lexer, lexer_tok_t *token) { - token->loc = lexer->pos; - core_stream_t *stream = lexer->stream; - stream_reset_char(stream); - int ch = stream_peek_char(stream); - int base = 0; - if (ch == core_stream_eof) { - LEX_WARN("Unexpected EOF at begin"); - goto ERR; - } else if (ch == '0') { - ch = stream_peek_char(stream); - if (ch == 'x' || ch == 'X') { - base = 16; - stream_next_char(stream); - lexer_next_pos(lexer); - stream_next_char(stream); - lexer_next_pos(lexer); - } else if (ch == 'b' || ch == 'B') { - // FIXME C23 external integer base - base = 2; - stream_next_char(stream); - lexer_next_pos(lexer); - stream_next_char(stream); - lexer_next_pos(lexer); - } else if (ch >= '0' && ch <= '7') { - base = 8; - stream_next_char(stream); - lexer_next_pos(lexer); - } else { - base = 10; - } - } else { - base = 10; - } - - // 解析整数部分 - stream_reset_char(stream); - int tmp = 0; - token->value.n = 0; - while (1) { - ch = stream_peek_char(stream); - - if (ch == core_stream_eof) { - break; - } else if (ch >= 'a' && ch <= 'z') { - tmp = ch - 'a' + 10; - } else if (ch >= 'A' && ch <= 'Z') { - tmp = ch - 'A' + 10; - } else if (ch >= '0' && ch <= '9') { - tmp = ch - '0'; - } else { - break; - } - - if (tmp >= base) { - LOG_ERROR("Invalid digit"); - break; - } - - stream_next_char(stream); - lexer_next_pos(lexer); - token->value.n = token->value.n * base + tmp; - // TODO number overflow - } - - token->type = TOKEN_INT_LITERAL; - return; -ERR: - set_err_token(token); -} - static void parse_line(smcc_lexer_t *lexer, lexer_tok_t *token) { token->loc = lexer->pos; core_stream_t *stream = lexer->stream; - stream_reset_char(stream); - int ch = stream_peek_char(stream); + core_stream_reset_char(stream); + int ch = core_stream_peek_char(stream); + + usize n; + cstring_t str = cstring_new(); if (ch == core_stream_eof) { LEX_WARN("Unexpected EOF at begin"); @@ -384,9 +103,9 @@ static void parse_line(smcc_lexer_t *lexer, lexer_tok_t *token) { const char line[] = "line"; - for (int i = 0; i < sizeof(line); i++) { - ch = stream_next_char(stream); - lexer_next_pos(lexer); + for (int i = 0; i < (int)sizeof(line); i++) { + ch = core_stream_next_char(stream); + core_pos_next(&lexer->pos); if (ch != line[i]) { LEX_WARN("Maroc does not support in lexer rather in preprocessor, " "it will be ignored"); @@ -394,38 +113,36 @@ static void parse_line(smcc_lexer_t *lexer, lexer_tok_t *token) { } } - parse_number(lexer, token); - if (token->type != TOKEN_INT_LITERAL) { + if (lex_parse_number(lexer->stream, &lexer->pos, &n) == false) { LEX_ERROR("Invalid line number"); goto SKIP_LINE; } - if (stream_next_char(stream) != ' ') { - skip_newline(lexer, token); + if (core_stream_next_char(stream) != ' ') { + lex_parse_skip_line(lexer->stream, &lexer->pos); token->loc.line = token->value.n; } - if (stream_peek_char(stream) != '"') { + if (core_stream_peek_char(stream) != '"') { LEX_ERROR("Invalid `#` line"); goto SKIP_LINE; } - parse_string(lexer, token); - if (token->type != TOKEN_STRING_LITERAL) { + if (lex_parse_string(lexer->stream, &lexer->pos, &str) == false) { LEX_ERROR("Invalid filename"); goto SKIP_LINE; } - skip_newline(lexer, token); - token->loc.line = token->value.n; + lex_parse_skip_line(lexer->stream, &lexer->pos); + token->loc.line = n; // FIXME memory leak - token->loc.name = cstring_as_cstr((const cstring_t *)&token->value.cstr); - token->loc.name_len = cstring_len((const cstring_t *)&token->value.cstr); - + token->loc.name = cstring_from_cstr(cstring_as_cstr(&str)); + cstring_free(&str); return; SKIP_LINE: - skip_newline(lexer, token); + lex_parse_skip_line(lexer->stream, &lexer->pos); ERR: set_err_token(token); + cstring_free(&str); } // /zh/c/language/operator_arithmetic.html @@ -434,24 +151,24 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { token->type = TOKEN_UNKNOWN; core_stream_t *stream = lexer->stream; - stream_reset_char(stream); + core_stream_reset_char(stream); token_type_t type = TOKEN_UNKNOWN; - int ch = stream_peek_char(stream); + int ch = core_stream_peek_char(stream); // once step switch (ch) { case '=': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_EQ; goto double_char; default: - stream_reset_char(stream), type = TOKEN_ASSIGN; + core_stream_reset_char(stream), type = TOKEN_ASSIGN; break; } break; case '+': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '+': type = TOKEN_ADD_ADD; goto double_char; @@ -459,12 +176,12 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_ASSIGN_ADD; goto double_char; default: - stream_reset_char(stream), type = TOKEN_ADD; + core_stream_reset_char(stream), type = TOKEN_ADD; break; } break; case '-': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '-': type = TOKEN_SUB_SUB; goto double_char; @@ -475,48 +192,50 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_DEREF; goto double_char; default: - stream_reset_char(stream), type = TOKEN_SUB; + core_stream_reset_char(stream), type = TOKEN_SUB; break; } break; case '*': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_ASSIGN_MUL; goto double_char; default: - stream_reset_char(stream), type = TOKEN_MUL; + core_stream_reset_char(stream), type = TOKEN_MUL; break; } break; case '/': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_ASSIGN_DIV; goto double_char; case '/': - skip_newline(lexer, token); + lex_parse_skip_line(lexer->stream, &lexer->pos); + token->type = TOKEN_LINE_COMMENT; goto END; case '*': - skip_block_comment(lexer, token); + lex_parse_skip_block_comment(lexer->stream, &lexer->pos); + token->type = TOKEN_BLOCK_COMMENT; goto END; default: - stream_reset_char(stream), type = TOKEN_DIV; + core_stream_reset_char(stream), type = TOKEN_DIV; break; } break; case '%': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_ASSIGN_MOD; goto double_char; default: - stream_reset_char(stream), type = TOKEN_MOD; + core_stream_reset_char(stream), type = TOKEN_MOD; break; } break; case '&': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '&': type = TOKEN_AND_AND; goto double_char; @@ -524,12 +243,12 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_ASSIGN_AND; goto double_char; default: - stream_reset_char(stream), type = TOKEN_AND; + core_stream_reset_char(stream), type = TOKEN_AND; break; } break; case '|': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '|': type = TOKEN_OR_OR; goto double_char; @@ -537,27 +256,27 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_ASSIGN_OR; goto double_char; default: - stream_reset_char(stream), type = TOKEN_OR; + core_stream_reset_char(stream), type = TOKEN_OR; break; } break; case '^': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_ASSIGN_XOR; goto double_char; default: - stream_reset_char(stream), type = TOKEN_XOR; + core_stream_reset_char(stream), type = TOKEN_XOR; break; } break; case '<': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_LE; goto double_char; case '<': { - if (stream_peek_char(stream) == '=') { + if (core_stream_peek_char(stream) == '=') { type = TOKEN_ASSIGN_L_SH; goto triple_char; } else { @@ -567,17 +286,17 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { break; } default: - stream_reset_char(stream), type = TOKEN_LT; + core_stream_reset_char(stream), type = TOKEN_LT; break; } break; case '>': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_GE; goto double_char; case '>': { - if (stream_peek_char(stream) == '=') { + if (core_stream_peek_char(stream) == '=') { type = TOKEN_ASSIGN_R_SH; goto triple_char; } else { @@ -587,7 +306,7 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { break; } default: - stream_reset_char(stream), type = TOKEN_GT; + core_stream_reset_char(stream), type = TOKEN_GT; break; } break; @@ -595,12 +314,12 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_BIT_NOT; break; case '!': - switch (stream_peek_char(stream)) { + switch (core_stream_peek_char(stream)) { case '=': type = TOKEN_NEQ; goto double_char; default: - stream_reset_char(stream), type = TOKEN_NOT; + core_stream_reset_char(stream), type = TOKEN_NOT; break; } break; @@ -632,8 +351,8 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_COLON; break; case '.': - if (stream_peek_char(stream) == '.' && - stream_peek_char(stream) == '.') { + if (core_stream_peek_char(stream) == '.' && + core_stream_peek_char(stream) == '.') { type = TOKEN_ELLIPSIS; goto triple_char; } @@ -643,17 +362,14 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = TOKEN_COND; break; case '\v': - case '\r': case '\f': case ' ': case '\t': type = TOKEN_BLANK; break; + case '\r': case '\n': - // you need to flush a newline or blank - stream_next_char(stream); - lexer_next_line(lexer); - // FIXME some error + lex_parse_skip_endline(lexer->stream, &lexer->pos); token->type = TOKEN_BLANK; goto END; case '#': @@ -665,17 +381,45 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { // EOF type = TOKEN_EOF; break; - case '\'': - parse_char(lexer, token); + case '\'': { + token->loc = lexer->pos; + token->type = TOKEN_CHAR_LITERAL; + int ch = lex_parse_char(lexer->stream, &lexer->pos); + if (ch == core_stream_eof) { + LEX_ERROR("Unexpected character literal"); + token->type = TOKEN_UNKNOWN; + } else { + token->value.ch = ch; + } goto END; - case '"': - parse_string(lexer, token); + } + case '"': { + token->loc = lexer->pos; + token->type = TOKEN_STRING_LITERAL; + cstring_t output = cstring_new(); + if (lex_parse_string(lexer->stream, &lexer->pos, &output) == true) { + token->value.cstr.data = cstring_as_cstr(&output); + token->value.cstr.len = cstring_len(&output); + } else { + LEX_ERROR("Unexpected string literal"); + token->type = TOKEN_UNKNOWN; + } + goto END; + } /* clang-format off */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* clang-format on */ - parse_number(lexer, token); + token->loc = lexer->pos; + token->type = TOKEN_INT_LITERAL; + usize output; + if (lex_parse_number(lexer->stream, &lexer->pos, &output) == true) { + token->value.n = output; + } else { + LEX_ERROR("Unexpected number literal"); + token->type = TOKEN_UNKNOWN; + } goto END; /* clang-format off */ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': @@ -687,25 +431,9 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': /* clang-format on */ - // TOKEN_IDENT - // TODO - // if ((ch == 'L' && ch == '\'') || (ch == 'L' && ch == '"')) { - // LEX_ERROR("unsupport wide-character char literal by `L` format"); - // } cstring_t str = cstring_new(); - cstring_push(&str, stream_next_char(stream)); - lexer_next_pos(lexer); - while (1) { - ch = stream_peek_char(stream); - if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || - (ch == '_') || (ch >= '0' && ch <= '9')) { - stream_next_char(stream); - lexer_next_pos(lexer); - cstring_push(&str, ch); - continue; - } - break; - } + cbool ret = lex_parse_identifier(lexer->stream, &lexer->pos, &str); + Assert(ret == true); int res = keyword_cmp(cstring_as_cstr(&str), cstring_len(&str)); if (res == -1) { @@ -724,14 +452,14 @@ void lexer_get_token(smcc_lexer_t *lexer, lexer_tok_t *token) { } goto once_char; triple_char: - stream_next_char(stream); - lexer_next_pos(lexer); + core_stream_next_char(stream); + core_pos_next(&lexer->pos); double_char: - stream_next_char(stream); - lexer_next_pos(lexer); + core_stream_next_char(stream); + core_pos_next(&lexer->pos); once_char: - stream_next_char(stream); - lexer_next_pos(lexer); + core_stream_next_char(stream); + core_pos_next(&lexer->pos); token->type = type; END: LEX_DEBUG("get token `%s` in %s:%d:%d", get_tok_name(token->type), @@ -746,6 +474,7 @@ void lexer_get_valid_token(smcc_lexer_t *lexer, lexer_tok_t *token) { type = get_tok_subtype(token->type); AssertFmt(type != TK_BASIC_INVALID, "Invalid token: `%s` at %s:%d:%d", get_tok_name(token->type), token->loc.name, token->loc.line, - token->loc.column); + token->loc.col); + Assert(type != TK_BASIC_INVALID); } while (type == TK_BASIC_EMPTYSPACE || type == TK_BASIC_COMMENT); } diff --git a/libs/lexer/tests/test_run.c b/libs/lexer/tests/test_run.c index 3853c9f..631ddbd 100644 --- a/libs/lexer/tests/test_run.c +++ b/libs/lexer/tests/test_run.c @@ -40,7 +40,6 @@ int main(int argc, char *argv[]) { perror("open file failed"); return 1; } - printf("open file success\n"); if (fseek(fp, 0, SEEK_END) != 0) { perror("fseek failed"); @@ -79,7 +78,7 @@ int main(int argc, char *argv[]) { break; } LOG_DEBUG("token `%s` at %s:%u:%u", get_tok_name(tok.type), - tok.loc.name, tok.loc.line, tok.loc.column); + tok.loc.name, tok.loc.line, tok.loc.col); Assert(tok.loc.offset <= fsize); // LOG_DEBUG("%s", tok.val.str); // printf("line: %d, column: %d, type: %3d, typename: %s\n", diff --git a/runtime/libcore/include/core_pos.h b/runtime/libcore/include/core_pos.h new file mode 100644 index 0000000..2206da8 --- /dev/null +++ b/runtime/libcore/include/core_pos.h @@ -0,0 +1,28 @@ +#ifndef __SMCC_CORE_POS_H__ +#define __SMCC_CORE_POS_H__ + +#include "core_str.h" +#include "core_type.h" +typedef struct { + cstring_t name; + usize line; + usize col; + usize offset; +} core_pos_t; + +static inline core_pos_t core_pos_init() { + return (core_pos_t){cstring_new(), 1, 1, 0}; +} + +static inline void core_pos_next(core_pos_t *pos) { + pos->offset++; + pos->col++; +} + +static inline void core_pos_next_line(core_pos_t *pos) { + pos->offset++; + pos->line++; + pos->col = 1; +} + +#endif /* __SMCC_CORE_POS_H__ */ diff --git a/runtime/libcore/include/core_str.h b/runtime/libcore/include/core_str.h index d2eb498..86ec4c2 100644 --- a/runtime/libcore/include/core_str.h +++ b/runtime/libcore/include/core_str.h @@ -54,12 +54,15 @@ static inline cstring_t cstring_from_cstr(const char *s) { * @param str 要被释放的字符串指针 */ static inline void cstring_free(cstring_t *str) { - if (str && str->data && str->cap != 0) { + if (str == null) { + return; + } + if (str->data != null && str->cap != 0) { smcc_free(str->data); str->data = null; - str->size = 0; - str->cap = 0; } + str->size = 0; + str->cap = 0; } /** diff --git a/runtime/libcore/include/libcore.h b/runtime/libcore/include/libcore.h index a86a52b..53f32a5 100644 --- a/runtime/libcore/include/libcore.h +++ b/runtime/libcore/include/libcore.h @@ -21,6 +21,7 @@ #define SMCC_STR(str) _SMCC_STR(str) #define SMCC_ARRLEN(arr) (sizeof(arr) / sizeof(arr[0])) +#include #include #include #include diff --git a/runtime/libcore/src/stream.c b/runtime/libcore/src/stream.c index 2e391f8..0cdd2b2 100644 --- a/runtime/libcore/src/stream.c +++ b/runtime/libcore/src/stream.c @@ -69,11 +69,16 @@ static void free_stream(core_stream_t *_stream) { core_stream_t *core_mem_stream_init(core_mem_stream_t *stream, const char *data, usize length, cbool need_copy) { - if (stream == null || data == NULL || length == 0) { + if (stream == null || data == null) { LOG_ERROR("param error"); return null; } + if (length == 0) { + LOG_WARN("input memory is empty"); + need_copy = false; + } + stream->owned = need_copy; if (need_copy) { char *buf = (char *)smcc_malloc(length); diff --git a/runtime/log/include/log.c b/runtime/log/include/log.c index cf161ec..b023f10 100644 --- a/runtime/log/include/log.c +++ b/runtime/log/include/log.c @@ -79,7 +79,11 @@ void init_logger(logger_t *logger, const char *name) { log_set_level(logger, LOG_LEVEL_ALL); } -logger_t *log_get(const char *name) { return &__default_logger_root; } +logger_t *log_get(const char *name) { + // TODO for -Wunused-parameter + (void)name; + return &__default_logger_root; +} void log_set_level(logger_t *logger, int level) { if (logger) @@ -95,4 +99,8 @@ void log_set_handler(logger_t *logger, log_handler handler) { __default_logger_root.handler = handler; } -void logger_destroy(logger_t *logger) { return; } +void logger_destroy(logger_t *logger) { + // TODO for -Wunused-parameter + (void)logger; + return; +}