so... jai :D
This commit is contained in:
403
src/lexer.zig
Normal file
403
src/lexer.zig
Normal file
@@ -0,0 +1,403 @@
|
||||
const std = @import("std");
|
||||
const Token = @import("token.zig").Token;
|
||||
const Tag = @import("token.zig").Tag;
|
||||
const getKeyword = @import("token.zig").getKeyword;
|
||||
|
||||
pub const Lexer = struct {
|
||||
source: [:0]const u8,
|
||||
index: u32,
|
||||
|
||||
pub fn init(source: [:0]const u8) Lexer {
|
||||
return .{ .source = source, .index = 0 };
|
||||
}
|
||||
|
||||
pub fn next(self: *Lexer) Token {
|
||||
// Skip whitespace and comments
|
||||
while (true) {
|
||||
if (self.index >= self.source.len) {
|
||||
return self.makeToken(.eof, self.index, self.index);
|
||||
}
|
||||
const c = self.source[self.index];
|
||||
if (c == ' ' or c == '\t' or c == '\n' or c == '\r') {
|
||||
self.index += 1;
|
||||
continue;
|
||||
}
|
||||
// Line comments
|
||||
if (c == '/' and self.index + 1 < self.source.len and self.source[self.index + 1] == '/') {
|
||||
while (self.index < self.source.len and self.source[self.index] != '\n') {
|
||||
self.index += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
const start = self.index;
|
||||
const c = self.source[start];
|
||||
|
||||
// Integer / float literals
|
||||
if (isDigit(c)) {
|
||||
return self.lexNumber(start);
|
||||
}
|
||||
|
||||
// Identifiers and keywords
|
||||
if (isIdentStart(c)) {
|
||||
return self.lexIdentifier(start);
|
||||
}
|
||||
|
||||
// String literals
|
||||
if (c == '"') {
|
||||
return self.lexString(start);
|
||||
}
|
||||
|
||||
// Directives: #import, #insert, #run
|
||||
if (c == '#') {
|
||||
if (self.source.len >= start + 7 and std.mem.eql(u8, self.source[start .. start + 7], "#import") and
|
||||
(start + 7 >= self.source.len or !isIdentContinue(self.source[start + 7])))
|
||||
{
|
||||
self.index = start + 7;
|
||||
return self.makeToken(.hash_import, start, self.index);
|
||||
}
|
||||
if (self.source.len >= start + 7 and std.mem.eql(u8, self.source[start .. start + 7], "#insert") and
|
||||
(start + 7 >= self.source.len or !isIdentContinue(self.source[start + 7])))
|
||||
{
|
||||
self.index = start + 7;
|
||||
return self.makeToken(.hash_insert, start, self.index);
|
||||
}
|
||||
if (self.source.len >= start + 4 and std.mem.eql(u8, self.source[start .. start + 4], "#run") and
|
||||
(start + 4 >= self.source.len or !isIdentContinue(self.source[start + 4])))
|
||||
{
|
||||
self.index = start + 4;
|
||||
return self.makeToken(.hash_run, start, self.index);
|
||||
}
|
||||
if (self.source.len >= start + 8 and std.mem.eql(u8, self.source[start .. start + 8], "#builtin") and
|
||||
(start + 8 >= self.source.len or !isIdentContinue(self.source[start + 8])))
|
||||
{
|
||||
self.index = start + 8;
|
||||
return self.makeToken(.hash_builtin, start, self.index);
|
||||
}
|
||||
self.index += 1;
|
||||
return self.makeToken(.invalid, start, self.index);
|
||||
}
|
||||
|
||||
// Punctuation and operators
|
||||
self.index += 1;
|
||||
switch (c) {
|
||||
';' => return self.makeToken(.semicolon, start, self.index),
|
||||
',' => return self.makeToken(.comma, start, self.index),
|
||||
'(' => return self.makeToken(.l_paren, start, self.index),
|
||||
')' => return self.makeToken(.r_paren, start, self.index),
|
||||
'{' => return self.makeToken(.l_brace, start, self.index),
|
||||
'}' => return self.makeToken(.r_brace, start, self.index),
|
||||
'[' => return self.makeToken(.l_bracket, start, self.index),
|
||||
']' => return self.makeToken(.r_bracket, start, self.index),
|
||||
'.' => {
|
||||
if (self.peek() == '.') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.dot_dot, start, self.index);
|
||||
}
|
||||
return self.makeToken(.dot, start, self.index);
|
||||
},
|
||||
'$' => return self.makeToken(.dollar, start, self.index),
|
||||
':' => {
|
||||
if (self.peek() == ':') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.colon_colon, start, self.index);
|
||||
}
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.colon_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.colon, start, self.index);
|
||||
},
|
||||
'=' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.equal_equal, start, self.index);
|
||||
}
|
||||
if (self.peek() == '>') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.fat_arrow, start, self.index);
|
||||
}
|
||||
return self.makeToken(.equal, start, self.index);
|
||||
},
|
||||
'+' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.plus_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.plus, start, self.index);
|
||||
},
|
||||
'-' => {
|
||||
if (self.peek() == '-' and (self.index + 1) < self.source.len and self.source[self.index + 1] == '-') {
|
||||
self.index += 2;
|
||||
return self.makeToken(.triple_minus, start, self.index);
|
||||
}
|
||||
if (self.peek() == '>') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.arrow, start, self.index);
|
||||
}
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.minus_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.minus, start, self.index);
|
||||
},
|
||||
'*' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.star_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.star, start, self.index);
|
||||
},
|
||||
'/' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.slash_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.slash, start, self.index);
|
||||
},
|
||||
'%' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.percent_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.percent, start, self.index);
|
||||
},
|
||||
'!' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.bang_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.bang, start, self.index);
|
||||
},
|
||||
'<' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.less_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.less, start, self.index);
|
||||
},
|
||||
'>' => {
|
||||
if (self.peek() == '=') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.greater_equal, start, self.index);
|
||||
}
|
||||
return self.makeToken(.greater, start, self.index);
|
||||
},
|
||||
else => return self.makeToken(.invalid, start, self.index),
|
||||
}
|
||||
}
|
||||
|
||||
fn lexNumber(self: *Lexer, start: u32) Token {
|
||||
// Advance past the initial digit that was already matched
|
||||
self.index += 1;
|
||||
|
||||
// Check for hex (0x/0X) or binary (0b/0B) prefix
|
||||
if (self.source[start] == '0' and self.index < self.source.len) {
|
||||
const prefix = self.source[self.index];
|
||||
if (prefix == 'x' or prefix == 'X') {
|
||||
self.index += 1; // skip 'x'/'X'
|
||||
while (self.index < self.source.len and isHexDigit(self.source[self.index])) {
|
||||
self.index += 1;
|
||||
}
|
||||
return self.makeToken(.int_literal, start, self.index);
|
||||
}
|
||||
if (prefix == 'b' or prefix == 'B') {
|
||||
self.index += 1; // skip 'b'/'B'
|
||||
while (self.index < self.source.len and (self.source[self.index] == '0' or self.source[self.index] == '1')) {
|
||||
self.index += 1;
|
||||
}
|
||||
return self.makeToken(.int_literal, start, self.index);
|
||||
}
|
||||
}
|
||||
|
||||
while (self.index < self.source.len and isDigit(self.source[self.index])) {
|
||||
self.index += 1;
|
||||
}
|
||||
// Check for float
|
||||
if (self.index < self.source.len and self.source[self.index] == '.') {
|
||||
// Look ahead: must be followed by a digit (not `.identifier`)
|
||||
if (self.index + 1 < self.source.len and isDigit(self.source[self.index + 1])) {
|
||||
self.index += 1; // skip '.'
|
||||
while (self.index < self.source.len and isDigit(self.source[self.index])) {
|
||||
self.index += 1;
|
||||
}
|
||||
return self.makeToken(.float_literal, start, self.index);
|
||||
}
|
||||
}
|
||||
return self.makeToken(.int_literal, start, self.index);
|
||||
}
|
||||
|
||||
fn lexIdentifier(self: *Lexer, start: u32) Token {
|
||||
while (self.index < self.source.len and isIdentContinue(self.source[self.index])) {
|
||||
self.index += 1;
|
||||
}
|
||||
const text = self.source[start..self.index];
|
||||
if (getKeyword(text)) |kw| {
|
||||
return self.makeToken(kw, start, self.index);
|
||||
}
|
||||
return self.makeToken(.identifier, start, self.index);
|
||||
}
|
||||
|
||||
fn lexString(self: *Lexer, start: u32) Token {
|
||||
self.index += 1; // skip opening "
|
||||
while (self.index < self.source.len) {
|
||||
const ch = self.source[self.index];
|
||||
if (ch == '"') {
|
||||
self.index += 1;
|
||||
return self.makeToken(.string_literal, start, self.index);
|
||||
}
|
||||
if (ch == '\\') {
|
||||
self.index += 1; // skip escape
|
||||
}
|
||||
self.index += 1;
|
||||
}
|
||||
// Unterminated string
|
||||
return self.makeToken(.invalid, start, self.index);
|
||||
}
|
||||
|
||||
fn peek(self: *const Lexer) u8 {
|
||||
if (self.index < self.source.len) {
|
||||
return self.source[self.index];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn makeToken(_: *const Lexer, tag: Tag, start: u32, end: u32) Token {
|
||||
return .{ .tag = tag, .loc = .{ .start = start, .end = end } };
|
||||
}
|
||||
|
||||
fn isDigit(c: u8) bool {
|
||||
return c >= '0' and c <= '9';
|
||||
}
|
||||
|
||||
fn isIdentStart(c: u8) bool {
|
||||
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == '_';
|
||||
}
|
||||
|
||||
fn isHexDigit(c: u8) bool {
|
||||
return isDigit(c) or (c >= 'a' and c <= 'f') or (c >= 'A' and c <= 'F');
|
||||
}
|
||||
|
||||
fn isIdentContinue(c: u8) bool {
|
||||
return isIdentStart(c) or isDigit(c);
|
||||
}
|
||||
};
|
||||
|
||||
test "lex minimal main" {
|
||||
var lex = Lexer.init("main :: () { 42; }");
|
||||
const expected = [_]Tag{ .identifier, .colon_colon, .l_paren, .r_paren, .l_brace, .int_literal, .semicolon, .r_brace, .eof };
|
||||
for (expected) |exp| {
|
||||
const tok = lex.next();
|
||||
try std.testing.expectEqual(exp, tok.tag);
|
||||
}
|
||||
}
|
||||
|
||||
test "lex with comments" {
|
||||
var lex = Lexer.init("// comment\nmain :: () { 0; }");
|
||||
try std.testing.expectEqual(Tag.identifier, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.colon_colon, lex.next().tag);
|
||||
}
|
||||
|
||||
test "lex operators" {
|
||||
var lex = Lexer.init(":= : :: += -= *= /= -> => == != <= >=");
|
||||
const expected = [_]Tag{
|
||||
.colon_equal, .colon, .colon_colon, .plus_equal, .minus_equal,
|
||||
.star_equal, .slash_equal, .arrow, .fat_arrow, .equal_equal,
|
||||
.bang_equal, .less_equal, .greater_equal,
|
||||
};
|
||||
for (expected) |exp| {
|
||||
try std.testing.expectEqual(exp, lex.next().tag);
|
||||
}
|
||||
}
|
||||
|
||||
test "lex float" {
|
||||
var lex = Lexer.init("0.3 42 0.9");
|
||||
try std.testing.expectEqual(Tag.float_literal, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.int_literal, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.float_literal, lex.next().tag);
|
||||
}
|
||||
|
||||
test "lex keywords" {
|
||||
var lex = Lexer.init("if else then true false enum case break return f32 f64 struct");
|
||||
const expected = [_]Tag{
|
||||
.kw_if, .kw_else, .kw_then, .kw_true, .kw_false,
|
||||
.kw_enum, .kw_case, .kw_break, .kw_return, .kw_f32, .kw_f64, .kw_struct,
|
||||
};
|
||||
for (expected) |exp| {
|
||||
try std.testing.expectEqual(exp, lex.next().tag);
|
||||
}
|
||||
}
|
||||
|
||||
test "lex type-like identifiers" {
|
||||
// s32, u8, bool, string are identifiers, not keywords
|
||||
var lex = Lexer.init("s32 u8 bool string");
|
||||
for (0..4) |_| {
|
||||
try std.testing.expectEqual(Tag.identifier, lex.next().tag);
|
||||
}
|
||||
}
|
||||
|
||||
test "lex hash_run" {
|
||||
var lex = Lexer.init("#run");
|
||||
try std.testing.expectEqual(Tag.hash_run, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.eof, lex.next().tag);
|
||||
|
||||
// #run followed by identifier
|
||||
var lex2 = Lexer.init("#run compute(5)");
|
||||
try std.testing.expectEqual(Tag.hash_run, lex2.next().tag);
|
||||
try std.testing.expectEqual(Tag.identifier, lex2.next().tag);
|
||||
|
||||
// #running should not match (identContinue after "run")
|
||||
var lex3 = Lexer.init("#running");
|
||||
try std.testing.expectEqual(Tag.invalid, lex3.next().tag);
|
||||
}
|
||||
|
||||
test "lex hash_import" {
|
||||
var lex = Lexer.init("#import \"foo.sx\"");
|
||||
try std.testing.expectEqual(Tag.hash_import, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.string_literal, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.eof, lex.next().tag);
|
||||
|
||||
// #importing should not match
|
||||
var lex2 = Lexer.init("#importing");
|
||||
try std.testing.expectEqual(Tag.invalid, lex2.next().tag);
|
||||
}
|
||||
|
||||
test "lex hash_insert" {
|
||||
var lex = Lexer.init("#insert #run generate()");
|
||||
try std.testing.expectEqual(Tag.hash_insert, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.hash_run, lex.next().tag);
|
||||
try std.testing.expectEqual(Tag.identifier, lex.next().tag);
|
||||
|
||||
// #inserting should not match
|
||||
var lex2 = Lexer.init("#inserting");
|
||||
try std.testing.expectEqual(Tag.invalid, lex2.next().tag);
|
||||
}
|
||||
|
||||
test "lex string" {
|
||||
var lex = Lexer.init("\"Hello\"");
|
||||
const tok = lex.next();
|
||||
try std.testing.expectEqual(Tag.string_literal, tok.tag);
|
||||
try std.testing.expectEqualStrings("\"Hello\"", tok.slice("\"Hello\""));
|
||||
}
|
||||
|
||||
test "lex hex literal" {
|
||||
var lex = Lexer.init("0xFF 0X1A");
|
||||
const tok1 = lex.next();
|
||||
try std.testing.expectEqual(Tag.int_literal, tok1.tag);
|
||||
try std.testing.expectEqualStrings("0xFF", tok1.slice("0xFF 0X1A"));
|
||||
const tok2 = lex.next();
|
||||
try std.testing.expectEqual(Tag.int_literal, tok2.tag);
|
||||
try std.testing.expectEqualStrings("0X1A", tok2.slice("0xFF 0X1A"));
|
||||
}
|
||||
|
||||
test "lex binary literal" {
|
||||
var lex = Lexer.init("0b1010 0B110");
|
||||
const tok1 = lex.next();
|
||||
try std.testing.expectEqual(Tag.int_literal, tok1.tag);
|
||||
try std.testing.expectEqualStrings("0b1010", tok1.slice("0b1010 0B110"));
|
||||
const tok2 = lex.next();
|
||||
try std.testing.expectEqual(Tag.int_literal, tok2.tag);
|
||||
try std.testing.expectEqualStrings("0B110", tok2.slice("0b1010 0B110"));
|
||||
}
|
||||
Reference in New Issue
Block a user