notes.billmill.org / programming / zig / lexer /
1.2 scan stuff

last updated: Oct 20, 2023
const std = @import("std");

const TokenType = enum {
    // single-character tokens
    left_paren,
    right_paren,
    left_brace,
    right_brace,
    comma,
    dot,
    minus,
    plus,
    semicolon,
    slash,
    star,

    // 1-2 char tokens
    bang,
    bang_equal,
    equal,
    equal_equal,
    greater,
    greater_equal,
    less,
    less_equal,

    // literals
    identifier,
    string,
    number,

    // keywords
    keyword_and,
    keyword_class,
    keyword_else,
    keyword_false,
    keyword_fun,
    keyword_for,
    keyword_if,
    keyword_nil,
    keyword_or,
    keyword_print,
    keyword_return,
    keyword_super,
    keyword_this,
    keyword_true,
    keyword_var,
    keyword_while,

    eof,
};

const Lexemes = enum {
    bytes,
    int,
    float,
    empty,
};

const Token = struct {
    typ: TokenType,
    // I think the natural way to do a zig token would be to return a slice
    // into the buffer? But instead I'm going to follow the book and represent
    // the lexeme as a union
    lexeme: union(Lexemes) {
        bytes: []const u8,
        int: i64,
        float: f64,
        empty: void,
    },
    line: u64,
};

const TokenList = std.ArrayList(*Token);

const Scanner = struct {
    allocator: std.mem.Allocator,
    buf: []const u8,
    start: u64,
    current: u64,
    line: u64,
    tokens: TokenList,

    pub fn init(allocator: std.mem.Allocator, buf: []const u8) Scanner {
        return Scanner{
            .allocator = allocator,
            .buf = buf,
            .start = 0,
            .current = 0,
            .line = 1,
            .tokens = TokenList.init(allocator),
        };
    }

    pub fn scan(self: *Scanner) !TokenList {
        while (self.current < self.buf.len) {
            self.start = self.current;
            try self.next();
        }

        try self.tokens.append(&Token{ .typ = .eof, .lexeme = .empty, .line = self.line });
        return self.tokens;
    }

    pub fn next(self: *Scanner) !void {
        switch (self.buf[self.current]) {
            '(' => {
                try self.addTok(.left_paren);
            },
            ')' => {
                try self.addTok(.right_paren);
            },
            '{' => {
                try self.addTok(.left_brace);
            },
            '}' => {
                try self.addTok(.right_brace);
            },
            ',' => {
                try self.addTok(.comma);
            },
            '.' => {
                try self.addTok(.dot);
            },
            '-' => {
                try self.addTok(.minus);
            },
            '+' => {
                try self.addTok(.plus);
            },
            ';' => {
                try self.addTok(.semicolon);
            },
            '*' => {
                try self.addTok(.star);
            },
            else => {},
        }
        self.current += 1;
    }

    fn addTok(self: *Scanner, typ: TokenType) !void {
        var tok = try self.allocator.create(Token);
        tok.* = Token{ .typ = typ, .lexeme = .empty, .line = 1 };
        try self.tokens.append(tok);
    }
};

fn run(allocator: std.mem.Allocator, buf: []const u8) void {
    std.debug.print("file contents: {s}", .{buf});
    var scanner = Scanner.init(allocator, buf);
    var tokens = scanner.scan();

    // TODO: better token printer
    std.debug.print("tokens: {s}\n", .{tokens});
}

fn runFile(allocator: std.mem.Allocator, path: []const u8) !void {
    std.debug.print("reading file {s}\n", .{path});

    var input = try std.fs.cwd().readFileAlloc(allocator, path, std.math.maxInt(usize));
    defer allocator.free(input);

    run(allocator, input);
}

fn runPrompt(allocator: std.mem.Allocator) !void {
    std.debug.print("run the interpreter\n", .{});
    const stdin = std.io.getStdIn().reader();
    const stderr = std.io.getStdErr().writer();
    var repl_buf: [1024]u8 = undefined;

    if (stdin.readUntilDelimiterOrEof(&repl_buf, '\n') catch |err| {
        try stderr.print("\nUnable to parse command: {s}\n", .{@errorName(err)});
        return;
    }) |line| {
        run(allocator, line);
    }
}

pub fn main() anyerror!void {
    var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{}){};
    const gpa = general_purpose_allocator.allocator();
    const args = try std.process.argsAlloc(gpa);
    defer std.process.argsFree(gpa, args);

    // args[0] is our executable
    if (args.len > 2) {
        std.debug.print("Usage: lexzical [script]\n", .{});
    } else if (args.len == 2) {
        try runFile(gpa, args[1]);
    } else {
        try runPrompt(gpa);
    }
}
↑ up