Fixed wrong calculation of locals size when parsing a wasm binary

The fix involves moving the function leb128Decode over to parser. To me it makes more sense for the function to belong in that module so I think of it as a positive change. However I do not think that returning two values is really necessary. I think a proper solution would be either to parse the code or wrap the stream so we can count how many bytes are readed. Therefore we could use std.leb.readUleb128 which should be less error-prone.
2025-03-13 22:51:56 +00:00 · 2025-03-13 22:51:56 +00:00 · 2818fd14c5
commit 2818fd14c5
parent 1c3f9702af
2 changed files with 53 additions and 37 deletions
--- a/src/vm/parse.zig
+++ b/src/vm/parse.zig
@ -2,6 +2,46 @@ const std = @import("std");
 const wasm = @import("wasm.zig");
 const Allocator = std.mem.Allocator;
 pub fn leb128Result(T: type) type {
    return struct { len: usize, val: T };
 }
 pub fn leb128Decode(comptime T: type, stream: anytype) !leb128Result(T) {
    switch (@typeInfo(T)) {
        .int => {},
        else => @compileError("LEB128 integer decoding only support integers, but got " ++ @typeName(T)),
    }
    if (@typeInfo(T).int.bits != 32 and @typeInfo(T).int.bits != 64) {
        @compileError("LEB128 integer decoding only supports 32 or 64 bits integers but got " ++ std.fmt.comptimePrint("{d} bits", .{@typeInfo(T).int.bits}));
    }
    var result: T = 0;
    // TODO: is the type of shift important. Reading Wikipedia (not very much tho) it seems like we can use u32 and call it a day...
    var shift: if (@typeInfo(T).int.bits == 32) u5 else u6 = 0;
    var byte: u8 = undefined;
    var len: usize = 0;
    while (stream.readByte()) |b| {
        len += 1;
        result |= @as(T, @intCast((b & 0x7f))) << shift;
        if ((b & (0x1 << 7)) == 0) {
            byte = b;
            break;
        }
        shift += 7;
    } else |err| {
        return err;
    }
    if (@typeInfo(T).int.signedness == .signed) {
        const size = @sizeOf(T) * 8;
        if (shift < size and (byte & 0x40) != 0) {
            result |= (~@as(T, 0) << shift);
        }
    }
    return .{ .len = len, .val = result };
 }
 pub const Error = error{
    malformed_wasm,
    invalid_utf8,
@ -242,20 +282,23 @@ pub fn parseWasm(allocator: Allocator, stream: anytype) !Module {
                code = try allocator.alloc(FunctionBody, code_count);
                for (0..code_count) |i| {
                    const code_size = try std.leb.readULEB128(u32, stream);
-                    const local_count = try std.leb.readULEB128(u32, stream);
+                    var locals_size: usize = 0;
-                    const locals = try allocator.alloc(Local, local_count);
+                    const local_count = try leb128Decode(u32, stream);
                    locals_size += local_count.len;
                    const locals = try allocator.alloc(Local, local_count.val);
                    for (locals) |*l| {
-                        const n = try std.leb.readULEB128(u32, stream);
+                        const n = try leb128Decode(u32, stream);
-                        l.types = try allocator.alloc(u8, n);
+                        l.types = try allocator.alloc(u8, n.val);
                        @memset(l.types, try stream.readByte());
                        locals_size += n.len + 1;
                    }
                    code[i].locals = locals;
                    // TODO: maybe is better to parse code into ast here and not do it every frame?
                    // FIXME: This calculation is plain wrong. Resolving above TODO should help
-                    code[i].code = try allocator.alloc(u8, code_size - local_count - 1);
+                    code[i].code = try allocator.alloc(u8, code_size - locals_size);
                    // TODO: better error reporting
-                    if (try stream.read(code[i].code) != code_size - local_count - 1) return Error.malformed_wasm;
+                    if (try stream.read(code[i].code) != code_size - locals_size) return Error.malformed_wasm;
                    const f = Function{ .internal = @intCast(i) };
                    try funcs.append(f);
--- a/src/vm/vm.zig
+++ b/src/vm/vm.zig
@ -4,37 +4,10 @@ const Parser = @import("parse.zig");
 const Allocator = std.mem.Allocator;
 const AllocationError = error{OutOfMemory};
-pub fn leb128Decode(comptime T: type, bytes: []u8) struct { len: usize, val: T } {
+fn leb128Decode(comptime T: type, bytes: []u8) Parser.leb128Result(T) {
-    switch (@typeInfo(T)) {
+    var fbs = std.io.fixedBufferStream(bytes);
-        .int => {},
+    // TODO: this catch should be unrecheable
-        else => @compileError("LEB128 integer decoding only support integers, but got " ++ @typeName(T)),
+    return Parser.leb128Decode(T, fbs.reader()) catch .{ .len = 0, .val = 0 };
    }
    if (@typeInfo(T).int.bits != 32 and @typeInfo(T).int.bits != 64) {
        @compileError("LEB128 integer decoding only supports 32 or 64 bits integers but got " ++ std.fmt.comptimePrint("{d} bits", .{@typeInfo(T).int.bits}));
    }
    var result: T = 0;
    // TODO: is the type of shift important. Reading Wikipedia (not very much tho) it seems like we can use u32 and call it a day...
    var shift: if (@typeInfo(T).int.bits == 32) u5 else u6 = 0;
    var byte: u8 = undefined;
    var len: usize = 0;
    for (bytes) |b| {
        len += 1;
        result |= @as(T, @intCast((b & 0x7f))) << shift;
        if ((b & (0x1 << 7)) == 0) {
            byte = b;
            break;
        }
        shift += 7;
    }
    if (@typeInfo(T).int.signedness == .signed) {
        const size = @sizeOf(T) * 8;
        if (shift < size and (byte & 0x40) != 0) {
            result |= (~@as(T, 0) << shift);
        }
    }
    return .{ .len = len, .val = result };
 }
 pub fn decodeLittleEndian(comptime T: type, bytes: []u8) T {