sideros/src/vm/parse.zig
Ernesto Lanchares 2818fd14c5 Fixed wrong calculation of locals size when parsing a wasm binary
The fix involves moving the function leb128Decode over to parser. To me
it makes more sense for the function to belong in that module so I think
of it as a positive change. However I do not think that returning two
values is really necessary. I think a proper solution would be either
to parse the code or wrap the stream so we can count how many bytes are
readed. Therefore we could use std.leb.readUleb128 which should be less
error-prone.
2025-03-14 16:50:59 +01:00

331 lines
12 KiB
Zig

const std = @import("std");
const wasm = @import("wasm.zig");
const Allocator = std.mem.Allocator;
pub fn leb128Result(T: type) type {
return struct { len: usize, val: T };
}
pub fn leb128Decode(comptime T: type, stream: anytype) !leb128Result(T) {
switch (@typeInfo(T)) {
.int => {},
else => @compileError("LEB128 integer decoding only support integers, but got " ++ @typeName(T)),
}
if (@typeInfo(T).int.bits != 32 and @typeInfo(T).int.bits != 64) {
@compileError("LEB128 integer decoding only supports 32 or 64 bits integers but got " ++ std.fmt.comptimePrint("{d} bits", .{@typeInfo(T).int.bits}));
}
var result: T = 0;
// TODO: is the type of shift important. Reading Wikipedia (not very much tho) it seems like we can use u32 and call it a day...
var shift: if (@typeInfo(T).int.bits == 32) u5 else u6 = 0;
var byte: u8 = undefined;
var len: usize = 0;
while (stream.readByte()) |b| {
len += 1;
result |= @as(T, @intCast((b & 0x7f))) << shift;
if ((b & (0x1 << 7)) == 0) {
byte = b;
break;
}
shift += 7;
} else |err| {
return err;
}
if (@typeInfo(T).int.signedness == .signed) {
const size = @sizeOf(T) * 8;
if (shift < size and (byte & 0x40) != 0) {
result |= (~@as(T, 0) << shift);
}
}
return .{ .len = len, .val = result };
}
pub const Error = error{
malformed_wasm,
invalid_utf8,
};
pub const Module = struct {
types: []FunctionType,
imports: std.ArrayList(Import),
exports: std.StringHashMap(u32),
functions: []u32,
memory: Memory,
code: []FunctionBody,
funcs: std.ArrayList(Function),
pub fn deinit(self: *Module, allocator: Allocator) void {
for (self.types) |t| {
t.deinit(allocator);
}
allocator.free(self.types);
for (self.imports.items) |i| {
i.deinit(allocator);
}
self.imports.deinit();
var iter = self.exports.iterator();
while (iter.next()) |entry| {
allocator.free(entry.key_ptr.*);
}
self.exports.deinit();
allocator.free(self.functions);
for (self.code) |f| {
for (f.locals) |l| {
allocator.free(l.types);
}
allocator.free(f.code);
}
allocator.free(self.code);
self.funcs.deinit();
}
};
pub const FunctionScope = enum {
external,
internal,
};
pub const Function = union(FunctionScope) {
external: u8,
internal: u8,
};
// TODO: refactor locals
pub const Local = struct {
types: []u8,
};
pub const FunctionBody = struct {
locals: []Local,
code: []u8,
};
pub const Memory = struct {
initial: u32,
max: u32,
};
pub const FunctionType = struct {
parameters: []u8,
results: []u8,
pub fn deinit(self: FunctionType, allocator: Allocator) void {
allocator.free(self.parameters);
allocator.free(self.results);
}
};
pub const Import = struct {
name: []u8,
module: []u8,
signature: u32,
pub fn deinit(self: Import, allocator: Allocator) void {
allocator.free(self.name);
allocator.free(self.module);
}
};
pub fn parseType(t: u8) wasm.Type {
return @enumFromInt(t);
}
pub fn parseName(allocator: Allocator, stream: anytype) ![]u8 {
const size = try std.leb.readULEB128(u32, stream);
const str = try allocator.alloc(u8, size);
if (try stream.read(str) != size) {
// TODO: better error
return Error.malformed_wasm;
}
if (!std.unicode.utf8ValidateSlice(str)) return Error.invalid_utf8;
return str;
}
// TODO: parse Global Section
// TODO: Consider Arena allocator
pub fn parseWasm(allocator: Allocator, stream: anytype) !Module {
var types: []FunctionType = undefined;
var imports = std.ArrayList(Import).init(allocator);
var exports = std.StringHashMap(u32).init(allocator);
var funcs = std.ArrayList(Function).init(allocator);
var functions: []u32 = undefined;
var memory: Memory = undefined;
var code: []FunctionBody = undefined;
// Parse magic
if (!(try stream.isBytes(&[_]u8{ 0x00, 0x61, 0x73, 0x6d }))) return Error.malformed_wasm;
// Parse version
if (!(try stream.isBytes(&[_]u8{ 0x01, 0x00, 0x00, 0x00 }))) return Error.malformed_wasm;
// NOTE: This ensures that (in this block) illegal behavior is safety-checked.
// This slows down the code but since this function is only called at the start
// I believe it is better to take the ``hit'' in performance (should only be @enumFromInt)
// rather than having undefined behavior when user provides an invalid wasm file.
@setRuntimeSafety(true);
loop: while (stream.readByte()) |byte| {
const section_size = try std.leb.readULEB128(u32, stream);
switch (@as(std.wasm.Section, @enumFromInt(byte))) {
std.wasm.Section.custom => {
// TODO: unimplemented
break :loop;
},
std.wasm.Section.type => {
const type_count = try std.leb.readULEB128(u32, stream);
types = try allocator.alloc(FunctionType, type_count);
for (types) |*t| {
if (!(try stream.isBytes(&.{0x60}))) return Error.malformed_wasm;
const params_count = try std.leb.readULEB128(u32, stream);
t.parameters = try allocator.alloc(u8, params_count);
if (try stream.read(t.parameters) != params_count) {
// TODO: better errors
return Error.malformed_wasm;
}
const results = try std.leb.readULEB128(u32, stream);
t.results = try allocator.alloc(u8, results);
if (try stream.read(t.results) != results) {
// TODO: better errors
return Error.malformed_wasm;
}
}
},
std.wasm.Section.import => {
// Can there be more than one import section?
const import_count = try std.leb.readULEB128(u32, stream);
for (0..import_count) |i| {
const mod = try parseName(allocator, stream);
const nm = try parseName(allocator, stream);
const b = try stream.readByte();
switch (@as(std.wasm.ExternalKind, @enumFromInt(b))) {
std.wasm.ExternalKind.function => try funcs.append(.{ .external = @intCast(i) }),
// TODO: not implemented
std.wasm.ExternalKind.table => {},
std.wasm.ExternalKind.memory => {},
std.wasm.ExternalKind.global => {},
}
const idx = try std.leb.readULEB128(u32, stream);
try imports.append(.{
.module = mod,
.name = nm,
.signature = idx,
});
}
},
std.wasm.Section.function => {
const function_count = try std.leb.readULEB128(u32, stream);
functions = try allocator.alloc(u32, function_count);
for (functions) |*f| {
f.* = try std.leb.readULEB128(u32, stream);
}
},
std.wasm.Section.table => {
// TODO: not implemented
try stream.skipBytes(section_size, .{});
},
std.wasm.Section.memory => {
const memory_count = try std.leb.readULEB128(u32, stream);
for (0..memory_count) |_| {
const b = try stream.readByte();
const n = try std.leb.readULEB128(u32, stream);
var m: u32 = 0;
switch (b) {
0x00 => {},
0x01 => m = try std.leb.readULEB128(u32, stream),
else => return Error.malformed_wasm,
}
// TODO: support multiple memories
memory = .{
.initial = n,
.max = m,
};
}
},
std.wasm.Section.global => {
// TODO: unimplemented
try stream.skipBytes(section_size, .{});
},
// TODO: Can there be more than one export section? Otherwise we can optimize allocations
std.wasm.Section.@"export" => {
const export_count = try std.leb.readULEB128(u32, stream);
for (0..export_count) |_| {
const nm = try parseName(allocator, stream);
const b = try stream.readByte();
const idx = try std.leb.readULEB128(u32, stream);
switch (@as(std.wasm.ExternalKind, @enumFromInt(b))) {
std.wasm.ExternalKind.function => try exports.put(nm, idx),
// TODO: unimplemented,
std.wasm.ExternalKind.table => allocator.free(nm),
std.wasm.ExternalKind.memory => allocator.free(nm),
std.wasm.ExternalKind.global => allocator.free(nm),
}
}
},
std.wasm.Section.start => {
// TODO: unimplemented
try stream.skipBytes(section_size, .{});
},
std.wasm.Section.element => {
// TODO: unimplemented
try stream.skipBytes(section_size, .{});
},
std.wasm.Section.code => {
const code_count = try std.leb.readULEB128(u32, stream);
code = try allocator.alloc(FunctionBody, code_count);
for (0..code_count) |i| {
const code_size = try std.leb.readULEB128(u32, stream);
var locals_size: usize = 0;
const local_count = try leb128Decode(u32, stream);
locals_size += local_count.len;
const locals = try allocator.alloc(Local, local_count.val);
for (locals) |*l| {
const n = try leb128Decode(u32, stream);
l.types = try allocator.alloc(u8, n.val);
@memset(l.types, try stream.readByte());
locals_size += n.len + 1;
}
code[i].locals = locals;
// TODO: maybe is better to parse code into ast here and not do it every frame?
// FIXME: This calculation is plain wrong. Resolving above TODO should help
code[i].code = try allocator.alloc(u8, code_size - locals_size);
// TODO: better error reporting
if (try stream.read(code[i].code) != code_size - locals_size) return Error.malformed_wasm;
const f = Function{ .internal = @intCast(i) };
try funcs.append(f);
}
},
std.wasm.Section.data => {
// TODO: unimplemented
try stream.skipBytes(section_size, .{});
},
std.wasm.Section.data_count => {
// TODO: unimplemented
try stream.skipBytes(section_size, .{});
},
else => return Error.malformed_wasm,
}
} else |err| switch (err) {
error.EndOfStream => {},
else => return err,
}
return Module{
.types = types,
.imports = imports,
.functions = functions,
.memory = memory,
.exports = exports,
.code = code,
.funcs = funcs,
};
}