WASM work

2025-08-02 11:46:07 -04:00 · 2022-03-15 09:33:34 -04:00 · 2022-03-15 09:33:34 -04:00 · ca85b555aa
commit ca85b555aa
parent 031c998cfc
25 changed files with 672 additions and 395 deletions
--- a/veilid-wasm/wasm-sourcemap.py
+++ b/veilid-wasm/wasm-sourcemap.py
@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+# Copyright 2018 The Emscripten Authors.  All rights reserved.
+# Emscripten is available under two separate licenses, the MIT license and the
+# University of Illinois/NCSA Open Source License.  Both these licenses can be
+# found in the LICENSE file.
+
+"""Utility tools that extracts DWARF information encoded in a wasm output
+produced by the LLVM tools, and encodes it as a wasm source map. Additionally,
+it can collect original sources, change files prefixes, and strip debug
+sections from a wasm file.
+"""
+
+import argparse
+from collections import OrderedDict
+import json
+import logging
+from math import floor, log
+import os
+import re
+from subprocess import Popen, PIPE
+from pathlib import Path
+import sys
+
+__scriptdir__ = os.path.dirname(os.path.abspath(__file__))
+__rootdir__ = os.path.dirname(__scriptdir__)
+sys.path.append(__rootdir__)
+
+logger = logging.getLogger('wasm-sourcemap')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog='wasm-sourcemap.py', description=__doc__)
+    parser.add_argument('wasm', help='wasm file')
+    parser.add_argument('-o', '--output', help='output source map')
+    parser.add_argument('-p', '--prefix', nargs='*',
+                        help='replace source debug filename prefix for source map', default=[])
+    parser.add_argument('-s', '--sources', action='store_true',
+                        help='read and embed source files from file system into source map')
+    parser.add_argument('-l', '--load-prefix', nargs='*',
+                        help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[])
+    parser.add_argument('-w', nargs='?', help='set output wasm file')
+    parser.add_argument('-x', '--strip', action='store_true',
+                        help='removes debug and linking sections')
+    parser.add_argument('-u', '--source-map-url', nargs='?',
+                        help='specifies sourceMappingURL section contest')
+    parser.add_argument(
+        '--dwarfdump', help="path to llvm-dwarfdump executable")
+    parser.add_argument('--dwarfdump-output', nargs='?',
+                        help=argparse.SUPPRESS)
+    parser.add_argument(
+        '--basepath', help='base path for source files, which will be relative to this')
+    return parser.parse_args()
+
+
+class Prefixes:
+    def __init__(self, args):
+        prefixes = []
+        for p in args:
+            if '=' in p:
+                prefix, replacement = p.split('=')
+                prefixes.append({'prefix': prefix, 'replacement': replacement})
+            else:
+                prefixes.append({'prefix': p, 'replacement': None})
+        self.prefixes = prefixes
+        self.cache = {}
+
+    def resolve(self, name):
+        if name in self.cache:
+            return self.cache[name]
+
+        for p in self.prefixes:
+            if name.startswith(p['prefix']):
+                if p['replacement'] is None:
+                    result = name[len(p['prefix'])::]
+                else:
+                    result = p['replacement'] + name[len(p['prefix'])::]
+                break
+        self.cache[name] = result
+        return result
+
+
+# SourceMapPrefixes contains resolver for file names that are:
+#  - "sources" is for names that output to source maps JSON
+#  - "load" is for paths that used to load source text
+class SourceMapPrefixes:
+    def __init__(self, sources, load):
+        self.sources = sources
+        self.load = load
+
+    def provided(self):
+        return bool(self.sources.prefixes or self.load.prefixes)
+
+
+def encode_vlq(n):
+    VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+    x = (n << 1) if n >= 0 else ((-n << 1) + 1)
+    result = ""
+    while x > 31:
+        result = result + VLQ_CHARS[32 + (x & 31)]
+        x = x >> 5
+    return result + VLQ_CHARS[x]
+
+
+def read_var_uint(wasm, pos):
+    n = 0
+    shift = 0
+    b = ord(wasm[pos:pos + 1])
+    pos = pos + 1
+    while b >= 128:
+        n = n | ((b - 128) << shift)
+        b = ord(wasm[pos:pos + 1])
+        pos = pos + 1
+        shift += 7
+    return n + (b << shift), pos
+
+
+def strip_debug_sections(wasm):
+    logger.debug('Strip debug sections')
+    pos = 8
+    stripped = wasm[:pos]
+
+    while pos < len(wasm):
+        section_start = pos
+        section_id, pos_ = read_var_uint(wasm, pos)
+        section_size, section_body = read_var_uint(wasm, pos_)
+        pos = section_body + section_size
+        if section_id == 0:
+            name_len, name_pos = read_var_uint(wasm, section_body)
+            name_end = name_pos + name_len
+            name = wasm[name_pos:name_end]
+            if name == "linking" or name == "sourceMappingURL" or name.startswith("reloc..debug_") or name.startswith(".debug_"):
+                continue  # skip debug related sections
+        stripped = stripped + wasm[section_start:pos]
+
+    return stripped
+
+
+def encode_uint_var(n):
+    result = bytearray()
+    while n > 127:
+        result.append(128 | (n & 127))
+        n = n >> 7
+    result.append(n)
+    return bytes(result)
+
+
+def append_source_mapping(wasm, url):
+    logger.debug('Append sourceMappingURL section')
+    section_name = "sourceMappingURL"
+    section_content = encode_uint_var(
+        len(section_name)) + section_name + encode_uint_var(len(url)) + url
+    return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content
+
+
+def get_code_section_offset(wasm):
+    logger.debug('Read sections index')
+    pos = 8
+
+    while pos < len(wasm):
+        section_id, pos_ = read_var_uint(wasm, pos)
+        section_size, pos = read_var_uint(wasm, pos_)
+        if section_id == 10:
+            return pos
+        pos = pos + section_size
+
+
+def remove_dead_entries(entries):
+    # Remove entries for dead functions. It is a heuristics to ignore data if the
+    # function starting address near to 0 (is equal to its size field length).
+    block_start = 0
+    cur_entry = 0
+    while cur_entry < len(entries):
+        if not entries[cur_entry]['eos']:
+            cur_entry += 1
+            continue
+        fn_start = entries[block_start]['address']
+        # Calculate the LEB encoded function size (including size field)
+        fn_size_length = floor(
+            log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1
+        min_live_offset = 1 + fn_size_length  # 1 byte is for code section entries
+        if fn_start < min_live_offset:
+            # Remove dead code debug info block.
+            del entries[block_start:cur_entry + 1]
+            cur_entry = block_start
+            continue
+        cur_entry += 1
+        block_start = cur_entry
+
+
+def read_dwarf_entries(wasm, options):
+    if options.dwarfdump_output:
+        output = Path(options.dwarfdump_output).read_bytes()
+    elif options.dwarfdump:
+        logger.debug('Reading DWARF information from %s' % wasm)
+        if not os.path.exists(options.dwarfdump):
+            logger.error('llvm-dwarfdump not found: ' + options.dwarfdump)
+            sys.exit(1)
+        process = Popen([options.dwarfdump, '-debug-info',
+                        '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE)
+        output, err = process.communicate()
+        exit_code = process.wait()
+        if exit_code != 0:
+            logger.error(
+                'Error during llvm-dwarfdump execution (%s)' % exit_code)
+            sys.exit(1)
+    else:
+        logger.error('Please specify either --dwarfdump or --dwarfdump-output')
+        sys.exit(1)
+
+    entries = []
+    debug_line_chunks = re.split(
+        r"debug_line\[(0x[0-9a-f]*)\]", output.decode('utf-8'))
+    maybe_debug_info_content = debug_line_chunks[0]
+    for i in range(1, len(debug_line_chunks), 2):
+        stmt_list = debug_line_chunks[i]
+        comp_dir_match = re.search(r"DW_AT_stmt_list\s+\(" + stmt_list + r"\)\s+" +
+                                   r"DW_AT_comp_dir\s+\(\"([^\"]+)", maybe_debug_info_content)
+        comp_dir = comp_dir_match.group(
+            1) if comp_dir_match is not None else ""
+
+        line_chunk = debug_line_chunks[i + 1]
+
+        # include_directories[  1] = "/Users/yury/Work/junk/sqlite-playground/src"
+        # file_names[  1]:
+        #            name: "playground.c"
+        #       dir_index: 1
+        #        mod_time: 0x00000000
+        #          length: 0x00000000
+        #
+        # Address            Line   Column File   ISA Discriminator Flags
+        # ------------------ ------ ------ ------ --- ------------- -------------
+        # 0x0000000000000006     22      0      1   0             0  is_stmt
+        # 0x0000000000000007     23     10      1   0             0  is_stmt prologue_end
+        # 0x000000000000000f     23      3      1   0             0
+        # 0x0000000000000010     23      3      1   0             0  end_sequence
+        # 0x0000000000000011     28      0      1   0             0  is_stmt
+
+        include_directories = {'0': comp_dir}
+        for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk):
+            include_directories[dir.group(1)] = dir.group(2)
+
+        files = {}
+        for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk):
+            dir = include_directories[file.group(3)]
+            file_path = (dir + '/' if file.group(2)
+                         [0] != '/' else '') + file.group(2)
+            files[file.group(1)] = file_path
+
+        for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk):
+            entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(
+                line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
+            if not entry['eos']:
+                entries.append(entry)
+            else:
+                # move end of function to the last END operator
+                entry['address'] -= 1
+                if entries[-1]['address'] == entry['address']:
+                    # last entry has the same address, reusing
+                    entries[-1]['eos'] = True
+                else:
+                    entries.append(entry)
+
+    remove_dead_entries(entries)
+
+    # return entries sorted by the address field
+    return sorted(entries, key=lambda entry: entry['address'])
+
+
+def normalize_path(path):
+    return path.replace('\\', '/').replace('//', '/')
+
+
+def build_sourcemap(entries, code_section_offset, prefixes, collect_sources, base_path):
+    sources = []
+    sources_content = [] if collect_sources else None
+    mappings = []
+    sources_map = {}
+    last_address = 0
+    last_source_id = 0
+    last_line = 1
+    last_column = 1
+    for entry in entries:
+        line = entry['line']
+        column = entry['column']
+        # ignore entries with line 0
+        if line == 0:
+            continue
+        # start at least at column 1
+        if column == 0:
+            column = 1
+        address = entry['address'] + code_section_offset
+        file_name = entry['file']
+        file_name = normalize_path(file_name)
+        # if prefixes were provided, we use that; otherwise, we emit a relative
+        # path
+        if prefixes.provided():
+            source_name = prefixes.sources.resolve(file_name)
+        else:
+            try:
+                file_name = os.path.relpath(file_name, base_path)
+            except ValueError:
+                file_name = os.path.abspath(file_name)
+            file_name = normalize_path(file_name)
+            source_name = file_name
+        if source_name not in sources_map:
+            source_id = len(sources)
+            sources_map[source_name] = source_id
+            sources.append(source_name)
+            if collect_sources:
+                load_name = prefixes.load.resolve(file_name)
+                try:
+                    with open(load_name, 'r') as infile:
+                        source_content = infile.read()
+                    sources_content.append(source_content)
+                except IOError:
+                    print('Failed to read source: %s' % load_name)
+                    sources_content.append(None)
+        else:
+            source_id = sources_map[source_name]
+
+        address_delta = address - last_address
+        source_id_delta = source_id - last_source_id
+        line_delta = line - last_line
+        column_delta = column - last_column
+        mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) +
+                        encode_vlq(line_delta) + encode_vlq(column_delta))
+        last_address = address
+        last_source_id = source_id
+        last_line = line
+        last_column = column
+    return OrderedDict([('version', 3),
+                        ('names', []),
+                        ('sources', sources),
+                        ('sourcesContent', sources_content),
+                        ('mappings', ','.join(mappings))])
+
+
+def main():
+    options = parse_args()
+
+    wasm_input = options.wasm
+    with open(wasm_input, 'rb') as infile:
+        wasm = infile.read()
+
+    entries = read_dwarf_entries(wasm_input, options)
+
+    code_section_offset = get_code_section_offset(wasm)
+
+    prefixes = SourceMapPrefixes(sources=Prefixes(
+        options.prefix), load=Prefixes(options.load_prefix))
+
+    logger.debug('Saving to %s' % options.output)
+    map = build_sourcemap(entries, code_section_offset,
+                          prefixes, options.sources, options.basepath)
+    with open(options.output, 'w') as outfile:
+        json.dump(map, outfile, separators=(',', ':'))
+
+    if options.strip:
+        wasm = strip_debug_sections(wasm)
+
+    if options.source_map_url:
+        wasm = append_source_mapping(wasm, options.source_map_url)
+
+    if options.w:
+        logger.debug('Saving wasm to %s' % options.w)
+        with open(options.w, 'wb') as outfile:
+            outfile.write(wasm)
+
+    logger.debug('Done')
+    return 0
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG if os.environ.get(
+        'EMCC_DEBUG') else logging.INFO)
+    sys.exit(main())