#!/usr/bin/env python3 # Copyright 2018 The Emscripten Authors. All rights reserved. # Emscripten is available under two separate licenses, the MIT license and the # University of Illinois/NCSA Open Source License. Both these licenses can be # found in the LICENSE file. """Utility tools that extracts DWARF information encoded in a wasm output produced by the LLVM tools, and encodes it as a wasm source map. Additionally, it can collect original sources, change files prefixes, and strip debug sections from a wasm file. """ import argparse from collections import OrderedDict import json import logging from math import floor, log import os import re from subprocess import Popen, PIPE from pathlib import Path import sys __scriptdir__ = os.path.dirname(os.path.abspath(__file__)) __rootdir__ = os.path.dirname(__scriptdir__) sys.path.append(__rootdir__) logger = logging.getLogger('wasm-sourcemap') def parse_args(): parser = argparse.ArgumentParser( prog='wasm-sourcemap.py', description=__doc__) parser.add_argument('wasm', help='wasm file') parser.add_argument('-o', '--output', help='output source map') parser.add_argument('-p', '--prefix', nargs='*', help='replace source debug filename prefix for source map', default=[]) parser.add_argument('-s', '--sources', action='store_true', help='read and embed source files from file system into source map') parser.add_argument('-l', '--load-prefix', nargs='*', help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[]) parser.add_argument('-w', nargs='?', help='set output wasm file') parser.add_argument('-x', '--strip', action='store_true', help='removes debug and linking sections') parser.add_argument('-u', '--source-map-url', nargs='?', help='specifies sourceMappingURL section contest') parser.add_argument( '--dwarfdump', help="path to llvm-dwarfdump executable") parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS) parser.add_argument( '--basepath', help='base path for source files, which will be relative to this') return parser.parse_args() class Prefixes: def __init__(self, args): prefixes = [] for p in args: if '=' in p: prefix, replacement = p.split('=') prefixes.append({'prefix': prefix, 'replacement': replacement}) else: prefixes.append({'prefix': p, 'replacement': None}) self.prefixes = prefixes self.cache = {} def resolve(self, name): if name in self.cache: return self.cache[name] for p in self.prefixes: if name.startswith(p['prefix']): if p['replacement'] is None: result = name[len(p['prefix'])::] else: result = p['replacement'] + name[len(p['prefix'])::] break self.cache[name] = result return result # SourceMapPrefixes contains resolver for file names that are: # - "sources" is for names that output to source maps JSON # - "load" is for paths that used to load source text class SourceMapPrefixes: def __init__(self, sources, load): self.sources = sources self.load = load def provided(self): return bool(self.sources.prefixes or self.load.prefixes) def encode_vlq(n): VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" x = (n << 1) if n >= 0 else ((-n << 1) + 1) result = "" while x > 31: result = result + VLQ_CHARS[32 + (x & 31)] x = x >> 5 return result + VLQ_CHARS[x] def read_var_uint(wasm, pos): n = 0 shift = 0 b = ord(wasm[pos:pos + 1]) pos = pos + 1 while b >= 128: n = n | ((b - 128) << shift) b = ord(wasm[pos:pos + 1]) pos = pos + 1 shift += 7 return n + (b << shift), pos def strip_debug_sections(wasm): logger.debug('Strip debug sections') pos = 8 stripped = wasm[:pos] while pos < len(wasm): section_start = pos section_id, pos_ = read_var_uint(wasm, pos) section_size, section_body = read_var_uint(wasm, pos_) pos = section_body + section_size if section_id == 0: name_len, name_pos = read_var_uint(wasm, section_body) name_end = name_pos + name_len name = wasm[name_pos:name_end] if name == "linking" or name == "sourceMappingURL" or name.startswith("reloc..debug_") or name.startswith(".debug_"): continue # skip debug related sections stripped = stripped + wasm[section_start:pos] return stripped def encode_uint_var(n): result = bytearray() while n > 127: result.append(128 | (n & 127)) n = n >> 7 result.append(n) return bytes(result) def append_source_mapping(wasm, url): logger.debug('Append sourceMappingURL section') section_name = "sourceMappingURL" section_content = encode_uint_var( len(section_name)) + section_name + encode_uint_var(len(url)) + url return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content def get_code_section_offset(wasm): logger.debug('Read sections index') pos = 8 while pos < len(wasm): section_id, pos_ = read_var_uint(wasm, pos) section_size, pos = read_var_uint(wasm, pos_) if section_id == 10: return pos pos = pos + section_size def remove_dead_entries(entries): # Remove entries for dead functions. It is a heuristics to ignore data if the # function starting address near to 0 (is equal to its size field length). block_start = 0 cur_entry = 0 while cur_entry < len(entries): if not entries[cur_entry]['eos']: cur_entry += 1 continue fn_start = entries[block_start]['address'] # Calculate the LEB encoded function size (including size field) fn_size_length = floor( log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1 min_live_offset = 1 + fn_size_length # 1 byte is for code section entries if fn_start < min_live_offset: # Remove dead code debug info block. del entries[block_start:cur_entry + 1] cur_entry = block_start continue cur_entry += 1 block_start = cur_entry def read_dwarf_entries(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() elif options.dwarfdump: logger.debug('Reading DWARF information from %s' % wasm) if not os.path.exists(options.dwarfdump): logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) sys.exit(1) process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code != 0: logger.error( 'Error during llvm-dwarfdump execution (%s)' % exit_code) sys.exit(1) else: logger.error('Please specify either --dwarfdump or --dwarfdump-output') sys.exit(1) entries = [] debug_line_chunks = re.split( r"debug_line\[(0x[0-9a-f]*)\]", output.decode('utf-8')) maybe_debug_info_content = debug_line_chunks[0] for i in range(1, len(debug_line_chunks), 2): stmt_list = debug_line_chunks[i] comp_dir_match = re.search(r"DW_AT_stmt_list\s+\(" + stmt_list + r"\)\s+" + r"DW_AT_comp_dir\s+\(\"([^\"]+)", maybe_debug_info_content) comp_dir = comp_dir_match.group( 1) if comp_dir_match is not None else "" line_chunk = debug_line_chunks[i + 1] # include_directories[ 1] = "/Users/yury/Work/junk/sqlite-playground/src" # file_names[ 1]: # name: "playground.c" # dir_index: 1 # mod_time: 0x00000000 # length: 0x00000000 # # Address Line Column File ISA Discriminator Flags # ------------------ ------ ------ ------ --- ------------- ------------- # 0x0000000000000006 22 0 1 0 0 is_stmt # 0x0000000000000007 23 10 1 0 0 is_stmt prologue_end # 0x000000000000000f 23 3 1 0 0 # 0x0000000000000010 23 3 1 0 0 end_sequence # 0x0000000000000011 28 0 1 0 0 is_stmt include_directories = {'0': comp_dir} for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk): include_directories[dir.group(1)] = dir.group(2) files = {} for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk): dir = include_directories[file.group(3)] file_path = (dir + '/' if file.group(2) [0] != '/' else '') + file.group(2) files[file.group(1)] = file_path for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk): entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int( line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None} if not entry['eos']: entries.append(entry) else: # move end of function to the last END operator entry['address'] -= 1 if entries[-1]['address'] == entry['address']: # last entry has the same address, reusing entries[-1]['eos'] = True else: entries.append(entry) remove_dead_entries(entries) # return entries sorted by the address field return sorted(entries, key=lambda entry: entry['address']) def normalize_path(path): return path.replace('\\', '/').replace('//', '/') def build_sourcemap(entries, code_section_offset, prefixes, collect_sources, base_path): sources = [] sources_content = [] if collect_sources else None mappings = [] sources_map = {} last_address = 0 last_source_id = 0 last_line = 1 last_column = 1 for entry in entries: line = entry['line'] column = entry['column'] # ignore entries with line 0 if line == 0: continue # start at least at column 1 if column == 0: column = 1 address = entry['address'] + code_section_offset file_name = entry['file'] file_name = normalize_path(file_name) # if prefixes were provided, we use that; otherwise, we emit a relative # path if prefixes.provided(): source_name = prefixes.sources.resolve(file_name) else: try: file_name = os.path.relpath(file_name, base_path) except ValueError: file_name = os.path.abspath(file_name) file_name = normalize_path(file_name) source_name = file_name if source_name not in sources_map: source_id = len(sources) sources_map[source_name] = source_id sources.append(source_name) if collect_sources: load_name = prefixes.load.resolve(file_name) try: with open(load_name, 'r') as infile: source_content = infile.read() sources_content.append(source_content) except IOError: print('Failed to read source: %s' % load_name) sources_content.append(None) else: source_id = sources_map[source_name] address_delta = address - last_address source_id_delta = source_id - last_source_id line_delta = line - last_line column_delta = column - last_column mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) last_address = address last_source_id = source_id last_line = line last_column = column return OrderedDict([('version', 3), ('names', []), ('sources', sources), ('sourcesContent', sources_content), ('mappings', ','.join(mappings))]) def main(): options = parse_args() wasm_input = options.wasm with open(wasm_input, 'rb') as infile: wasm = infile.read() entries = read_dwarf_entries(wasm_input, options) code_section_offset = get_code_section_offset(wasm) prefixes = SourceMapPrefixes(sources=Prefixes( options.prefix), load=Prefixes(options.load_prefix)) logger.debug('Saving to %s' % options.output) map = build_sourcemap(entries, code_section_offset, prefixes, options.sources, options.basepath) with open(options.output, 'w') as outfile: json.dump(map, outfile, separators=(',', ':')) if options.strip: wasm = strip_debug_sections(wasm) if options.source_map_url: wasm = append_source_mapping(wasm, options.source_map_url) if options.w: logger.debug('Saving wasm to %s' % options.w) with open(options.w, 'wb') as outfile: outfile.write(wasm) logger.debug('Done') return 0 if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG if os.environ.get( 'EMCC_DEBUG') else logging.INFO) sys.exit(main())