4 files changed, 629 insertions, 0 deletions
diff --git a/third_party/bazel/rules_haskell/debug/linking_utils/BUILD.bazel b/third_party/bazel/rules_haskell/debug/linking_utils/BUILD.bazel
new file mode 100644
index 0000000000..a32be2cfb6
--- /dev/null
+++ b/third_party/bazel/rules_haskell/debug/linking_utils/BUILD.bazel
@@ -0,0 +1,50 @@
+load(
+    ":ldd_test.bzl",
+    "ldd_test",
+)
+
+py_library(
+    name = "linking_utils",
+    srcs = ["ldd.py"],
+    visibility = ["//visibility:public"],
+)
+
+# test the ldd debug library on the output of `//tests/binary-indirect-cbits`
+ldd_test(
+    name = "test-ldd",
+    current_workspace = None,
+    elf_binary = "//tests/binary-indirect-cbits",
+    script = r'''
+import sys
+
+def contains_error(error):
+    """check whether any of the dependencies contains `error`,
+    where error is something from `LDD_ERRORS`.
+    Returns {} if there's no error.
+    """
+    def f(d):
+        return { k: v for k, v in d['needed'].items()
+          if (v == error
+             or (v not in LDD_ERRORS
+                and dict_remove_empty(v['item']) != {})) }
+    return f
+
+# output should have some runpaths
+assert \
+    ldd(identity, sys.argv[1])['runpath_dirs']\
+    > 0
+
+# some of the dependencies are implicit and not in NEEDED flags
+assert ldd(contains_error(LDD_UNKNOWN), sys.argv[1])
+
+import pprint
+# none of the dependencies must be missing
+res = ldd(contains_error(LDD_MISSING), sys.argv[1])
+if res != {}:
+  print("These dependencies are missing:")
+  pprint.pprint(res)
+  exit(1)
+''',
+    # it only works on linux
+    tags = ["dont_test_on_darwin"],
+)
diff --git a/third_party/bazel/rules_haskell/debug/linking_utils/README.md b/third_party/bazel/rules_haskell/debug/linking_utils/README.md
new file mode 100644
index 0000000000..57384a27fe
--- /dev/null
+++ b/third_party/bazel/rules_haskell/debug/linking_utils/README.md
@@ -0,0 +1,265 @@
+# Debugging linking errors
+
+The usual utilities, like `nm`, `objdump`, and of course `ldd` (see
+[here](https://linux-audit.com/elf-binaries-on-linux-understanding-and-analysis/#tools-for-binary-analysis)
+for a good overview of existing tools) go a long way. Yet, when
+debugging non-trivial runtime linker failures one would oftentimes
+like to filter outputs programmatically, with more advanced query
+logic than just simple `grep` and `sed` expressions.
+
+This library provides a small set of utility subroutines. These can
+help debug complicated linker errors.
+
+The main function is `ldd(f, elf_path)`. It is in the same spirit
+as `ldd(1)`, but instead of a flat list of resolved libraries, it
+returns a tree of structured information.
+
+When we use the term `ldd` in the following document, it refers
+to the `ldd` function exported from [./ldd.py](./ldd.py).
+
+To query that tree, you pass it a function `f`, which is applied to
+each dependency recursively (transforming the tree from the bottom
+up).
+
+The following functions are exported alongside the `ldd` function.
+They can be passed to `ldd` and used as building blocks for insightful
+queries:
+
+- `identity`: don’t transform, output everything
+- `remove_matching_needed`: remove needed entries that match a regex
+- `remove_matching_runpaths`: remove runpaths that match a regex
+- `non_existing_runpaths`: return a list of runpaths that don’t exist
+  in the filesystem
+- `unused_runpaths`: return a list of runpaths that are listed in the
+  elf binary header, but no dependency was actually found in them
+- `collect_unused_runpaths`: give an overview of all unused runpaths
+
+Helpers:
+- `dict_remove_empty`: remove fields with empty lists/dicts from an output
+- `items`: `dict.iteritems()` for both python 2 and 3
+
+See the introductory tutorial below on how to use these functions.
+
+## Example usage
+
+### Setup
+
+If you have a bazel target which outputs a binary which you want to
+debug, the easiest way is to use `ldd_test`:
+
+```python
+load(
+    "//:debug/linking_utils/ldd_test.bzl",
+    "ldd_test",
+)
+
+ldd_test(
+    name = "test-ldd",
+    elf_binary = "//tests/binary-indirect-cbits",
+    current_workspace = None,
+    script = r'''
+YOUR SCRIPT HERE
+'''
+)
+```
+
+All exported functions from `ldd.py` are already in scope.
+See the [`BUILD`](./BUILD) file in this directory for an example.
+
+
+### Writing queries
+
+`ldd` takes a function that is applied to each layer of elf
+dependencies. This function is passed a set of structured data.
+This data is gathered by querying the elf binary with `objdump`
+and parsing the header fields of the dynamic section:
+
+```
+DependencyInfo :
+{ needed : dict(string, union(
+    LDD_MISSING, LDD_UNKNOWN,
+    {
+        # the needed dependency
+        item : a,
+        # where the dependency was found in
+        found_in : RunpathDir
+    }))
+# all runpath directories that were searched
+, runpath_dirs : [ RunpathDir ] }
+```
+
+The amount of data can get quite extensive for larger projects, so you
+need a way to filter it down to get to the bottom of our problem.
+
+If a transitive dependency cannot be found by the runtime linker, the
+binary cannot be started. `ldd` shows such a problem by setting
+the corresponding value in the `needed` dict to `LDD_MISSING`.
+To remove everything from the output but the missing dependency and
+the path to that dependency, you can write a filter like this:
+
+```python
+# `d` is the DependencyInfo dict from above
+def filter_down_to_missing(d):
+    res = {}
+
+    # items is a .iteritems() that works for py 2 and 3
+    for name, dep in items(d['needed']):
+        if dep == LDD_MISSING:
+            res[name] = LDD_MISSING
+        elif dep in LDD_ERRORS:
+            pass
+        else:
+            # dep['item'] contains the already converted info
+            # from the previous layer
+            res[name] = dep['item']
+
+    # dict_remove_empty removes all empty fields from the dict,
+    # otherwise your result contains a lot of {} in the values.
+    return dict_remove_empty(res)
+
+# To get human-readable output, we re-use python’s pretty printing
+# library. It’s only simple python values after all!
+import pprint
+pprint.pprint(
+  # actually parse the elf binary and apply only_missing on each layer
+  ldd(
+    filter_down_to_missing,
+    # the path to the elf binary you want to expect.
+    elf_binary_path
+  )
+)
+```
+
+Note that in the filter you only need to filter the data for the
+current executable, and add the info from previous layers (which are
+available in `d['item']`).
+
+The result might look something like:
+
+```python
+{'libfoo.so.5': {'libbar.so.1': {'libbaz.so.6': 'MISSING'}}}
+```
+
+or
+
+```python
+{}
+```
+
+if nothing is missing.
+
+Now, that is a similar output to what a tool like `lddtree(1)` could
+give you. But we don’t need to stop there because it’s trivial to
+augment your output with more information:
+
+
+```python
+def missing_with_runpath(d):
+  # our previous function can be re-used
+  missing = filter_down_to_missing(d)
+
+  # only display runpaths if there are missing deps
+  runpaths = [] if missing is {} else d['runpath_dirs']
+
+  # dict_remove_empty keeps the output clean
+  return dict_remove_empty({
+    'rpth': runpaths,
+    'miss': missing
+  })
+
+# same invocation, different function
+pprint.pprint(
+  ldd(
+    missing_with_runpath,
+    elf_binary_path
+  )
+)
+```
+
+which displays something like this for my example binary:
+
+```python
+{ 'miss': { 'libfoo.so.5': { 'miss': { 'libbar.so.1': { 'miss': { 'libbaz.so.6': 'MISSING'},
+                                                          'rpth': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
+                                                                      'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]}},
+                             'rpth': [ { 'absolute_path': '/nix/store/xdsjx0gba4id3yyqxv66bxnm2sqixkjj-glibc-2.27/lib',
+                                         'path': '/nix/store/xdsjx0gba4id3yyqxv66bxnm2sqixkjj-glibc-2.27/lib'},
+                                       { 'absolute_path': '/nix/store/x6inizi5ahlyhqxxwv1rvn05a25icarq-gcc-7.3.0-lib/lib',
+                                         'path': '/nix/store/x6inizi5ahlyhqxxwv1rvn05a25icarq-gcc-7.3.0-lib/lib'}]}},
+  'rpth': [ … lots more nix rpaths … ]}
+```
+
+That’s still a bit cluttered for my taste, so let’s filter out
+the `/nix/store` paths (which are mostly noise):
+
+```python
+import re
+nix_matcher = re.compile("/nix/store.*")
+
+def missing_with_runpath(d):
+  missing = filter_down_to_missing(d)
+
+  # this is one of the example functions provided by ldd.py
+  remove_matching_runpaths(d, nix_matcher)
+  # ^^^
+
+  runpaths = [] if missing is {} else d['runpath_dirs']
+
+  # dict_remove_empty keeps the output clean
+  return dict_remove_empty({
+    'rpth': runpaths,
+    'miss': missing
+  })
+```
+
+and we are down to:
+
+```python
+{ 'miss': { 'libfoo.so.5': { 'miss': { 'libbar.so.1': { 'miss': { 'libbaz.so.6': 'MISSING'},
+                                                          'rpth': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
+                                                                      'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]}}}
+```
+
+… which shows exactly the path that is missing the dependency we
+expect. But what has gone wrong? Does this path even exist? We can
+find out!
+
+```python
+import re
+nix_matcher = re.compile("/nix/store.*")
+
+def missing_with_runpath(d):
+  missing = filter_down_to_missing(d)
+  remove_matching_runpaths(d, nix_matcher)
+  runpaths = [] if missing is {} else d['runpath_dirs']
+
+  # returns a list of runpaths that don’t exist in the filesystem
+  doesnt_exist = non_existing_runpaths(d)
+  # ^^^
+
+  return dict_remove_empty({
+    'rpth': runpaths,
+    'miss': missing,
+    'doesnt_exist': doesnt_exist,
+  })
+```
+
+I amended the output by a list of runpaths which point to non-existing
+directories:
+
+```python
+{ 'miss': { 'libfoo.so.5': { 'miss': { 'libbar.so.1': { 'miss': { 'libbaz.so.6': 'MISSING'},
+                                                        'rpth': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
+                                                                    'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]
+                                                        'doesnt_exist': [ { 'absolute_path': '/home/philip/.cache/bazel/_bazel_philip/fd9fea5ad581ea59473dc1f9d6bce826/execroot/myproject/bazel-out/k8-fastbuild/bin/something/and/bazel-out/k8-fastbuild/bin/other/integrate',
+                                                                            'path': '$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate'}]}}}
+```
+
+Suddenly it’s perfectly clear where the problem lies,
+`$ORIGIN/../../../../../../bazel-out/k8-fastbuild/bin/other/integrate`
+points to a path that does not exist.
+
+Any data query you’d like to do is possible, as long as it uses
+the data provided by the `ldd` function. See the lower part of
+`ldd.py` for more examples.
+
diff --git a/third_party/bazel/rules_haskell/debug/linking_utils/ldd.py b/third_party/bazel/rules_haskell/debug/linking_utils/ldd.py
new file mode 100644
index 0000000000..897cfdc713
--- /dev/null
+++ b/third_party/bazel/rules_haskell/debug/linking_utils/ldd.py
@@ -0,0 +1,288 @@
+import subprocess
+import os
+import sys
+import re
+
+
+### helper functions
+
+def list_to_dict(f, l):
+    """dict with elements of list as keys & as values transformed by f"""
+    d = {}
+    for el in l:
+        d[el] = f(el)
+    return d
+
+def dict_remove_empty(d):
+    """remove keys that have [] or {} or as values"""
+    new = {}
+    for k, v in d.items():
+        if not (v == [] or v == {}):
+             new[k] = v
+    return new
+
+def identity(x):
+    """identity function"""
+    return x
+
+def const(x):
+    """(curried) constant function"""
+    def f(y):
+        return x
+    return f
+
+def memoized(cache, f, arg):
+    """Memoizes a call to `f` with `arg` in the dict `cache`.
+    Modifies the cache dict in place."""
+    res = cache.get(arg)
+    if arg in cache:
+        return cache[arg]
+    else:
+        res = f(arg)
+        cache[arg] = res
+        return res
+
+### IO functions that find elf dependencies
+
+_field_matcher = re.compile(b"  ([A-Z0-9_]+) +(.*)$")
+
+def read_dynamic_fields(elf_path):
+    """Read the dynamic header fields from an elf binary
+
+    Args:
+      elf_path: path to the elf binary (either absolute or relative to pwd)
+
+    Returns:
+      a list [(field_key, field_value)] where field_keys could appear multiple
+      times (for example there's usually more than one NEEDED field).
+    """
+    res = subprocess.check_output([
+        # force locale to C for stable output
+        "env", "LC_ALL=C",
+        "objdump",
+        # specifying the section brings execution time down from 150ms to 10ms
+        "--section=.dynamic",
+        "--all-headers",
+        elf_path
+    ])
+    to_end = res.split(b"Dynamic Section:\n")[1]
+    # to first empty line
+    dyn_section = to_end[: 1 + to_end.find(b"\n\n")]
+    def read_dynamic_field(s):
+        """return (field_key, field_value)"""
+        return _field_matcher.match(s).groups()
+    return list(map(read_dynamic_field, dyn_section.splitlines(True)))
+
+def __query_dynamic_fields(df, key):
+    """takes a list of dynamic field tuples (key and value),
+    where keys can appear multiple times, and returns a list of all
+    values with the given key (in stable order)."""
+    return [v for k, v in df if k == key]
+
+def parse_runpath_dirs(elf_path, elf_dynamic_fields):
+    """Parse a RUNPATH entry from an elf header bytestring.
+
+    Returns:
+      { path: unmodified string from DT_RUNPATH
+      , absolute_path: fully normalized, absolute path to dir }
+    """
+    fields = __query_dynamic_fields(elf_dynamic_fields, b"RUNPATH")
+    if fields == []:
+        return []
+    assert len(fields) == 1
+    val = fields[0]
+    origin = os.path.dirname(elf_path)
+    return [{ 'path': path,
+              'absolute_path': os.path.abspath(path.replace("$ORIGIN", origin)) }
+            for path in val.decode().strip(":").split(":")
+            if path != ""]
+
+def parse_needed(elf_dynamic_fields):
+    """Returns the list of DT_NEEDED entries for elf"""
+    return [n.decode() for n in __query_dynamic_fields(elf_dynamic_fields, b"NEEDED")]
+
+
+### Main utility
+
+# cannot find dependency
+LDD_MISSING = "MISSING"
+# don't know how to search for dependency
+LDD_UNKNOWN = "DUNNO"
+# list of all errors for easy branching
+LDD_ERRORS = [ LDD_MISSING, LDD_UNKNOWN ]
+
+def _ldd(elf_cache, f, elf_path):
+    """Same as `ldd` (below), except for an additional `elf_cache` argument,
+    which is a dict needed for memoizing elf files that were already read.
+    This is done because the elf reading operation is quite expensive
+    and many files are referenced multiple times (e.g. glib.so)."""
+
+    def search(rdirs, elf_libname):
+        """search for elf_libname in runfile dirs
+        and return either the name or missing"""
+        res = LDD_MISSING
+        for rdir in rdirs:
+            potential_path = os.path.join(rdir['absolute_path'], elf_libname)
+            if os.path.exists(potential_path):
+                res = {
+                    'item': potential_path,
+                    'found_in': rdir,
+                }
+                break
+        return res
+
+    def recurse(search_res):
+        """Unfold the subtree of ELF dependencies for a `search` result"""
+        if search_res == LDD_MISSING:
+            return LDD_MISSING
+        else:
+            # we keep all other fields in search_res the same,
+            # just item is the one that does the recursion.
+            # This is the part that would normally be done by fmap.
+            search_res['item'] = _ldd(elf_cache, f, search_res['item'])
+            return search_res
+
+    # (GNU) ld.so resolves any symlinks before searching for dependencies
+    elf_realpath = os.path.realpath(elf_path)
+
+    # memoized uses the cache to not repeat the I/O action
+    # for the same elf files (same path)
+    dyn_fields = memoized(
+        elf_cache, read_dynamic_fields, elf_realpath
+    )
+    rdirs = parse_runpath_dirs(elf_realpath, dyn_fields)
+    all_needed = parse_needed(dyn_fields)
+
+    # if there's no runpath dirs we don't know where to search
+    if rdirs == []:
+        needed = list_to_dict(const(LDD_UNKNOWN), all_needed)
+    else:
+        needed = list_to_dict(
+            lambda name: recurse(search(rdirs, name)),
+            all_needed
+        )
+
+    result = {
+        'runpath_dirs': rdirs,
+        'needed': needed
+    }
+    # Here, f is applied to the result of the previous level of recursion
+    return f(result)
+
+
+def ldd(f, elf_path):
+    """follows DT_NEEDED ELF headers for elf by searching the through DT_RUNPATH.
+
+    DependencyInfo :
+    { needed : dict(string, union(
+        LDD_MISSING, LDD_UNKNOWN,
+        {
+            # the needed dependency
+            item : a,
+            # where the dependency was found in
+            found_in : RunpathDir
+        }))
+    # all runpath directories that were searched
+    , runpath_dirs : [ RunpathDir ] }
+
+    Args:
+        f: DependencyInfo -> a
+        modifies the results of each level
+        elf_path: path to ELF file, either absolute or relative to current working dir
+
+    Returns: a
+    """
+    elf_cache = {}
+    return _ldd(elf_cache, f, elf_path)
+
+
+### Functions to pass to ldd
+
+# Only use the current layer
+
+def remove_matching_needed(d, re_matcher_absolute_path=None, re_matcher_path=None):
+    """Destructively removes needed values from d['needed']
+    if they match the given regex matcher.
+    Doesn't remove LDD_ERRORS."""
+    def pred(v):
+        """return true if match"""
+        if v in LDD_ERRORS:
+            return False
+        found_in = v['found_in']
+        abs_match = re_matcher_absolute_path.match(found_in['absolute_path']) \
+                    if re_matcher_absolute_path else False
+        match = re_matcher_path.match(found_in['path']) \
+                    if re_matcher_path else False
+        if abs_match or match:
+            return True
+    d['needed'] = {
+        k: v for k, v in d['needed'].items()
+        if not pred(v)
+    }
+
+def remove_matching_runpaths(d, re_matcher):
+    """Destructively removes runpaths from d['runpath_dirs']
+    if they match the given regex matcher."""
+    d['runpath_dirs'] = [
+        runp for runp in d['runpath_dirs']
+        if not re_matcher.match(runp['absolute_path'])
+    ]
+    return d
+
+def non_existing_runpaths(d):
+    """Return a list of runpaths_dirs that do not exist in the file system."""
+    return [
+        runp for runp in d['runpath_dirs']
+        if not os.path.exists(runp['absolute_path'])
+    ]
+
+def unused_runpaths(d):
+    """Return a list of runpath_dirs that were not used to find NEEDED dependencies."""
+    used = set()
+    for k, v in d['needed'].items():
+        if not v in LDD_ERRORS:
+            used.add(v['found_in']['absolute_path'])
+    return [
+        u for u in d['runpath_dirs']
+        if u['absolute_path'] not in used
+    ]
+
+# Also use the results of sub-layers
+
+def collect_unused_runpaths(d):
+    """This is like `unused_runpaths`, but it creates a deduplicated list of all unused runpaths
+    for its dependencies instead of just returning them for the current layer.
+
+    Returns:
+      a dict of two fields;
+      `mine` contains the unused dependencies of the current binary under scrutiny
+      `others` contains a flat dict of all .sos with unused runpath entries and a list of them for each .so
+    """
+    used = set()
+    given = set(r['absolute_path'] for r in d['runpath_dirs'])
+    prev = {}
+    # TODO: use `unused_runpaths` here
+    for k, v in d['needed'].items():
+        if not v in LDD_ERRORS:
+            used.add(v['found_in']['absolute_path'])
+            prev[k] = v['item']
+    unused = [
+        u for u in given.difference(used)
+        # leave out nix storepaths
+        if not u.startswith("/nix/store")
+    ]
+
+    # Each layer doesn't know about their own name
+    # So we return a list of unused for this layer ('mine')
+    # and a dict of all previeous layers combined (name to list)
+    def combine_unused(deps):
+        res = {}
+        for name, dep in deps.items():
+            res.update(dep['others'])
+            res[name] = dep['mine']
+        return res
+
+    return {
+        'mine': unused,
+        'others': combine_unused(prev),
+    }
diff --git a/third_party/bazel/rules_haskell/debug/linking_utils/ldd_test.bzl b/third_party/bazel/rules_haskell/debug/linking_utils/ldd_test.bzl
new file mode 100644
index 0000000000..5872828df2
--- /dev/null
+++ b/third_party/bazel/rules_haskell/debug/linking_utils/ldd_test.bzl
@@ -0,0 +1,26 @@
+load(
+    "//:tests/inline_tests.bzl",
+    "py_inline_test",
+)
+
+#
+def ldd_test(name, elf_binary, script, current_workspace = None, tags = []):
+    """Test with imported linking_utils.ldd library.
+    The path to the `elf_binary` is passed in sys.argv[1].
+    """
+    py_inline_test(
+        name,
+        deps = ["@io_tweag_rules_haskell//debug/linking_utils"],
+        data = [elf_binary],
+        args = ["{}/$(rootpath {})".format(current_workspace, elf_binary)] if current_workspace else ["$(rootpath {})".format(elf_binary)],
+        script = """
+from io_tweag_rules_haskell.debug.linking_utils.ldd import \\
+        dict_remove_empty, identity, const, \\
+        LDD_MISSING, LDD_UNKNOWN, LDD_ERRORS, \\
+        ldd, \\
+        remove_matching_needed, remove_matching_runpaths, \\
+        non_existing_runpaths, unused_runpaths, \\
+        collect_unused_runpaths
+""" + script,
+        tags = tags,
+    )