about summary refs log tree commit diff
path: root/users/tazjin/presentations/bootstrapping-2018
diff options
context:
space:
mode:
Diffstat (limited to 'users/tazjin/presentations/bootstrapping-2018')
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/README.md5
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/default.nix50
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/drake-meme.pngbin0 -> 246872 bytes
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/nixos-logo.pngbin0 -> 90542 bytes
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/notes.org89
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/presentation.pdfbin0 -> 527371 bytes
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/presentation.tex251
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/quine-relay.pngbin0 -> 52350 bytes
-rw-r--r--users/tazjin/presentations/bootstrapping-2018/result.pdfpc142
9 files changed, 537 insertions, 0 deletions
diff --git a/users/tazjin/presentations/bootstrapping-2018/README.md b/users/tazjin/presentations/bootstrapping-2018/README.md
new file mode 100644
index 000000000000..e9573ae3f2e1
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/README.md
@@ -0,0 +1,5 @@
+These are the slides for a talk I gave at the Norwegian Unix User Group on
+2018-03-13.
+
+There is more information and a recording on the [event
+page](https://www.nuug.no/aktiviteter/20180313-reproduible-compiler/).
diff --git a/users/tazjin/presentations/bootstrapping-2018/default.nix b/users/tazjin/presentations/bootstrapping-2018/default.nix
new file mode 100644
index 000000000000..0dff14b2a1a6
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/default.nix
@@ -0,0 +1,50 @@
+# This derivation builds the LaTeX presentation.
+
+{ pkgs, ... }:
+
+with pkgs;
+
+let tex = texlive.combine {
+  inherit (texlive)
+    beamer
+    beamertheme-metropolis
+    etoolbox
+    euenc
+    extsizes
+    fontspec
+    lualibs
+    luaotfload
+    luatex
+    minted
+    ms
+    pgfopts
+    scheme-basic
+    translator;
+};
+in stdenv.mkDerivation {
+  name = "nuug-bootstrapping-slides";
+  src = ./.;
+
+  FONTCONFIG_FILE = makeFontsConf {
+    fontDirectories = [ fira fira-code fira-mono ];
+  };
+
+  buildInputs = [ tex fira fira-code fira-mono ];
+  buildPhase = ''
+    # LaTeX needs a cache folder in /home/ ...
+    mkdir home
+    export HOME=$PWD/home
+    # ${tex}/bin/luaotfload-tool -ufv
+
+    # As usual, TeX needs to be run twice ...
+    function run() {
+      ${tex}/bin/lualatex presentation.tex
+    }
+    run && run
+  '';
+
+  installPhase = ''
+    mkdir -p $out
+    cp presentation.pdf $out/
+  '';
+}
diff --git a/users/tazjin/presentations/bootstrapping-2018/drake-meme.png b/users/tazjin/presentations/bootstrapping-2018/drake-meme.png
new file mode 100644
index 000000000000..4b036754384f
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/drake-meme.png
Binary files differdiff --git a/users/tazjin/presentations/bootstrapping-2018/nixos-logo.png b/users/tazjin/presentations/bootstrapping-2018/nixos-logo.png
new file mode 100644
index 000000000000..ce0c98c2cabb
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/nixos-logo.png
Binary files differdiff --git a/users/tazjin/presentations/bootstrapping-2018/notes.org b/users/tazjin/presentations/bootstrapping-2018/notes.org
new file mode 100644
index 000000000000..363d75352e62
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/notes.org
@@ -0,0 +1,89 @@
+#+TITLE: Bootstrapping, reproducibility, etc.
+#+AUTHOR: Vincent Ambo
+#+DATE: <2018-03-10 Sat>
+
+* Compiler bootstrapping
+  This section contains notes about compiler bootstrapping, the
+  history thereof, which compilers need it - and so on:
+
+** C
+
+** Haskell
+   - self-hosted compiler (GHC)
+
+** Common Lisp
+   CL is fairly interesting in this space because it is a language
+   that is defined via an ANSI standard that compiler implementations
+   normally actually follow!
+
+   CL has several ecosystem components that focus on making
+   abstracting away implementation-specific calls and if a self-hosted
+   compiler is written in CL using those components it can be
+   cross-bootstrapped.
+
+** Python
+
+* A note on runtimes
+  Sometimes the compiler just isn't enough ...
+
+** LLVM
+** JVM
+
+* References
+  https://github.com/mame/quine-relay
+  https://manishearth.github.io/blog/2016/12/02/reflections-on-rusting-trust/
+  https://tests.reproducible-builds.org/debian/reproducible.html
+
+* Slide thoughts:
+  1. Hardware trust has been discussed here a bunch, most recently
+     during the puri.sm talk. Hardware trust is important, as we see
+     with IME, but it's striking that people often take a leap to "I'm
+     now on my trusted Debian with free software".
+
+     Unless you built it yourself from scratch (Spoiler: you haven't)
+     you're placing trust in what is basically foreign binary blobs.
+
+     Agenda: Implications/attack vectors of this, state of the chicken
+     & egg, the topic of reproducibility, what can you do? (Nix!)
+
+  2. Chicken-and-egg issue
+
+     It's an important milestone for a language to become self-hosted:
+     You begin doing a kind of dogfeeding, you begin to enforce
+     reliability & consistency guarantees to avoid having to redo your
+     own codebase constantly and so on.
+
+     However, the implication is now that you need your own compiler
+     to compile itself.
+
+     Common examples:
+     - C/C++ compilers needed to build C/C++ compilers:
+
+       GCC 4.7 was the last version of GCC that could be built with a
+       standard C-compiler, nowadays it is mostly written in C++.
+
+       Certain versions of GCC can be built with LLVM/Clang.
+
+       Clang/LLVM can be compiled by itself and also GCC.
+
+     - Rust was originally written in OCAML but moved to being
+       self-hosted in 2011. Currently rustc-releases are always built
+       with a copy of the previous release.
+
+       It's relatively new so we can build the chain all the way.
+
+     Notable exceptions: Some popular languages are not self-hosted,
+     for example Clojure. Languages also have runtimes, which may be
+     written in something else (e.g. Haskell -> C runtime)
+* How to help:
+  Most of this advice is about reproducible builds, not bootstrapping,
+  as that is a much harder project.
+
+  - fix reproducibility issues listed in Debian's issue tracker (focus
+    on non-Debian specific ones though)
+  - experiment with NixOS / GuixSD to get a better grasp on the
+    problem space of reproducibility
+
+  If you want to contribute to bootstrapping, look at
+  bootstrappable.org and their wiki. Several initiatives such as MES
+  could need help!
diff --git a/users/tazjin/presentations/bootstrapping-2018/presentation.pdf b/users/tazjin/presentations/bootstrapping-2018/presentation.pdf
new file mode 100644
index 000000000000..7f435fe5b539
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/presentation.pdf
Binary files differdiff --git a/users/tazjin/presentations/bootstrapping-2018/presentation.tex b/users/tazjin/presentations/bootstrapping-2018/presentation.tex
new file mode 100644
index 000000000000..d3aa61337554
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/presentation.tex
@@ -0,0 +1,251 @@
+\documentclass[12pt]{beamer}
+\usetheme{metropolis}
+\newenvironment{code}{\ttfamily}{\par}
+\title{Where does \textit{your} compiler come from?}
+\date{2018-03-13}
+\author{Vincent Ambo}
+\institute{Norwegian Unix User Group}
+\begin{document}
+  \maketitle
+
+  %% Slide 1:
+  \section{Introduction}
+
+  %% Slide 2:
+  \begin{frame}{Chicken and egg}
+    Self-hosted compilers are often built using themselves, for example:
+
+    \begin{itemize}
+    \item C-family compilers bootstrap themselves \& each other
+    \item (Some!) Common Lisp compilers can bootstrap each other
+    \item \texttt{rustc} bootstraps itself with a previous version
+    \item ... same for many other languages!
+    \end{itemize}
+  \end{frame}
+
+  \begin{frame}{Chicken, egg and ... lizard?}
+    It's not just compilers: Languages have runtimes, too.
+
+    \begin{itemize}
+    \item JVM is implemented in C++
+    \item Erlang-VM is C
+    \item Haskell runtime is C
+    \end{itemize}
+
+    ... we can't ever get away from C, can we?
+  \end{frame}
+
+  %% Slide 3:
+  \begin{frame}{Trusting Trust}
+    \begin{center}
+      \huge{Could this be exploited?}
+    \end{center}
+  \end{frame}
+
+  %% Slide 4:
+  \begin{frame}{Short interlude: A quine}
+    \begin{center}
+      \begin{code}
+        ((lambda (x) (list x (list 'quote x)))
+        \newline\vspace*{6mm} '(lambda (x) (list x (list 'quote x))))
+      \end{code}
+    \end{center}
+  \end{frame}
+
+  %% Slide 5:
+  \begin{frame}{Short interlude: Quine Relay}
+    \begin{center}
+      \includegraphics[
+        keepaspectratio=true,
+        height=\textheight
+      ]{quine-relay.png}
+    \end{center}
+  \end{frame}
+
+  %% Slide 6:
+  \begin{frame}{Trusting Trust}
+    An attack described by Ken Thompson in 1983:
+
+    \begin{enumerate}
+    \item Modify a compiler to detect when it's compiling itself.
+    \item Let the modification insert \textit{itself} into the new compiler.
+    \item Add arbitrary attack code to the modification.
+    \item \textit{Optional!} Remove the attack from the source after compilation.
+    \end{enumerate}
+  \end{frame}
+
+  %% Slide 7:
+  \begin{frame}{Damage potential?}
+    \begin{center}
+      \large{Let your imagination run wild!}
+    \end{center}
+  \end{frame}
+
+  %% Slide 8:
+  \section{Countermeasures}
+
+  %% Slide 9:
+  \begin{frame}{Diverse Double-Compiling}
+    Assume we have:
+
+    \begin{itemize}
+    \item Target language compilers $A$ and $T$
+    \item The source code of $A$: $ S_{A} $
+    \end{itemize}
+  \end{frame}
+
+  %% Slide 10:
+  \begin{frame}{Diverse Double-Compiling}
+    Apply the first stage (functional equivalence):
+
+    \begin{itemize}
+    \item $ X = A(S_{A})$
+    \item $ Y = T(S_{A})$
+    \end{itemize}
+
+    Apply the second stage (bit-for-bit equivalence):
+
+    \begin{itemize}
+    \item $ V = X(S_{A})$
+    \item $ W = Y(S_{A})$
+    \end{itemize}
+
+    Now we have a new problem: Reproducibility!
+  \end{frame}
+
+  %% Slide 11:
+  \begin{frame}{Reproducibility}
+    Bit-for-bit equivalent output is hard, for example:
+
+    \begin{itemize}
+    \item Timestamps in output artifacts
+    \item Non-deterministic linking order in concurrent builds
+    \item Non-deterministic VM \& memory states in outputs
+    \item Randomness in builds (sic!)
+    \end{itemize}
+  \end{frame}
+
+  \begin{frame}{Reproducibility}
+    \begin{center}
+      Without reproducibility, we can never trust that any shipped
+      binary matches the source code!
+    \end{center}
+  \end{frame}
+
+  %% Slide 12:
+  \section{(Partial) State of the Union}
+
+  \begin{frame}{The Desired State}
+    \begin{center}
+      \begin{enumerate}
+      \item Full-source bootstrap!
+      \item All packages reproducible!
+      \end{enumerate}
+    \end{center}
+  \end{frame}
+
+  %% Slide 13:
+  \begin{frame}{Bootstrapping Debian}
+    \begin{itemize}
+    \item Sparse information on the Debian-wiki
+    \item Bootstrapping discussions mostly resolve around new architectures
+    \item GCC is compiled by depending on previous versions of GCC
+    \end{itemize}
+  \end{frame}
+
+  \begin{frame}{Reproducing Debian}
+    Debian has a very active effort for reproducible builds:
+
+    \begin{itemize}
+    \item Organised information about reproducibility status
+    \item Over 90\% reproducibility in Debian package base!
+    \end{itemize}
+  \end{frame}
+
+  \begin{frame}{Short interlude: Nix}
+    \begin{center}
+      \includegraphics[
+        keepaspectratio=true,
+        height=0.7\textheight
+      ]{nixos-logo.png}
+    \end{center}
+  \end{frame}
+
+  \begin{frame}{Short interlude: Nix}
+    \begin{center}
+      \includegraphics[
+        keepaspectratio=true,
+        height=0.90\textheight
+      ]{drake-meme.png}
+    \end{center}
+  \end{frame}
+
+  \begin{frame}{Short interlude: Nix}
+    \begin{center}
+      \includegraphics[
+        keepaspectratio=true,
+        height=0.7\textheight
+      ]{nixos-logo.png}
+    \end{center}
+  \end{frame}
+
+  \begin{frame}{Bootstrapping NixOS}
+    Nix evaluation can not recurse forever: The bootstrap can not
+    simply depend on a previous GCC.
+
+    Workaround: \texttt{bootstrap-tools} tarball from a previous
+    binary cache is fetched and used.
+
+    An unfortunate magic binary blob ...
+  \end{frame}
+
+  \begin{frame}{Reproducing NixOS}
+    Not all reproducibility patches have been ported from Debian.
+
+    However: Builds are fully repeatable via the Nix fundamentals!
+  \end{frame}
+
+  \section{Future Developments}
+
+  \begin{frame}{Bootstrappable: stage0}
+    Hand-rolled ``Cthulhu's Path to Madness'' hex-programs:
+
+    \begin{itemize}
+    \item No non-auditable binary blobs
+    \item Aims for understandability by 70\% of programmers
+    \item End goal is a full-source bootstrap of GCC
+    \end{itemize}
+  \end{frame}
+
+
+  \begin{frame}{Bootstrappable: MES}
+    Bootstrapping the ``Maxwell Equations of Software'':
+
+    \begin{itemize}
+    \item Minimal C-compiler written in Scheme
+    \item Minimal Scheme-interpreter (currently in C, but intended to
+      be rewritten in stage0 macros)
+    \item End goal is full-source bootstrap of the entire GuixSD
+    \end{itemize}
+  \end{frame}
+
+  \begin{frame}{Other platforms}
+    \begin{itemize}
+    \item Nix for Darwin is actively maintained
+    \item F-Droid Android repository works towards fully reproducible
+      builds of (open) Android software
+    \item Mobile devices (phones, tablets, etc.) are a lost cause at
+      the moment
+    \end{itemize}
+  \end{frame}
+
+  \begin{frame}{Thanks!}
+    Resources:
+    \begin{itemize}
+    \item bootstrappable.org
+    \item reproducible-builds.org
+    \end{itemize}
+
+    @tazjin | mail@tazj.in
+  \end{frame}
+\end{document}
diff --git a/users/tazjin/presentations/bootstrapping-2018/quine-relay.png b/users/tazjin/presentations/bootstrapping-2018/quine-relay.png
new file mode 100644
index 000000000000..5644dc3900e3
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/quine-relay.png
Binary files differdiff --git a/users/tazjin/presentations/bootstrapping-2018/result.pdfpc b/users/tazjin/presentations/bootstrapping-2018/result.pdfpc
new file mode 100644
index 000000000000..b0fa6c9a0ef8
--- /dev/null
+++ b/users/tazjin/presentations/bootstrapping-2018/result.pdfpc
@@ -0,0 +1,142 @@
+[file]
+result
+[last_saved_slide]
+10
+[font_size]
+20000
+[notes]
+### 1
+- previous discussions of hardware trust (e.g. purism presentation)
+- people leap to "now I'm on my trusted Debian!"
+- unless you built it from scratch (spoiler: you haven't) you're *trusting* someone
+
+Agenda: Implications of trust with focus on bootstrap paths and reproducibility, plus how you can help.### 2
+self-hosting:
+- C-family: GCC pre/post 4.7, Clang
+- Common Lisp: Sunshine land! (with SBCL)
+- rustc: Bootstrap based on previous versions (C++ transpiler underway!)
+- many other languages also work this way!
+
+(Noteable counterexample: Clojure is written in Java!)### 3
+
+- compilers are just one bit, the various runtimes exist, too!### 4
+
+Could this be exploited?
+
+People don't think about where their compiler comes from.
+
+Even if they do, they may only go so far as to say "I'll just recompile it using <other compiler>".
+
+Unfortunately, spoiler alert, life isn't that easy in the computer world and yes, exploitation is possible.### 5
+
+- describe what a quine is
+- classic Lisp quine
+- explain demo quine
+- demo demo quine
+
+- this is interesting, but not useful - can quines do more than that?### 6
+
+- quine-relay: "art project" with 128-language circular quine
+
+- show source of quine-relay
+
+- (demo quine relay?)
+
+- side-note: this program is very, very trustworthy!### 7
+
+Ken Thompson (designer of UNIX and a couple other things!) received Turing award in 1983, and described attack in speech.
+
+- figure out how to detect self-compilation
+- make that modification a quine
+- insert modification into new compiler
+- add attack code to modification
+- remove attack from source, distributed binary will still be compromised! it's like evolution :)### 8
+
+damage potential is basically infinite:
+
+- classic "login" attack
+=> also applicable to other credentials
+
+- attack (weaken) crypto algorithms
+
+- you can probably think of more!### 10
+
+idea being: potential vulnerability would have to work across compilers:
+
+the more compilers we can introduce (e.g. more architectures, different versions, different compilers), the harder it gets for a vulnerability to survive all of those
+
+The more compilers, the merrier! Lisps are pretty good at this.### 11
+
+if we get a bit-mismatch after DDC, not all hope is lost: Maybe the thing just isn't reproducible!
+
+- many reasons for failures
+- timestamps are a classic! artifacts can be build logs, metadata in ZIP-files or whatever
+- non-determinism is the devil
+- sometimes people actively introduce build-randomness (NaCl)### 12
+
+- Does that binary download on the project's website really match the source?
+
+- Your Linux packages are signed by someone - cool - but what does that mean?### 13
+
+Two things should be achieved - gross oversimplification - to get to the ideal "desired state of the union":
+
+1. full-source bootstrap: without ever introducing any binaries, go from nothing to a full Linux distribution
+
+2. when packages are distributed, we should be able to know the expected output of a source package beforehand
+
+=> suddenly binary distributions become a cache! But more on Nix later.### 14
+
+- Debian project does not seem as concerned with bootstrapping as with reproducibility
+- Debian mostly bootstraps on new architectures (using cross-compilation and similar techniques, from an existing binary base)
+- core bootstrap (GCC & friends) is performed with previous Debian version and depending on GCC### 15
+
+... however! Debian cares about reproducibility.
+
+- automated testing of reproducibility
+- information about the status of all packages is made available in repos
+- Over 90% packages of packages are reproducible!
+
+< show reproducible builds website >
+
+Debian is still fundamentally a binary distribution though, but it doesn't have to be that way.### 16
+
+Nix - a purely functional package manager
+
+It's not a new project (10+ years), been discussed here before, has multiple components: package manager, language, NixOS.
+
+Instead of describing *how* to build a thing, Nix describes *what* to build:### 17
+### 19
+
+In Nix, it's impossible to say "GCC is the result of applying GCC to the GCC source", because that happens to be infinite recursion.
+
+Bootstrapping in Nix works by introducing a binary pinned by its full-hash, which was built on some previous Nix version.
+
+Unfortunately also just a magic binary blob ... ### 20
+
+NixOS is not actively porting all of Debian's reproducibility patches, but builds are fully repeatable:
+
+- introducing a malicious compiler would produce a different input hash -> different package
+
+Future slide: hope is not lost! Things are underway.### 21
+
+- bootstrappable.org (demo?) is an umbrella page for several projects working on bootstrappability
+
+- stage0 is an important piece: manually, small, auditable Hex programs to get to a Hex macro expander
+
+- end goal is a full-source bootrap, but pieces are missing### 22
+
+MES is out of the GuixSD circles (explain Guix, GNU Hurd joke)
+
+- idea being that once you have a Lisp, you have all of computing (as Alan Key said)
+
+- includes MesCC in Scheme -> can *almost* make a working tinyCC -> can *almost* make a working gcc 4.7
+
+- minimal Scheme interpreter, currently built in C to get the higher-level stuff to work, goal is rewrite in hex
+- bootstrapping Guix is the end goal### 23
+
+- userspace in Darwin has a Nix project
+- unsure about other BSDs, but if anyone knows - input welcome!
+- F-Droid has reproducible Android packages, but that's also userspace only
+- All other mobile platforms are a lost cause
+
+Generally, all closed-source software is impossible to trust.