From 6edfdd0773264b058736a8fb3887050d48a06c67 Mon Sep 17 00:00:00 2001
From: Vincent Ambo <mail@tazj.in>
Date: Thu, 25 Nov 2021 12:58:22 +0300
Subject: refactor(ops/pipelines): Query build status from Buildkite API

Instead of manually tracking the build status through Buildkite
metadata, use the Buildkite GraphQL API in the `:duck:` build
step (i.e. the one that determines the status of the entire pipeline
to be reported back to Gerrit) to fetch the number of failed jobs.

This way we have less manual state accounting in the pipeline.

The downside is that the GraphQL query embedded here is a little hard
to read.

Notes:

  * This needs an access token for Buildkite. We already have one for
    besadii which is also run by the agents, so I've given it GraphQL
    permissions and reused it.

  * I almost introduced a very rare bug here: My initial intuition was
    to simply `exit $FAILED_JOBS` - in the extremely rare case where
    `$FAILED_JOBS % 256 = 0` this would mean we would ... fail to fail
    the build :)

Change-Id: I61976b11b591d722494d3010a362b544efe2cb25
---
 ops/pipelines/depot.nix            | 28 +++++++++++++++++++++-------
 ops/pipelines/static-pipeline.yaml |  1 -
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'ops/pipelines')

diff --git a/ops/pipelines/depot.nix b/ops/pipelines/depot.nix
index 6d4b69d5b086..f2db69a78ff3 100644
--- a/ops/pipelines/depot.nix
+++ b/ops/pipelines/depot.nix
@@ -27,10 +27,6 @@ let
       else label;
 
   # Create a pipeline step from a single target.
-  #
-  # If the build fails, Buildkite metadata is updated to mark the
-  # pipeline as failed. Buildkite has a concept of a failed pipeline
-  # regardless, but this data is not accessible.
   mkStep = target: {
     command = let
       drvPath = builtins.unsafeDiscardStringContext target.drvPath;
@@ -45,7 +41,6 @@ let
       # To handle this case we fall back to an ordinary build if the derivation
       # file is missing.
       "|| (test ! -f '${drvPath}' && nix-build -E '${mkBuildExpr target}' --show-trace)"
-      "|| (buildkite-agent meta-data set 'failure' '1'; exit 1)"
     ];
     label = ":nix: ${mkLabel target}";
 
@@ -89,10 +84,29 @@ let
       })
 
       # Wait for all steps to complete, then exit with success or
-      # failure depending on whether any failure status was written.
+      # failure depending on whether any other steps failed.
+      #
+      # This information is checked by querying the Buildkite GraphQL
+      # API and fetching the count of failed steps.
+      #
       # This step must be :duck:! (yes, really!)
       ({
-        command = "exit $(buildkite-agent meta-data get 'failure')";
+        command = let duck = pkgs.writeShellScript "duck" ''
+          set -ueo pipefail
+
+          readonly FAILED_JOBS=$(${pkgs.curl}/bin/curl 'https://graphql.buildkite.com/v1' \
+            --silent \
+            -H "Authorization: Bearer $(cat /etc/secrets/buildkite-besadii)" \
+            -d "{\"query\": \"query BuildStatusQuery { build(uuid: \\\"$BUILDKITE_BUILD_ID\\\") { jobs(passed: false) { count } } }\"}" | \
+            ${pkgs.jq}/bin/jq -r '.data.build.jobs.count')
+
+          echo "$FAILED_JOBS build jobs failed."
+
+          if (( $FAILED_JOBS > 0 )); then
+            exit 1
+          fi
+        ''; in "${duck}";
+
         label = ":duck:";
         key = ":duck:";
       })
diff --git a/ops/pipelines/static-pipeline.yaml b/ops/pipelines/static-pipeline.yaml
index 2261b11b80d0..c864aea65714 100644
--- a/ops/pipelines/static-pipeline.yaml
+++ b/ops/pipelines/static-pipeline.yaml
@@ -14,7 +14,6 @@ steps:
       }
 
       nix-build -A ops.pipelines.depot -o depot.yaml --show-trace || fallback
-      buildkite-agent meta-data set 'failure' '0'
       buildkite-agent pipeline upload depot.yaml || fallback
 
   # Create a revision number for the current commit for builds on
-- 
cgit 1.4.1