about summary refs log tree commit diff
path: root/tools/nixery/popcount
diff options
context:
space:
mode:
authorVincent Ambo <tazjin@google.com>2019-10-31T17·29+0000
committerVincent Ambo <github@tazj.in>2019-11-03T01·33+0000
commitb03f7a1b4dff4780a8eaeb5c261598d422551220 (patch)
tree4030b6d0c4e1bb053d053d19d958a91f9c19711c /tools/nixery/popcount
parent2d4a3ea307350e1f7495a4fb6c6f5a37d10d3912 (diff)
feat(popcount): Add new narinfo-based popcount implementation
Adds an implementation of popcount that, instead of realising
derivations locally, just queries the cache's narinfo files.

The downside of this is that calculating popularity for arbitrary Nix
package sets is not possible with this implementation. The upside is
that calculating the popularity for an entire Nix channel can now be
done in ~10 seconds[0].

This fixes #65.

[0]: Assuming a /fast/ internet connection.
Diffstat (limited to 'tools/nixery/popcount')
-rw-r--r--tools/nixery/popcount/empty.json1
-rwxr-xr-xtools/nixery/popcount/popcount13
-rw-r--r--tools/nixery/popcount/popcount.go256
-rw-r--r--tools/nixery/popcount/popcount.nix53
4 files changed, 256 insertions, 67 deletions
diff --git a/tools/nixery/popcount/empty.json b/tools/nixery/popcount/empty.json
deleted file mode 100644
index fe51488c70..0000000000
--- a/tools/nixery/popcount/empty.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/tools/nixery/popcount/popcount b/tools/nixery/popcount/popcount
deleted file mode 100755
index 83baf3045d..0000000000
--- a/tools/nixery/popcount/popcount
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -ueo pipefail
-
-function graphsFor() {
-  local pkg="${1}"
-  local graphs=$(nix-build --timeout 2 --argstr target "${pkg}" popcount.nix || echo -n 'empty.json')
-  cat $graphs | jq -r -cM '.[] | .references[]'
-}
-
-for pkg in $(cat all-top-level.json | jq -r '.[]'); do
-  graphsFor "${pkg}" 2>/dev/null
-  echo "Printed refs for ${pkg}" >&2
-done
diff --git a/tools/nixery/popcount/popcount.go b/tools/nixery/popcount/popcount.go
new file mode 100644
index 0000000000..a37408e37f
--- /dev/null
+++ b/tools/nixery/popcount/popcount.go
@@ -0,0 +1,256 @@
+// Popcount fetches popularity information for each store path in a
+// given Nix channel from the upstream binary cache.
+//
+// It does this simply by inspecting the narinfo files, rather than
+// attempting to deal with instantiation of the binary cache.
+//
+// This is *significantly* faster than attempting to realise the whole
+// channel and then calling `nix path-info` on it.
+//
+// TODO(tazjin): Persist intermediate results (references for each
+// store path) to speed up subsequent runs.
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+	"os/exec"
+	"regexp"
+	"strings"
+)
+
+var client http.Client
+var pathexp = regexp.MustCompile("/nix/store/([a-z0-9]{32})-(.*)$")
+var refsexp = regexp.MustCompile("(?m:^References: (.*)$)")
+var refexp = regexp.MustCompile("^([a-z0-9]{32})-(.*)$")
+
+type meta struct {
+	name   string
+	url    string
+	commit string
+}
+
+type item struct {
+	name string
+	hash string
+}
+
+func failOn(err error, msg string) {
+	if err != nil {
+		log.Fatalf("%s: %s", msg, err)
+	}
+}
+
+func channelMetadata(channel string) meta {
+	// This needs an HTTP client that does not follow redirects
+	// because the channel URL is used explicitly for other
+	// downloads.
+	c := http.Client{
+		CheckRedirect: func(req *http.Request, via []*http.Request) error {
+			return http.ErrUseLastResponse
+		},
+	}
+
+	resp, err := c.Get(fmt.Sprintf("https://nixos.org/channels/%s", channel))
+	failOn(err, "failed to retrieve channel metadata")
+
+	loc, err := resp.Location()
+	failOn(err, "no redirect location given for channel")
+	if resp.StatusCode != 302 {
+		log.Fatalf("Expected redirect for channel, but received '%s'\n", resp.Status)
+	}
+
+	commitResp, err := c.Get(fmt.Sprintf("%s/git-revision", loc.String()))
+	failOn(err, "failed to retrieve commit for channel")
+
+	defer commitResp.Body.Close()
+	commit, err := ioutil.ReadAll(commitResp.Body)
+	failOn(err, "failed to read commit from response")
+
+	return meta{
+		name:   channel,
+		url:    loc.String(),
+		commit: string(commit),
+	}
+}
+
+func downloadStorePaths(c *meta) []string {
+	resp, err := client.Get(fmt.Sprintf("%s/store-paths.xz", c.url))
+	failOn(err, "failed to download store-paths.xz")
+	defer resp.Body.Close()
+
+	cmd := exec.Command("xzcat")
+	stdin, err := cmd.StdinPipe()
+	failOn(err, "failed to open xzcat stdin")
+	stdout, err := cmd.StdoutPipe()
+	failOn(err, "failed to open xzcat stdout")
+	defer stdout.Close()
+
+	go func() {
+		defer stdin.Close()
+		io.Copy(stdin, resp.Body)
+	}()
+
+	err = cmd.Start()
+	failOn(err, "failed to start xzcat")
+
+	paths, err := ioutil.ReadAll(stdout)
+	failOn(err, "failed to read uncompressed store paths")
+
+	err = cmd.Wait()
+	failOn(err, "xzcat failed to decompress")
+
+	return strings.Split(string(paths), "\n")
+}
+
+func storePathToItem(path string) *item {
+	res := pathexp.FindStringSubmatch(path)
+	if len(res) != 3 {
+		return nil
+	}
+
+	return &item{
+		hash: res[1],
+		name: res[2],
+	}
+}
+
+func narInfoToRefs(narinfo string) []string {
+	all := refsexp.FindAllStringSubmatch(narinfo, 1)
+
+	if len(all) != 1 {
+		log.Fatalf("failed to parse narinfo:\n%s\nfound: %v\n", narinfo, all[0])
+	}
+
+	if len(all[0]) != 2 {
+		// no references found
+		return []string{}
+	}
+
+	refs := strings.Split(all[0][1], " ")
+	for i, s := range refs {
+		if s == "" {
+			continue
+		}
+
+		res := refexp.FindStringSubmatch(s)
+		refs[i] = res[2]
+	}
+
+	return refs
+}
+
+func fetchNarInfo(i *item) (string, error) {
+	resp, err := client.Get(fmt.Sprintf("https://cache.nixos.org/%s.narinfo", i.hash))
+	if err != nil {
+		return "", err
+	}
+
+	defer resp.Body.Close()
+
+	narinfo, err := ioutil.ReadAll(resp.Body)
+	return string(narinfo), err
+}
+
+// downloader starts a worker that takes care of downloading narinfos
+// for all paths received from the queue.
+//
+// If there is no data remaining in the queue, the downloader exits
+// and informs the finaliser queue about having exited.
+func downloader(queue chan *item, narinfos chan string, downloaders chan struct{}) {
+	for i := range queue {
+		ni, err := fetchNarInfo(i)
+		if err != nil {
+			log.Printf("couldn't fetch narinfo for %s: %s\n", i.name, err)
+			continue
+
+		}
+		narinfos <- ni
+	}
+	downloaders <- struct{}{}
+}
+
+// finaliser counts the number of downloaders that have exited and
+// closes the narinfos queue to signal to the counters that no more
+// elements will arrive.
+func finaliser(count int, downloaders chan struct{}, narinfos chan string) {
+	for range downloaders {
+		count--
+		if count == 0 {
+			close(downloaders)
+			close(narinfos)
+			break
+		}
+	}
+}
+
+func main() {
+	if len(os.Args) == 1 {
+		log.Fatalf("Nix channel must be specified as first argument")
+	}
+
+	count := 42 // concurrent downloader count
+	channel := os.Args[1]
+	log.Printf("Fetching metadata for channel '%s'\n", channel)
+
+	meta := channelMetadata(channel)
+	log.Printf("Pinned channel '%s' to commit '%s'\n", meta.name, meta.commit)
+
+	paths := downloadStorePaths(&meta)
+	log.Printf("Fetching references for %d store paths\n", len(paths))
+
+	// Download paths concurrently and receive their narinfos into
+	// a channel. Data is collated centrally into a map and
+	// serialised at the /very/ end.
+	downloadQueue := make(chan *item, len(paths))
+	for _, p := range paths {
+		if i := storePathToItem(p); i != nil {
+			downloadQueue <- i
+		}
+	}
+	close(downloadQueue)
+
+	// Set up a task tracking channel for parsing & counting
+	// narinfos, as well as a coordination channel for signaling
+	// that all downloads have finished
+	narinfos := make(chan string, 50)
+	downloaders := make(chan struct{}, count)
+	for i := 0; i < count; i++ {
+		go downloader(downloadQueue, narinfos, downloaders)
+	}
+
+	go finaliser(count, downloaders, narinfos)
+
+	counts := make(map[string]int)
+	for ni := range narinfos {
+		refs := narInfoToRefs(ni)
+		for _, ref := range refs {
+			if ref == "" {
+				continue
+			}
+
+			counts[ref] += 1
+		}
+	}
+
+	// Remove all self-references (i.e. packages not referenced by anyone else)
+	for k, v := range counts {
+		if v == 1 {
+			delete(counts, k)
+		}
+	}
+
+	bytes, _ := json.Marshal(counts)
+	outfile := fmt.Sprintf("popularity-%s-%s.json", meta.name, meta.commit)
+	err = ioutil.WriteFile(outfile, bytes, 0644)
+	if err != nil {
+		log.Fatalf("Failed to write output to '%s': %s\n", outfile, err)
+	}
+
+	log.Printf("Wrote output to '%s'\n", outfile)
+}
diff --git a/tools/nixery/popcount/popcount.nix b/tools/nixery/popcount/popcount.nix
deleted file mode 100644
index 54fd2ad589..0000000000
--- a/tools/nixery/popcount/popcount.nix
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script, given a target attribute in `nixpkgs`, builds the
-# target derivations' runtime closure and returns its reference graph.
-#
-# This is invoked by popcount.sh for each package in nixpkgs to
-# collect all package references, so that package popularity can be
-# tracked.
-#
-# Check out build-image/group-layers.go for an in-depth explanation of
-# what the popularity counts are used for.
-
-{ pkgs ? import <nixpkgs> { config.allowUnfree = false; }, target }:
-
-let
-  inherit (pkgs) coreutils runCommand writeText;
-  inherit (builtins) readFile toFile fromJSON toJSON listToAttrs;
-
-  # graphJSON abuses feature in Nix that makes structured runtime
-  # closure information available to builders. This data is imported
-  # back via IFD to process it for layering data.
-  graphJSON = path:
-    runCommand "build-graph" {
-      __structuredAttrs = true;
-      exportReferencesGraph.graph = path;
-      PATH = "${coreutils}/bin";
-      builder = toFile "builder" ''
-        . .attrs.sh
-        cat .attrs.json > ''${outputs[out]}
-      '';
-    } "";
-
-  buildClosures = paths: (fromJSON (readFile (graphJSON paths)));
-
-  buildGraph = paths:
-    listToAttrs (map (c: {
-      name = c.path;
-      value = { inherit (c) closureSize references; };
-    }) (buildClosures paths));
-in writeText "${target}-graph"
-(toJSON (buildClosures [ pkgs."${target}" ]).graph)