From b03f7a1b4dff4780a8eaeb5c261598d422551220 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Thu, 31 Oct 2019 17:29:59 +0000 Subject: feat(popcount): Add new narinfo-based popcount implementation Adds an implementation of popcount that, instead of realising derivations locally, just queries the cache's narinfo files. The downside of this is that calculating popularity for arbitrary Nix package sets is not possible with this implementation. The upside is that calculating the popularity for an entire Nix channel can now be done in ~10 seconds[0]. This fixes #65. [0]: Assuming a /fast/ internet connection. --- tools/nixery/popcount/empty.json | 1 - tools/nixery/popcount/popcount | 13 -- tools/nixery/popcount/popcount.go | 256 +++++++++++++++++++++++++++++++++++++ tools/nixery/popcount/popcount.nix | 53 -------- 4 files changed, 256 insertions(+), 67 deletions(-) delete mode 100644 tools/nixery/popcount/empty.json delete mode 100755 tools/nixery/popcount/popcount create mode 100644 tools/nixery/popcount/popcount.go delete mode 100644 tools/nixery/popcount/popcount.nix (limited to 'tools/nixery') diff --git a/tools/nixery/popcount/empty.json b/tools/nixery/popcount/empty.json deleted file mode 100644 index fe51488c7066..000000000000 --- a/tools/nixery/popcount/empty.json +++ /dev/null @@ -1 +0,0 @@ -[] diff --git a/tools/nixery/popcount/popcount b/tools/nixery/popcount/popcount deleted file mode 100755 index 83baf3045da7..000000000000 --- a/tools/nixery/popcount/popcount +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -ueo pipefail - -function graphsFor() { - local pkg="${1}" - local graphs=$(nix-build --timeout 2 --argstr target "${pkg}" popcount.nix || echo -n 'empty.json') - cat $graphs | jq -r -cM '.[] | .references[]' -} - -for pkg in $(cat all-top-level.json | jq -r '.[]'); do - graphsFor "${pkg}" 2>/dev/null - echo "Printed refs for ${pkg}" >&2 -done diff --git a/tools/nixery/popcount/popcount.go b/tools/nixery/popcount/popcount.go new file mode 100644 index 000000000000..a37408e37f2c --- /dev/null +++ b/tools/nixery/popcount/popcount.go @@ -0,0 +1,256 @@ +// Popcount fetches popularity information for each store path in a +// given Nix channel from the upstream binary cache. +// +// It does this simply by inspecting the narinfo files, rather than +// attempting to deal with instantiation of the binary cache. +// +// This is *significantly* faster than attempting to realise the whole +// channel and then calling `nix path-info` on it. +// +// TODO(tazjin): Persist intermediate results (references for each +// store path) to speed up subsequent runs. +package main + +import ( + "encoding/json" + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "os" + "os/exec" + "regexp" + "strings" +) + +var client http.Client +var pathexp = regexp.MustCompile("/nix/store/([a-z0-9]{32})-(.*)$") +var refsexp = regexp.MustCompile("(?m:^References: (.*)$)") +var refexp = regexp.MustCompile("^([a-z0-9]{32})-(.*)$") + +type meta struct { + name string + url string + commit string +} + +type item struct { + name string + hash string +} + +func failOn(err error, msg string) { + if err != nil { + log.Fatalf("%s: %s", msg, err) + } +} + +func channelMetadata(channel string) meta { + // This needs an HTTP client that does not follow redirects + // because the channel URL is used explicitly for other + // downloads. + c := http.Client{ + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, + } + + resp, err := c.Get(fmt.Sprintf("https://nixos.org/channels/%s", channel)) + failOn(err, "failed to retrieve channel metadata") + + loc, err := resp.Location() + failOn(err, "no redirect location given for channel") + if resp.StatusCode != 302 { + log.Fatalf("Expected redirect for channel, but received '%s'\n", resp.Status) + } + + commitResp, err := c.Get(fmt.Sprintf("%s/git-revision", loc.String())) + failOn(err, "failed to retrieve commit for channel") + + defer commitResp.Body.Close() + commit, err := ioutil.ReadAll(commitResp.Body) + failOn(err, "failed to read commit from response") + + return meta{ + name: channel, + url: loc.String(), + commit: string(commit), + } +} + +func downloadStorePaths(c *meta) []string { + resp, err := client.Get(fmt.Sprintf("%s/store-paths.xz", c.url)) + failOn(err, "failed to download store-paths.xz") + defer resp.Body.Close() + + cmd := exec.Command("xzcat") + stdin, err := cmd.StdinPipe() + failOn(err, "failed to open xzcat stdin") + stdout, err := cmd.StdoutPipe() + failOn(err, "failed to open xzcat stdout") + defer stdout.Close() + + go func() { + defer stdin.Close() + io.Copy(stdin, resp.Body) + }() + + err = cmd.Start() + failOn(err, "failed to start xzcat") + + paths, err := ioutil.ReadAll(stdout) + failOn(err, "failed to read uncompressed store paths") + + err = cmd.Wait() + failOn(err, "xzcat failed to decompress") + + return strings.Split(string(paths), "\n") +} + +func storePathToItem(path string) *item { + res := pathexp.FindStringSubmatch(path) + if len(res) != 3 { + return nil + } + + return &item{ + hash: res[1], + name: res[2], + } +} + +func narInfoToRefs(narinfo string) []string { + all := refsexp.FindAllStringSubmatch(narinfo, 1) + + if len(all) != 1 { + log.Fatalf("failed to parse narinfo:\n%s\nfound: %v\n", narinfo, all[0]) + } + + if len(all[0]) != 2 { + // no references found + return []string{} + } + + refs := strings.Split(all[0][1], " ") + for i, s := range refs { + if s == "" { + continue + } + + res := refexp.FindStringSubmatch(s) + refs[i] = res[2] + } + + return refs +} + +func fetchNarInfo(i *item) (string, error) { + resp, err := client.Get(fmt.Sprintf("https://cache.nixos.org/%s.narinfo", i.hash)) + if err != nil { + return "", err + } + + defer resp.Body.Close() + + narinfo, err := ioutil.ReadAll(resp.Body) + return string(narinfo), err +} + +// downloader starts a worker that takes care of downloading narinfos +// for all paths received from the queue. +// +// If there is no data remaining in the queue, the downloader exits +// and informs the finaliser queue about having exited. +func downloader(queue chan *item, narinfos chan string, downloaders chan struct{}) { + for i := range queue { + ni, err := fetchNarInfo(i) + if err != nil { + log.Printf("couldn't fetch narinfo for %s: %s\n", i.name, err) + continue + + } + narinfos <- ni + } + downloaders <- struct{}{} +} + +// finaliser counts the number of downloaders that have exited and +// closes the narinfos queue to signal to the counters that no more +// elements will arrive. +func finaliser(count int, downloaders chan struct{}, narinfos chan string) { + for range downloaders { + count-- + if count == 0 { + close(downloaders) + close(narinfos) + break + } + } +} + +func main() { + if len(os.Args) == 1 { + log.Fatalf("Nix channel must be specified as first argument") + } + + count := 42 // concurrent downloader count + channel := os.Args[1] + log.Printf("Fetching metadata for channel '%s'\n", channel) + + meta := channelMetadata(channel) + log.Printf("Pinned channel '%s' to commit '%s'\n", meta.name, meta.commit) + + paths := downloadStorePaths(&meta) + log.Printf("Fetching references for %d store paths\n", len(paths)) + + // Download paths concurrently and receive their narinfos into + // a channel. Data is collated centrally into a map and + // serialised at the /very/ end. + downloadQueue := make(chan *item, len(paths)) + for _, p := range paths { + if i := storePathToItem(p); i != nil { + downloadQueue <- i + } + } + close(downloadQueue) + + // Set up a task tracking channel for parsing & counting + // narinfos, as well as a coordination channel for signaling + // that all downloads have finished + narinfos := make(chan string, 50) + downloaders := make(chan struct{}, count) + for i := 0; i < count; i++ { + go downloader(downloadQueue, narinfos, downloaders) + } + + go finaliser(count, downloaders, narinfos) + + counts := make(map[string]int) + for ni := range narinfos { + refs := narInfoToRefs(ni) + for _, ref := range refs { + if ref == "" { + continue + } + + counts[ref] += 1 + } + } + + // Remove all self-references (i.e. packages not referenced by anyone else) + for k, v := range counts { + if v == 1 { + delete(counts, k) + } + } + + bytes, _ := json.Marshal(counts) + outfile := fmt.Sprintf("popularity-%s-%s.json", meta.name, meta.commit) + err = ioutil.WriteFile(outfile, bytes, 0644) + if err != nil { + log.Fatalf("Failed to write output to '%s': %s\n", outfile, err) + } + + log.Printf("Wrote output to '%s'\n", outfile) +} diff --git a/tools/nixery/popcount/popcount.nix b/tools/nixery/popcount/popcount.nix deleted file mode 100644 index 54fd2ad589ee..000000000000 --- a/tools/nixery/popcount/popcount.nix +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script, given a target attribute in `nixpkgs`, builds the -# target derivations' runtime closure and returns its reference graph. -# -# This is invoked by popcount.sh for each package in nixpkgs to -# collect all package references, so that package popularity can be -# tracked. -# -# Check out build-image/group-layers.go for an in-depth explanation of -# what the popularity counts are used for. - -{ pkgs ? import { config.allowUnfree = false; }, target }: - -let - inherit (pkgs) coreutils runCommand writeText; - inherit (builtins) readFile toFile fromJSON toJSON listToAttrs; - - # graphJSON abuses feature in Nix that makes structured runtime - # closure information available to builders. This data is imported - # back via IFD to process it for layering data. - graphJSON = path: - runCommand "build-graph" { - __structuredAttrs = true; - exportReferencesGraph.graph = path; - PATH = "${coreutils}/bin"; - builder = toFile "builder" '' - . .attrs.sh - cat .attrs.json > ''${outputs[out]} - ''; - } ""; - - buildClosures = paths: (fromJSON (readFile (graphJSON paths))); - - buildGraph = paths: - listToAttrs (map (c: { - name = c.path; - value = { inherit (c) closureSize references; }; - }) (buildClosures paths)); -in writeText "${target}-graph" -(toJSON (buildClosures [ pkgs."${target}" ]).graph) -- cgit 1.4.1