about summary refs log tree commit diff
path: root/tools/nixery/popcount/popcount.go
blob: b83ac3ed1ad89d893f64ae6fb3f3423c75db8b3e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// Copyright 2022 The TVL Contributors
// SPDX-License-Identifier: Apache-2.0

// Popcount fetches popularity information for each store path in a
// given Nix channel from the upstream binary cache.
//
// It does this simply by inspecting the narinfo files, rather than
// attempting to deal with instantiation of the binary cache.
//
// This is *significantly* faster than attempting to realise the whole
// channel and then calling `nix path-info` on it.
//
// TODO(tazjin): Persist intermediate results (references for each
// store path) to speed up subsequent runs.
package main

import (
	"encoding/json"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"os/exec"
	"regexp"
	"strings"
)

var client http.Client
var pathexp = regexp.MustCompile("/nix/store/([a-z0-9]{32})-(.*)$")
var refsexp = regexp.MustCompile("(?m:^References: (.*)$)")
var refexp = regexp.MustCompile("^([a-z0-9]{32})-(.*)$")

type meta struct {
	name   string
	url    string
	commit string
}

type item struct {
	name string
	hash string
}

func failOn(err error, msg string) {
	if err != nil {
		log.Fatalf("%s: %s", msg, err)
	}
}

func channelMetadata(channel string) meta {
	// This needs an HTTP client that does not follow redirects
	// because the channel URL is used explicitly for other
	// downloads.
	c := http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			return http.ErrUseLastResponse
		},
	}

	resp, err := c.Get(fmt.Sprintf("https://channels.nixos.org/%s", channel))
	failOn(err, "failed to retrieve channel metadata")

	loc, err := resp.Location()
	failOn(err, "no redirect location given for channel")

	// TODO(tazjin): These redirects are currently served as 301s, but
	// should (and used to) be 302s. Check if/when this is fixed and
	// update accordingly.
	if !(resp.StatusCode == 301 || resp.StatusCode == 302) {
		log.Fatalf("Expected redirect for channel, but received '%s'\n", resp.Status)
	}

	commitResp, err := c.Get(fmt.Sprintf("%s/git-revision", loc.String()))
	failOn(err, "failed to retrieve commit for channel")

	defer commitResp.Body.Close()
	commit, err := ioutil.ReadAll(commitResp.Body)
	failOn(err, "failed to read commit from response")
	if commitResp.StatusCode != 200 {
		log.Fatalf("non-success status code when fetching commit: %s (%v)", string(commit), commitResp.StatusCode)
	}

	return meta{
		name:   channel,
		url:    loc.String(),
		commit: string(commit),
	}
}

func downloadStorePaths(c *meta) []string {
	resp, err := client.Get(fmt.Sprintf("%s/store-paths.xz", c.url))
	failOn(err, "failed to download store-paths.xz")
	defer resp.Body.Close()

	cmd := exec.Command("xzcat")
	stdin, err := cmd.StdinPipe()
	failOn(err, "failed to open xzcat stdin")
	stdout, err := cmd.StdoutPipe()
	failOn(err, "failed to open xzcat stdout")
	defer stdout.Close()

	go func() {
		defer stdin.Close()
		io.Copy(stdin, resp.Body)
	}()

	err = cmd.Start()
	failOn(err, "failed to start xzcat")

	paths, err := ioutil.ReadAll(stdout)
	failOn(err, "failed to read uncompressed store paths")

	err = cmd.Wait()
	failOn(err, "xzcat failed to decompress")

	return strings.Split(string(paths), "\n")
}

func storePathToItem(path string) *item {
	res := pathexp.FindStringSubmatch(path)
	if len(res) != 3 {
		return nil
	}

	return &item{
		hash: res[1],
		name: res[2],
	}
}

func narInfoToRefs(narinfo string) []string {
	all := refsexp.FindAllStringSubmatch(narinfo, 1)

	if len(all) != 1 {
		log.Fatalf("failed to parse narinfo:\n%s\nfound: %v\n", narinfo, all[0])
	}

	if len(all[0]) != 2 {
		// no references found
		return []string{}
	}

	refs := strings.Split(all[0][1], " ")
	for i, s := range refs {
		if s == "" {
			continue
		}

		res := refexp.FindStringSubmatch(s)
		refs[i] = res[2]
	}

	return refs
}

func fetchNarInfo(i *item) (string, error) {
	file, err := ioutil.ReadFile("popcache/" + i.hash)
	if err == nil {
		return string(file), nil
	}

	resp, err := client.Get(fmt.Sprintf("https://cache.nixos.org/%s.narinfo", i.hash))
	if err != nil {
		return "", err
	}

	defer resp.Body.Close()

	narinfo, err := ioutil.ReadAll(resp.Body)

	// best-effort write the file to the cache
	ioutil.WriteFile("popcache/"+i.hash, narinfo, 0644)

	return string(narinfo), err
}

// downloader starts a worker that takes care of downloading narinfos
// for all paths received from the queue.
//
// If there is no data remaining in the queue, the downloader exits
// and informs the finaliser queue about having exited.
func downloader(queue chan *item, narinfos chan string, downloaders chan struct{}) {
	for i := range queue {
		ni, err := fetchNarInfo(i)
		if err != nil {
			log.Printf("couldn't fetch narinfo for %s: %s\n", i.name, err)
			continue

		}
		narinfos <- ni
	}
	downloaders <- struct{}{}
}

// finaliser counts the number of downloaders that have exited and
// closes the narinfos queue to signal to the counters that no more
// elements will arrive.
func finaliser(count int, downloaders chan struct{}, narinfos chan string) {
	for range downloaders {
		count--
		if count == 0 {
			close(downloaders)
			close(narinfos)
			break
		}
	}
}

func main() {
	if len(os.Args) == 1 {
		log.Fatalf("Nix channel must be specified as first argument")
	}

	err := os.MkdirAll("popcache", 0755)
	if err != nil {
		log.Fatalf("Failed to create 'popcache' directory in current folder: %s\n", err)
	}

	count := 42 // concurrent downloader count
	channel := os.Args[1]
	log.Printf("Fetching metadata for channel '%s'\n", channel)

	meta := channelMetadata(channel)
	log.Printf("Pinned channel '%s' to commit '%s'\n", meta.name, meta.commit)

	paths := downloadStorePaths(&meta)
	log.Printf("Fetching references for %d store paths\n", len(paths))

	// Download paths concurrently and receive their narinfos into
	// a channel. Data is collated centrally into a map and
	// serialised at the /very/ end.
	downloadQueue := make(chan *item, len(paths))
	for _, p := range paths {
		if i := storePathToItem(p); i != nil {
			downloadQueue <- i
		}
	}
	close(downloadQueue)

	// Set up a task tracking channel for parsing & counting
	// narinfos, as well as a coordination channel for signaling
	// that all downloads have finished
	narinfos := make(chan string, 50)
	downloaders := make(chan struct{}, count)
	for i := 0; i < count; i++ {
		go downloader(downloadQueue, narinfos, downloaders)
	}

	go finaliser(count, downloaders, narinfos)

	counts := make(map[string]int)
	for ni := range narinfos {
		refs := narInfoToRefs(ni)
		for _, ref := range refs {
			if ref == "" {
				continue
			}

			counts[ref] += 1
		}
	}

	// Remove all self-references (i.e. packages not referenced by anyone else)
	for k, v := range counts {
		if v == 1 {
			delete(counts, k)
		}
	}

	bytes, _ := json.Marshal(counts)
	outfile := fmt.Sprintf("popularity-%s-%s.json", meta.name, meta.commit)
	err = ioutil.WriteFile(outfile, bytes, 0644)
	if err != nil {
		log.Fatalf("Failed to write output to '%s': %s\n", outfile, err)
	}

	log.Printf("Wrote output to '%s'\n", outfile)
}