From 6e946c8e72e0fd2baa58cab484a34debd2ae906d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Sat, 28 Mar 2009 20:51:33 +0000 Subject: * Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. --- src/libstore/build.cc | 20 ++++---- src/libstore/references.cc | 115 +++++++++++++++------------------------------ src/libstore/references.hh | 4 +- 3 files changed, 50 insertions(+), 89 deletions(-) diff --git a/src/libstore/build.cc b/src/libstore/build.cc index 12061bee7813..6c24a2746dd4 100644 --- a/src/libstore/build.cc +++ b/src/libstore/build.cc @@ -1864,15 +1864,17 @@ void DerivationGoal::computeClosure() /* Get rid of all weird permissions. */ canonicalisePathMetaData(path); - /* For this output path, find the references to other paths contained - in it. */ - PathSet references = scanForReferences(path, allPaths); + /* For this output path, find the references to other paths + contained in it. Compute the SHA-256 NAR hash at the same + time. The hash is stored in the database so that we can + verify later on whether nobody has messed with the store. */ + Hash hash; + PathSet references = scanForReferences(path, allPaths, hash); + contentHashes[path] = hash; /* For debugging, print out the referenced and unreferenced paths. */ - for (PathSet::iterator i = inputPaths.begin(); - i != inputPaths.end(); ++i) - { + foreach (PathSet::iterator, i, inputPaths) { PathSet::iterator j = references.find(*i); if (j == references.end()) debug(format("unreferenced input: `%1%'") % *i); @@ -1892,12 +1894,6 @@ void DerivationGoal::computeClosure() if (allowed.find(*i) == allowed.end()) throw BuildError(format("output is not allowed to refer to path `%1%'") % *i); } - - /* Hash the contents of the path. The hash is stored in the - database so that we can verify later on whether nobody has - messed with the store. !!! inefficient: it would be nice - if we could combine this with filterReferences(). */ - contentHashes[path] = hashPath(htSHA256, path); } /* Register each output path as valid, and register the sets of diff --git a/src/libstore/references.cc b/src/libstore/references.cc index bfb4f8d0a3af..68d1cbfbfbc2 100644 --- a/src/libstore/references.cc +++ b/src/libstore/references.cc @@ -1,18 +1,10 @@ #include "references.hh" #include "hash.hh" #include "util.hh" +#include "archive.hh" -#include -#include -#include #include -#include -#include -#include -#include -#include - namespace nix { @@ -20,8 +12,8 @@ namespace nix { static unsigned int refLength = 32; /* characters */ -static void search(size_t len, const unsigned char * s, - StringSet & ids, StringSet & seen) +static void search(const unsigned char * s, unsigned int len, + StringSet & hashes, StringSet & seen) { static bool initialised = false; static bool isBase32[256]; @@ -43,93 +35,60 @@ static void search(size_t len, const unsigned char * s, } if (!match) continue; string ref((const char *) s + i, refLength); - if (ids.find(ref) != ids.end()) { + if (hashes.find(ref) != hashes.end()) { debug(format("found reference to `%1%' at offset `%2%'") % ref % i); seen.insert(ref); - ids.erase(ref); + hashes.erase(ref); } ++i; } } -void checkPath(const string & path, - StringSet & ids, StringSet & seen) +struct RefScanSink : Sink { - checkInterrupt(); - - debug(format("checking `%1%'") % path); + HashSink hashSink; + StringSet hashes; + StringSet seen; - struct stat st; - if (lstat(path.c_str(), &st)) - throw SysError(format("getting attributes of path `%1%'") % path); + string tail; - if (S_ISDIR(st.st_mode)) { - Strings names = readDirectory(path); - for (Strings::iterator i = names.begin(); i != names.end(); i++) { - search(i->size(), (const unsigned char *) i->c_str(), ids, seen); - checkPath(path + "/" + *i, ids, seen); - } - } + RefScanSink() : hashSink(htSHA256) { } + + void operator () (const unsigned char * data, unsigned int len); +}; - else if (S_ISREG(st.st_mode)) { - - AutoCloseFD fd = open(path.c_str(), O_RDONLY); - if (fd == -1) throw SysError(format("opening file `%1%'") % path); - size_t bufSize = 1024 * 1024; - assert(refLength <= bufSize); - unsigned char * buf = new unsigned char[bufSize]; +void RefScanSink::operator () (const unsigned char * data, unsigned int len) +{ + hashSink(data, len); - size_t left = st.st_size; - bool firstBlock = true; - - while (left > 0) { - checkInterrupt(); - - size_t read = left > bufSize ? bufSize : left; - size_t copiedBytes = 0; - - if (!firstBlock) { - /* Move the last (refLength - 1) bytes from the last - block to the start of the buffer to deal with - references that cross block boundaries. */ - copiedBytes = refLength - 1; - if (read + copiedBytes > bufSize) - read -= copiedBytes; - memcpy(buf, buf + (bufSize - copiedBytes), copiedBytes); - } - firstBlock = false; + /* It's possible that a reference spans the previous and current + fragment, so search in the concatenation of the tail of the + previous fragment and the start of the current fragment. */ + string s = tail + string((const char *) data, len > refLength ? refLength : len); + search((const unsigned char *) s.c_str(), s.size(), hashes, seen); - readFull(fd, buf + copiedBytes, read); - left -= read; + search(data, len, hashes, seen); - search(copiedBytes + read, buf, ids, seen); - } - - delete[] buf; /* !!! autodelete */ - } - - else if (S_ISLNK(st.st_mode)) { - string target = readLink(path); - search(target.size(), (const unsigned char *) target.c_str(), ids, seen); - } - - else throw Error(format("unknown file type: %1%") % path); + unsigned int tailLen = len <= refLength ? len : refLength; + tail = + string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) + + string((const char *) data + len - tailLen, tailLen); } -PathSet scanForReferences(const string & path, const PathSet & paths) +PathSet scanForReferences(const string & path, + const PathSet & refs, Hash & hash) { + RefScanSink sink; std::map backMap; - StringSet ids; - StringSet seen; /* For efficiency (and a higher hit rate), just search for the hash part of the file name. (This assumes that all references have the form `HASH-bla'). */ - for (PathSet::const_iterator i = paths.begin(); i != paths.end(); i++) { + foreach (PathSet::const_iterator, i, refs) { string baseName = baseNameOf(*i); string::size_type pos = baseName.find('-'); if (pos == string::npos) @@ -138,21 +97,25 @@ PathSet scanForReferences(const string & path, const PathSet & paths) assert(s.size() == refLength); assert(backMap.find(s) == backMap.end()); // parseHash(htSHA256, s); - ids.insert(s); + sink.hashes.insert(s); backMap[s] = *i; } - checkPath(path, ids, seen); + /* Look for the hashes in the NAR dump of the path. */ + dumpPath(path, sink); + /* Map the hashes found back to their store paths. */ PathSet found; - for (StringSet::iterator i = seen.begin(); i != seen.end(); i++) { + foreach (StringSet::iterator, i, sink.seen) { std::map::iterator j; if ((j = backMap.find(*i)) == backMap.end()) abort(); found.insert(j->second); } + hash = sink.hashSink.finish(); + return found; } - + } diff --git a/src/libstore/references.hh b/src/libstore/references.hh index 76a7ee166dbe..7d068eb51700 100644 --- a/src/libstore/references.hh +++ b/src/libstore/references.hh @@ -2,10 +2,12 @@ #define __REFERENCES_H #include "types.hh" +#include "hash.hh" namespace nix { -PathSet scanForReferences(const Path & path, const PathSet & refs); +PathSet scanForReferences(const Path & path, const PathSet & refs, + Hash & hash); } -- cgit 1.4.1