From 6e946c8e72e0fd2baa58cab484a34debd2ae906d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Sat, 28 Mar 2009 20:51:33 +0000 Subject: * Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. --- src/libstore/references.cc | 115 +++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 76 deletions(-) (limited to 'src/libstore/references.cc') diff --git a/src/libstore/references.cc b/src/libstore/references.cc index bfb4f8d0a3af..68d1cbfbfbc2 100644 --- a/src/libstore/references.cc +++ b/src/libstore/references.cc @@ -1,18 +1,10 @@ #include "references.hh" #include "hash.hh" #include "util.hh" +#include "archive.hh" -#include -#include -#include #include -#include -#include -#include -#include -#include - namespace nix { @@ -20,8 +12,8 @@ namespace nix { static unsigned int refLength = 32; /* characters */ -static void search(size_t len, const unsigned char * s, - StringSet & ids, StringSet & seen) +static void search(const unsigned char * s, unsigned int len, + StringSet & hashes, StringSet & seen) { static bool initialised = false; static bool isBase32[256]; @@ -43,93 +35,60 @@ static void search(size_t len, const unsigned char * s, } if (!match) continue; string ref((const char *) s + i, refLength); - if (ids.find(ref) != ids.end()) { + if (hashes.find(ref) != hashes.end()) { debug(format("found reference to `%1%' at offset `%2%'") % ref % i); seen.insert(ref); - ids.erase(ref); + hashes.erase(ref); } ++i; } } -void checkPath(const string & path, - StringSet & ids, StringSet & seen) +struct RefScanSink : Sink { - checkInterrupt(); - - debug(format("checking `%1%'") % path); + HashSink hashSink; + StringSet hashes; + StringSet seen; - struct stat st; - if (lstat(path.c_str(), &st)) - throw SysError(format("getting attributes of path `%1%'") % path); + string tail; - if (S_ISDIR(st.st_mode)) { - Strings names = readDirectory(path); - for (Strings::iterator i = names.begin(); i != names.end(); i++) { - search(i->size(), (const unsigned char *) i->c_str(), ids, seen); - checkPath(path + "/" + *i, ids, seen); - } - } + RefScanSink() : hashSink(htSHA256) { } + + void operator () (const unsigned char * data, unsigned int len); +}; - else if (S_ISREG(st.st_mode)) { - - AutoCloseFD fd = open(path.c_str(), O_RDONLY); - if (fd == -1) throw SysError(format("opening file `%1%'") % path); - size_t bufSize = 1024 * 1024; - assert(refLength <= bufSize); - unsigned char * buf = new unsigned char[bufSize]; +void RefScanSink::operator () (const unsigned char * data, unsigned int len) +{ + hashSink(data, len); - size_t left = st.st_size; - bool firstBlock = true; - - while (left > 0) { - checkInterrupt(); - - size_t read = left > bufSize ? bufSize : left; - size_t copiedBytes = 0; - - if (!firstBlock) { - /* Move the last (refLength - 1) bytes from the last - block to the start of the buffer to deal with - references that cross block boundaries. */ - copiedBytes = refLength - 1; - if (read + copiedBytes > bufSize) - read -= copiedBytes; - memcpy(buf, buf + (bufSize - copiedBytes), copiedBytes); - } - firstBlock = false; + /* It's possible that a reference spans the previous and current + fragment, so search in the concatenation of the tail of the + previous fragment and the start of the current fragment. */ + string s = tail + string((const char *) data, len > refLength ? refLength : len); + search((const unsigned char *) s.c_str(), s.size(), hashes, seen); - readFull(fd, buf + copiedBytes, read); - left -= read; + search(data, len, hashes, seen); - search(copiedBytes + read, buf, ids, seen); - } - - delete[] buf; /* !!! autodelete */ - } - - else if (S_ISLNK(st.st_mode)) { - string target = readLink(path); - search(target.size(), (const unsigned char *) target.c_str(), ids, seen); - } - - else throw Error(format("unknown file type: %1%") % path); + unsigned int tailLen = len <= refLength ? len : refLength; + tail = + string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) + + string((const char *) data + len - tailLen, tailLen); } -PathSet scanForReferences(const string & path, const PathSet & paths) +PathSet scanForReferences(const string & path, + const PathSet & refs, Hash & hash) { + RefScanSink sink; std::map backMap; - StringSet ids; - StringSet seen; /* For efficiency (and a higher hit rate), just search for the hash part of the file name. (This assumes that all references have the form `HASH-bla'). */ - for (PathSet::const_iterator i = paths.begin(); i != paths.end(); i++) { + foreach (PathSet::const_iterator, i, refs) { string baseName = baseNameOf(*i); string::size_type pos = baseName.find('-'); if (pos == string::npos) @@ -138,21 +97,25 @@ PathSet scanForReferences(const string & path, const PathSet & paths) assert(s.size() == refLength); assert(backMap.find(s) == backMap.end()); // parseHash(htSHA256, s); - ids.insert(s); + sink.hashes.insert(s); backMap[s] = *i; } - checkPath(path, ids, seen); + /* Look for the hashes in the NAR dump of the path. */ + dumpPath(path, sink); + /* Map the hashes found back to their store paths. */ PathSet found; - for (StringSet::iterator i = seen.begin(); i != seen.end(); i++) { + foreach (StringSet::iterator, i, sink.seen) { std::map::iterator j; if ((j = backMap.find(*i)) == backMap.end()) abort(); found.insert(j->second); } + hash = sink.hashSink.finish(); + return found; } - + } -- cgit 1.4.1