diff options
author | edef <edef@edef.eu> | 2023-11-04T10·09+0000 |
---|---|---|
committer | edef <edef@edef.eu> | 2023-11-10T19·05+0000 |
commit | bdda10a2f54974053b1d78b801469f076e90a6da (patch) | |
tree | da10ab5e9f64b624fec66a9a0f1aaa94947afd59 /tvix/tools/turbofetch/src | |
parent | 9cd2e920653914d4ef7bec525614d101f3fdd207 (diff) |
feat(tvix/tools/turbofetch): init r/6979
Change-Id: I2efa6f94f57e812c52371256a4e62d1d54ff5057 Reviewed-on: https://cl.tvl.fyi/c/depot/+/9925 Reviewed-by: flokli <flokli@flokli.de> Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/tools/turbofetch/src')
-rw-r--r-- | tvix/tools/turbofetch/src/buffer.rs | 87 | ||||
-rw-r--r-- | tvix/tools/turbofetch/src/lib.rs | 103 | ||||
-rw-r--r-- | tvix/tools/turbofetch/src/main.rs | 220 |
3 files changed, 410 insertions, 0 deletions
diff --git a/tvix/tools/turbofetch/src/buffer.rs b/tvix/tools/turbofetch/src/buffer.rs new file mode 100644 index 000000000000..13207f562184 --- /dev/null +++ b/tvix/tools/turbofetch/src/buffer.rs @@ -0,0 +1,87 @@ +use magic_buffer::MagicBuffer; +use std::cell::Cell; + +/// Buffer is a FIFO queue for bytes, built on a ring buffer. +/// It always provides contiguous slices for both the readable and writable parts, +/// using an underlying buffer that is "mirrored" in virtual memory. +pub struct Buffer { + buffer: MagicBuffer, + /// first readable byte + head: Cell<usize>, + /// first writable byte + tail: usize, +} + +// SAFETY: MagicBuffer isn't bound to a thread, and neither are any of the other fields. +// MagicBuffer ought to be Send+Sync itself, upstream PR at https://github.com/sklose/magic-buffer/pull/4 +unsafe impl Send for Buffer {} + +impl Buffer { + /// Allocate a fresh buffer, with the specified capacity. + /// The buffer can contain at most `capacity - 1` bytes. + /// The capacity must be a power of two, and at least [Buffer::min_len]. + pub fn new(capacity: usize) -> Buffer { + Buffer { + // MagicBuffer::new verifies that `capacity` is a power of two, + // and at least MagicBuffer::min_len(). + buffer: MagicBuffer::new(capacity).unwrap(), + // `head == tail` means the buffer is empty. + // In order to ensure that this remains unambiguous, + // the buffer can only be filled with capacity-1 bytes. + head: Cell::new(0), + tail: 0, + } + } + + /// Returns the minimum buffer capacity. + /// This depends on the operating system and architecture. + pub fn min_capacity() -> usize { + MagicBuffer::min_len() + } + + /// Return the capacity of the buffer. + /// This is equal to `self.data().len() + self.space().len() + 1`. + pub fn capacity(&self) -> usize { + self.buffer.len() + } + + /// Return the valid, readable data in the buffer. + pub fn data(&self) -> &[u8] { + let len = self.buffer.len(); + let head = self.head.get(); + + if head <= self.tail { + &self.buffer[head..self.tail] + } else { + &self.buffer[head..self.tail + len] + } + } + + /// Mark `read_len` bytes of the readable data as consumed, freeing the space. + pub fn consume(&self, read_len: usize) { + debug_assert!(read_len <= self.data().len()); + let mut head = self.head.get(); + head += read_len; + head &= self.buffer.len() - 1; + self.head.set(head); + } + + /// Return the empty, writable space in the buffer. + pub fn space(&mut self) -> &mut [u8] { + let len = self.buffer.len(); + let head = self.head.get(); + + if head <= self.tail { + &mut self.buffer[self.tail..head + len - 1] + } else { + &mut self.buffer[self.tail..head - 1] + } + } + + /// Mark `written_len` bytes of the writable space as valid, readable data. + pub fn commit(&mut self, written_len: usize) { + debug_assert!(written_len <= self.space().len()); + self.tail += written_len; + self.tail &= self.buffer.len() - 1; + } +} diff --git a/tvix/tools/turbofetch/src/lib.rs b/tvix/tools/turbofetch/src/lib.rs new file mode 100644 index 000000000000..4b62fa4d75e7 --- /dev/null +++ b/tvix/tools/turbofetch/src/lib.rs @@ -0,0 +1,103 @@ +use std::{mem::MaybeUninit, str}; +use tokio::io::{self, AsyncRead, AsyncReadExt}; + +pub use buffer::Buffer; +mod buffer; + +/// Read as much data into `buffer` as possible. +/// Returns [io::ErrorKind::OutOfMemory] if the buffer is already full. +async fn slurp(buffer: &mut Buffer, sock: &mut (impl AsyncRead + Unpin)) -> io::Result<()> { + match buffer.space() { + [] => Err(io::Error::new(io::ErrorKind::OutOfMemory, "buffer filled")), + buf => { + let n = sock.read(buf).await?; + if n == 0 { + return Err(io::ErrorKind::UnexpectedEof.into()); + } + buffer.commit(n); + + Ok(()) + } + } +} + +fn get_content_length(headers: &[httparse::Header]) -> io::Result<u64> { + for header in headers { + if header.name == "Transfer-Encoding" { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Transfer-Encoding is unsupported", + )); + } + + if header.name == "Content-Length" { + return str::from_utf8(header.value) + .ok() + .and_then(|v| v.parse().ok()) + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "invalid Content-Length") + }); + } + } + + Err(io::Error::new( + io::ErrorKind::InvalidData, + "Content-Length missing", + )) +} + +/// Read an HTTP response from `sock` using `buffer`, returning the response body. +/// Returns an error if anything but 200 OK is received. +/// +/// The buffer must have enough space to contain the entire response body. +/// If there is not enough space, [io::ErrorKind::OutOfMemory] is returned. +/// +/// The HTTP response must use `Content-Length`, without `Transfer-Encoding`. +pub async fn parse_response<'a>( + sock: &mut (impl AsyncRead + Unpin), + buffer: &'a mut Buffer, +) -> io::Result<&'a [u8]> { + let body_len = loop { + let mut headers = [MaybeUninit::uninit(); 16]; + let mut response = httparse::Response::new(&mut []); + let status = httparse::ParserConfig::default() + .parse_response_with_uninit_headers(&mut response, buffer.data(), &mut headers) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + if let httparse::Status::Complete(n) = status { + buffer.consume(n); + + let code = response.code.unwrap(); + if code != 200 { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("HTTP response {code}"), + )); + } + + break get_content_length(response.headers)?; + } + + slurp(buffer, sock).await?; + }; + + let buf_len = buffer.space().len() + buffer.data().len(); + + if body_len > buf_len as u64 { + return Err(io::Error::new( + io::ErrorKind::OutOfMemory, + "HTTP response body does not fit in buffer", + )); + } + + let body_len = body_len as usize; + + while buffer.data().len() < body_len { + slurp(buffer, sock).await?; + } + + let data = buffer.data(); + buffer.consume(body_len); + + Ok(&data[..body_len]) +} diff --git a/tvix/tools/turbofetch/src/main.rs b/tvix/tools/turbofetch/src/main.rs new file mode 100644 index 000000000000..4b3a50eb3941 --- /dev/null +++ b/tvix/tools/turbofetch/src/main.rs @@ -0,0 +1,220 @@ +//! turbofetch is a high-performance bulk S3 object aggregator. +//! +//! It operates on two S3 buckets: a source bucket (nix-cache), and a +//! work bucket defined at runtime. The work bucket contains a job file +//! consisting of concatenated 32-character keys, representing narinfo +//! files in the source bucket, without the `.narinfo` suffix or any +//! other separators. +//! +//! Each run of turbofetch processes a half-open range of indices from the +//! job file, and outputs a zstd stream of concatenated objects, without +//! additional separators and in no particular order. These segment files +//! are written into the work bucket, named for the range of indices they +//! cover. `/narinfo.zst/000000000c380d40-000000000c385b60` covers the 20k +//! objects `[0xc380d40, 0xc385b60) = [205000000, 205020000)`. Empirically, +//! segment files of 20k objects achieve a compression ratio of 4.7x. +//! +//! Reassembly is left to narinfo2parquet, which interprets StorePath lines. +//! +//! TODO(edef): any retries/error handling whatsoever +//! Currently, it fails an entire range if anything goes wrong, and doesn't +//! write any output. + +use bytes::Bytes; +use futures::{stream::FuturesUnordered, Stream, TryStreamExt}; +use rusoto_core::ByteStream; +use rusoto_s3::{GetObjectRequest, PutObjectRequest, S3Client, S3}; +use serde::Deserialize; +use std::{io::Write, mem, ops::Range, ptr}; +use tokio::{ + io::{self, AsyncReadExt, AsyncWriteExt}, + net::TcpStream, +}; + +/// Fetch a group of keys, streaming concatenated chunks as they arrive from S3. +/// `keys` must be a slice from the job file. Any network error at all fails the +/// entire batch, and there is no rate limiting. +fn fetch(keys: &[[u8; 32]]) -> impl Stream<Item = io::Result<Bytes>> { + // S3 supports only HTTP/1.1, but we can ease the pain somewhat by using + // HTTP pipelining. It terminates the TCP connection after receiving 100 + // requests, so we chunk the keys up accordingly, and make one connection + // for each chunk. + keys.chunks(100) + .map(|chunk| { + const PREFIX: &[u8] = b"GET /nix-cache/"; + const SUFFIX: &[u8] = b".narinfo HTTP/1.1\nHost: s3.amazonaws.com\n\n"; + const LENGTH: usize = PREFIX.len() + 32 + SUFFIX.len(); + + let mut request = Vec::with_capacity(LENGTH * 100); + for key in chunk { + request.extend_from_slice(PREFIX); + request.extend_from_slice(key); + request.extend_from_slice(SUFFIX); + } + + (request, chunk.len()) + }) + .map(|(request, n)| async move { + let (mut read, mut write) = TcpStream::connect("s3.amazonaws.com:80") + .await? + .into_split(); + + let _handle = tokio::spawn(async move { + let request = request; + write.write_all(&request).await + }); + + let mut buffer = turbofetch::Buffer::new(512 * 1024); + let mut bodies = vec![]; + + for _ in 0..n { + let body = turbofetch::parse_response(&mut read, &mut buffer).await?; + bodies.extend_from_slice(body); + } + + Ok::<_, io::Error>(Bytes::from(bodies)) + }) + .collect::<FuturesUnordered<_>>() +} + +/// Retrieve a range of keys from the job file. +async fn get_range( + s3: &'static S3Client, + bucket: String, + key: String, + range: Range<u64>, +) -> io::Result<Box<[[u8; 32]]>> { + let resp = s3 + .get_object(GetObjectRequest { + bucket, + key, + range: Some(format!("bytes={}-{}", range.start * 32, range.end * 32 - 1)), + ..GetObjectRequest::default() + }) + .await + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + let mut body = vec![]; + resp.body + .ok_or(io::ErrorKind::InvalidData)? + .into_async_read() + .read_to_end(&mut body) + .await?; + + let body = exact_chunks(body.into_boxed_slice()).ok_or(io::ErrorKind::InvalidData)?; + + Ok(body) +} + +fn exact_chunks(mut buf: Box<[u8]>) -> Option<Box<[[u8; 32]]>> { + // SAFETY: We ensure that `buf.len()` is a multiple of 32, and there are no alignment requirements. + unsafe { + let ptr = buf.as_mut_ptr(); + let len = buf.len(); + + if len % 32 != 0 { + return None; + } + + let ptr = ptr as *mut [u8; 32]; + let len = len / 32; + mem::forget(buf); + + Some(Box::from_raw(ptr::slice_from_raw_parts_mut(ptr, len))) + } +} + +// TODO(edef): factor this out into a separate entry point +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<(), lambda_runtime::Error> { + let s3 = S3Client::new(rusoto_core::Region::UsEast1); + let s3 = &*Box::leak(Box::new(s3)); + + tracing_subscriber::fmt() + .json() + .with_max_level(tracing::Level::INFO) + // this needs to be set to remove duplicated information in the log. + .with_current_span(false) + // this needs to be set to false, otherwise ANSI color codes will + // show up in a confusing manner in CloudWatch logs. + .with_ansi(false) + // disabling time is handy because CloudWatch will add the ingestion time. + .without_time() + // remove the name of the function from every log entry + .with_target(false) + .init(); + + lambda_runtime::run(lambda_runtime::service_fn(|event| func(s3, event))).await +} + +/// Lambda request body +#[derive(Debug, Deserialize)] +struct Params { + work_bucket: String, + job_file: String, + start: u64, + end: u64, +} + +#[tracing::instrument(skip(s3, event), fields(req_id = %event.context.request_id))] +async fn func( + s3: &'static S3Client, + event: lambda_runtime::LambdaEvent< + aws_lambda_events::lambda_function_urls::LambdaFunctionUrlRequest, + >, +) -> Result<&'static str, lambda_runtime::Error> { + let mut params = event.payload.body.ok_or("no body")?; + + if event.payload.is_base64_encoded { + params = String::from_utf8(data_encoding::BASE64.decode(params.as_bytes())?)?; + } + + let params: Params = serde_json::from_str(¶ms)?; + + if params.start >= params.end { + return Err("nope".into()); + } + + let keys = get_range( + s3, + params.work_bucket.clone(), + params.job_file.to_owned(), + params.start..params.end, + ) + .await?; + + let zchunks = fetch(&keys) + .try_fold( + Box::new(zstd::Encoder::new(vec![], zstd::DEFAULT_COMPRESSION_LEVEL).unwrap()), + |mut w, buf| { + w.write_all(&buf).unwrap(); + async { Ok(w) } + }, + ) + .await?; + + let zchunks = to_byte_stream(zchunks.finish().unwrap()); + + tracing::info!("we got to put_object"); + + s3.put_object(PutObjectRequest { + bucket: params.work_bucket, + key: format!("narinfo.zst/{:016x}-{:016x}", params.start, params.end), + body: Some(zchunks), + ..Default::default() + }) + .await + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + tracing::info!("… and it worked!"); + + Ok("OK") +} + +fn to_byte_stream(buffer: Vec<u8>) -> ByteStream { + let size_hint = buffer.len(); + ByteStream::new_with_size( + futures::stream::once(async { Ok(buffer.into()) }), + size_hint, + ) +} |