From bdda10a2f54974053b1d78b801469f076e90a6da Mon Sep 17 00:00:00 2001 From: edef Date: Sat, 4 Nov 2023 10:09:20 +0000 Subject: feat(tvix/tools/turbofetch): init Change-Id: I2efa6f94f57e812c52371256a4e62d1d54ff5057 Reviewed-on: https://cl.tvl.fyi/c/depot/+/9925 Reviewed-by: flokli Tested-by: BuildkiteCI --- tvix/tools/turbofetch/src/main.rs | 220 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 tvix/tools/turbofetch/src/main.rs (limited to 'tvix/tools/turbofetch/src/main.rs') diff --git a/tvix/tools/turbofetch/src/main.rs b/tvix/tools/turbofetch/src/main.rs new file mode 100644 index 000000000000..4b3a50eb3941 --- /dev/null +++ b/tvix/tools/turbofetch/src/main.rs @@ -0,0 +1,220 @@ +//! turbofetch is a high-performance bulk S3 object aggregator. +//! +//! It operates on two S3 buckets: a source bucket (nix-cache), and a +//! work bucket defined at runtime. The work bucket contains a job file +//! consisting of concatenated 32-character keys, representing narinfo +//! files in the source bucket, without the `.narinfo` suffix or any +//! other separators. +//! +//! Each run of turbofetch processes a half-open range of indices from the +//! job file, and outputs a zstd stream of concatenated objects, without +//! additional separators and in no particular order. These segment files +//! are written into the work bucket, named for the range of indices they +//! cover. `/narinfo.zst/000000000c380d40-000000000c385b60` covers the 20k +//! objects `[0xc380d40, 0xc385b60) = [205000000, 205020000)`. Empirically, +//! segment files of 20k objects achieve a compression ratio of 4.7x. +//! +//! Reassembly is left to narinfo2parquet, which interprets StorePath lines. +//! +//! TODO(edef): any retries/error handling whatsoever +//! Currently, it fails an entire range if anything goes wrong, and doesn't +//! write any output. + +use bytes::Bytes; +use futures::{stream::FuturesUnordered, Stream, TryStreamExt}; +use rusoto_core::ByteStream; +use rusoto_s3::{GetObjectRequest, PutObjectRequest, S3Client, S3}; +use serde::Deserialize; +use std::{io::Write, mem, ops::Range, ptr}; +use tokio::{ + io::{self, AsyncReadExt, AsyncWriteExt}, + net::TcpStream, +}; + +/// Fetch a group of keys, streaming concatenated chunks as they arrive from S3. +/// `keys` must be a slice from the job file. Any network error at all fails the +/// entire batch, and there is no rate limiting. +fn fetch(keys: &[[u8; 32]]) -> impl Stream> { + // S3 supports only HTTP/1.1, but we can ease the pain somewhat by using + // HTTP pipelining. It terminates the TCP connection after receiving 100 + // requests, so we chunk the keys up accordingly, and make one connection + // for each chunk. + keys.chunks(100) + .map(|chunk| { + const PREFIX: &[u8] = b"GET /nix-cache/"; + const SUFFIX: &[u8] = b".narinfo HTTP/1.1\nHost: s3.amazonaws.com\n\n"; + const LENGTH: usize = PREFIX.len() + 32 + SUFFIX.len(); + + let mut request = Vec::with_capacity(LENGTH * 100); + for key in chunk { + request.extend_from_slice(PREFIX); + request.extend_from_slice(key); + request.extend_from_slice(SUFFIX); + } + + (request, chunk.len()) + }) + .map(|(request, n)| async move { + let (mut read, mut write) = TcpStream::connect("s3.amazonaws.com:80") + .await? + .into_split(); + + let _handle = tokio::spawn(async move { + let request = request; + write.write_all(&request).await + }); + + let mut buffer = turbofetch::Buffer::new(512 * 1024); + let mut bodies = vec![]; + + for _ in 0..n { + let body = turbofetch::parse_response(&mut read, &mut buffer).await?; + bodies.extend_from_slice(body); + } + + Ok::<_, io::Error>(Bytes::from(bodies)) + }) + .collect::>() +} + +/// Retrieve a range of keys from the job file. +async fn get_range( + s3: &'static S3Client, + bucket: String, + key: String, + range: Range, +) -> io::Result> { + let resp = s3 + .get_object(GetObjectRequest { + bucket, + key, + range: Some(format!("bytes={}-{}", range.start * 32, range.end * 32 - 1)), + ..GetObjectRequest::default() + }) + .await + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + let mut body = vec![]; + resp.body + .ok_or(io::ErrorKind::InvalidData)? + .into_async_read() + .read_to_end(&mut body) + .await?; + + let body = exact_chunks(body.into_boxed_slice()).ok_or(io::ErrorKind::InvalidData)?; + + Ok(body) +} + +fn exact_chunks(mut buf: Box<[u8]>) -> Option> { + // SAFETY: We ensure that `buf.len()` is a multiple of 32, and there are no alignment requirements. + unsafe { + let ptr = buf.as_mut_ptr(); + let len = buf.len(); + + if len % 32 != 0 { + return None; + } + + let ptr = ptr as *mut [u8; 32]; + let len = len / 32; + mem::forget(buf); + + Some(Box::from_raw(ptr::slice_from_raw_parts_mut(ptr, len))) + } +} + +// TODO(edef): factor this out into a separate entry point +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<(), lambda_runtime::Error> { + let s3 = S3Client::new(rusoto_core::Region::UsEast1); + let s3 = &*Box::leak(Box::new(s3)); + + tracing_subscriber::fmt() + .json() + .with_max_level(tracing::Level::INFO) + // this needs to be set to remove duplicated information in the log. + .with_current_span(false) + // this needs to be set to false, otherwise ANSI color codes will + // show up in a confusing manner in CloudWatch logs. + .with_ansi(false) + // disabling time is handy because CloudWatch will add the ingestion time. + .without_time() + // remove the name of the function from every log entry + .with_target(false) + .init(); + + lambda_runtime::run(lambda_runtime::service_fn(|event| func(s3, event))).await +} + +/// Lambda request body +#[derive(Debug, Deserialize)] +struct Params { + work_bucket: String, + job_file: String, + start: u64, + end: u64, +} + +#[tracing::instrument(skip(s3, event), fields(req_id = %event.context.request_id))] +async fn func( + s3: &'static S3Client, + event: lambda_runtime::LambdaEvent< + aws_lambda_events::lambda_function_urls::LambdaFunctionUrlRequest, + >, +) -> Result<&'static str, lambda_runtime::Error> { + let mut params = event.payload.body.ok_or("no body")?; + + if event.payload.is_base64_encoded { + params = String::from_utf8(data_encoding::BASE64.decode(params.as_bytes())?)?; + } + + let params: Params = serde_json::from_str(¶ms)?; + + if params.start >= params.end { + return Err("nope".into()); + } + + let keys = get_range( + s3, + params.work_bucket.clone(), + params.job_file.to_owned(), + params.start..params.end, + ) + .await?; + + let zchunks = fetch(&keys) + .try_fold( + Box::new(zstd::Encoder::new(vec![], zstd::DEFAULT_COMPRESSION_LEVEL).unwrap()), + |mut w, buf| { + w.write_all(&buf).unwrap(); + async { Ok(w) } + }, + ) + .await?; + + let zchunks = to_byte_stream(zchunks.finish().unwrap()); + + tracing::info!("we got to put_object"); + + s3.put_object(PutObjectRequest { + bucket: params.work_bucket, + key: format!("narinfo.zst/{:016x}-{:016x}", params.start, params.end), + body: Some(zchunks), + ..Default::default() + }) + .await + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + tracing::info!("… and it worked!"); + + Ok("OK") +} + +fn to_byte_stream(buffer: Vec) -> ByteStream { + let size_hint = buffer.len(); + ByteStream::new_with_size( + futures::stream::once(async { Ok(buffer.into()) }), + size_hint, + ) +} -- cgit 1.4.1