From 5ae49c5ccf9e98e75243b5156a2648e1cd7a285d Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Sat, 11 Nov 2023 12:08:02 +0200 Subject: feat(users/flokli/archeology): init parse-bucket-logs Change-Id: I096b6fed8c73ddd5a417f5183cc113356ffd98c9 Reviewed-on: https://cl.tvl.fyi/c/depot/+/9983 Tested-by: BuildkiteCI Reviewed-by: raitobezarius --- users/flokli/archeology/OWNERS | 1 + users/flokli/archeology/README.md | 5 ++++ users/flokli/archeology/default.nix | 12 +++++++++ users/flokli/archeology/parse_bucket_logs.rs | 37 ++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+) create mode 100644 users/flokli/archeology/OWNERS create mode 100644 users/flokli/archeology/README.md create mode 100644 users/flokli/archeology/default.nix create mode 100644 users/flokli/archeology/parse_bucket_logs.rs diff --git a/users/flokli/archeology/OWNERS b/users/flokli/archeology/OWNERS new file mode 100644 index 000000000000..b9bc074a8020 --- /dev/null +++ b/users/flokli/archeology/OWNERS @@ -0,0 +1 @@ +edef diff --git a/users/flokli/archeology/README.md b/users/flokli/archeology/README.md new file mode 100644 index 000000000000..e4cd9b84b0d8 --- /dev/null +++ b/users/flokli/archeology/README.md @@ -0,0 +1,5 @@ +# archeology + +This directory contains various scripts and helpers used for nix-archeology tasks. + +It's used from some of the archeology instances, as well as standalone. diff --git a/users/flokli/archeology/default.nix b/users/flokli/archeology/default.nix new file mode 100644 index 000000000000..f5d7ae3fc339 --- /dev/null +++ b/users/flokli/archeology/default.nix @@ -0,0 +1,12 @@ +{ depot, pkgs, ... }: + +depot.nix.readTree.drvTargets { + parse-bucket-logs = pkgs.runCommand "archeology-parse-bucket-logs" + { + nativeBuildInputs = [ pkgs.makeWrapper ]; + } '' + mkdir -p $out/bin + makeWrapper ${(pkgs.writers.writeRust "parse-bucket-logs-unwrapped" {} ./parse_bucket_logs.rs)} $out/bin/archeology-parse-bucket-logs \ + --prefix PATH : ${pkgs.lib.makeBinPath [ pkgs.clickhouse ]} + ''; +} diff --git a/users/flokli/archeology/parse_bucket_logs.rs b/users/flokli/archeology/parse_bucket_logs.rs new file mode 100644 index 000000000000..32108f93dbb5 --- /dev/null +++ b/users/flokli/archeology/parse_bucket_logs.rs @@ -0,0 +1,37 @@ +use std::env; +use std::process::Command; +use std::process::ExitCode; + +fn main() -> ExitCode { + let args: Vec = env::args().collect(); + if args.len() != 3 { + eprintln!("needs two args, input s3 url (glob) and output pq file"); + return ExitCode::FAILURE; + } + + let input_files = &args[1]; + let output_file = &args[2]; + + let mut cmd = Command::new("clickhouse-local"); + cmd.arg("-q") + .arg(format!(r#"SELECT + key, + toInt64(nullif(http_status, '-')) AS http_status, + toInt64(nullif(object_size_str, '-')) AS object_size, + toInt64(nullif(bytes_sent_str, '-')) AS bytes_sent, + nullif(user_agent, '-') AS user_agent, + operation, + nullif(requester, '-') AS requester, + parseDateTime(timestamp_str, '%d/%b/%Y:%k:%i:%s %z') AS timestamp + FROM s3( + '{}', + 'Regexp', + 'owner String , bucket String, timestamp_str String, remote_ip String, requester LowCardinality(String), request_id String, operation LowCardinality(String), key String, request_uri String, http_status String, error_code String, bytes_sent_str String, object_size_str String, total_time String, turn_around_time String, referer String, user_agent String, version_id String, host_id String, signature_version String, cipher_suite String, authentication_type String, host_header String, tls_version String, access_point_arn String, acl_required String' + ) + SETTINGS format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*' + INTO OUTFILE '{}' COMPRESSION 'zstd' FORMAT Parquet"#, input_files, output_file)); + + cmd.status().expect("clickhouse-local failed"); + + ExitCode::SUCCESS +} -- cgit 1.4.1