about summary refs log tree commit diff
path: root/users/flokli
diff options
context:
space:
mode:
authorFlorian Klink <flokli@flokli.de>2023-11-11T10·08+0200
committerflokli <flokli@flokli.de>2023-11-11T12·24+0000
commit5ae49c5ccf9e98e75243b5156a2648e1cd7a285d (patch)
treef221f9f0d9245188ae29f683157492dbd5308026 /users/flokli
parent8d02928b1493087b7f9e458411416ed268ce2112 (diff)
feat(users/flokli/archeology): init parse-bucket-logs r/6989
Change-Id: I096b6fed8c73ddd5a417f5183cc113356ffd98c9
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9983
Tested-by: BuildkiteCI
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Diffstat (limited to 'users/flokli')
-rw-r--r--users/flokli/archeology/OWNERS1
-rw-r--r--users/flokli/archeology/README.md5
-rw-r--r--users/flokli/archeology/default.nix12
-rw-r--r--users/flokli/archeology/parse_bucket_logs.rs37
4 files changed, 55 insertions, 0 deletions
diff --git a/users/flokli/archeology/OWNERS b/users/flokli/archeology/OWNERS
new file mode 100644
index 0000000000..b9bc074a80
--- /dev/null
+++ b/users/flokli/archeology/OWNERS
@@ -0,0 +1 @@
+edef
diff --git a/users/flokli/archeology/README.md b/users/flokli/archeology/README.md
new file mode 100644
index 0000000000..e4cd9b84b0
--- /dev/null
+++ b/users/flokli/archeology/README.md
@@ -0,0 +1,5 @@
+# archeology
+
+This directory contains various scripts and helpers used for nix-archeology tasks.
+
+It's used from some of the archeology instances, as well as standalone.
diff --git a/users/flokli/archeology/default.nix b/users/flokli/archeology/default.nix
new file mode 100644
index 0000000000..f5d7ae3fc3
--- /dev/null
+++ b/users/flokli/archeology/default.nix
@@ -0,0 +1,12 @@
+{ depot, pkgs, ... }:
+
+depot.nix.readTree.drvTargets {
+  parse-bucket-logs = pkgs.runCommand "archeology-parse-bucket-logs"
+    {
+      nativeBuildInputs = [ pkgs.makeWrapper ];
+    } ''
+    mkdir -p $out/bin
+    makeWrapper ${(pkgs.writers.writeRust "parse-bucket-logs-unwrapped" {} ./parse_bucket_logs.rs)} $out/bin/archeology-parse-bucket-logs \
+      --prefix PATH : ${pkgs.lib.makeBinPath [ pkgs.clickhouse ]}
+  '';
+}
diff --git a/users/flokli/archeology/parse_bucket_logs.rs b/users/flokli/archeology/parse_bucket_logs.rs
new file mode 100644
index 0000000000..32108f93db
--- /dev/null
+++ b/users/flokli/archeology/parse_bucket_logs.rs
@@ -0,0 +1,37 @@
+use std::env;
+use std::process::Command;
+use std::process::ExitCode;
+
+fn main() -> ExitCode {
+    let args: Vec<String> = env::args().collect();
+    if args.len() != 3 {
+        eprintln!("needs two args, input s3 url (glob) and output pq file");
+        return ExitCode::FAILURE;
+    }
+
+    let input_files = &args[1];
+    let output_file = &args[2];
+
+    let mut cmd = Command::new("clickhouse-local");
+    cmd.arg("-q")
+        .arg(format!(r#"SELECT
+        key,
+        toInt64(nullif(http_status, '-')) AS http_status,
+        toInt64(nullif(object_size_str, '-')) AS object_size,
+        toInt64(nullif(bytes_sent_str, '-')) AS bytes_sent,
+        nullif(user_agent, '-') AS user_agent,
+        operation,
+        nullif(requester, '-') AS requester,
+        parseDateTime(timestamp_str, '%d/%b/%Y:%k:%i:%s %z') AS timestamp
+    FROM s3(
+        '{}',
+        'Regexp',
+        'owner String , bucket String, timestamp_str String, remote_ip String, requester LowCardinality(String), request_id String, operation LowCardinality(String), key String, request_uri String, http_status String, error_code String, bytes_sent_str String, object_size_str String, total_time String, turn_around_time String, referer String, user_agent String, version_id String, host_id String, signature_version String, cipher_suite String, authentication_type String, host_header String, tls_version String, access_point_arn String, acl_required String'
+    )
+    SETTINGS format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*'
+    INTO OUTFILE '{}' COMPRESSION 'zstd' FORMAT Parquet"#, input_files, output_file));
+
+    cmd.status().expect("clickhouse-local failed");
+
+    ExitCode::SUCCESS
+}