From 281cb93ba808b73d4ea4ce86f762bbcb504a09da Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Sat, 11 Nov 2023 12:59:33 +0200 Subject: feat(users/flokli/nixos/archeology-ec2): add parse-bucket-logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a `archeology-parse-bucket-logs` CLI tool to `$PATH`. It can be invoked like this: ``` archeology-parse-bucket-logs http://nix-cache-log.s3.amazonaws.com/log/2023-11-10-00-* bucket_logs_2023-11-10-00.pq.zstd ```` … and will produce a zstd-compressed Parquet file for (roughly) that time range. As the EC2 instance credentials don't give access to the logs bucket (yet), other AWS credentials need to be provided. This can be accomplished by using "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN" from "Option 2: Manually add a profile to your AWS credentials file (Short- term credentials)" in AWS IAM Identity Center. Processing logs for a one-hour range takes a minute or two, the resulting zstd-compressed Parquet file is around 40-80M in size. Processing logs for a whole day takes some 25mins, due to the sheer amount of data (12 GB of raw log data, distributed among 450k individual files, 20Mio log lines), but at least clickhouse isn't able to parse the resulting parquet file back in: > Code: 36. DB::Exception: IOError: Couldn't deserialize thrift: MaxMessageSize reached For future automation tasks, it's probably better to run this once an hour, and further join the data later on. Change-Id: I6c8108c0ec17dc8d4e2dbe923175553325210a5c Reviewed-on: https://cl.tvl.fyi/c/depot/+/10007 Tested-by: BuildkiteCI Reviewed-by: raitobezarius --- users/flokli/nixos/archeology-ec2/configuration.nix | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'users/flokli') diff --git a/users/flokli/nixos/archeology-ec2/configuration.nix b/users/flokli/nixos/archeology-ec2/configuration.nix index 8e8b2884fb..af10771adc 100644 --- a/users/flokli/nixos/archeology-ec2/configuration.nix +++ b/users/flokli/nixos/archeology-ec2/configuration.nix @@ -6,6 +6,10 @@ ../profiles/archeology.nix ]; + environment.systemPackages = [ + depot.users.flokli.archeology.parse-bucket-logs + ]; + networking.hostName = "archeology-ec2"; system.stateVersion = "23.05"; # Did you read the comment? -- cgit 1.4.1