about summary refs log tree commit diff
path: root/users/flokli/archeology/parse_bucket_logs.rs
diff options
context:
space:
mode:
authorFlorian Klink <flokli@flokli.de>2023-11-11T19·14+0200
committerflokli <flokli@flokli.de>2023-11-11T19·49+0000
commit46964f6d8f95590748855976fc77ce1faa75d708 (patch)
tree11da67f7d54062f87cb6dc10f2169d2ffb7347ec /users/flokli/archeology/parse_bucket_logs.rs
parent281cb93ba808b73d4ea4ce86f762bbcb504a09da (diff)
fix(users/flokli/archaeology): don't use file but column compression r/6994
Clickhouse also has column compression, configurable with the
output_format_parquet_compression_method setting.

It defaults to lz4, and the previous setting got a a zstd-compressed
parquet file with lz4 data.

Set output_format_parquet_compression_method to zstd instead, and sort
by timestamp before assembling the parquet file.

The existing files were updated to the same format with the following query:

```
SELECT * FROM file('bucket_logs_2023-11-11*.pq', 'Parquet', 'auto') ORDER BY timestamp ASC INTO OUTFILE 'bucket_logs_2023-11-11.parquet' SETTINGS output_format_parquet_compression_method = 'zstd'
```

Change-Id: Id63b14c82e7bf4b9907a500528b569a51e277751
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10008
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Tested-by: BuildkiteCI
Diffstat (limited to 'users/flokli/archeology/parse_bucket_logs.rs')
-rw-r--r--users/flokli/archeology/parse_bucket_logs.rs7
1 files changed, 5 insertions, 2 deletions
diff --git a/users/flokli/archeology/parse_bucket_logs.rs b/users/flokli/archeology/parse_bucket_logs.rs
index 1fbba0506821..c794222f5b7d 100644
--- a/users/flokli/archeology/parse_bucket_logs.rs
+++ b/users/flokli/archeology/parse_bucket_logs.rs
@@ -29,8 +29,11 @@ fn main() -> ExitCode {
         'Regexp',
         'owner String , bucket String, timestamp_str String, remote_ip String, requester LowCardinality(String), request_id String, operation LowCardinality(String), key String, request_uri String, http_status String, error_code String, bytes_sent_str String, object_size_str String, total_time String, turn_around_time String, referer String, user_agent String, version_id String, host_id String, signature_version String, cipher_suite String, authentication_type String, host_header String, tls_version String, access_point_arn String, acl_required String'
     )
-    SETTINGS format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*'
-    INTO OUTFILE '{}' COMPRESSION 'zstd' FORMAT Parquet"#, input_files, output_file));
+    ORDER BY timestamp ASC
+    SETTINGS
+        format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*',
+        output_format_parquet_compression_method = 'zstd'
+    INTO OUTFILE '{}' FORMAT Parquet"#, input_files, output_file));
 
     cmd.status().expect("clickhouse-local failed");