From 4ba624efae2d63057c2bd5be23841be5017bd457 Mon Sep 17 00:00:00 2001
From: sterni <sternenseemann@systemli.org>
Date: Tue, 11 Jul 2023 17:54:20 +0200
Subject: fix(tvix/eval): use byte, not codepoint index for slicing in escape

This fixes a subtle issue which would occasionally lead to a crash (e.g.
when evaluating (pkgs.systemd.outPath with --trace-runtime): With each
character in the string that has a multi byte representation in UTF-8,
the actual byte position and what tvix thought it was would get out of
sync. This could either lead to

* Tvix swallowing characters or jumbling characters if multi byte
  characters would cause the tracked index to become out of sync with
  the byte position before the first character to be escaped, or

* Tvix crashing if (in the same situation) the out of sync index would
  be within a UTF-8 byte sequence.

Luckily, std's `char_indices()` iterator implements exactly what
`nix_escape_char()`'s original author had in mind with
`.chars().enumerate()`. Using `i + 1` for continuing is safe, since all
characters that need (in fact, can) to be escaped in Nix are represented
as a single byte in UTF-8.

Change-Id: I1c836f70cde3d72db1c644e9112852f0d824715e
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8952
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Autosubmit: sterni <sternenseemann@systemli.org>
---
 .../tvix_tests/eval-okay-escape-string-correct-char-boundaries.exp  | 1 +
 .../tvix_tests/eval-okay-escape-string-correct-char-boundaries.nix  | 6 ++++++
 tvix/eval/src/value/string.rs                                       | 6 +++++-
 3 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.exp
 create mode 100644 tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.nix

(limited to 'tvix/eval')
diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.exp b/tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.exp
new file mode 100644
index 000000000000..d889063f9ab1
--- /dev/null
+++ b/tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.exp
@@ -0,0 +1 @@
+"💭(\":thonking:\")"
diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.nix b/tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.nix
new file mode 100644
index 000000000000..49f4b6273106
--- /dev/null
+++ b/tvix/eval/src/tests/tvix_tests/eval-okay-escape-string-correct-char-boundaries.nix
@@ -0,0 +1,6 @@
+# Regression test for a bug where tvix would crash in nix_escape_string
+# because it counted the string position by unicode code point count,
+# but then used it as a byte index for slicing. Consequently, it would
+# try slicing 💭 in half, thinking the first element to be escaped was
+# at byte index 2 (i.e. the quote).
+"💭(\":thonking:\")"
diff --git a/tvix/eval/src/value/string.rs b/tvix/eval/src/value/string.rs
index 7144ca360d15..2649e00f0830 100644
--- a/tvix/eval/src/value/string.rs
+++ b/tvix/eval/src/value/string.rs
@@ -193,7 +193,7 @@ fn is_valid_nix_identifier(s: &str) -> bool {
 ///
 /// Note that this does not add the outer pair of surrounding quotes.
 fn nix_escape_string(input: &str) -> Cow<str> {
-    let mut iter = input.chars().enumerate().peekable();
+    let mut iter = input.char_indices().peekable();
 
     while let Some((i, c)) = iter.next() {
         if let Some(esc) = nix_escape_char(c, iter.peek().map(|(_, c)| c)) {
@@ -201,6 +201,10 @@ fn nix_escape_string(input: &str) -> Cow<str> {
             escaped.push_str(&input[..i]);
             escaped.push_str(esc);
 
+            // In theory we calculate how many bytes it takes to represent `esc`
+            // in UTF-8 and use that for the offset. It is, however, safe to
+            // assume that to be 1, as all characters that can be escaped in a
+            // Nix string are ASCII.
             let mut inner_iter = input[i + 1..].chars().peekable();
             while let Some(c) = inner_iter.next() {
                 match nix_escape_char(c, inner_iter.peek()) {
-- 
cgit 1.4.1