about summary refs log tree commit diff
diff options
context:
space:
mode:
authorProfpatsch <mail@profpatsch.de>2024-05-20T13·50+0200
committerclbot <clbot@tvl.fyi>2024-05-22T10·32+0000
commit5b2ba0efa1b87aa514d665a7f64ada36617c720e (patch)
tree27a4bb08f20e997377cc1bc884ee16c38e203f86
parente7be3422566b36e5bd3aeaaf7d47537dfd050a5c (diff)
refactor(tvix/eval): rewrite xml emitter to be simple-stupid r/8160
In order to be compatible with the nix XML generator, it’s easier to
generate the XML directly, instead of going through a library which we
have to bend to do what we need.

Removes dependency on `xml-rs`, which came with a full XML parser that
we didn’t use. Only takes a tiny bit of code for the XML escaping,
somewhat simplified.

I add a little escaping value, to make sure we have the same behaviour
as nix proper.

Interestingly enough, we never need to escape XML attribute names,
because the `builtins.toXML` format encodes user-defined values as
attribute keys only. So we only escape attribute values.

Fixes: https://b.tvl.fyi/issues/399
Change-Id: If4d407d324864b3bb9aa3160e2ec6889f7727127
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11697
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Autosubmit: Profpatsch <mail@profpatsch.de>
-rw-r--r--tvix/Cargo.lock7
-rw-r--r--tvix/Cargo.nix16
-rw-r--r--tvix/eval/Cargo.toml1
-rw-r--r--tvix/eval/src/builtins/to_xml.rs246
-rw-r--r--tvix/eval/src/errors.rs15
-rw-r--r--tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml5
-rw-r--r--tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix1
-rw-r--r--tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp1
-rw-r--r--tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix2
9 files changed, 205 insertions, 89 deletions
diff --git a/tvix/Cargo.lock b/tvix/Cargo.lock
index dc5298c45b5b..9ad121371264 100644
--- a/tvix/Cargo.lock
+++ b/tvix/Cargo.lock
@@ -4167,7 +4167,6 @@ dependencies = [
  "test-strategy",
  "toml",
  "tvix-eval-builtin-macros",
- "xml-rs",
 ]
 
 [[package]]
@@ -4845,12 +4844,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "xml-rs"
-version = "0.8.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fcb9cbac069e033553e8bb871be2fbdffcab578eb25bd0f7c508cedc6dcd75a"
-
-[[package]]
 name = "xz2"
 version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/tvix/Cargo.nix b/tvix/Cargo.nix
index f6c3108fa422..0bae0a6f20fa 100644
--- a/tvix/Cargo.nix
+++ b/tvix/Cargo.nix
@@ -13222,10 +13222,6 @@ rec {
             packageId = "tvix-eval-builtin-macros";
             rename = "builtin-macros";
           }
-          {
-            name = "xml-rs";
-            packageId = "xml-rs";
-          }
         ];
         devDependencies = [
           {
@@ -16041,18 +16037,6 @@ rec {
         };
         resolvedDefaultFeatures = [ "default" "unsupported" ];
       };
-      "xml-rs" = rec {
-        crateName = "xml-rs";
-        version = "0.8.19";
-        edition = "2021";
-        crateBin = [ ];
-        sha256 = "0nnpvk3fv32hgh7vs9gbg2swmzxx5yz73f4b7rak7q39q2x9rjqg";
-        libName = "xml";
-        authors = [
-          "Vladimir Matveev <vmatveev@citrine.cc>"
-        ];
-
-      };
       "xz2" = rec {
         crateName = "xz2";
         version = "0.1.7";
diff --git a/tvix/eval/Cargo.toml b/tvix/eval/Cargo.toml
index 677ce6ab85be..cc590791cc92 100644
--- a/tvix/eval/Cargo.toml
+++ b/tvix/eval/Cargo.toml
@@ -30,7 +30,6 @@ smol_str = "0.2.0"
 tabwriter = "1.2"
 test-strategy = { version = "0.2.1", optional = true }
 toml = "0.6.0"
-xml-rs = "0.8.4"
 sha2 = "0.10.8"
 sha1 = "0.10.6"
 md-5 = "0.10.6"
diff --git a/tvix/eval/src/builtins/to_xml.rs b/tvix/eval/src/builtins/to_xml.rs
index bb12cebfc9d0..10a6d8cdb358 100644
--- a/tvix/eval/src/builtins/to_xml.rs
+++ b/tvix/eval/src/builtins/to_xml.rs
@@ -3,59 +3,43 @@
 //! things in nixpkgs rely on.
 
 use bstr::ByteSlice;
+use std::borrow::Cow;
 use std::{io::Write, rc::Rc};
-use xml::writer::events::XmlEvent;
-use xml::writer::EmitterConfig;
-use xml::writer::EventWriter;
 
 use crate::{ErrorKind, Value};
 
 /// Recursively serialise a value to XML. The value *must* have been
 /// deep-forced before being passed to this function.
 pub fn value_to_xml<W: Write>(mut writer: W, value: &Value) -> Result<(), ErrorKind> {
-    let config = EmitterConfig {
-        perform_indent: true,
-        pad_self_closing: true,
-
-        // Nix uses single-quotes *only* in the document declaration,
-        // so we need to write it manually.
-        write_document_declaration: false,
-        ..Default::default()
-    };
-
     // Write a literal document declaration, using C++-Nix-style
     // single quotes.
     writeln!(writer, "<?xml version='1.0' encoding='utf-8'?>")?;
 
-    let mut writer = EventWriter::new_with_config(writer, config);
-
-    writer.write(XmlEvent::start_element("expr"))?;
-    value_variant_to_xml(&mut writer, value)?;
-    writer.write(XmlEvent::end_element())?;
+    let mut emitter = XmlEmitter::new(writer);
 
-    // Unwrap the writer to add the final newline that C++ Nix adds.
-    writeln!(writer.into_inner())?;
+    emitter.write_open_tag("expr", &[])?;
+    value_variant_to_xml(&mut emitter, value)?;
+    emitter.write_closing_tag("expr")?;
 
     Ok(())
 }
 
 fn write_typed_value<W: Write, V: ToString>(
-    w: &mut EventWriter<W>,
-    name: &str,
+    w: &mut XmlEmitter<W>,
+    name_unescaped: &str,
     value: V,
 ) -> Result<(), ErrorKind> {
-    w.write(XmlEvent::start_element(name).attr("value", &value.to_string()))?;
-    w.write(XmlEvent::end_element())?;
+    w.write_self_closing_tag(name_unescaped, &[("value", &value.to_string())])?;
     Ok(())
 }
 
-fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Result<(), ErrorKind> {
+fn value_variant_to_xml<W: Write>(w: &mut XmlEmitter<W>, value: &Value) -> Result<(), ErrorKind> {
     match value {
         Value::Thunk(t) => return value_variant_to_xml(w, &t.value()),
 
         Value::Null => {
-            w.write(XmlEvent::start_element("null"))?;
-            w.write(XmlEvent::end_element())
+            w.write_open_tag("null", &[])?;
+            w.write_closing_tag("null")?;
         }
 
         Value::Bool(b) => return write_typed_value(w, "bool", b),
@@ -65,50 +49,46 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu
         Value::Path(p) => return write_typed_value(w, "path", p.to_string_lossy()),
 
         Value::List(list) => {
-            w.write(XmlEvent::start_element("list"))?;
+            w.write_open_tag("list", &[])?;
 
             for elem in list.into_iter() {
                 value_variant_to_xml(w, elem)?;
             }
 
-            w.write(XmlEvent::end_element())
+            w.write_closing_tag("list")?;
         }
 
         Value::Attrs(attrs) => {
-            w.write(XmlEvent::start_element("attrs"))?;
+            w.write_open_tag("attrs", &[])?;
 
             for elem in attrs.iter() {
-                w.write(XmlEvent::start_element("attr").attr("name", &elem.0.to_str_lossy()))?;
+                w.write_open_tag("attr", &[("name", &elem.0.to_str_lossy())])?;
                 value_variant_to_xml(w, elem.1)?;
-                w.write(XmlEvent::end_element())?;
+                w.write_closing_tag("attr")?;
             }
 
-            w.write(XmlEvent::end_element())
+            w.write_closing_tag("attrs")?;
         }
 
         Value::Closure(c) => {
-            w.write(XmlEvent::start_element("function"))?;
+            w.write_open_tag("function", &[])?;
 
             match &c.lambda.formals {
                 Some(formals) => {
-                    let mut attrspat = XmlEvent::start_element("attrspat");
+                    let mut attrs: Vec<(&str, &str)> = Vec::with_capacity(2);
                     if formals.ellipsis {
-                        attrspat = attrspat.attr("ellipsis", "1");
+                        attrs.push(("ellipsis", "1"));
                     }
                     if let Some(ref name) = &formals.name {
-                        attrspat = attrspat.attr("name", name.as_str());
+                        attrs.push(("name", name.as_str()));
                     }
 
-                    w.write(attrspat)?;
-
+                    w.write_open_tag("attrspat", &attrs)?;
                     for arg in formals.arguments.iter() {
-                        w.write(
-                            XmlEvent::start_element("attr").attr("name", &arg.0.to_str_lossy()),
-                        )?;
-                        w.write(XmlEvent::end_element())?;
+                        w.write_self_closing_tag("attr", &[("name", &arg.0.to_str_lossy())])?;
                     }
 
-                    w.write(XmlEvent::end_element())?;
+                    w.write_closing_tag("attrspat")?;
                 }
                 None => {
                     // TODO(tazjin): tvix does not currently persist function
@@ -120,17 +100,16 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu
                     // If we don't want to persist the data, we can re-parse the
                     // AST from the spans of the lambda's bytecode and figure it
                     // out that way, but it needs some investigating.
-                    w.write(XmlEvent::start_element("varpat").attr("name", /* fake: */ "x"))?;
-                    w.write(XmlEvent::end_element())?;
+                    w.write_self_closing_tag("varpat", &[("name", /* fake: */ "x")])?;
                 }
             }
 
-            w.write(XmlEvent::end_element())
+            w.write_closing_tag("function")?;
         }
 
         Value::Builtin(_) => {
-            w.write(XmlEvent::start_element("unevaluated"))?;
-            w.write(XmlEvent::end_element())
+            w.write_open_tag("unevaluated", &[])?;
+            w.write_closing_tag("unevaluated")?;
         }
 
         Value::AttrNotFound
@@ -148,7 +127,174 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu
         Value::Catchable(_) => {
             panic!("tvix bug: value_to_xml() called on a value which had not been deep-forced")
         }
-    }?;
+    };
 
     Ok(())
 }
+
+/// A simple-stupid XML emitter, which implements only the subset needed for byte-by-byte compat with C++ nix’ `builtins.toXML`.
+struct XmlEmitter<W> {
+    /// The current indentation
+    cur_indent: usize,
+    writer: W,
+}
+
+impl<W: Write> XmlEmitter<W> {
+    pub fn new(writer: W) -> Self {
+        XmlEmitter {
+            cur_indent: 0,
+            writer,
+        }
+    }
+
+    /// Write an open tag with the given name (which is not escaped!)
+    /// and attributes (Keys are not escaped! Only attribute values are.)
+    pub fn write_open_tag(
+        &mut self,
+        name_unescaped: &str,
+        attrs: &[(&str, &str)],
+    ) -> std::io::Result<()> {
+        self.add_indent()?;
+        self.writer.write_all(b"<")?;
+        self.writer.write_all(name_unescaped.as_bytes())?;
+        self.write_attrs_escape_vals(attrs)?;
+        self.writer.write_all(b">\n")?;
+        self.cur_indent += 2;
+        Ok(())
+    }
+
+    /// Write a self-closing open tag with the given name (which is not escaped!)
+    /// and attributes (Keys are not escaped! Only attribute values are.)
+    pub fn write_self_closing_tag(
+        &mut self,
+        name_unescaped: &str,
+        attrs: &[(&str, &str)],
+    ) -> std::io::Result<()> {
+        self.add_indent()?;
+        self.writer.write_all(b"<")?;
+        self.writer.write_all(name_unescaped.as_bytes())?;
+        self.write_attrs_escape_vals(attrs)?;
+        self.writer.write_all(b" />\n")?;
+        Ok(())
+    }
+
+    /// Write a closing tag with the given name (which is not escaped!)
+    pub fn write_closing_tag(&mut self, name_unescaped: &str) -> std::io::Result<()> {
+        self.cur_indent -= 2;
+        self.add_indent()?;
+        self.writer.write_all(b"</")?;
+        self.writer.write_all(name_unescaped.as_bytes())?;
+        self.writer.write_all(b">\n")?;
+        Ok(())
+    }
+
+    #[inline]
+    fn add_indent(&mut self) -> std::io::Result<()> {
+        self.writer.write_all(&b" ".repeat(self.cur_indent))
+    }
+
+    /// Write an attribute list
+    fn write_attrs_escape_vals(&mut self, attrs: &[(&str, &str)]) -> std::io::Result<()> {
+        for (name, val) in attrs {
+            self.writer.write_all(b" ")?;
+            self.writer.write_all(name.as_bytes())?;
+            self.writer.write_all(br#"=""#)?;
+            self.writer
+                .write_all(Self::escape_attr_value(val).as_bytes())?;
+            self.writer.write_all(b"\"")?;
+        }
+        Ok(())
+    }
+
+    /// Escape the given attribute value, making sure we only actually clone the string if we needed to replace something.
+    fn escape_attr_value(s: &str) -> Cow<str> {
+        let mut last_escape: usize = 0;
+        let mut res: Cow<str> = Cow::Borrowed("");
+        // iterating via char_indices gives us the ability to index the original string slice at character boundaries
+        for (idx, c) in s.char_indices() {
+            match Self::should_escape_char(c) {
+                None => {}
+                Some(new) => {
+                    // add characters since the last escape we did
+                    res += &s[last_escape..idx];
+                    // add the escaped value
+                    res += new;
+                    last_escape = idx + 1;
+                }
+            }
+        }
+        // we did not need to escape anything, so borrow original string
+        if last_escape == 0 {
+            Cow::Borrowed(s)
+        } else {
+            // add the remaining characters
+            res += &s[last_escape..];
+            res
+        }
+    }
+
+    fn should_escape_char(c: char) -> Option<&'static str> {
+        match c {
+            '<' => Some("&lt;"),
+            '>' => Some("&gt;"),
+            '"' => Some("&quot;"),
+            '\'' => Some("&apos;"),
+            '&' => Some("&amp;"),
+            '\n' => Some("&#xA;"),
+            '\r' => Some("&#xD;"),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use bytes::buf::Writer;
+    use pretty_assertions::assert_eq;
+
+    use crate::builtins::to_xml::XmlEmitter;
+    use std::borrow::Cow;
+
+    #[test]
+    fn xml_gen() {
+        let mut buf = Vec::new();
+        let mut x = XmlEmitter::new(&mut buf);
+        x.write_open_tag("hello", &[("hi", "it’s me"), ("no", "<escape>")])
+            .unwrap();
+        x.write_self_closing_tag("self-closing", &[("tag", "yay")])
+            .unwrap();
+        x.write_closing_tag("hello").unwrap();
+
+        assert_eq!(
+            std::str::from_utf8(&buf).unwrap(),
+            r##"<hello hi="it’s me" no="&lt;escape&gt;">
+  <self-closing tag="yay" />
+</hello>
+"##
+        );
+    }
+
+    #[test]
+    fn xml_escape() {
+        match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("ab<>c&de") {
+            Cow::Owned(s) => assert_eq!(s, "ab&lt;&gt;c&amp;de".to_string(), "escape stuff"),
+            Cow::Borrowed(s) => panic!("s should be owned {}", s),
+        }
+        match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("") {
+            Cow::Borrowed(s) => assert_eq!(s, "", "empty escape is borrowed"),
+            Cow::Owned(s) => panic!("s should be borrowed {}", s),
+        }
+        match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!ŷbla") {
+            Cow::Borrowed(s) => assert_eq!(s, "hi!ŷbla", "no escape is borrowed"),
+            Cow::Owned(s) => panic!("s should be borrowed {}", s),
+        }
+        match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!<ŷ>bla") {
+            Cow::Owned(s) => assert_eq!(
+                s,
+                "hi!&lt;ŷ&gt;bla".to_string(),
+                "multi-byte chars are correctly used"
+            ),
+            Cow::Borrowed(s) => panic!("s should be owned {}", s),
+        }
+    }
+}
diff --git a/tvix/eval/src/errors.rs b/tvix/eval/src/errors.rs
index 652252dadfa0..a5f79c235f09 100644
--- a/tvix/eval/src/errors.rs
+++ b/tvix/eval/src/errors.rs
@@ -10,7 +10,6 @@ use std::{fmt::Debug, fmt::Display, num::ParseIntError};
 use codemap::{File, Span};
 use codemap_diagnostic::{ColorConfig, Diagnostic, Emitter, Level, SpanLabel, SpanStyle};
 use smol_str::SmolStr;
-use xml::writer::Error as XmlError;
 
 use crate::spans::ToSpan;
 use crate::value::{CoercionKind, NixString};
@@ -194,9 +193,6 @@ pub enum ErrorKind {
     /// Invalid UTF-8 was encoutered somewhere
     Utf8,
 
-    /// Errors while serialising to XML.
-    Xml(Rc<XmlError>),
-
     /// Variant for errors that bubble up to eval from other Tvix
     /// components.
     TvixError(Rc<dyn error::Error>),
@@ -248,7 +244,6 @@ impl error::Error for Error {
                 errors.first().map(|e| e as &dyn error::Error)
             }
             ErrorKind::IO { error, .. } => Some(error.as_ref()),
-            ErrorKind::Xml(error) => Some(error.as_ref()),
             ErrorKind::TvixError(error) => Some(error.as_ref()),
             _ => None,
         }
@@ -285,12 +280,6 @@ impl From<bstr::FromUtf8Error> for ErrorKind {
     }
 }
 
-impl From<XmlError> for ErrorKind {
-    fn from(err: XmlError) -> Self {
-        Self::Xml(Rc::new(err))
-    }
-}
-
 impl From<io::Error> for ErrorKind {
     fn from(e: io::Error) -> Self {
         ErrorKind::IO {
@@ -506,8 +495,6 @@ to a missing value in the attribute set(s) included via `with`."#,
                 write!(f, "Invalid UTF-8 in string")
             }
 
-            ErrorKind::Xml(error) => write!(f, "failed to serialise to XML: {error}"),
-
             ErrorKind::TvixError(inner_error) => {
                 write!(f, "{inner_error}")
             }
@@ -823,7 +810,6 @@ impl Error {
             | ErrorKind::JsonError(_)
             | ErrorKind::NotSerialisableToJson(_)
             | ErrorKind::FromTomlError(_)
-            | ErrorKind::Xml(_)
             | ErrorKind::Utf8
             | ErrorKind::TvixError(_)
             | ErrorKind::TvixBug { .. }
@@ -870,7 +856,6 @@ impl Error {
             ErrorKind::UnexpectedArgument { .. } => "E031",
             ErrorKind::RelativePathResolution(_) => "E032",
             ErrorKind::DivisionByZero => "E033",
-            ErrorKind::Xml(_) => "E034",
             ErrorKind::FromTomlError(_) => "E035",
             ErrorKind::NotSerialisableToJson(_) => "E036",
             ErrorKind::UnexpectedContext => "E037",
diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml
new file mode 100644
index 000000000000..468972b2f819
--- /dev/null
+++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml
@@ -0,0 +1,5 @@
+<?xml version='1.0' encoding='utf-8'?>
+<expr>
+  <attrs>
+  </attrs>
+</expr>
diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix
new file mode 100644
index 000000000000..ffcd4415b08f
--- /dev/null
+++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix
@@ -0,0 +1 @@
+{ }
diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp
new file mode 100644
index 000000000000..9ae16de526f1
--- /dev/null
+++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp
@@ -0,0 +1 @@
+"<?xml version='1.0' encoding='utf-8'?>\n<expr>\n  <attrs>\n    <attr name=\"&amp;-{\">\n      <string value=\";&amp;&quot;\" />\n    </attr>\n    <attr name=\"a\">\n      <string value=\"s\" />\n    </attr>\n  </attrs>\n</expr>\n"
diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix
new file mode 100644
index 000000000000..7d074048ddec
--- /dev/null
+++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix
@@ -0,0 +1,2 @@
+# Check some corner cases regarding escaping.
+builtins.toXML { a = "s"; "&-{" = ";&\""; }