diff options
author | Profpatsch <mail@profpatsch.de> | 2024-05-20T13·50+0200 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2024-05-22T10·32+0000 |
commit | 5b2ba0efa1b87aa514d665a7f64ada36617c720e (patch) | |
tree | 27a4bb08f20e997377cc1bc884ee16c38e203f86 | |
parent | e7be3422566b36e5bd3aeaaf7d47537dfd050a5c (diff) |
refactor(tvix/eval): rewrite xml emitter to be simple-stupid r/8160
In order to be compatible with the nix XML generator, it’s easier to generate the XML directly, instead of going through a library which we have to bend to do what we need. Removes dependency on `xml-rs`, which came with a full XML parser that we didn’t use. Only takes a tiny bit of code for the XML escaping, somewhat simplified. I add a little escaping value, to make sure we have the same behaviour as nix proper. Interestingly enough, we never need to escape XML attribute names, because the `builtins.toXML` format encodes user-defined values as attribute keys only. So we only escape attribute values. Fixes: https://b.tvl.fyi/issues/399 Change-Id: If4d407d324864b3bb9aa3160e2ec6889f7727127 Reviewed-on: https://cl.tvl.fyi/c/depot/+/11697 Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de> Autosubmit: Profpatsch <mail@profpatsch.de>
-rw-r--r-- | tvix/Cargo.lock | 7 | ||||
-rw-r--r-- | tvix/Cargo.nix | 16 | ||||
-rw-r--r-- | tvix/eval/Cargo.toml | 1 | ||||
-rw-r--r-- | tvix/eval/src/builtins/to_xml.rs | 246 | ||||
-rw-r--r-- | tvix/eval/src/errors.rs | 15 | ||||
-rw-r--r-- | tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml | 5 | ||||
-rw-r--r-- | tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix | 1 | ||||
-rw-r--r-- | tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp | 1 | ||||
-rw-r--r-- | tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix | 2 |
9 files changed, 205 insertions, 89 deletions
diff --git a/tvix/Cargo.lock b/tvix/Cargo.lock index dc5298c45b5b..9ad121371264 100644 --- a/tvix/Cargo.lock +++ b/tvix/Cargo.lock @@ -4167,7 +4167,6 @@ dependencies = [ "test-strategy", "toml", "tvix-eval-builtin-macros", - "xml-rs", ] [[package]] @@ -4845,12 +4844,6 @@ dependencies = [ ] [[package]] -name = "xml-rs" -version = "0.8.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fcb9cbac069e033553e8bb871be2fbdffcab578eb25bd0f7c508cedc6dcd75a" - -[[package]] name = "xz2" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/tvix/Cargo.nix b/tvix/Cargo.nix index f6c3108fa422..0bae0a6f20fa 100644 --- a/tvix/Cargo.nix +++ b/tvix/Cargo.nix @@ -13222,10 +13222,6 @@ rec { packageId = "tvix-eval-builtin-macros"; rename = "builtin-macros"; } - { - name = "xml-rs"; - packageId = "xml-rs"; - } ]; devDependencies = [ { @@ -16041,18 +16037,6 @@ rec { }; resolvedDefaultFeatures = [ "default" "unsupported" ]; }; - "xml-rs" = rec { - crateName = "xml-rs"; - version = "0.8.19"; - edition = "2021"; - crateBin = [ ]; - sha256 = "0nnpvk3fv32hgh7vs9gbg2swmzxx5yz73f4b7rak7q39q2x9rjqg"; - libName = "xml"; - authors = [ - "Vladimir Matveev <vmatveev@citrine.cc>" - ]; - - }; "xz2" = rec { crateName = "xz2"; version = "0.1.7"; diff --git a/tvix/eval/Cargo.toml b/tvix/eval/Cargo.toml index 677ce6ab85be..cc590791cc92 100644 --- a/tvix/eval/Cargo.toml +++ b/tvix/eval/Cargo.toml @@ -30,7 +30,6 @@ smol_str = "0.2.0" tabwriter = "1.2" test-strategy = { version = "0.2.1", optional = true } toml = "0.6.0" -xml-rs = "0.8.4" sha2 = "0.10.8" sha1 = "0.10.6" md-5 = "0.10.6" diff --git a/tvix/eval/src/builtins/to_xml.rs b/tvix/eval/src/builtins/to_xml.rs index bb12cebfc9d0..10a6d8cdb358 100644 --- a/tvix/eval/src/builtins/to_xml.rs +++ b/tvix/eval/src/builtins/to_xml.rs @@ -3,59 +3,43 @@ //! things in nixpkgs rely on. use bstr::ByteSlice; +use std::borrow::Cow; use std::{io::Write, rc::Rc}; -use xml::writer::events::XmlEvent; -use xml::writer::EmitterConfig; -use xml::writer::EventWriter; use crate::{ErrorKind, Value}; /// Recursively serialise a value to XML. The value *must* have been /// deep-forced before being passed to this function. pub fn value_to_xml<W: Write>(mut writer: W, value: &Value) -> Result<(), ErrorKind> { - let config = EmitterConfig { - perform_indent: true, - pad_self_closing: true, - - // Nix uses single-quotes *only* in the document declaration, - // so we need to write it manually. - write_document_declaration: false, - ..Default::default() - }; - // Write a literal document declaration, using C++-Nix-style // single quotes. writeln!(writer, "<?xml version='1.0' encoding='utf-8'?>")?; - let mut writer = EventWriter::new_with_config(writer, config); - - writer.write(XmlEvent::start_element("expr"))?; - value_variant_to_xml(&mut writer, value)?; - writer.write(XmlEvent::end_element())?; + let mut emitter = XmlEmitter::new(writer); - // Unwrap the writer to add the final newline that C++ Nix adds. - writeln!(writer.into_inner())?; + emitter.write_open_tag("expr", &[])?; + value_variant_to_xml(&mut emitter, value)?; + emitter.write_closing_tag("expr")?; Ok(()) } fn write_typed_value<W: Write, V: ToString>( - w: &mut EventWriter<W>, - name: &str, + w: &mut XmlEmitter<W>, + name_unescaped: &str, value: V, ) -> Result<(), ErrorKind> { - w.write(XmlEvent::start_element(name).attr("value", &value.to_string()))?; - w.write(XmlEvent::end_element())?; + w.write_self_closing_tag(name_unescaped, &[("value", &value.to_string())])?; Ok(()) } -fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Result<(), ErrorKind> { +fn value_variant_to_xml<W: Write>(w: &mut XmlEmitter<W>, value: &Value) -> Result<(), ErrorKind> { match value { Value::Thunk(t) => return value_variant_to_xml(w, &t.value()), Value::Null => { - w.write(XmlEvent::start_element("null"))?; - w.write(XmlEvent::end_element()) + w.write_open_tag("null", &[])?; + w.write_closing_tag("null")?; } Value::Bool(b) => return write_typed_value(w, "bool", b), @@ -65,50 +49,46 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu Value::Path(p) => return write_typed_value(w, "path", p.to_string_lossy()), Value::List(list) => { - w.write(XmlEvent::start_element("list"))?; + w.write_open_tag("list", &[])?; for elem in list.into_iter() { value_variant_to_xml(w, elem)?; } - w.write(XmlEvent::end_element()) + w.write_closing_tag("list")?; } Value::Attrs(attrs) => { - w.write(XmlEvent::start_element("attrs"))?; + w.write_open_tag("attrs", &[])?; for elem in attrs.iter() { - w.write(XmlEvent::start_element("attr").attr("name", &elem.0.to_str_lossy()))?; + w.write_open_tag("attr", &[("name", &elem.0.to_str_lossy())])?; value_variant_to_xml(w, elem.1)?; - w.write(XmlEvent::end_element())?; + w.write_closing_tag("attr")?; } - w.write(XmlEvent::end_element()) + w.write_closing_tag("attrs")?; } Value::Closure(c) => { - w.write(XmlEvent::start_element("function"))?; + w.write_open_tag("function", &[])?; match &c.lambda.formals { Some(formals) => { - let mut attrspat = XmlEvent::start_element("attrspat"); + let mut attrs: Vec<(&str, &str)> = Vec::with_capacity(2); if formals.ellipsis { - attrspat = attrspat.attr("ellipsis", "1"); + attrs.push(("ellipsis", "1")); } if let Some(ref name) = &formals.name { - attrspat = attrspat.attr("name", name.as_str()); + attrs.push(("name", name.as_str())); } - w.write(attrspat)?; - + w.write_open_tag("attrspat", &attrs)?; for arg in formals.arguments.iter() { - w.write( - XmlEvent::start_element("attr").attr("name", &arg.0.to_str_lossy()), - )?; - w.write(XmlEvent::end_element())?; + w.write_self_closing_tag("attr", &[("name", &arg.0.to_str_lossy())])?; } - w.write(XmlEvent::end_element())?; + w.write_closing_tag("attrspat")?; } None => { // TODO(tazjin): tvix does not currently persist function @@ -120,17 +100,16 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu // If we don't want to persist the data, we can re-parse the // AST from the spans of the lambda's bytecode and figure it // out that way, but it needs some investigating. - w.write(XmlEvent::start_element("varpat").attr("name", /* fake: */ "x"))?; - w.write(XmlEvent::end_element())?; + w.write_self_closing_tag("varpat", &[("name", /* fake: */ "x")])?; } } - w.write(XmlEvent::end_element()) + w.write_closing_tag("function")?; } Value::Builtin(_) => { - w.write(XmlEvent::start_element("unevaluated"))?; - w.write(XmlEvent::end_element()) + w.write_open_tag("unevaluated", &[])?; + w.write_closing_tag("unevaluated")?; } Value::AttrNotFound @@ -148,7 +127,174 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu Value::Catchable(_) => { panic!("tvix bug: value_to_xml() called on a value which had not been deep-forced") } - }?; + }; Ok(()) } + +/// A simple-stupid XML emitter, which implements only the subset needed for byte-by-byte compat with C++ nix’ `builtins.toXML`. +struct XmlEmitter<W> { + /// The current indentation + cur_indent: usize, + writer: W, +} + +impl<W: Write> XmlEmitter<W> { + pub fn new(writer: W) -> Self { + XmlEmitter { + cur_indent: 0, + writer, + } + } + + /// Write an open tag with the given name (which is not escaped!) + /// and attributes (Keys are not escaped! Only attribute values are.) + pub fn write_open_tag( + &mut self, + name_unescaped: &str, + attrs: &[(&str, &str)], + ) -> std::io::Result<()> { + self.add_indent()?; + self.writer.write_all(b"<")?; + self.writer.write_all(name_unescaped.as_bytes())?; + self.write_attrs_escape_vals(attrs)?; + self.writer.write_all(b">\n")?; + self.cur_indent += 2; + Ok(()) + } + + /// Write a self-closing open tag with the given name (which is not escaped!) + /// and attributes (Keys are not escaped! Only attribute values are.) + pub fn write_self_closing_tag( + &mut self, + name_unescaped: &str, + attrs: &[(&str, &str)], + ) -> std::io::Result<()> { + self.add_indent()?; + self.writer.write_all(b"<")?; + self.writer.write_all(name_unescaped.as_bytes())?; + self.write_attrs_escape_vals(attrs)?; + self.writer.write_all(b" />\n")?; + Ok(()) + } + + /// Write a closing tag with the given name (which is not escaped!) + pub fn write_closing_tag(&mut self, name_unescaped: &str) -> std::io::Result<()> { + self.cur_indent -= 2; + self.add_indent()?; + self.writer.write_all(b"</")?; + self.writer.write_all(name_unescaped.as_bytes())?; + self.writer.write_all(b">\n")?; + Ok(()) + } + + #[inline] + fn add_indent(&mut self) -> std::io::Result<()> { + self.writer.write_all(&b" ".repeat(self.cur_indent)) + } + + /// Write an attribute list + fn write_attrs_escape_vals(&mut self, attrs: &[(&str, &str)]) -> std::io::Result<()> { + for (name, val) in attrs { + self.writer.write_all(b" ")?; + self.writer.write_all(name.as_bytes())?; + self.writer.write_all(br#"=""#)?; + self.writer + .write_all(Self::escape_attr_value(val).as_bytes())?; + self.writer.write_all(b"\"")?; + } + Ok(()) + } + + /// Escape the given attribute value, making sure we only actually clone the string if we needed to replace something. + fn escape_attr_value(s: &str) -> Cow<str> { + let mut last_escape: usize = 0; + let mut res: Cow<str> = Cow::Borrowed(""); + // iterating via char_indices gives us the ability to index the original string slice at character boundaries + for (idx, c) in s.char_indices() { + match Self::should_escape_char(c) { + None => {} + Some(new) => { + // add characters since the last escape we did + res += &s[last_escape..idx]; + // add the escaped value + res += new; + last_escape = idx + 1; + } + } + } + // we did not need to escape anything, so borrow original string + if last_escape == 0 { + Cow::Borrowed(s) + } else { + // add the remaining characters + res += &s[last_escape..]; + res + } + } + + fn should_escape_char(c: char) -> Option<&'static str> { + match c { + '<' => Some("<"), + '>' => Some(">"), + '"' => Some("""), + '\'' => Some("'"), + '&' => Some("&"), + '\n' => Some("
"), + '\r' => Some("
"), + _ => None, + } + } +} + +#[cfg(test)] +mod tests { + use bytes::buf::Writer; + use pretty_assertions::assert_eq; + + use crate::builtins::to_xml::XmlEmitter; + use std::borrow::Cow; + + #[test] + fn xml_gen() { + let mut buf = Vec::new(); + let mut x = XmlEmitter::new(&mut buf); + x.write_open_tag("hello", &[("hi", "it’s me"), ("no", "<escape>")]) + .unwrap(); + x.write_self_closing_tag("self-closing", &[("tag", "yay")]) + .unwrap(); + x.write_closing_tag("hello").unwrap(); + + assert_eq!( + std::str::from_utf8(&buf).unwrap(), + r##"<hello hi="it’s me" no="<escape>"> + <self-closing tag="yay" /> +</hello> +"## + ); + } + + #[test] + fn xml_escape() { + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("ab<>c&de") { + Cow::Owned(s) => assert_eq!(s, "ab<>c&de".to_string(), "escape stuff"), + Cow::Borrowed(s) => panic!("s should be owned {}", s), + } + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("") { + Cow::Borrowed(s) => assert_eq!(s, "", "empty escape is borrowed"), + Cow::Owned(s) => panic!("s should be borrowed {}", s), + } + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!ŷbla") { + Cow::Borrowed(s) => assert_eq!(s, "hi!ŷbla", "no escape is borrowed"), + Cow::Owned(s) => panic!("s should be borrowed {}", s), + } + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!<ŷ>bla") { + Cow::Owned(s) => assert_eq!( + s, + "hi!<ŷ>bla".to_string(), + "multi-byte chars are correctly used" + ), + Cow::Borrowed(s) => panic!("s should be owned {}", s), + } + } +} diff --git a/tvix/eval/src/errors.rs b/tvix/eval/src/errors.rs index 652252dadfa0..a5f79c235f09 100644 --- a/tvix/eval/src/errors.rs +++ b/tvix/eval/src/errors.rs @@ -10,7 +10,6 @@ use std::{fmt::Debug, fmt::Display, num::ParseIntError}; use codemap::{File, Span}; use codemap_diagnostic::{ColorConfig, Diagnostic, Emitter, Level, SpanLabel, SpanStyle}; use smol_str::SmolStr; -use xml::writer::Error as XmlError; use crate::spans::ToSpan; use crate::value::{CoercionKind, NixString}; @@ -194,9 +193,6 @@ pub enum ErrorKind { /// Invalid UTF-8 was encoutered somewhere Utf8, - /// Errors while serialising to XML. - Xml(Rc<XmlError>), - /// Variant for errors that bubble up to eval from other Tvix /// components. TvixError(Rc<dyn error::Error>), @@ -248,7 +244,6 @@ impl error::Error for Error { errors.first().map(|e| e as &dyn error::Error) } ErrorKind::IO { error, .. } => Some(error.as_ref()), - ErrorKind::Xml(error) => Some(error.as_ref()), ErrorKind::TvixError(error) => Some(error.as_ref()), _ => None, } @@ -285,12 +280,6 @@ impl From<bstr::FromUtf8Error> for ErrorKind { } } -impl From<XmlError> for ErrorKind { - fn from(err: XmlError) -> Self { - Self::Xml(Rc::new(err)) - } -} - impl From<io::Error> for ErrorKind { fn from(e: io::Error) -> Self { ErrorKind::IO { @@ -506,8 +495,6 @@ to a missing value in the attribute set(s) included via `with`."#, write!(f, "Invalid UTF-8 in string") } - ErrorKind::Xml(error) => write!(f, "failed to serialise to XML: {error}"), - ErrorKind::TvixError(inner_error) => { write!(f, "{inner_error}") } @@ -823,7 +810,6 @@ impl Error { | ErrorKind::JsonError(_) | ErrorKind::NotSerialisableToJson(_) | ErrorKind::FromTomlError(_) - | ErrorKind::Xml(_) | ErrorKind::Utf8 | ErrorKind::TvixError(_) | ErrorKind::TvixBug { .. } @@ -870,7 +856,6 @@ impl Error { ErrorKind::UnexpectedArgument { .. } => "E031", ErrorKind::RelativePathResolution(_) => "E032", ErrorKind::DivisionByZero => "E033", - ErrorKind::Xml(_) => "E034", ErrorKind::FromTomlError(_) => "E035", ErrorKind::NotSerialisableToJson(_) => "E036", ErrorKind::UnexpectedContext => "E037", diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml new file mode 100644 index 000000000000..468972b2f819 --- /dev/null +++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.exp.xml @@ -0,0 +1,5 @@ +<?xml version='1.0' encoding='utf-8'?> +<expr> + <attrs> + </attrs> +</expr> diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix new file mode 100644 index 000000000000..ffcd4415b08f --- /dev/null +++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml-empty.nix @@ -0,0 +1 @@ +{ } diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp new file mode 100644 index 000000000000..9ae16de526f1 --- /dev/null +++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.exp @@ -0,0 +1 @@ +"<?xml version='1.0' encoding='utf-8'?>\n<expr>\n <attrs>\n <attr name=\"&-{\">\n <string value=\";&"\" />\n </attr>\n <attr name=\"a\">\n <string value=\"s\" />\n </attr>\n </attrs>\n</expr>\n" diff --git a/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix new file mode 100644 index 000000000000..7d074048ddec --- /dev/null +++ b/tvix/eval/src/tests/tvix_tests/eval-okay-toxml.nix @@ -0,0 +1,2 @@ +# Check some corner cases regarding escaping. +builtins.toXML { a = "s"; "&-{" = ";&\""; } |