diff options
author | Profpatsch <mail@profpatsch.de> | 2024-05-20T13·50+0200 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2024-05-22T10·32+0000 |
commit | 5b2ba0efa1b87aa514d665a7f64ada36617c720e (patch) | |
tree | 27a4bb08f20e997377cc1bc884ee16c38e203f86 /tvix/eval/src/builtins/to_xml.rs | |
parent | e7be3422566b36e5bd3aeaaf7d47537dfd050a5c (diff) |
refactor(tvix/eval): rewrite xml emitter to be simple-stupid r/8160
In order to be compatible with the nix XML generator, it’s easier to generate the XML directly, instead of going through a library which we have to bend to do what we need. Removes dependency on `xml-rs`, which came with a full XML parser that we didn’t use. Only takes a tiny bit of code for the XML escaping, somewhat simplified. I add a little escaping value, to make sure we have the same behaviour as nix proper. Interestingly enough, we never need to escape XML attribute names, because the `builtins.toXML` format encodes user-defined values as attribute keys only. So we only escape attribute values. Fixes: https://b.tvl.fyi/issues/399 Change-Id: If4d407d324864b3bb9aa3160e2ec6889f7727127 Reviewed-on: https://cl.tvl.fyi/c/depot/+/11697 Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de> Autosubmit: Profpatsch <mail@profpatsch.de>
Diffstat (limited to 'tvix/eval/src/builtins/to_xml.rs')
-rw-r--r-- | tvix/eval/src/builtins/to_xml.rs | 246 |
1 files changed, 196 insertions, 50 deletions
diff --git a/tvix/eval/src/builtins/to_xml.rs b/tvix/eval/src/builtins/to_xml.rs index bb12cebfc9d0..10a6d8cdb358 100644 --- a/tvix/eval/src/builtins/to_xml.rs +++ b/tvix/eval/src/builtins/to_xml.rs @@ -3,59 +3,43 @@ //! things in nixpkgs rely on. use bstr::ByteSlice; +use std::borrow::Cow; use std::{io::Write, rc::Rc}; -use xml::writer::events::XmlEvent; -use xml::writer::EmitterConfig; -use xml::writer::EventWriter; use crate::{ErrorKind, Value}; /// Recursively serialise a value to XML. The value *must* have been /// deep-forced before being passed to this function. pub fn value_to_xml<W: Write>(mut writer: W, value: &Value) -> Result<(), ErrorKind> { - let config = EmitterConfig { - perform_indent: true, - pad_self_closing: true, - - // Nix uses single-quotes *only* in the document declaration, - // so we need to write it manually. - write_document_declaration: false, - ..Default::default() - }; - // Write a literal document declaration, using C++-Nix-style // single quotes. writeln!(writer, "<?xml version='1.0' encoding='utf-8'?>")?; - let mut writer = EventWriter::new_with_config(writer, config); - - writer.write(XmlEvent::start_element("expr"))?; - value_variant_to_xml(&mut writer, value)?; - writer.write(XmlEvent::end_element())?; + let mut emitter = XmlEmitter::new(writer); - // Unwrap the writer to add the final newline that C++ Nix adds. - writeln!(writer.into_inner())?; + emitter.write_open_tag("expr", &[])?; + value_variant_to_xml(&mut emitter, value)?; + emitter.write_closing_tag("expr")?; Ok(()) } fn write_typed_value<W: Write, V: ToString>( - w: &mut EventWriter<W>, - name: &str, + w: &mut XmlEmitter<W>, + name_unescaped: &str, value: V, ) -> Result<(), ErrorKind> { - w.write(XmlEvent::start_element(name).attr("value", &value.to_string()))?; - w.write(XmlEvent::end_element())?; + w.write_self_closing_tag(name_unescaped, &[("value", &value.to_string())])?; Ok(()) } -fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Result<(), ErrorKind> { +fn value_variant_to_xml<W: Write>(w: &mut XmlEmitter<W>, value: &Value) -> Result<(), ErrorKind> { match value { Value::Thunk(t) => return value_variant_to_xml(w, &t.value()), Value::Null => { - w.write(XmlEvent::start_element("null"))?; - w.write(XmlEvent::end_element()) + w.write_open_tag("null", &[])?; + w.write_closing_tag("null")?; } Value::Bool(b) => return write_typed_value(w, "bool", b), @@ -65,50 +49,46 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu Value::Path(p) => return write_typed_value(w, "path", p.to_string_lossy()), Value::List(list) => { - w.write(XmlEvent::start_element("list"))?; + w.write_open_tag("list", &[])?; for elem in list.into_iter() { value_variant_to_xml(w, elem)?; } - w.write(XmlEvent::end_element()) + w.write_closing_tag("list")?; } Value::Attrs(attrs) => { - w.write(XmlEvent::start_element("attrs"))?; + w.write_open_tag("attrs", &[])?; for elem in attrs.iter() { - w.write(XmlEvent::start_element("attr").attr("name", &elem.0.to_str_lossy()))?; + w.write_open_tag("attr", &[("name", &elem.0.to_str_lossy())])?; value_variant_to_xml(w, elem.1)?; - w.write(XmlEvent::end_element())?; + w.write_closing_tag("attr")?; } - w.write(XmlEvent::end_element()) + w.write_closing_tag("attrs")?; } Value::Closure(c) => { - w.write(XmlEvent::start_element("function"))?; + w.write_open_tag("function", &[])?; match &c.lambda.formals { Some(formals) => { - let mut attrspat = XmlEvent::start_element("attrspat"); + let mut attrs: Vec<(&str, &str)> = Vec::with_capacity(2); if formals.ellipsis { - attrspat = attrspat.attr("ellipsis", "1"); + attrs.push(("ellipsis", "1")); } if let Some(ref name) = &formals.name { - attrspat = attrspat.attr("name", name.as_str()); + attrs.push(("name", name.as_str())); } - w.write(attrspat)?; - + w.write_open_tag("attrspat", &attrs)?; for arg in formals.arguments.iter() { - w.write( - XmlEvent::start_element("attr").attr("name", &arg.0.to_str_lossy()), - )?; - w.write(XmlEvent::end_element())?; + w.write_self_closing_tag("attr", &[("name", &arg.0.to_str_lossy())])?; } - w.write(XmlEvent::end_element())?; + w.write_closing_tag("attrspat")?; } None => { // TODO(tazjin): tvix does not currently persist function @@ -120,17 +100,16 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu // If we don't want to persist the data, we can re-parse the // AST from the spans of the lambda's bytecode and figure it // out that way, but it needs some investigating. - w.write(XmlEvent::start_element("varpat").attr("name", /* fake: */ "x"))?; - w.write(XmlEvent::end_element())?; + w.write_self_closing_tag("varpat", &[("name", /* fake: */ "x")])?; } } - w.write(XmlEvent::end_element()) + w.write_closing_tag("function")?; } Value::Builtin(_) => { - w.write(XmlEvent::start_element("unevaluated"))?; - w.write(XmlEvent::end_element()) + w.write_open_tag("unevaluated", &[])?; + w.write_closing_tag("unevaluated")?; } Value::AttrNotFound @@ -148,7 +127,174 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu Value::Catchable(_) => { panic!("tvix bug: value_to_xml() called on a value which had not been deep-forced") } - }?; + }; Ok(()) } + +/// A simple-stupid XML emitter, which implements only the subset needed for byte-by-byte compat with C++ nix’ `builtins.toXML`. +struct XmlEmitter<W> { + /// The current indentation + cur_indent: usize, + writer: W, +} + +impl<W: Write> XmlEmitter<W> { + pub fn new(writer: W) -> Self { + XmlEmitter { + cur_indent: 0, + writer, + } + } + + /// Write an open tag with the given name (which is not escaped!) + /// and attributes (Keys are not escaped! Only attribute values are.) + pub fn write_open_tag( + &mut self, + name_unescaped: &str, + attrs: &[(&str, &str)], + ) -> std::io::Result<()> { + self.add_indent()?; + self.writer.write_all(b"<")?; + self.writer.write_all(name_unescaped.as_bytes())?; + self.write_attrs_escape_vals(attrs)?; + self.writer.write_all(b">\n")?; + self.cur_indent += 2; + Ok(()) + } + + /// Write a self-closing open tag with the given name (which is not escaped!) + /// and attributes (Keys are not escaped! Only attribute values are.) + pub fn write_self_closing_tag( + &mut self, + name_unescaped: &str, + attrs: &[(&str, &str)], + ) -> std::io::Result<()> { + self.add_indent()?; + self.writer.write_all(b"<")?; + self.writer.write_all(name_unescaped.as_bytes())?; + self.write_attrs_escape_vals(attrs)?; + self.writer.write_all(b" />\n")?; + Ok(()) + } + + /// Write a closing tag with the given name (which is not escaped!) + pub fn write_closing_tag(&mut self, name_unescaped: &str) -> std::io::Result<()> { + self.cur_indent -= 2; + self.add_indent()?; + self.writer.write_all(b"</")?; + self.writer.write_all(name_unescaped.as_bytes())?; + self.writer.write_all(b">\n")?; + Ok(()) + } + + #[inline] + fn add_indent(&mut self) -> std::io::Result<()> { + self.writer.write_all(&b" ".repeat(self.cur_indent)) + } + + /// Write an attribute list + fn write_attrs_escape_vals(&mut self, attrs: &[(&str, &str)]) -> std::io::Result<()> { + for (name, val) in attrs { + self.writer.write_all(b" ")?; + self.writer.write_all(name.as_bytes())?; + self.writer.write_all(br#"=""#)?; + self.writer + .write_all(Self::escape_attr_value(val).as_bytes())?; + self.writer.write_all(b"\"")?; + } + Ok(()) + } + + /// Escape the given attribute value, making sure we only actually clone the string if we needed to replace something. + fn escape_attr_value(s: &str) -> Cow<str> { + let mut last_escape: usize = 0; + let mut res: Cow<str> = Cow::Borrowed(""); + // iterating via char_indices gives us the ability to index the original string slice at character boundaries + for (idx, c) in s.char_indices() { + match Self::should_escape_char(c) { + None => {} + Some(new) => { + // add characters since the last escape we did + res += &s[last_escape..idx]; + // add the escaped value + res += new; + last_escape = idx + 1; + } + } + } + // we did not need to escape anything, so borrow original string + if last_escape == 0 { + Cow::Borrowed(s) + } else { + // add the remaining characters + res += &s[last_escape..]; + res + } + } + + fn should_escape_char(c: char) -> Option<&'static str> { + match c { + '<' => Some("<"), + '>' => Some(">"), + '"' => Some("""), + '\'' => Some("'"), + '&' => Some("&"), + '\n' => Some("
"), + '\r' => Some("
"), + _ => None, + } + } +} + +#[cfg(test)] +mod tests { + use bytes::buf::Writer; + use pretty_assertions::assert_eq; + + use crate::builtins::to_xml::XmlEmitter; + use std::borrow::Cow; + + #[test] + fn xml_gen() { + let mut buf = Vec::new(); + let mut x = XmlEmitter::new(&mut buf); + x.write_open_tag("hello", &[("hi", "it’s me"), ("no", "<escape>")]) + .unwrap(); + x.write_self_closing_tag("self-closing", &[("tag", "yay")]) + .unwrap(); + x.write_closing_tag("hello").unwrap(); + + assert_eq!( + std::str::from_utf8(&buf).unwrap(), + r##"<hello hi="it’s me" no="<escape>"> + <self-closing tag="yay" /> +</hello> +"## + ); + } + + #[test] + fn xml_escape() { + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("ab<>c&de") { + Cow::Owned(s) => assert_eq!(s, "ab<>c&de".to_string(), "escape stuff"), + Cow::Borrowed(s) => panic!("s should be owned {}", s), + } + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("") { + Cow::Borrowed(s) => assert_eq!(s, "", "empty escape is borrowed"), + Cow::Owned(s) => panic!("s should be borrowed {}", s), + } + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!ŷbla") { + Cow::Borrowed(s) => assert_eq!(s, "hi!ŷbla", "no escape is borrowed"), + Cow::Owned(s) => panic!("s should be borrowed {}", s), + } + match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!<ŷ>bla") { + Cow::Owned(s) => assert_eq!( + s, + "hi!<ŷ>bla".to_string(), + "multi-byte chars are correctly used" + ), + Cow::Borrowed(s) => panic!("s should be owned {}", s), + } + } +} |