about summary refs log tree commit diff
path: root/tools/cheddar/src/lib.rs
//! This file implements the rendering logic of cheddar with public
//! functions for syntax-highlighting code and for turning Markdown
//! into HTML with TVL extensions.
use comrak::arena_tree::Node;
use comrak::nodes::{Ast, AstNode, NodeCodeBlock, NodeHtmlBlock, NodeValue};
use comrak::{format_html, parse_document, Arena, ComrakOptions};
use lazy_static::lazy_static;
use regex::Regex;
use std::cell::RefCell;
use std::collections::HashMap;
use std::env;
use std::ffi::OsStr;
use std::io;
use std::io::BufRead;
use std::io::Write;
use std::path::Path;
use syntect::dumps::from_binary;
use syntect::easy::HighlightLines;
use syntect::highlighting::{Theme, ThemeSet};
use syntect::parsing::{SyntaxReference, SyntaxSet};
use syntect::util::LinesWithEndings;

use syntect::html::{
    append_highlighted_html_for_styled_line, start_highlighted_html_snippet, IncludeBackground,
};

#[cfg(test)]
mod tests;

lazy_static! {
    // Load syntaxes lazily. Initialisation might not be required in
    // the case of Markdown rendering (if there's no code blocks
    // within the document).
    //
    // Note that the syntax set is included from the path pointed to
    // by the BAT_SYNTAXES environment variable at compile time. This
    // variable is populated by Nix and points to TVL's syntax set.
    static ref SYNTAXES: SyntaxSet = from_binary(include_bytes!(env!("BAT_SYNTAXES")));
    pub static ref THEMES: ThemeSet = ThemeSet::load_defaults();

    // Configure Comrak's Markdown rendering with all the bells &
    // whistles!
    static ref MD_OPTS: ComrakOptions = {
        let mut options = ComrakOptions::default();

        // Enable non-standard Markdown features:
        options.extension.strikethrough = true;
        options.extension.tagfilter = true;
        options.extension.table = true;
        options.extension.autolink = true;
        options.extension.tasklist = true;
        options.extension.header_ids = Some(String::new()); // yyeeesss!
        options.extension.footnotes = true;
        options.extension.description_lists = true;
        options.extension.front_matter_delimiter = Some("---".to_owned());

        // Required for tagfilter
        options.render.unsafe_ = true;

        options
    };

    // Configures a map of specific filenames to languages, for cases
    // where the detection by extension or other heuristics fails.
    static ref FILENAME_OVERRIDES: HashMap<&'static str, &'static str> = {
        let mut map = HashMap::new();
        // rules.pl is the canonical name of the submit rule file in
        // Gerrit, which is written in Prolog.
        map.insert("rules.pl", "Prolog");
        map
    };

    // Supported shortlinks in Markdown files.
    //
    // Currently only bug links (e.g. b/123) and CL links (e.g.
    // cl/345) are supported. Coincidentally these have the same
    // format, which makes the initial implementation easy.
    static ref SHORTLINK: Regex = Regex::new("\\b(?P<type>b|cl)/(?P<dest>\\d+)\\b").unwrap();
}

// HTML fragment used when rendering inline blocks in Markdown documents.
// Emulates the GitHub style (subtle background hue and padding).
const BLOCK_PRE: &str = "<pre style=\"background-color:#f6f8fa;padding:16px;\">\n";

fn should_continue(res: &io::Result<usize>) -> bool {
    match *res {
        Ok(n) => n > 0,
        Err(_) => false,
    }
}

// This function is taken from the Comrak documentation.
fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &F)
where
    F: Fn(&'a AstNode<'a>),
{
    f(node);
    for c in node.children() {
        iter_nodes(c, f);
    }
}

// Many of the syntaxes in the syntax list have random capitalisations, which
// means that name matching for the block info of a code block in HTML fails.
//
// Instead, try finding a syntax match by comparing case insensitively (for
// ASCII characters, anyways).
fn find_syntax_case_insensitive(info: &str) -> Option<&'static SyntaxReference> {
    // TODO(tazjin): memoize this lookup
    SYNTAXES
        .syntaxes()
        .iter()
        .rev()
        .find(|&s| info.eq_ignore_ascii_case(&s.name))
}

// Replaces code-block inside of a Markdown AST with HTML blocks rendered by
// syntect. This enables static (i.e. no JavaScript) syntax highlighting, even
// of complex languages.
fn highlight_code_block(code_block: &NodeCodeBlock) -> NodeValue {
    let theme = &THEMES.themes["InspiredGitHub"];
    let info = String::from_utf8_lossy(&code_block.info);

    let syntax = find_syntax_case_insensitive(&info)
        .or_else(|| SYNTAXES.find_syntax_by_extension(&info))
        .unwrap_or_else(|| SYNTAXES.find_syntax_plain_text());

    let code = String::from_utf8_lossy(&code_block.literal);

    let rendered = {
        // Write the block preamble manually to get exactly the
        // desired layout:
        let mut hl = HighlightLines::new(syntax, theme);
        let mut buf = BLOCK_PRE.to_string();

        for line in LinesWithEndings::from(&code) {
            let regions = hl.highlight(line, &SYNTAXES);
            append_highlighted_html_for_styled_line(&regions[..], IncludeBackground::No, &mut buf);
        }

        buf.push_str("</pre>");
        buf
    };

    let mut block = NodeHtmlBlock::default();
    block.literal = rendered.into_bytes();

    NodeValue::HtmlBlock(block)
}

// Supported callout elements (which each have their own distinct rendering):
enum Callout {
    Todo,
    Warning,
    Question,
    Tip,
}

// Determine whether the first child of the supplied node contains a text that
// should cause a callout section to be rendered.
fn has_callout<'a>(node: &Node<'a, RefCell<Ast>>) -> Option<Callout> {
    match node.first_child().map(|c| c.data.borrow()) {
        Some(child) => match &child.value {
            NodeValue::Text(text) => {
                if text.starts_with(b"TODO") {
                    return Some(Callout::Todo);
                } else if text.starts_with(b"WARNING") {
                    return Some(Callout::Warning);
                } else if text.starts_with(b"QUESTION") {
                    return Some(Callout::Question);
                } else if text.starts_with(b"TIP") {
                    return Some(Callout::Tip);
                }

                None
            }
            _ => None,
        },
        _ => None,
    }
}

// Replace instances of known shortlinks in the input document with
// Markdown syntax for a highlighted link.
fn linkify_shortlinks<'a>(input: &'a str) -> std::borrow::Cow<'a, str> {
    SHORTLINK.replace_all(input, "[$type/$dest](https://$type.tvl.fyi/$dest)")
}

fn format_callout_paragraph(callout: Callout) -> NodeValue {
    let class = match callout {
        Callout::Todo => "cheddar-todo",
        Callout::Warning => "cheddar-warning",
        Callout::Question => "cheddar-question",
        Callout::Tip => "cheddar-tip",
    };

    let mut block = NodeHtmlBlock::default();
    block.literal = format!("<p class=\"cheddar-callout {}\">", class).into_bytes();
    NodeValue::HtmlBlock(block)
}

pub fn format_markdown<R: BufRead, W: Write>(reader: &mut R, writer: &mut W) {
    let document = {
        let mut buffer = String::new();
        reader
            .read_to_string(&mut buffer)
            .expect("reading should work");
        buffer
    };

    let arena = Arena::new();
    let root = parse_document(&arena, &linkify_shortlinks(&document), &MD_OPTS);

    // This node must exist with a lifetime greater than that of the parsed AST
    // in case that callouts are encountered (otherwise insertion into the tree
    // is not possible).
    let mut p_close_value = NodeHtmlBlock::default();
    p_close_value.literal = b"</p>".to_vec();

    let p_close_node = Ast::new(NodeValue::HtmlBlock(p_close_value));
    let p_close = Node::new(RefCell::new(p_close_node));

    // Special features of Cheddar are implemented by traversing the
    // arena and reacting on nodes that we might want to modify.
    iter_nodes(root, &|node| {
        let mut ast = node.data.borrow_mut();
        let new = match &ast.value {
            // Syntax highlighting is implemented by replacing the
            // code block node with literal HTML.
            NodeValue::CodeBlock(code) => Some(highlight_code_block(code)),

            NodeValue::Paragraph => {
                if let Some(callout) = has_callout(node) {
                    node.insert_after(&p_close);
                    Some(format_callout_paragraph(callout))
                } else {
                    None
                }
            }
            _ => None,
        };

        if let Some(new_value) = new {
            ast.value = new_value
        }
    });

    format_html(root, &MD_OPTS, writer).expect("Markdown rendering failed");
}

fn find_syntax_for_file(filename: &str) -> &'static SyntaxReference {
    (*FILENAME_OVERRIDES)
        .get(filename)
        .and_then(|name| SYNTAXES.find_syntax_by_name(name))
        .or_else(|| {
            Path::new(filename)
                .extension()
                .and_then(OsStr::to_str)
                .and_then(|s| SYNTAXES.find_syntax_by_extension(s))
        })
        .unwrap_or_else(|| SYNTAXES.find_syntax_plain_text())
}

pub fn format_code<R: BufRead, W: Write>(
    theme: &Theme,
    reader: &mut R,
    writer: &mut W,
    filename: &str,
) {
    let mut linebuf = String::new();

    // Get the first line, we might need it for syntax identification.
    let mut read_result = reader.read_line(&mut linebuf);
    let syntax = find_syntax_for_file(filename);

    let mut hl = HighlightLines::new(syntax, theme);
    let (mut outbuf, bg) = start_highlighted_html_snippet(theme);

    // Rather than using the `lines` iterator, read each line manually
    // and maintain buffer state.
    //
    // This is done because the syntax highlighter requires trailing
    // newlines to be efficient, and those are stripped in the lines
    // iterator.
    while should_continue(&read_result) {
        let regions = hl.highlight(&linebuf, &SYNTAXES);

        append_highlighted_html_for_styled_line(
            &regions[..],
            IncludeBackground::IfDifferent(bg),
            &mut outbuf,
        );

        // immediately output the current state to avoid keeping
        // things in memory
        write!(writer, "{}", outbuf).expect("write should not fail");

        // merry go round again
        linebuf.clear();
        outbuf.clear();
        read_result = reader.read_line(&mut linebuf);
    }

    writeln!(writer, "</pre>").expect("write should not fail");
}