about summary refs log tree commit diff
path: root/t/helper/test-xml-encode.c
diff options
context:
space:
mode:
authorVincent Ambo <Vincent Ambo>2020-01-11T23·36+0000
committerVincent Ambo <Vincent Ambo>2020-01-11T23·36+0000
commit1b593e1ea4d2af0f6444d9a7788d5d99abd6fde5 (patch)
treee3accb9beed5c4c1b5a05c99db71ab2841f0ed04 /t/helper/test-xml-encode.c
Squashed 'third_party/git/' content from commit cb71568594
git-subtree-dir: third_party/git
git-subtree-split: cb715685942260375e1eb8153b0768a376e4ece7
Diffstat (limited to 't/helper/test-xml-encode.c')
-rw-r--r--t/helper/test-xml-encode.c80
1 files changed, 80 insertions, 0 deletions
diff --git a/t/helper/test-xml-encode.c b/t/helper/test-xml-encode.c
new file mode 100644
index 000000000000..a648bbd961c2
--- /dev/null
+++ b/t/helper/test-xml-encode.c
@@ -0,0 +1,80 @@
+#include "test-tool.h"
+
+static const char *utf8_replace_character = "&#xfffd;";
+
+/*
+ * Encodes (possibly incorrect) UTF-8 on <stdin> to <stdout>, to be embedded
+ * in an XML file.
+ */
+int cmd__xml_encode(int argc, const char **argv)
+{
+	unsigned char buf[1024], tmp[4], *tmp2 = NULL;
+	ssize_t cur = 0, len = 1, remaining = 0;
+	unsigned char ch;
+
+	for (;;) {
+		if (++cur == len) {
+			len = xread(0, buf, sizeof(buf));
+			if (!len)
+				return 0;
+			if (len < 0)
+				die_errno("Could not read <stdin>");
+			cur = 0;
+		}
+		ch = buf[cur];
+
+		if (tmp2) {
+			if ((ch & 0xc0) != 0x80) {
+				fputs(utf8_replace_character, stdout);
+				tmp2 = NULL;
+				cur--;
+				continue;
+			}
+			*tmp2 = ch;
+			tmp2++;
+			if (--remaining == 0) {
+				fwrite(tmp, tmp2 - tmp, 1, stdout);
+				tmp2 = NULL;
+			}
+			continue;
+		}
+
+		if (!(ch & 0x80)) {
+			/* 0xxxxxxx */
+			if (ch == '&')
+				fputs("&amp;", stdout);
+			else if (ch == '\'')
+				fputs("&apos;", stdout);
+			else if (ch == '"')
+				fputs("&quot;", stdout);
+			else if (ch == '<')
+				fputs("&lt;", stdout);
+			else if (ch == '>')
+				fputs("&gt;", stdout);
+			else if (ch >= 0x20)
+				fputc(ch, stdout);
+			else if (ch == 0x09 || ch == 0x0a || ch == 0x0d)
+				fprintf(stdout, "&#x%02x;", ch);
+			else
+				fputs(utf8_replace_character, stdout);
+		} else if ((ch & 0xe0) == 0xc0) {
+			/* 110XXXXx 10xxxxxx */
+			tmp[0] = ch;
+			remaining = 1;
+			tmp2 = tmp + 1;
+		} else if ((ch & 0xf0) == 0xe0) {
+			/* 1110XXXX 10Xxxxxx 10xxxxxx */
+			tmp[0] = ch;
+			remaining = 2;
+			tmp2 = tmp + 1;
+		} else if ((ch & 0xf8) == 0xf0) {
+			/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+			tmp[0] = ch;
+			remaining = 3;
+			tmp2 = tmp + 1;
+		} else
+			fputs(utf8_replace_character, stdout);
+	}
+
+	return 0;
+}