diff options
Diffstat (limited to 'configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el')
-rw-r--r-- | configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el | 3157 |
1 files changed, 3157 insertions, 0 deletions
diff --git a/configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el b/configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el new file mode 100644 index 000000000000..bea8effe825d --- /dev/null +++ b/configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el @@ -0,0 +1,3157 @@ +;;; pcre2el.el --- regexp syntax converter -*- lexical-binding: t -*- + +;; Copyright (C) 2012-2015 Jon Oddie <jonxfield@gmail.com> + +;; Author: joddie <jonxfield at gmail.com> +;; Hacked additionally by: opensource at hardakers dot net +;; Created: 14 Feb 2012 +;; Updated: 13 December 2015 +;; Version: 1.8 +;; Package-Version: 20161120.2103 +;; Url: https://github.com/joddie/pcre2el +;; Package-Requires: ((emacs "24") (cl-lib "0.3")) + +;; This file is NOT part of GNU Emacs. + +;; This program is free software: you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation, either version 3 of the +;; License, or (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with this program. If not, see `http://www.gnu.org/licenses/'. + +;; This file incorporates work covered by the following copyright and +;; permission notice: +;; +;; Copyright (c) 1993-2002 Richard Kelsey and Jonathan Rees +;; Copyright (c) 1994-2002 by Olin Shivers and Brian D. Carlstrom. +;; Copyright (c) 1999-2002 by Martin Gasbichler. +;; Copyright (c) 2001-2002 by Michael Sperber. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions +;; are met: 1. Redistributions of source code must retain the above +;; copyright notice, this list of conditions and the following +;; disclaimer. 2. Redistributions in binary form must reproduce the +;; above copyright notice, this list of conditions and the following +;; disclaimer in the documentation and/or other materials provided +;; with the distribution. 3. The name of the authors may not be used +;; to endorse or promote products derived from this software without +;; specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE AUTHORS "AS IS" AND ANY EXPRESS OR +;; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +;;; Commentary: + +;; 1 Overview +;; ========== + +;; `pcre2el' or `rxt' (RegeXp Translator or RegeXp Tools) is a utility +;; for working with regular expressions in Emacs, based on a +;; recursive-descent parser for regexp syntax. In addition to converting +;; (a subset of) PCRE syntax into its Emacs equivalent, it can do the +;; following: + +;; - convert Emacs syntax to PCRE +;; - convert either syntax to `rx', an S-expression based regexp syntax +;; - untangle complex regexps by showing the parse tree in `rx' form and +;; highlighting the corresponding chunks of code +;; - show the complete list of strings (productions) matching a regexp, +;; provided the list is finite +;; - provide live font-locking of regexp syntax (so far only for Elisp +;; buffers -- other modes on the TODO list) + + +;; 2 Usage +;; ======= + +;; Enable `rxt-mode' or its global equivalent `rxt-global-mode' to get +;; the default key-bindings. There are three sets of commands: commands +;; that take a PCRE regexp, commands which take an Emacs regexp, and +;; commands that try to do the right thing based on the current +;; mode. Currently, this means Emacs syntax in `emacs-lisp-mode' and +;; `lisp-interaction-mode', and PCRE syntax everywhere else. + +;; The default key bindings all begin with `C-c /' and have a mnemonic +;; structure: `C-c / <source> <target>', or just `C-c / <target>' for the +;; "do what I mean" commands. The complete list of key bindings is given +;; here and explained in more detail below: + +;; - "Do-what-I-mean" commands: +;; `C-c / /': `rxt-explain' +;; `C-c / c': `rxt-convert-syntax' +;; `C-c / x': `rxt-convert-to-rx' +;; `C-c / '': `rxt-convert-to-strings' + +;; - Commands that work on a PCRE regexp: +;; `C-c / p e': `rxt-pcre-to-elisp' +;; `C-c / %': `pcre-query-replace-regexp' +;; `C-c / p x': `rxt-pcre-to-rx' +;; `C-c / p '': `rxt-pcre-to-strings' +;; `C-c / p /': `rxt-explain-pcre' + +;; - Commands that work on an Emacs regexp: +;; `C-c / e /': `rxt-explain-elisp' +;; `C-c / e p': `rxt-elisp-to-pcre' +;; `C-c / e x': `rxt-elisp-to-rx' +;; `C-c / e '': `rxt-elisp-to-strings' +;; `C-c / e t': `rxt-toggle-elisp-rx' +;; `C-c / t': `rxt-toggle-elisp-rx' + + +;; 2.1 Interactive input and output +;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +;; When used interactively, the conversion commands can read a regexp +;; either from the current buffer or from the minibuffer. The output is +;; displayed in the minibuffer and copied to the kill-ring. + +;; - When called with a prefix argument (`C-u'), they read a regular +;; expression from the minibuffer literally, without further processing +;; -- meaning there's no need to double the backslashes if it's an +;; Emacs regexp. This is the same way commands like +;; `query-replace-regexp' read input. + +;; - When the region is active, they use they the region contents, again +;; literally (without any translation of string syntax). + +;; - With neither a prefix arg nor an active region, the behavior depends +;; on whether the command expects an Emacs regexp or a PCRE one. + +;; Commands that take an Emacs regexp behave like `C-x C-e': they +;; evaluate the sexp before point (which could be simply a string +;; literal) and use its value. This is designed for use in Elisp +;; buffers. As a special case, if point is *inside* a string, it's +;; first moved to the string end, so in practice they should work as +;; long as point is somewhere within the regexp literal. + +;; Commands that take a PCRE regexp try to read a Perl-style delimited +;; regex literal *after* point in the current buffer, including its +;; flags. For example, putting point before the `m' in the following +;; example and doing `C-c / p e' (`rxt-pcre-to-elisp') displays +;; `\(?:bar\|foo\)', correctly stripping out the whitespace and +;; comment: + +;; ,---- +;; | $x =~ m/ foo | (?# comment) bar /x +;; `---- + +;; The PCRE reader currently only works with `/ ... /' delimiters. It +;; will ignore any preceding `m', `s', or `qr' operator, as well as the +;; replacement part of an `s' construction. + +;; Readers for other PCRE-using languages are on the TODO list. + +;; The translation functions display their result in the minibuffer and +;; copy it to the kill ring. When translating something into Elisp +;; syntax, you might need to use the result either literally (e.g. for +;; interactive input to a command like `query-replace-regexp'), or as a +;; string to paste into Lisp code. To allow both uses, +;; `rxt-pcre-to-elisp' copies both versions successively to the +;; kill-ring. The literal regexp without string quoting is the top +;; element of the kill-ring, while the Lisp string is the +;; second-from-top. You can paste the literal regexp somewhere by doing +;; `C-y', or the Lisp string by `C-y M-y'. + + +;; 2.2 Syntax conversion commands +;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +;; `rxt-convert-syntax' (`C-c / c') converts between Emacs and PCRE +;; syntax, depending on the major mode in effect when called. +;; Alternatively, you can specify the conversion direction explicitly by +;; using either `rxt-pcre-to-elisp' (`C-c / p e') or `rxt-elisp-to-pcre' +;; (`C-c / e p'). + +;; Similarly, `rxt-convert-to-rx' (`C-c / x') converts either kind of +;; syntax to `rx' form, while `rxt-convert-pcre-to-rx' (`C-c / p x') and +;; `rxt-convert-elisp-to-rx' (`C-c / e x') convert to `rx' from a +;; specified source type. + +;; In Elisp buffers, you can use `rxt-toggle-elisp-rx' (`C-c / t' or `C-c +;; / e t') to switch the regexp at point back and forth between string +;; and `rx' syntax. Point should either be within an `rx' or +;; `rx-to-string' form or a string literal for this to work. + + +;; 2.3 PCRE mode (experimental) +;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +;; If you want to use emulated PCRE regexp syntax in all Emacs commands, +;; try `pcre-mode', which uses Emacs's advice system to make all commands +;; that read regexps using the minibuffer use emulated PCRE syntax. It +;; should also work with Isearch. + +;; This feature is still fairly experimental. It may fail to work or do +;; the wrong thing with certain commands. Please report bugs. + +;; `pcre-query-replace-regexp' was originally defined to do query-replace +;; using emulated PCRE regexps, and is now made somewhat obsolete by +;; `pcre-mode'. It is bound to `C-c / %' by default, by analogy with +;; `M-%'. Put the following in your `.emacs' if you want to use +;; PCRE-style query replacement everywhere: + +;; ,---- +;; | (global-set-key [(meta %)] 'pcre-query-replace-regexp) +;; `---- + +;; 2.5 Explain regexps +;; ~~~~~~~~~~~~~~~~~~~ + +;; When syntax-highlighting isn't enough to untangle some gnarly regexp +;; you find in the wild, try the 'explain' commands: `rxt-explain' (`C-c +;; / /'), `rxt-explain-pcre' (`C-c / p') and `rxt-explain-elisp' (`C-c / +;; e'). These display the original regexp along with its pretty-printed +;; `rx' equivalent in a new buffer. Moving point around either in the +;; original regexp or the `rx' translation highlights corresponding +;; pieces of syntax, which can aid in seeing things like the scope of +;; quantifiers. + +;; I call them "explain" commands because the `rx' form is close to a +;; plain syntax tree, and this plus the wordiness of the operators +;; usually helps to clarify what is going on. People who dislike Lisp +;; syntax might disagree with this assessment. + + +;; 2.6 Generate all matching strings (productions) +;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +;; Occasionally you come across a regexp which is designed to match a +;; finite set of strings, e.g. a set of keywords, and it would be useful +;; to recover the original set. (In Emacs you can generate such regexps +;; using `regexp-opt'). The commands `rxt-convert-to-strings' (`C-c / +;; ′'), `rxt-pcre-to-strings' (`C-c / p ′') or `rxt-elisp-to-strings' +;; (`C-c / e ′') accomplish this by generating all the matching strings +;; ("productions") of a regexp. (The productions are copied to the kill +;; ring as a Lisp list). + +;; An example in Lisp code: + +;; ,---- +;; | (regexp-opt '("cat" "caterpillar" "catatonic")) +;; | ;; => "\\(?:cat\\(?:atonic\\|erpillar\\)?\\)" +;; | (rxt-elisp-to-strings "\\(?:cat\\(?:atonic\\|erpillar\\)?\\)") +;; | ;; => '("cat" "caterpillar" "catatonic") +;; `---- + +;; For obvious reasons, these commands only work with regexps that don't +;; include any unbounded quantifiers like `+' or `*'. They also can't +;; enumerate all the characters that match a named character class like +;; `[[:alnum:]]'. In either case they will give a (hopefully meaningful) +;; error message. Due to the nature of permutations, it's still possible +;; for a finite regexp to generate a huge number of productions, which +;; will eat memory and slow down your Emacs. Be ready with `C-g' if +;; necessary. + + +;; 2.7 RE-Builder support +;; ~~~~~~~~~~~~~~~~~~~~~~ + +;; The Emacs RE-Builder is a useful visual tool which allows using +;; several different built-in syntaxes via `reb-change-syntax' (`C-c +;; TAB'). It supports Elisp read and literal syntax and `rx', but it can +;; only convert from the symbolic forms to Elisp, not the other way. This +;; package hacks the RE-Builder to also work with emulated PCRE syntax, +;; and to convert transparently between Elisp, PCRE and rx syntaxes. PCRE +;; mode reads a delimited Perl-like literal of the form `/ ... /', and it +;; should correctly support using the `x' and `s' flags. + + +;; 2.8 Use from Lisp +;; ~~~~~~~~~~~~~~~~~ + +;; Example of using the conversion functions: +;; ,---- +;; | (rxt-pcre-to-elisp "(abc|def)\\w+\\d+") +;; | ;; => "\\(\\(?:abc\\|def\\)\\)[_[:alnum:]]+[[:digit:]]+" +;; `---- + +;; All the conversion functions take a single string argument, the regexp +;; to translate: + +;; - `rxt-pcre-to-elisp' +;; - `rxt-pcre-to-rx' +;; - `rxt-pcre-to-strings' +;; - `rxt-elisp-to-pcre' +;; - `rxt-elisp-to-rx' +;; - `rxt-elisp-to-strings' + + +;; 3 Bugs and Limitations +;; ====================== + +;; 3.1 Limitations on PCRE syntax +;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +;; PCRE has a complicated syntax and semantics, only some of which can be +;; translated into Elisp. The following subset of PCRE should be +;; correctly parsed and converted: + +;; - parenthesis grouping `( .. )', including shy matches `(?: ... )' +;; - backreferences (various syntaxes), but only up to 9 per expression +;; - alternation `|' +;; - greedy and non-greedy quantifiers `*', `*?', `+', `+?', `?' and `??' +;; (all of which are the same in Elisp as in PCRE) +;; - numerical quantifiers `{M,N}' +;; - beginning/end of string `\A', `\Z' +;; - string quoting `\Q .. \E' +;; - word boundaries `\b', `\B' (these are the same in Elisp) +;; - single character escapes `\a', `\c', `\e', `\f', `\n', `\r', `\t', +;; `\x', and `\octal digits' (but see below about non-ASCII characters) +;; - character classes `[...]' including Posix escapes +;; - character classes `\d', `\D', `\h', `\H', `\s', `\S', `\v', `\V' +;; both within character class brackets and outside +;; - word and non-word characters `\w' and `\W' (Emacs has the same +;; syntax, but its meaning is different) +;; - `s' (single line) and `x' (extended syntax) flags, in regexp +;; literals, or set within the expression via `(?xs-xs)' or `(?xs-xs: +;; .... )' syntax +;; - comments `(?# ... )' + +;; Most of the more esoteric PCRE features can't really be supported by +;; simple translation to Elisp regexps. These include the different +;; lookaround assertions, conditionals, and the "backtracking control +;; verbs" `(* ...)' . OTOH, there are a few other syntaxes which are +;; currently unsupported and possibly could be: + +;; - `\L', `\U', `\l', `\u' case modifiers +;; - `\g{...}' backreferences + + +;; 3.2 Other limitations +;; ~~~~~~~~~~~~~~~~~~~~~ + +;; - The order of alternatives and characters in char classes sometimes +;; gets shifted around, which is annoying. +;; - Although the string parser tries to interpret PCRE's octal and +;; hexadecimal escapes correctly, there are problems with matching +;; 8-bit characters that I don't use enough to properly understand, +;; e.g.: +;; ,---- +;; | (string-match-p (rxt-pcre-to-elisp "\\377") "\377") => nil +;; `---- +;; A fix for this would be welcome. + +;; - Most of PCRE's rules for how `^', `\A', `$' and `\Z' interact with +;; newlines are not implemented, since they seem less relevant to +;; Emacs's buffer-oriented rather than line-oriented model. However, +;; the different meanings of the `.' metacharacter *are* implemented +;; (it matches newlines with the `/s' flag, but not otherwise). + +;; - Not currently namespace clean (both `rxt-' and a couple of `pcre-' +;; functions). + + +;; 3.3 TODO: +;; ~~~~~~~~~ + +;; - Python-specific extensions to PCRE? +;; - Language-specific stuff to enable regexp font-locking and explaining +;; in different modes. Each language would need two functions, which +;; could be kept in an alist: + +;; 1. A function to read PCRE regexps, taking the string syntax into +;; account. E.g., Python has single-quoted, double-quoted and raw +;; strings, each with different quoting rules. PHP has the kind of +;; belt-and-suspenders solution you would expect: regexps are in +;; strings, /and/ you have to include the `/ ... /' delimiters! +;; Duh. + +;; 2. A function to copy faces back from the parsed string to the +;; original buffer text. This has to recognize any escape sequences +;; so they can be treated as a single character. + + +;; 4 Internal details +;; ================== + +;; `rxt' defines an internal syntax tree representation of regular +;; expressions, parsers for Elisp and PCRE syntax, and 'unparsers' +;; to convert the internal representation to PCRE or `rx' syntax. +;; Converting from the internal representation to Emacs syntax is +;; done by converting to `rx' form and passing it to `rx-to-string'. +;; See `rxt-parse-re', `rxt-adt->pcre', and `rxt-adt->rx' for +;; details. + +;; This code is partially based on Olin Shivers' reference SRE +;; implementation in scsh, although it is simplified in some respects and +;; extended in others. See `scsh/re.scm', `scsh/spencer.scm' and +;; `scsh/posixstr.scm' in the `scsh' source tree for details. In +;; particular, `pcre2el' steals the idea of an abstract data type for +;; regular expressions and the general structure of the string regexp +;; parser and unparser. The data types for character sets are extended in +;; order to support symbolic translation between character set +;; expressions without assuming a small (Latin1) character set. The +;; string parser is also extended to parse a bigger variety of +;; constructions, including POSIX character classes and various Emacs and +;; Perl regexp assertions. Otherwise, only the bare minimum of scsh's +;; abstract data type is implemented. + + +;; 5 Soapbox +;; ========= + +;; Emacs regexps have their annoyances, but it is worth getting used to +;; them. The Emacs assertions for word boundaries, symbol boundaries, and +;; syntax classes depending on the syntax of the mode in effect are +;; especially useful. (PCRE has `\b' for word-boundary, but AFAIK it +;; doesn't have separate assertions for beginning-of-word and +;; end-of-word). Other things that might be done with huge regexps in +;; other languages can be expressed more understandably in Elisp using +;; combinations of `save-excursion' with the various searches (regexp, +;; literal, skip-syntax-forward, sexp-movement functions, etc.). + +;; There's not much point in using `rxt-pcre-to-elisp' to use PCRE +;; notation in a Lisp program you're going to maintain, since you still +;; have to double all the backslashes. Better to just use the converted +;; result (or better yet, the `rx' form). + + +;; 6 History and acknowledgments +;; ============================= + +;; This was originally created out of an answer to a stackoverflow +;; question: +;; [http://stackoverflow.com/questions/9118183/elisp-mechanism-for-converting-pcre-regexps-to-emacs-regexps] + +;; Thanks to: + +;; - Wes Hardaker (hardaker) for the initial inspiration and subsequent +;; hacking +;; - priyadarshan for requesting RX support +;; - Daniel Colascione (dcolascione) for a patch to support Emacs's +;; explicitly-numbered match groups +;; - Aaron Meurer (asmeurer) for requesting Isearch support +;; - Philippe Vaucher (silex) for a patch to support `ibuffer-do-replace-regexp' +;; in PCRE mode + +;;; Code: + +(require 'cl-lib) +(require 'rx) +(require 're-builder) +(require 'advice) +(require 'ring) +(require 'pcase) + +;;; Customization group +(defgroup rxt nil + "Regex syntax converter and utilities." + :version 1.2 + :group 'tools + :group 'lisp + :link '(emacs-commentary-link :tag "commentary" "pcre2el.el") + :link '(emacs-library-link :tag "lisp file" "pcre2el.el") + :link '(url-link :tag "web page" "https://github.com/joddie/pcre2el")) + +(defface rxt-highlight-face + '((((min-colors 16581375) (background light)) :background "#eee8d5") + (((min-colors 16581375) (background dark)) :background "#222222")) + "Face for highlighting corresponding regex syntax in `rxt-explain' buffers." + :group 'rxt) + +(defcustom rxt-verbose-rx-translation nil + "Non-nil if `rxt-pcre-to-rx' and `rxt-elisp-to-rx' should use verbose `rx' primitives. + +Verbose primitives are things like `line-start' instead of `bol', +etc." + :group 'rxt + :type 'boolean) + +(defcustom rxt-explain-verbosely t + "Non-nil if `rxt-explain-elisp' and `rxt-explain-pcre' should use verbose `rx' primitives. + +This overrides the value of `rxt-verbose-rx-translation' for +these commands only." + :group 'rxt + :type 'boolean) + + +;;;; Macros and functions for writing interactive input and output + +;; Macros for handling return values. If called interactively, +;; display the value in the echo area and copy it to the kill ring, +;; otherwise just return the value. PCREs are copied as unquoted +;; strings for yanking into Perl, JS, etc. `rx' forms and other sexps +;; are copied as `read'-able literals for yanking into Elisp buffers. +;; Emacs regexps are copied twice: once as an unquoted value for +;; interactive use, and once as a readable string literal for yanking +;; into Elisp buffers. +(defmacro rxt-return-pcre (expr) + (let ((value (make-symbol "value"))) + `(let ((,value ,expr)) + (when (called-interactively-p 'any) + (rxt--kill-pcre ,value)) + ,value))) + +(defmacro rxt-return-sexp (expr) + (let ((value (make-symbol "value"))) + `(let ((,value ,expr)) + (when (called-interactively-p 'any) + (rxt--kill-sexp ,value)) + ,value))) + +(defmacro rxt-return-emacs-regexp (expr) + (let ((value (make-symbol "value"))) + `(let ((,value ,expr)) + (when (called-interactively-p 'any) + (rxt--kill-emacs-regexp ,value)) + ,value))) + +(defun rxt--kill-sexp (value) + (let ((lisp-literal (prin1-to-string value))) + (message "%s" lisp-literal) + (kill-new lisp-literal))) + +(defun rxt--kill-pcre (value) + (message "%s" value) + (kill-new value)) + +(defun rxt--kill-emacs-regexp (value) + (let ((lisp-literal (prin1-to-string value))) + (message "%s" value) + (kill-new lisp-literal) + (kill-new value))) + +;; Read an Elisp regexp interactively. +;; +;; Three possibilities: +;; +;; 1) With a prefix arg, reads literally from the minibuffer, w/o +;; using string syntax -- just like query-replace-regexp, etc. +;; +;; 2) If the region is active, use the text of the region literally +;; (again w/o string syntax) +;; +;; 3) Otherwise, eval the sexp before point (which might be a string +;; literal or an expression) and use its value. Falls back to method +;; (1) if this fails to produce a string value. +;; +(cl-defun rxt-interactive/elisp (&optional (prompt "Emacs regexp: ")) + (list + (cond (current-prefix-arg + (read-string prompt)) + + ((use-region-p) + (buffer-substring-no-properties (region-beginning) (region-end))) + + (t + (condition-case nil + (save-excursion + (while (nth 3 (syntax-ppss)) (forward-char)) + (let ((re (eval (preceding-sexp)))) + (if (stringp re) re + (read-string prompt)))) + (error + (read-string prompt))))))) + +;; Read a PCRE regexp interactively. +;; +;; Three possibilities: As above, except that without prefix arg or +;; active region, tries to read a delimited regexp literal like /.../, +;; m/.../, or qr/.../ following point in the current buffer. Falls +;; back to reading from minibuffer if that fails. +;; +;; Returns the regexp, with flags as text properties. +;; +;; TODO: Different delimiters +(cl-defun rxt-interactive/pcre (&optional (prompt "PCRE regexp: ")) + (list + (cond (current-prefix-arg + (rxt--read-pcre prompt)) + + ((use-region-p) + (buffer-substring-no-properties (region-beginning) (region-end))) + + (t + (condition-case nil + (rxt-read-delimited-pcre) + (error ; Fall back to reading from minibuffer + (rxt--read-pcre prompt))))) + nil)) + +(define-minor-mode rxt--read-pcre-mode + "Minor-mode with key-bindings for toggling PCRE flags. + +You should not normally call this directly. It will be enabled +in minibuffers for `read-regexp' and in the `re-builder' buffer +when `pcre-mode' is active. These bindings will also be added to +`isearch-mode-map' in `pcre-mode'." + :initial nil + :lighter nil + :keymap + `((,(kbd "C-c s") . ,#'rxt--toggle-s-mode) + (,(kbd "C-c x") . ,#'rxt--toggle-x-mode) + (,(kbd "C-c i") . ,#'rxt--toggle-i-mode))) + +(defun rxt--read-pcre (prompt) + "Read a PCRE regexp for translation, together with option flags. + +The `s', `x', and `i' flags can be toggled using the following +commands: \\<rxt--read-pcre-mode-map> + +\\[rxt--toggle-s-mode] : toggle `s' (single-line) mode +\\[rxt--toggle-x-mode] : toggle `x' (extended) mode +\\[rxt--toggle-i-mode] : toggle `i' (case-insensitive) mode + +In single-line mode, `.' will also match newlines. +In extended mode, whitespace is ignored. + +Case-insensitive mode emulates matching without case, +independently of Emacs's builtin `case-fold-search' setting. +Note that this does not apply to backreferences." + (minibuffer-with-setup-hook #'rxt--read-pcre-mode + (read-from-minibuffer prompt))) + +(defun rxt--toggle-s-mode () + "Toggle emulated PCRE single-line (s) flag." + (interactive) + (rxt--toggle-flag ?s)) + +(defun rxt--toggle-x-mode () + "Toggle emulated PCRE extended (x) flag." + (interactive) + (rxt--toggle-flag ?x)) + +(defun rxt--toggle-i-mode () + "Toggle emulated PCRE case-insensitive (i) flag." + (interactive) + (rxt--toggle-flag ?i)) + +(defun rxt--toggle-flag (char) + "Toggle CHAR, a PCRE flag." + (cond + ((derived-mode-p 'reb-mode) ; RE-Builder + (rxt--toggle-flag-re-builder char)) + ((minibufferp) + (rxt--toggle-flag-minibuffer char)) + (isearch-mode + (rxt--toggle-flag-isearch char)) + (t + (error "Not in minibuffer, RE-Builder or isearch mode.")))) + +(defun rxt--toggle-flag-re-builder (char) + (save-excursion + (goto-char (point-max)) + (search-backward "/") + (forward-char) + (when (looking-at (rx (* (any ?i ?s ?x)))) + (let ((inhibit-modification-hooks t)) + (replace-match (rxt--xor-flags (match-string 0) char) t t)))) + (reb-do-update)) + +(defun rxt--toggle-flag-minibuffer (char) + (setf (buffer-substring (minibuffer-prompt-end) (point-max)) + (rxt--toggle-flag-string (minibuffer-contents) char)) + (when + (and (= (point) (minibuffer-prompt-end)) + (looking-at (rx "(?" (group (+ (any ?i ?s ?x))) ")"))) + (forward-sexp))) + +(defun rxt--toggle-flag-isearch (char) + (when isearch-regexp + (setq isearch-string + (rxt--toggle-flag-string isearch-string char)) + (setq isearch-message + (mapconcat #'isearch-text-char-description isearch-string "")) + (isearch-search-and-update))) + +(defun rxt--toggle-flag-string (string char) + (if (string-match (rx string-start "(?" (group (+ (any ?i ?s ?x))) ")") + string) + (let ((flags (rxt--xor-flags (match-string 1 string) char))) + (if (string= flags "") + (replace-match "" t t string) + (replace-match flags t t string 1))) + (format "(?%c)%s" char string))) + +(defun rxt--xor-flags (flags char) + (concat + (sort + (cl-set-exclusive-or (string-to-list flags) (list char)) + #'<))) + + +;;;; Minor mode for using emulated PCRE syntax + +(defvar pcre-old-isearch-search-fun-function nil + "Original value of `isearch-search-fun-function' before entering `pcre-mode.' + +This function is wrapped by `pcre-isearch-search-fun-function' +and restored on exit from `pcre-mode'.") +(make-variable-buffer-local 'pcre-old-isearch-search-fun-function) + +(defvar pcre-old-isearch-key-bindings nil + "Alist of key-bindings to restore in `isearch-mode-map' on exiting `pcre-mode'.") + +;;;###autoload +(define-minor-mode pcre-mode + "Use emulated PCRE syntax for regexps wherever possible. + +Advises the `interactive' specs of `read-regexp' and the +following other functions so that they read PCRE syntax and +translate to its Emacs equivalent: + +- `align-regexp' +- `find-tag-regexp' +- `sort-regexp-fields' +- `isearch-message-prefix' +- `ibuffer-do-replace-regexp' + +Also alters the behavior of `isearch-mode' when searching by regexp." + nil " PCRE" + nil + :global t + + (if pcre-mode + ;; Enabling + (progn + ;; Enable advice + (ad-enable-regexp "pcre-mode") + ;; Set up isearch hooks + (add-hook 'isearch-mode-hook #'pcre-isearch-mode-hook) + (add-hook 'isearch-mode-end-hook #'pcre-isearch-mode-end-hook) + ;; Add the keybindings of `rxt--read-pcre-mode-map' to + ;; `isearch-mode-map' (so that they do not cause an exit from + ;; `isearch-mode'), and save any existing bindings for those + ;; keys to restore on exit from `pcre-mode'. + (setq pcre-old-isearch-key-bindings + (cl-loop for key being the key-seqs of rxt--read-pcre-mode-map + for def = (lookup-key isearch-mode-map key) + collect (cons (copy-sequence key) + (if (numberp def) nil def)))) + (cl-loop for key being the key-seqs of rxt--read-pcre-mode-map + using (key-bindings def) + do (define-key isearch-mode-map key def))) + + ;; Disable advice + (ad-disable-regexp "pcre-mode") + ;; Remove from isearch hooks + (remove-hook 'isearch-mode-hook #'pcre-isearch-mode-hook) + (remove-hook 'isearch-mode-end-hook #'pcre-isearch-mode-end-hook) + ;; Restore key-bindings + (cl-loop for (key . def) in pcre-old-isearch-key-bindings + do (define-key isearch-mode-map key def))) + + ;; "Activating" advice re-computes the function definitions, which + ;; is necessary whether enabling or disabling + (ad-activate-regexp "pcre-mode")) + +;;; Cache of PCRE -> Elisp translations +(defvar pcre-mode-cache-size 100 + "Number of PCRE-to-Emacs translations to keep in the `pcre-mode' cache.") + +(defvar pcre-mode-cache (make-hash-table :test 'equal) + "Cache of PCRE-to-Emacs translations used in `pcre-mode'. + +Keys are PCRE regexps, values are their Emacs equivalents.") + +(defvar pcre-mode-reverse-cache (make-hash-table :test 'equal) + "Cache of original PCREs translated to Emacs syntax in `pcre-mode'. + +Keys are translated Emacs regexps, values are their original PCRE +form. This is used to display the original PCRE regexp in place +of its translated form.") + +(defvar pcre-cache-ring (make-ring pcre-mode-cache-size) + "Ring of PCRE-to-Emacs translations used in `pcre-mode'. + +When the ring fills up, the oldest element is removed and the +corresponding entries are deleted from the hash tables +`pcre-mode-cache' and `pcre-mode-reverse-cache'.") + +(defun pcre-to-elisp/cached (pcre) + "Translate PCRE to Emacs syntax, caching both forms." + (or (gethash pcre pcre-mode-cache) + (let ((elisp (rxt-pcre-to-elisp pcre))) + (pcre-set-cache pcre elisp) + elisp))) + +(defun pcre-set-cache (pcre-regexp emacs-regexp) + "Add a PCRE-to-Emacs translation to the `pcre-mode' cache." + (when (and (not (zerop (length pcre-regexp))) + (not (zerop (length emacs-regexp))) + (not (gethash pcre-regexp pcre-mode-cache))) + (if (= (ring-length pcre-cache-ring) (ring-size pcre-cache-ring)) + (let* ((old-item (ring-remove pcre-cache-ring)) + (old-pcre (car old-item)) + (old-emacs (cdr old-item))) + (remhash old-pcre pcre-mode-cache) + (remhash old-emacs pcre-mode-reverse-cache)) + (puthash pcre-regexp emacs-regexp pcre-mode-cache) + (puthash emacs-regexp pcre-regexp pcre-mode-reverse-cache) + (ring-insert pcre-cache-ring (cons pcre-regexp emacs-regexp))))) + +;;; Isearch advice +(defun pcre-isearch-mode-hook () + (when (not (eq isearch-search-fun-function #'isearch-search-fun-default)) + (message "Warning: pcre-mode overriding existing isearch function `%s'" + isearch-search-fun-function)) + ;; Prevent an infinite loop, if a previous isearch in pcre-mode + ;; exited without restoring the original search function for some + ;; reason + (unless (eq isearch-search-fun-function #'pcre-isearch-search-fun-function) + (setq pcre-old-isearch-search-fun-function isearch-search-fun-function)) + (set (make-local-variable 'isearch-search-fun-function) + #'pcre-isearch-search-fun-function)) + +(defun pcre-isearch-mode-end-hook () + (setq isearch-search-fun-function pcre-old-isearch-search-fun-function)) + +(defun pcre-isearch-search-fun-function () + "Enable isearching using emulated PCRE syntax. + +This is set as the value of `isearch-search-fun-function' when +`pcre-mode' is enabled. Returns a function which searches using +emulated PCRE regexps when `isearch-regexp' is true." + (lambda (string bound noerror) + (let ((real-search-function + (funcall (or pcre-old-isearch-search-fun-function 'isearch-search-fun-default)))) + (if (not isearch-regexp) + (funcall real-search-function string bound noerror) + ;; Raise an error if the regexp ends in an incomplete escape + ;; sequence (= odd number of backslashes). + ;; TODO: Perhaps this should really be handled in rxt-pcre-to-elisp? + (if (isearch-backslash string) (rxt-error "Trailing backslash")) + (funcall real-search-function + (pcre-to-elisp/cached string) bound noerror))))) + +(defadvice isearch-message-prefix (after pcre-mode disable) + "Add \"PCRE\" to the Isearch message when searching by regexp in `pcre-mode'." + (when (and isearch-regexp + ;; Prevent an inaccurate message if our callback was + ;; removed somehow + (eq isearch-search-fun-function #'pcre-isearch-search-fun-function)) + (let ((message ad-return-value)) + ;; Some hackery to give replacement the same fontification as + ;; the original + (when + (let ((case-fold-search t)) (string-match "regexp" message)) + (let* ((match (match-string 0 message)) + (properties (text-properties-at 0 match)) + (replacement (apply #'propertize "PCRE regexp" properties)) + (new-message (replace-match replacement t t message))) + (setq ad-return-value new-message)))))) + +(defadvice isearch-fallback + (before pcre-mode (want-backslash &optional allow-invalid to-barrier) disable) + "Hack to fall back correctly in `pcre-mode'. " + ;; A dirty hack to the internals of isearch. Falling back to a + ;; previous match position is necessary when the (Emacs) regexp ends + ;; in "*", "?", "\{" or "\|": this is handled in + ;; `isearch-process-search-char' by calling `isearch-fallback' with + ;; `t' for the value of the first parameter, `want-backslash', in + ;; the last two cases. With PCRE regexps, falling back should take + ;; place on "*", "?", "{" or "|", with no backslashes required. + ;; This advice handles the last two cases by unconditionally setting + ;; `want-backslash' to nil. + (ad-set-arg 0 nil)) + +(defadvice isearch-edit-string + (around pcre-mode disable) + "Add PCRE mode-toggling keys to Isearch minibuffer in regexp mode." + (if isearch-regexp + (minibuffer-with-setup-hook + #'rxt--read-pcre-mode + ad-do-it) + ad-do-it)) + +;;; evil-mode advice +(defadvice evil-search-function + (around pcre-mode (forward regexp-p wrap) disable) + (if (and regexp-p (not isearch-mode)) + (let ((real-search-function ad-do-it)) + (setq ad-return-value + (pcre-decorate-search-function real-search-function))) + ad-do-it)) + +(eval-after-load 'evil + '(when pcre-mode + (ad-enable-advice 'evil-search-function 'around 'pcre-mode) + (ad-activate 'evil-search-function))) + +(defun pcre-decorate-search-function (real-search-function) + (lambda (string &optional bound noerror count) + (funcall real-search-function + (pcre-to-elisp/cached string) + bound noerror count))) + +;;; Other hooks and defadvices + +;;;###autoload +(defun pcre-query-replace-regexp () + "Perform `query-replace-regexp' using PCRE syntax. + +Consider using `pcre-mode' instead of this function." + (interactive) + (let ((old-pcre-mode pcre-mode)) + (unwind-protect + (progn + (pcre-mode +1) + (call-interactively #'query-replace-regexp)) + (pcre-mode (if old-pcre-mode 1 0))))) + + +(defadvice add-to-history + (before pcre-mode (history-var newelt &optional maxelt keep-all) disable) + "Add the original PCRE to query-replace history in `pcre-mode'." + (when (eq history-var query-replace-from-history-variable) + (let ((original (gethash newelt pcre-mode-reverse-cache))) + (when original + (ad-set-arg 1 original))))) + +(defadvice query-replace-descr + (before pcre-mode (from) disable) + "Use the original PCRE in Isearch prompts in `pcre-mode'." + (let ((original (gethash from pcre-mode-reverse-cache))) + (when original + (ad-set-arg 0 original)))) + +;;; The `interactive' specs of the following functions are lifted +;;; wholesale from the original built-ins, which see. +(defadvice read-regexp + (around pcre-mode first (prompt &optional defaults history) disable) + "Read regexp using PCRE syntax and convert to Elisp equivalent." + (ad-set-arg 0 (concat "[PCRE] " prompt)) + (minibuffer-with-setup-hook + #'rxt--read-pcre-mode + ad-do-it) + (setq ad-return-value + (pcre-to-elisp/cached ad-return-value))) + +(defadvice align-regexp + (before pcre-mode first (beg end regexp &optional group spacing repeat) disable) + "Read regexp using PCRE syntax and convert to Elisp equivalent." + (interactive + (append + (list (region-beginning) (region-end)) + (if current-prefix-arg + (list (rxt-pcre-to-elisp + (read-string "Complex align using PCRE regexp: " + "(\\s*)")) + (string-to-number + (read-string + "Parenthesis group to modify (justify if negative): " "1")) + (string-to-number + (read-string "Amount of spacing (or column if negative): " + (number-to-string align-default-spacing))) + (y-or-n-p "Repeat throughout line? ")) + (list (concat "\\(\\s-*\\)" + (rxt-pcre-to-elisp + (read-string "Align PCRE regexp: "))) + 1 align-default-spacing nil))))) + +(defadvice ibuffer-do-replace-regexp + (before pcre-mode first (from-str to-str) disable) + "Read regexp using PCRE syntax and convert to Elisp equivalent." + (interactive + (let* ((from-str (read-from-minibuffer "[PCRE] Replace regexp: ")) + (to-str (read-from-minibuffer (concat "[PCRE] Replace " from-str " with: ")))) + (list (rxt-pcre-to-elisp from-str) to-str)))) + +(defadvice find-tag-regexp + (before pcre-mode first (regexp &optional next-p other-window) disable) + "Read regexp using PCRE syntax and convert to Elisp equivalent. +Perform `find-tag-regexp' using emulated PCRE regexp syntax." + (interactive + (let ((args (find-tag-interactive "[PCRE] Find tag regexp: " t))) + (list (rxt-pcre-to-elisp (nth 0 args)) + (nth 1 args) (nth 2 args))))) + +(defadvice sort-regexp-fields + (before pcre-mode first (reverse record-regexp key-regexp beg end) disable) + "Read regexp using PCRE syntax and convert to Elisp equivalent." + (interactive "P\nsPCRE regexp specifying records to sort: \n\ +sPCRE regexp specifying key within record: \nr") + (ad-set-arg 1 (rxt-pcre-to-elisp (ad-get-arg 1))) + (ad-set-arg 2 (rxt-pcre-to-elisp (ad-get-arg 2)))) + + + +;;; Commands that take Emacs-style regexps as input + +;;;###autoload +(defun rxt-elisp-to-pcre (regexp) + "Translate REGEXP, a regexp in Emacs Lisp syntax, to Perl-compatible syntax. + +Interactively, reads the regexp in one of three ways. With a +prefix arg, reads from minibuffer without string escaping, like +`query-replace-regexp'. Without a prefix arg, uses the text of +the region if it is active. Otherwise, uses the result of +evaluating the sexp before point (which might be a string regexp +literal or an expression that produces a string). + +Displays the translated PCRE regexp in the echo area and copies +it to the kill ring. + +Emacs regexp features such as syntax classes which cannot be +translated to PCRE will cause an error." + (interactive (rxt-interactive/elisp)) + (rxt-return-pcre (rxt-adt->pcre (rxt-parse-elisp regexp)))) + +;;;###autoload +(defun rxt-elisp-to-rx (regexp) + "Translate REGEXP, a regexp in Emacs Lisp syntax, to `rx' syntax. + +See `rxt-elisp-to-pcre' for a description of the interactive +behavior and `rx' for documentation of the S-expression based +regexp syntax." + (interactive (rxt-interactive/elisp)) + (rxt-return-sexp (rxt-adt->rx (rxt-parse-elisp regexp)))) + +;;;###autoload +(defun rxt-elisp-to-strings (regexp) + "Return a list of all strings matched by REGEXP, an Emacs Lisp regexp. + +See `rxt-elisp-to-pcre' for a description of the interactive behavior. + +This is useful primarily for getting back the original list of +strings from a regexp generated by `regexp-opt', but it will work +with any regexp without unbounded quantifiers (*, +, {2, } and so +on). + +Throws an error if REGEXP contains any infinite quantifiers." + (interactive (rxt-interactive/elisp)) + (rxt-return-sexp (rxt-adt->strings (rxt-parse-elisp regexp)))) + +;;;###autoload +(defun rxt-toggle-elisp-rx () + "Toggle the regexp near point between Elisp string and rx syntax." + (interactive) + ;; First, position point before the regex form near point (either + ;; a string literal or a list beginning `rx' or `rx-to-string'). + (let* ((context (syntax-ppss)) + (string-start (nth 8 context))) + (cond (string-start (goto-char string-start)) + ((looking-back "\"") (backward-sexp)) + ((looking-at "\"") nil) + (t + ;; Search backwards, leaving point in place on error + (goto-char + (save-excursion + (skip-syntax-forward "-") + (while (not (looking-at + (rx "(" (or "rx" "rx-to-string") symbol-end))) + (backward-up-list)) + (point)))))) + + ;; Read and replace the regex following point + (let* ((regex (read (current-buffer))) + (print-escape-newlines t)) + (save-excursion + (if (listp regex) + ;; Replace rx form with string value + (prin1 (eval regex) (current-buffer)) + ;; Pretty-print rx form + (save-restriction + (let* ((start (point)) + (rx-syntax (rxt-elisp-to-rx regex)) + (rx-form + (pcase rx-syntax + (`(seq . ,rest) `(rx . ,rest)) + (form `(rx ,form))))) + (rxt-print rx-form) + (narrow-to-region start (point))) + (pp-buffer) + ;; remove the extra newline that pp-buffer inserts + (goto-char (point-max)) + (delete-region + (point) + (save-excursion (skip-chars-backward " \t\n") (point)))))) + (kill-sexp -1) + (indent-pp-sexp))) + + + +;;; Commands that translate PCRE to other formats + +;;;###autoload +(defun rxt-pcre-to-elisp (pcre &optional flags) + "Translate PCRE, a regexp in Perl-compatible syntax, to Emacs Lisp. + +Interactively, uses the contents of the region if it is active, +otherwise reads from the minibuffer. Prints the Emacs translation +in the echo area and copies it to the kill ring. + +PCRE regexp features that cannot be translated into Emacs syntax +will cause an error. See the commentary section of pcre2el.el for +more details." + (interactive (rxt-interactive/pcre)) + (rxt-return-emacs-regexp + (rx-to-string + (rxt-pcre-to-rx (rxt--add-flags pcre flags)) + t))) + +;;;###autoload +(defalias 'pcre-to-elisp 'rxt-pcre-to-elisp) + +;;;###autoload +(defun rxt-pcre-to-rx (pcre &optional flags) + "Translate PCRE, a regexp in Perl-compatible syntax, to `rx' syntax. + +See `rxt-pcre-to-elisp' for a description of the interactive behavior." + (interactive (rxt-interactive/pcre)) + (rxt-return-sexp (rxt-adt->rx (rxt-parse-pcre (rxt--add-flags pcre flags))))) + +;;;###autoload +(defun rxt-pcre-to-strings (pcre &optional flags) + "Return a list of all strings matched by PCRE, a Perl-compatible regexp. + +See `rxt-elisp-to-pcre' for a description of the interactive +behavior and `rxt-elisp-to-strings' for why this might be useful. + +Throws an error if PCRE contains any infinite quantifiers." + (interactive (rxt-interactive/pcre)) + (rxt-return-sexp (rxt-adt->strings (rxt-parse-pcre (rxt--add-flags pcre flags))))) + +(defun rxt--add-flags (pcre flags) + "Prepend FLAGS to PCRE." + (if (not (zerop (length flags))) + (concat "(?" flags ")" pcre) + pcre)) + + +;;; Regexp explaining functions to display pretty-printed rx syntax + +;; When the `rxt-explain' flag is non-nil, `rxt-adt->rx' records +;; location information for each element of the generated `rx' form, +;; allowing highlighting corresponding pieces of syntax at point. +(defvar rxt-explain nil) + +(defvar rxt-highlight-overlays nil + "List of active location-highlighting overlays in rxt-help-mode buffer.") + +;;;###autoload +(defun rxt-explain-elisp (regexp) + "Insert the pretty-printed `rx' syntax for REGEXP in a new buffer. + +REGEXP is a regular expression in Emacs Lisp syntax. See +`rxt-elisp-to-pcre' for a description of how REGEXP is read +interactively." + (interactive (rxt-interactive/elisp)) + (let ((rxt-explain t) + (rxt-verbose-rx-translation rxt-explain-verbosely)) + (rxt-pp-rx regexp (rxt-elisp-to-rx regexp)))) + +;;;###autoload +(defun rxt-explain-pcre (regexp &optional flags) + "Insert the pretty-printed `rx' syntax for REGEXP in a new buffer. + +REGEXP is a regular expression in PCRE syntax. See +`rxt-pcre-to-elisp' for a description of how REGEXP is read +interactively." + (interactive (rxt-interactive/pcre)) + (let ((rxt-explain t) + (rxt-verbose-rx-translation rxt-explain-verbosely)) + (rxt-pp-rx regexp (rxt-pcre-to-rx regexp flags)))) + +;;;###autoload +(defun rxt-quote-pcre (text) + "Return a PCRE regexp which matches TEXT literally. + +Any PCRE metacharacters in TEXT will be quoted with a backslash." + (rxt-adt->pcre (rxt-string text))) + + +;;;; Commands that depend on the major mode in effect + +;; Macro: interactively call one of two functions depending on the +;; major mode +(defmacro rxt-mode-dispatch (elisp-function pcre-function) + `(if (memq major-mode '(emacs-lisp-mode lisp-interaction-mode)) + (call-interactively #',elisp-function) + (call-interactively #',pcre-function))) + +;;;###autoload +(defun rxt-explain () + "Pop up a buffer with pretty-printed `rx' syntax for the regex at point. + +Chooses regex syntax to read based on current major mode, calling +`rxt-explain-elisp' if buffer is in `emacs-lisp-mode' or +`lisp-interaction-mode', or `rxt-explain-pcre' otherwise." + (interactive) + (rxt-mode-dispatch rxt-explain-elisp rxt-explain-pcre)) + +;;;###autoload +(defun rxt-convert-syntax () + "Convert regex at point to other kind of syntax, depending on major mode. + +For buffers in `emacs-lisp-mode' or `lisp-interaction-mode', +calls `rxt-elisp-to-pcre' to convert to PCRE syntax. Otherwise, +calls `rxt-pcre-to-elisp' to convert to Emacs syntax. + +The converted syntax is displayed in the echo area and copied to +the kill ring; see the two functions named above for details." + (interactive) + (rxt-mode-dispatch rxt-elisp-to-pcre rxt-pcre-to-elisp)) + +;;;###autoload +(defun rxt-convert-to-rx () + "Convert regex at point to RX syntax. Chooses Emacs or PCRE syntax by major mode." + (interactive) + (rxt-mode-dispatch rxt-elisp-to-rx rxt-pcre-to-rx)) + +;;;###autoload +(defun rxt-convert-to-strings () + "Convert regex at point to RX syntax. Chooses Emacs or PCRE syntax by major mode." + (interactive) + (rxt-mode-dispatch rxt-elisp-to-strings rxt-pcre-to-strings)) + + + +;;; Minor mode and keybindings + +(defvar rxt-mode-map + (let ((map (make-sparse-keymap))) + ;; Generic + (define-key map (kbd "C-c / /") 'rxt-explain) + (define-key map (kbd "C-c / c") 'rxt-convert-syntax) + (define-key map (kbd "C-c / x") 'rxt-convert-to-rx) + (define-key map (kbd "C-c / '") 'rxt-convert-to-strings) + + ;; From PCRE + (define-key map (kbd "C-c / p /") 'rxt-explain-pcre) + (define-key map (kbd "C-c / p e") 'rxt-pcre-to-elisp) + (define-key map (kbd "C-c / p x") 'rxt-pcre-to-rx) + (define-key map (kbd "C-c / p '") 'rxt-pcre-to-strings) + + ;; From Elisp + (define-key map (kbd "C-c / e /") 'rxt-explain-elisp) + (define-key map (kbd "C-c / e p") 'rxt-elisp-to-pcre) + (define-key map (kbd "C-c / e x") 'rxt-elisp-to-rx) + (define-key map (kbd "C-c / e '") 'rxt-elisp-to-strings) + (define-key map (kbd "C-c / e t") 'rxt-toggle-elisp-rx) + (define-key map (kbd "C-c / t") 'rxt-toggle-elisp-rx) + + ;; Search + (define-key map (kbd "C-c / %") 'pcre-query-replace-regexp) + + map) + "Keymap for `rxt-mode'.") + +;;;###autoload +(define-minor-mode rxt-mode + "Regex translation utilities." nil nil) + +;;;###autoload +(defun turn-on-rxt-mode () + "Turn on `rxt-mode' in the current buffer." + (interactive) + (rxt-mode 1)) + +;;;###autoload +(define-globalized-minor-mode rxt-global-mode rxt-mode + turn-on-rxt-mode) + + +;;;; Syntax explanations + +;; Major mode for displaying pretty-printed S-exp syntax +(define-derived-mode rxt-help-mode emacs-lisp-mode "Regexp Explain" + (setq buffer-read-only t) + (add-hook 'post-command-hook 'rxt-highlight-text nil t) + (rxt-highlight-text)) + +;; Hack: stop paredit-mode interfering with `rxt-print' +(eval-when-compile (declare-function paredit-mode "paredit.el")) +(add-hook 'rxt-help-mode-hook + (lambda () + (if (and (boundp 'paredit-mode) + paredit-mode) + (paredit-mode 0)))) + +(define-key rxt-help-mode-map "q" 'quit-window) +(define-key rxt-help-mode-map "z" 'kill-this-buffer) +(define-key rxt-help-mode-map "n" 'next-line) +(define-key rxt-help-mode-map "p" 'previous-line) +(define-key rxt-help-mode-map "f" 'forward-list) +(define-key rxt-help-mode-map "b" 'backward-list) +(define-key rxt-help-mode-map "u" 'backward-up-list) +(define-key rxt-help-mode-map "d" 'down-list) + +(defvar rxt--print-with-overlays nil) +(defvar rxt--print-depth 0) + +(defconst rxt--print-char-alist + '((?\a . "\\a") + (?\b . "\\b") + (?\t . "\\t") + (?\n . "\\n") + (?\v . "\\v") + (?\f . "\\f") + (?\r . "\\r") + (?\e . "\\e") + (?\s . "\\s") + (?\\ . "\\\\") + (?\d . "\\d")) + "Alist of characters to print using an escape sequence in Elisp source. +See (info \"(elisp) Basic Char Syntax\").") + +(defconst rxt--whitespace-display-regexp + (rx-to-string `(any ,@(mapcar #'car rxt--print-char-alist)))) + +(defconst rxt--print-special-chars + '(?\( ?\) ?\\ ?\| ?\; ?\' ?\` ?\" ?\# ?\. ?\,) + "Characters which require a preceding backslash in Elisp source. +See (info \"(elisp) Basic Char Syntax\").") + +(defun rxt-pp-rx (regexp rx) + "Display string regexp REGEXP with its `rx' form RX in an `rxt-help-mode' buffer." + (with-current-buffer (get-buffer-create "* Regexp Explain *") + (let ((print-escape-newlines t) + (inhibit-read-only t)) + (erase-buffer) + (rxt-help-mode) + (insert (rxt--propertize-whitespace regexp)) + (newline 2) + (save-excursion + (let ((sexp-begin (point)) + (rxt--print-with-overlays t)) + (rxt-print rx) + (narrow-to-region sexp-begin (point)) + (pp-buffer) + (widen))) + (rxt-highlight-text)) + (pop-to-buffer (current-buffer)))) + +(cl-defun rxt-print (rx) + "Insert RX, an `rx' form, into the current buffer, optionally adding overlays. + +Similar to `print' or `prin1', but ensures that `rx' forms are +printed readably, using character or integer syntax depending on +context. + +If `rxt--print-with-overlays' is non-nil, also creates overlays linking +elements of RX to their corresponding locations in the source +string (see `rxt-explain-elisp', `rxt-explain-pcre' and +`rxt--make-help-overlays')." + (let ((start (point))) + (cl-typecase rx + (cons + (pcase rx + (`(,(and (or `repeat `**) head) + ,(and (pred integerp) from) + ,(and (pred integerp) to) + . ,rest) + (insert (format "(%s %d %d" head from to)) + (rxt--print-list-tail rest)) + (`(,(and (or `repeat `= `>=) head) + ,(and (pred integerp) n) + . ,rest) + (insert (format "(%s %d" head n)) + (rxt--print-list-tail rest)) + (_ + (rxt--print-list-tail rx t)))) + (symbol + (cl-case rx + ;; `print' escapes the ? characters in the rx operators *? + ;; and +?, but this looks bad and is not strictly necessary: + ;; (eq (read "*?") (read "*\\?")) => t + ;; (eq (read "+?") (read "+\\?")) => t + ((*? +?) (insert (symbol-name rx))) + (t (prin1 rx (current-buffer))))) + (string + (insert (rxt--propertize-whitespace (prin1-to-string rx)))) + (character + (cond + ((eq ? rx) + (insert "?")) + ((memq rx rxt--print-special-chars) + (insert "?\\" rx)) + ((assq rx rxt--print-char-alist) + (insert "?" (assoc-default rx rxt--print-char-alist))) + (t + (insert "?" (char-to-string rx))))) + (t + (prin1 rx (current-buffer)))) + (when rxt--print-with-overlays + (rxt--make-help-overlays rx start (point))))) + +(defun rxt--print-list-tail (tail &optional open-paren) + (let ((rxt--print-depth (1+ rxt--print-depth))) + (let ((done nil)) + (while (not done) + (cl-typecase tail + (null + (insert ")") + (setq done t)) + (cons + (if open-paren + (progn + (insert "(") + (setq open-paren nil)) + (insert " ")) + (rxt-print (car tail)) + (setq tail (cdr tail))) + (t + (insert " . ") + (rxt-print tail) + (insert ")") + (setq done t))))))) + +(defun rxt--make-help-overlays (rx start end) + (let ((location (rxt-location rx))) + (when (and location + (rxt-location-start location) + (rxt-location-end location)) + (let* ((sexp-begin (copy-marker start t)) + (sexp-end (copy-marker end)) + (sexp-bounds (list sexp-begin sexp-end)) + + (source-begin (1+ (rxt-location-start location))) + (source-end (1+ (rxt-location-end location))) + (source-bounds (list source-begin source-end)) + + (bounds (list source-bounds sexp-bounds)) + + (sexp-ol (make-overlay sexp-begin sexp-end (current-buffer) t nil)) + (source-ol (make-overlay source-begin source-end (current-buffer) t nil))) + (dolist (ol (list sexp-ol source-ol)) + (overlay-put ol 'priority rxt--print-depth) + (overlay-put ol 'rxt-bounds bounds)))))) + +(defun rxt--propertize-whitespace (string) + (let ((string (copy-sequence string)) + (start 0)) + (while (string-match rxt--whitespace-display-regexp string start) + (put-text-property (match-beginning 0) (match-end 0) + 'display + (assoc-default (string-to-char (match-string 0 string)) + rxt--print-char-alist) + string) + (setq start (match-end 0))) + string)) + +(defun rxt-highlight-text () + "Highlight the regex syntax at point and its corresponding RX/string form." + (let ((all-bounds (get-char-property (point) 'rxt-bounds))) + (mapc #'delete-overlay rxt-highlight-overlays) + (setq rxt-highlight-overlays nil) + (dolist (bounds all-bounds) + (cl-destructuring-bind (begin end) bounds + (let ((overlay (make-overlay begin end))) + (push overlay rxt-highlight-overlays) + (overlay-put overlay 'face 'rxt-highlight-face)))))) + + +;;;; Error handling + +(if (fboundp 'define-error) + (define-error 'rxt-invalid-regexp "Invalid regexp" 'invalid-regexp) + (put 'rxt-invalid-regexp + 'error-conditions + '(rxt-invalid-regexp invalid-regexp error)) + (put 'rxt-invalid-regexp 'error-message "Invalid regexp")) + +(defun rxt-error (&rest format-args) + (signal 'rxt-invalid-regexp (list (apply #'format format-args)))) + + +;;;; Regexp syntax tree data type + +;; Base class from which other elements of the syntax-tree inherit +(cl-defstruct rxt-syntax-tree) + +;; Struct representing the original source location +(cl-defstruct rxt-location + source ; Either a string or a buffer + start end ; Offsets, 0- or 1-indexed as appropriate + ) + +(defun rxt-location-text (location) + (if (not (rxt-location-p location)) + nil + (let ((start (rxt-location-start location)) + (end (rxt-location-end location)) + (source (rxt-location-source location))) + (cond + ((buffer-live-p source) + (with-current-buffer source + (buffer-substring-no-properties start end))) + ((stringp source) + (substring source start end)) + (t nil))))) + +;; Hash table mapping from syntax-tree elements to source locations. +(defvar rxt-location-map (make-hash-table :weakness 'key)) + +(defun rxt-location (object) + (gethash object rxt-location-map)) + +(gv-define-setter rxt-location (value object) + `(puthash ,object ,value rxt-location-map)) + +(defun rxt-source-text (object) + (rxt-location-text (rxt-location object))) + +(defun rxt-to-string (tree) + "Return a readable representation of TREE, a regex syntax-tree object." + (or (rxt-source-text tree) + (let ((print-level 1)) + (prin1-to-string tree)))) +(defalias 'rxt-syntax-tree-readable 'rxt-to-string) + +;; FIXME +(defvar rxt-pcre-case-fold nil) + +;; Literal string +(cl-defstruct + (rxt-string + (:constructor rxt-string (chars &optional case-fold)) + (:include rxt-syntax-tree)) + chars + (case-fold rxt-pcre-case-fold)) + +(defun rxt-empty-string () + (rxt-string "")) + +(defun rxt-trivial-p (re) + (and (rxt-string-p re) + (equal (rxt-string-chars re) ""))) + +;;; Other primitives +(cl-defstruct (rxt-primitive + (:constructor rxt-primitive (pcre rx)) + (:include rxt-syntax-tree)) + pcre rx) + +(defun rxt-bos () (rxt-primitive "\\A" 'bos)) +(defun rxt-eos () (rxt-primitive "\\Z" 'eos)) + +(defun rxt-bol () (rxt-primitive "^" 'bol)) +(defun rxt-eol () (rxt-primitive "$" 'eol)) + +;; FIXME +(defun rxt-anything () (rxt-primitive "." 'anything)) +(defun rxt-nonl () (rxt-primitive "." 'nonl)) + +(defun rxt-word-boundary () (rxt-primitive "\\b" 'word-boundary)) +(defun rxt-not-word-boundary () (rxt-primitive "\\B" 'not-word-boundary)) + +(defun rxt-wordchar () (rxt-primitive "\\w" 'wordchar)) +(defun rxt-not-wordchar () (rxt-primitive "\\W" 'not-wordchar)) + +(defun rxt-symbol-start () (rxt-primitive nil 'symbol-start)) +(defun rxt-symbol-end () (rxt-primitive nil 'symbol-end)) + +(defun rxt-bow () (rxt-primitive nil 'bow)) +(defun rxt-eow () (rxt-primitive nil 'eow)) + +;;; Sequence +(cl-defstruct + (rxt-seq + (:constructor make-rxt-seq (elts)) + (:include rxt-syntax-tree)) + elts) + +;; Slightly smart sequence constructor: +;; - Flattens nested sequences +;; - Drops trivial "" elements +;; - Empty sequence => "" +;; - Singleton sequence is reduced to its one element. +(defun rxt-seq (&rest res) ; Flatten nested seqs & drop ""'s. + (let ((res (rxt-seq-flatten res))) + (if (consp res) + (if (consp (cdr res)) + (make-rxt-seq res) ; General case + (car res)) ; Singleton sequence + (rxt-empty-string)))) ; Empty seq -- "" + +(defun rxt-seq-flatten (res) + (if (consp res) + (let ((re (car res)) + (tail (rxt-seq-flatten (cdr res)))) + (cond ((rxt-seq-p re) ; Flatten nested seqs + (append (rxt-seq-flatten (rxt-seq-elts re)) tail)) + ((rxt-trivial-p re) tail) ; Drop trivial elts + ((and (rxt-string-p re) ; Flatten strings + (consp tail) + (rxt-string-p (car tail))) + (cons + (rxt-string-concat re (car tail)) + (cdr tail))) + (t (cons re tail)))) + '())) + +(defun rxt-string-concat (str1 str2) + (if (not (eq (rxt-string-case-fold str1) + (rxt-string-case-fold str2))) + (make-rxt-seq (list str1 str2)) + (let ((result + (rxt-string (concat (rxt-string-chars str1) + (rxt-string-chars str2)) + (rxt-string-case-fold str1))) + (first (rxt-location str1)) + (last (rxt-location str2))) + (when (and first last) + (setf (rxt-location result) + (make-rxt-location :source (rxt-location-source first) + :start (rxt-location-start first) + :end (rxt-location-end last)))) + result))) + +;;; Choice (alternation/union) +(cl-defstruct + (rxt-choice + (:constructor make-rxt-choice (elts)) + (:include rxt-syntax-tree)) + elts) + +;;; The empty choice represents a regexp that never matches in any context +(defvar rxt-empty (make-rxt-choice nil)) +(defun rxt-empty-p (re) + (or + (and (rxt-choice-p re) + (null (rxt-choice-elts re))) + (rxt-empty-char-set-p re))) + +(defun rxt-choice (&rest alternatives) + "Construct the alternation (union) of several regexps. + +ALTERNATIVES should be a list of `rxt-syntax-tree' objects. +The return value is an `rxt-choice' object representing a regexp +which matches any one of ALTERNATIVES, but simplified in the +following ways: + +- If ALTERNATIVES contains only one element, it is returned unchanged. + +- All existing `rxt-choice' elements in ALTERNATIVES are replaced + by a flat list of their subexpressions: symbolically, + a|(b|(c|d)) is replaced by a|b|c|d + +- All character sets and single-character strings in ALTERNATIVES + are combined together into one or two character sets, + respecting case-folding behaviour." + (cl-destructuring-bind (other-elements char-set case-fold-char-set) + (rxt--simplify-alternatives alternatives) + (let ((simplified-alternatives + (append (if (not (rxt-empty-p char-set)) + (list char-set) + '()) + (if (not (rxt-empty-p case-fold-char-set)) + (list case-fold-char-set) + '()) + other-elements))) + (pcase simplified-alternatives + (`() + rxt-empty) + (`(,element) + element) + (_ + (make-rxt-choice simplified-alternatives)))))) + +(defun rxt--simplify-alternatives (alternatives) + "Simplify a set of regexp alternatives. + +ALTERNATIVES should be a list of `rxt-syntax-tree' objects to be combined +into an `rxt-choice' structure. The result is a three-element +list (OTHER-ELEMENTS CHAR-SET CASE-FOLDED-CHAR-SET): + +- CHAR-SET is an `rxt-char-set-union' containing the union of all + case-sensitive character sets and single-character strings in + RES. + +- CASE-FOLDED-CHAR-SET is similar but combines all the + case-insensitive character sets and single-character strings. + +- OTHER-ELEMENTS is a list of all other elements, with all + `rxt-choice' structures replaced by a flat list of their + component subexpressions." + (if (null alternatives) + (list '() + (make-rxt-char-set-union :case-fold nil) + (make-rxt-char-set-union :case-fold t)) + (let* ((re (car alternatives))) + (cl-destructuring-bind (tail char-set case-fold-char-set) + (rxt--simplify-alternatives (cdr alternatives)) + (cond ((rxt-choice-p re) ; Flatten nested choices + (list + (append (rxt-choice-elts re) tail) + char-set + case-fold-char-set)) + + ((rxt-empty-p re) ; Drop empty re's. + (list tail char-set case-fold-char-set)) + + ((rxt-char-set-union-p re) ; Fold char sets together + (if (rxt-char-set-union-case-fold re) + (list tail + char-set + (rxt-char-set-union case-fold-char-set re)) + (list tail + (rxt-char-set-union char-set re) + case-fold-char-set))) + + ((and (rxt-string-p re) ; Same for 1-char strings + (= 1 (length (rxt-string-chars re)))) + (if (rxt-string-case-fold re) + (list tail + char-set + (rxt-char-set-union case-fold-char-set re)) + (list tail + (rxt-char-set-union char-set re) + case-fold-char-set))) + + (t ; Otherwise. + (list (cons re tail) char-set case-fold-char-set))))))) + +;;; Repetition +(cl-defstruct (rxt-repeat + (:include rxt-syntax-tree)) + from to body greedy) + +(cl-defun rxt-repeat (from to body &optional (greedy t)) + (if (equal to 0) + (rxt-empty-string) + (make-rxt-repeat :from from :to to + :body body :greedy greedy))) + +;;; Submatch +(cl-defstruct + (rxt-submatch + (:constructor rxt-submatch (body)) + (:include rxt-syntax-tree)) + body) + +;;; Numbered submatch (Emacs only) +(cl-defstruct + (rxt-submatch-numbered + (:constructor rxt-submatch-numbered (n body)) + (:include rxt-syntax-tree)) + n + body) + +;;; Backreference +(cl-defstruct + (rxt-backref + (:constructor rxt-backref (n)) + (:include rxt-syntax-tree)) + n) + +;;; Syntax classes (Emacs only) +(cl-defstruct (rxt-syntax-class + (:include rxt-syntax-tree)) + symbol) + +(defun rxt-syntax-class (symbol) + (if (assoc symbol rx-syntax) + (make-rxt-syntax-class :symbol symbol) + (rxt-error "Invalid syntax class symbol `%s'" symbol))) + +;;; Character categories (Emacs only) +(cl-defstruct (rxt-char-category + (:include rxt-syntax-tree)) + symbol) + +(defun rxt-char-category (symbol) + (if (assoc symbol rx-categories) + (make-rxt-char-category :symbol symbol) + (rxt-error "Invalid character category symbol `%s'" symbol))) + + +;;; Char sets +;; <rxt-char-set> ::= <rxt-char-set-union> +;; | <rxt-char-set-negation> +;; | <rxt-char-set-intersection> + +(cl-defstruct (rxt-char-set (:include rxt-syntax-tree))) + +;; An rxt-char-set-union represents the union of any number of +;; characters, character ranges, and POSIX character classes: anything +;; that can be represented in string notation as a class [ ... ] +;; without the negation operator. +(cl-defstruct (rxt-char-set-union + (:include rxt-char-set)) + chars ; list of single characters + ranges ; list of ranges (from . to) + classes ; list of character classes + (case-fold rxt-pcre-case-fold)) + +;; Test for empty character set +(defun rxt-empty-char-set-p (cset) + (and (rxt-char-set-union-p cset) + (null (rxt-char-set-union-chars cset)) + (null (rxt-char-set-union-ranges cset)) + (null (rxt-char-set-union-classes cset)))) + +;; Simple union constructor +(defun rxt-char-set-union (&rest items) + "Construct an regexp character set representing the union of ITEMS. + +Each element of ITEMS may be either: a character; a +single-character string; a single-character `rxt-string' object; +a cons, (FROM . TO) representing a range of characters; a symbol, +representing a named character class; or an `rxt-char-set-union' +object. All `rxt-char-set-union' objects in ITEMS must have the +same `case-fold' property." + (let ((chars '()) + (ranges '()) + (classes '()) + (case-fold 'undetermined)) + (dolist (item items) + (cl-etypecase item + (character + (push item chars)) + + (string + (cl-assert (= 1 (length item))) + (push (string-to-char item) chars)) + + (rxt-string + (cl-assert (= 1 (length (rxt-string-chars item)))) + (push (string-to-char (rxt-string-chars item)) chars)) + + (cons ; range (from . to) + (cl-check-type (car item) character) + (cl-check-type (cdr item) character) + (push item ranges)) + + (symbol ; named character class + (push item classes)) + + (rxt-char-set-union + (if (eq case-fold 'undetermined) + (setq case-fold (rxt-char-set-union-case-fold item)) + (unless (eq case-fold (rxt-char-set-union-case-fold item)) + (error "Cannot construct union of char-sets with unlike case-fold setting: %S" item))) + (setq chars (nconc chars (rxt-char-set-union-chars item))) + (setq ranges (nconc ranges (rxt-char-set-union-ranges item))) + (setq classes (nconc classes (rxt-char-set-union-classes item)))))) + + (make-rxt-char-set-union :chars chars :ranges ranges :classes classes + :case-fold (if (eq case-fold 'undetermined) + rxt-pcre-case-fold + case-fold)))) + +(defun rxt--all-char-set-union-chars (char-set) + "Return a list of all characters in CHAR-SET." + (cl-assert (rxt-char-set-union-p char-set)) + (append + (rxt-char-set-union-chars char-set) + (cl-loop for (start . end) in (rxt-char-set-union-ranges char-set) + nconc (cl-loop for char from start to end collect char)))) + +(defun rxt--simplify-char-set (char-set &optional case-fold-p) + "Return a minimal char-set to match the same characters as CHAR-SET. + +With optional argument CASE-FOLD-P, return a char-set which +emulates case-folding behaviour by including both uppercase and +lowercase versions of all characters in CHAR-SET." + (cl-assert (rxt-char-set-union-p char-set)) + (let* ((classes (rxt-char-set-union-classes char-set)) + (all-chars + (if case-fold-p + (cl-loop for char in (rxt--all-char-set-union-chars char-set) + nconc (list (upcase char) (downcase char))) + (rxt--all-char-set-union-chars char-set))) + (all-ranges + (rxt--extract-ranges (rxt--remove-redundant-chars all-chars classes)))) + (let ((singletons nil) + (ranges nil)) + (cl-loop for (start . end) in all-ranges + do + (cond ((= start end) (push start singletons)) + ((= (1+ start) end) + (push start singletons) + (push end singletons)) + (t (push (cons start end) ranges)))) + (make-rxt-char-set-union :chars (nreverse singletons) + :ranges (nreverse ranges) + :classes classes + :case-fold (if case-fold-p + nil + (rxt-char-set-union-case-fold char-set)))))) + +(defun rxt--remove-redundant-chars (chars classes) + "Remove all characters which match a character class in CLASSES from CHARS." + (if (null classes) + chars + (string-to-list + (replace-regexp-in-string + (rx-to-string `(any ,@classes)) + "" + (apply #'string chars))))) + +(defun rxt--extract-ranges (chars) + "Return a list of all contiguous ranges in CHARS. + +CHARS should be a list of characters (integers). The return +value is a list of conses (START . END) representing ranges, such +that the union of all the ranges represents the same of +characters as CHARS. + +Example: + (rxt--extract-ranges (list ?a ?b ?c ?q ?x ?y ?z)) + => ((?a . ?c) (?q . ?q) (?x . ?z))" + (let ((array + (apply #'vector + (cl-remove-duplicates + (sort (copy-sequence chars) #'<))))) + (cl-labels + ((recur (start end) + (if (< end start) + nil + (let ((min (aref array start)) + (max (aref array end))) + (if (= (- max min) (- end start)) + (list (cons min max)) + (let* ((split-point (/ (+ start end) 2)) + (left (recur start split-point)) + (right (recur (1+ split-point) end))) + (merge left right)))))) + (merge (left right) + (cond ((null left) right) + ((null right) left) + (t + (let ((last-left (car (last left))) + (first-right (car right))) + (if (= (1+ (cdr last-left)) + (car first-right)) + (append (cl-subseq left 0 -1) + (list + (cons (car last-left) + (cdr first-right))) + (cl-subseq right 1)) + (append left right))))))) + (recur 0 (1- (length array)))))) + +;;; Set complement of character set, syntax class, or character +;;; category + +;; In general, all character sets that can be represented in string +;; notation as [^ ... ] (but see `rxt-char-set-intersection', below), plus +;; Emacs' \Sx and \Cx constructions. +(cl-defstruct (rxt-char-set-negation + (:include rxt-char-set)) + elt) + +(defun rxt-negate (char-set) + "Construct the logical complement (negation) of CHAR-SET. + +CHAR-SET may be any of the following types: `rxt-char-set-union', +`rxt-syntax-class', `rxt-char-category', or `rxt-char-set-negation'." + (cl-etypecase char-set + ((or rxt-char-set-union rxt-syntax-class rxt-char-category) + (make-rxt-char-set-negation :elt char-set)) + + (rxt-char-set-negation + (rxt-char-set-negation-elt char-set)))) + +;;; Intersections of char sets + +;; These are difficult to represent in general, but can be constructed +;; in Perl using double negation; for example: [^\Wabc] means the set +;; complement of [abc] with respect to the universe of "word +;; characters": (& (~ (~ word)) (~ ("abc"))) == (& word (~ ("abc"))) +;; == (- word ("abc")) + +(cl-defstruct (rxt-char-set-intersection + (:include rxt-char-set)) + elts) + +;; Intersection constructor +(defun rxt-char-set-intersection (&rest charsets) + (let ((elts '()) + (cmpl (make-rxt-char-set-union))) + (dolist (cset (rxt-int-flatten charsets)) + (cond + ((rxt-char-set-negation-p cset) + ;; Fold negated charsets together: ~A & ~B = ~(A|B) + (setq cmpl (rxt-char-set-union cmpl (rxt-char-set-negation-elt cset)))) + + ((rxt-char-set-union-p cset) + (push cset elts)) + + (t + (rxt-error "Can't take intersection of non-character-set %S" cset)))) + + (if (null elts) + (rxt-negate cmpl) + (unless (rxt-empty-char-set-p cmpl) + (push (rxt-negate cmpl) elts)) + (if (null (cdr elts)) + (car elts) ; singleton case + (make-rxt-char-set-intersection :elts elts))))) + +;; Constructor helper: flatten nested intersections +(defun rxt-int-flatten (csets) + (if (consp csets) + (let ((cset (car csets)) + (tail (rxt-int-flatten (cdr csets)))) + (if (rxt-char-set-intersection-p cset) + (append (rxt-int-flatten (rxt-char-set-intersection-elts cset)) tail) + (cons cset tail))) + '())) + + + +;;;; Macros for building the parser + +(defmacro rxt-token-case (&rest cases) + "Consume a token at point and evaluate corresponding forms. + +CASES is a list of `cond'-like clauses, (REGEXP BODY ...) where +the REGEXPs define possible tokens which may appear at point. The +CASES are considered in order. For each case, if the text at +point matches REGEXP, then point is moved to the end of the +matched token, the corresponding BODY is evaluated and their +value returned. The matched token is available within the BODY +forms as (match-string 0). + +There can be a default case where REGEXP is `t', which evaluates +the corresponding FORMS but does not move point. + +Returns `nil' if none of the CASES matches." + (declare (debug (&rest (sexp &rest form)))) + `(cond + ,@(cl-loop for (token . action) in cases + collect + (if (eq token t) + `(t ,@action) + `((looking-at ,token) + (goto-char (match-end 0)) + ,@action))))) + +(defmacro rxt-with-source-location (&rest body) + "Evaluate BODY and record source location information on its value. + +BODY may evaluate to any kind of object, but its value should +generally not be `eq' to any other object." + (declare (debug (&rest form))) + (let ((begin (make-symbol "begin")) + (value (make-symbol "value"))) + `(let ((,begin (point)) + (,value ,(macroexp-progn body))) + (setf (rxt-location ,value) + (make-rxt-location :source rxt-source-text-string + :start (1- ,begin) + :end (1- (point)))) + ,value))) + +;; Read PCRE + flags +(defun rxt-read-delimited-pcre () + "Read a Perl-style delimited regexp and flags from the current buffer. + +Point should be before the regexp literal before calling +this. Currently only regexps delimited by / ... / are supported. +A preceding \"m\", \"qr\" or \"s\" will be ignored, as will the +replacement string in an s/.../.../ construction. + +Returns two strings: the regexp and the flags." + (save-excursion + (skip-syntax-forward "-") + + ;; Skip m, qr, s + (let ((is-subst (rxt-token-case + ("s" t) + ((rx (or "m" "qr")) nil)))) + + (when (not (looking-at "/")) + (error "Only Perl regexps delimited by slashes are supported")) + (let ((beg (match-end 0)) + (delim (rx (not (any "\\")) + (group "/")))) + (search-forward-regexp delim) + (let ((end (match-beginning 1))) + (when is-subst (search-forward-regexp delim)) + (let ((pcre (buffer-substring-no-properties beg end))) + (rxt-token-case + ("[gimosx]*" + (rxt--add-flags pcre (match-string-no-properties 0)))))))))) + + +;;;; Elisp and PCRE string notation parser + +;;; Parser constants +(defconst rxt-pcre-char-set-alist + `((?w . ; "word" characters + (?_ alnum)) + (?d . ; digits + (digit)) + (?h . ; horizontal whitespace + (#x0009 #x0020 #x00A0 #x1680 #x180E #x2000 #x2001 #x2002 #x2003 + #x2004 #x2005 #x2006 #x2007 #x2008 #x2009 #x200A #x202F + #x205F #x3000)) + (?s . ; whitespace + (9 10 12 13 32)) + (?v . ; vertical whitespace + (#x000A #x000B #x000C #x000D #x0085 #x2028 #x2029)))) + +(defconst rxt-pcre-named-classes-regexp + (rx "[:" + (submatch + (or "alnum" "alpha" "ascii" "blank" "cntrl" "digit" "graph" "lower" + "print" "punct" "space" "upper" "word" "xdigit")) + ":]")) + +(defconst rxt-elisp-named-classes-regexp + (rx "[:" + (submatch + (or "alnum" "alpha" "ascii" "blank" "cntrl" "digit" "graph" "lower" + "print" "punct" "space" "upper" "word" "xdigit" + "unibyte" "nonascii" "multibyte")) + ":]")) + +;;; The following dynamically bound variables control the operation of +;;; the parser (see `rxt-parse-re'.) +(defvar rxt-parse-pcre nil + "t if the rxt string parser is parsing PCRE syntax, nil for Elisp syntax. + +This should only be let-bound internally, never set otherwise.") + +(defvar rxt-pcre-extended-mode nil + "t if the rxt string parser is emulating PCRE's \"extended\" mode. + +In extended mode (indicated by /x in Perl/PCRE), whitespace +outside of character classes and \\Q...\\E quoting is ignored, +and a `#' character introduces a comment that extends to the end +of line.") + +(defvar rxt-pcre-s-mode nil + "t if the rxt string parser is emulating PCRE's single-line \"/s\" mode. + +When /s is used, PCRE's \".\" matches newline characters, which +otherwise it would not match.") + +(defvar rxt-pcre-case-fold nil + "non-nil to emulate PCRE's case-insensitive \"/i\" mode in translated regexps.") + +(defvar rxt-branch-end-regexp nil) +(defvar rxt-choice-regexp nil) +(defvar rxt-brace-begin-regexp nil) +(defvar rxt-m-to-n-brace-regexp nil) +(defvar rxt-m-to-?-brace-regexp nil) +(defvar rxt-m-brace-regexp nil) +(defvar rxt-named-classes-regexp nil) + +(defvar rxt-subgroup-count nil) +(defvar rxt-source-text-string nil) + +(defun rxt-parse-pcre (re) + (rxt-parse-re re t)) + +(defun rxt-parse-elisp (re) + (rxt-parse-re re nil)) + +(defun rxt-parse-re (re pcre-p) + (let* ((rxt-parse-pcre pcre-p) + (rxt-pcre-extended-mode nil) + (rxt-pcre-s-mode nil) + (rxt-pcre-case-fold nil) + + ;; Bind regexps to match syntax that differs between PCRE and + ;; Elisp only in the addition of a backslash "\" + (escape (if pcre-p "" "\\")) + (rxt-choice-regexp + (rx-to-string `(seq ,escape "|"))) + (rxt-branch-end-regexp + (rx-to-string `(or buffer-end + (seq ,escape (or "|" ")"))))) + (rxt-brace-begin-regexp + (rx-to-string `(seq ,escape "{"))) + (rxt-m-to-n-brace-regexp + (rx-to-string + `(seq + (submatch (* (any "0-9"))) "," (submatch (+ (any "0-9"))) + ,escape "}"))) + (rxt-m-to-?-brace-regexp + (rx-to-string + `(seq (submatch (+ (any "0-9"))) "," ,escape "}"))) + (rxt-m-brace-regexp + (rx-to-string + `(seq (submatch (+ (any "0-9"))) ,escape "}"))) + + ;; Named character classes [: ... :] differ slightly + (rxt-named-classes-regexp + (if pcre-p + rxt-pcre-named-classes-regexp + rxt-elisp-named-classes-regexp)) + + (rxt-subgroup-count 0) + (case-fold-search nil)) + (with-temp-buffer + (insert re) + (goto-char (point-min)) + (let ((rxt-source-text-string re)) + (rxt-parse-exp))))) + +;; Parse a complete regex: a number of branches separated by | or +;; \|, as determined by `rxt-branch-end-regexp'. +(defun rxt-parse-exp () + ;; These variables are let-bound here because in PCRE mode they may + ;; be set internally by (?x) or (?s) constructions, whose scope + ;; lasts until the end of a sub-expression + (rxt-with-source-location + (let ((rxt-pcre-extended-mode rxt-pcre-extended-mode) + (rxt-pcre-s-mode rxt-pcre-s-mode) + (rxt-pcre-case-fold rxt-pcre-case-fold)) + (if (eobp) + (rxt-seq) + (let ((branches '())) + (cl-block nil + (while t + (let ((branch (rxt-parse-branch))) + (push branch branches) + (rxt-token-case + (rxt-choice-regexp nil) + (t (cl-return (apply #'rxt-choice (reverse branches))))))))))))) + +;; Skip over whitespace and comments in PCRE extended mode +(defun rxt-extended-skip () + (when rxt-pcre-extended-mode + (skip-syntax-forward "-") + (while (looking-at "#") + (beginning-of-line 2) + (skip-syntax-forward "-")))) + +;; Parse a regexp "branch": a sequence of pieces +(defun rxt-parse-branch () + (rxt-extended-skip) + (rxt-with-source-location + (let ((pieces '()) + (branch-start-p t)) + (while (not (looking-at rxt-branch-end-regexp)) + (push (rxt-parse-piece branch-start-p) pieces) + (setq branch-start-p nil)) + (apply #'rxt-seq (reverse pieces))))) + +;; Parse a regexp "piece": an atom (`rxt-parse-atom') plus any +;; following quantifiers +(defun rxt-parse-piece (&optional branch-begin) + (rxt-extended-skip) + (rxt-with-source-location + (let ((atom (rxt-parse-atom branch-begin))) + (rxt-parse-quantifiers atom)))) + +;; Parse any and all quantifiers after ATOM and return the quantified +;; regexp, or ATOM unchanged if no quantifiers +(defun rxt-parse-quantifiers (atom) + (catch 'done + (while (not (eobp)) + (let ((atom1 (rxt-parse-quantifier atom))) + (if (eq atom1 atom) + (throw 'done t) + (setq atom atom1))))) + atom) + +;; Possibly parse a single quantifier after ATOM and return the +;; quantified atom, or ATOM if no quantifier +(defun rxt-parse-quantifier (atom) + (rxt-extended-skip) + (rxt-token-case + ((rx "*?") (rxt-repeat 0 nil atom nil)) + ((rx "*") (rxt-repeat 0 nil atom t)) + ((rx "+?") (rxt-repeat 1 nil atom nil)) + ((rx "+") (rxt-repeat 1 nil atom t)) + ((rx "??") (rxt-repeat 0 1 atom nil)) + ((rx "?") (rxt-repeat 0 1 atom t)) + ;; Brace expression "{M,N}", "{M,}", "{M}" + (rxt-brace-begin-regexp + (cl-destructuring-bind (from to) + (rxt-parse-braces) + (rxt-repeat from to atom))) + ;; No quantifiers found + (t atom))) + +;; Parse a regexp atom, i.e. an element that binds to any following +;; quantifiers. This includes characters, character classes, +;; parenthesized groups, assertions, etc. +(defun rxt-parse-atom (&optional branch-begin) + (if (eobp) + (rxt-error "Unexpected end of regular expression") + (if rxt-parse-pcre + (rxt-parse-atom/pcre) + (rxt-parse-atom/el branch-begin)))) + +(defun rxt-parse-atom/common () + (rxt-token-case + ((rx "[") (rxt-parse-char-class)) + ((rx "\\b") (rxt-word-boundary)) + ((rx "\\B") (rxt-not-word-boundary)))) + +(defun rxt-parse-atom/el (branch-begin) + (rxt-with-source-location + (or (rxt-parse-atom/common) + (rxt-token-case + ;; "." wildcard + ((rx ".") (rxt-nonl)) + ;; "^" and "$" are metacharacters only at beginning or end of a + ;; branch in Elisp; elsewhere they are literals + ((rx "^") + (if branch-begin + (rxt-bol) + (rxt-string "^"))) + ((rx "$") + (if (looking-at rxt-branch-end-regexp) + (rxt-eol) + (rxt-string "$"))) + ;; Beginning & end of string, word, symbol + ((rx "\\`") (rxt-bos)) + ((rx "\\'") (rxt-eos)) + ((rx "\\<") (rxt-bow)) + ((rx "\\>") (rxt-eow)) + ((rx "\\_<") (rxt-symbol-start)) + ((rx "\\_>") (rxt-symbol-end)) + ;; Subgroup + ((rx "\\(") (rxt-parse-subgroup/el)) + ;; Word/non-word characters (meaning depending on syntax table) + ((rx "\\w") (rxt-wordchar)) + ((rx "\\W") (rxt-not-wordchar)) + ;; Other syntax categories + ((rx "\\" (submatch (any ?S ?s)) (submatch nonl)) + (let ((negated (string= (match-string 1) "S")) + (syntax + (car (rassoc (string-to-char (match-string 2)) + rx-syntax)))) + (if syntax + (let ((re (rxt-syntax-class syntax))) + (if negated (rxt-negate re) re)) + (rxt-error "Invalid syntax class `\\%s'" (match-string 0))))) + ;; Character categories + ((rx "\\" (submatch (any ?C ?c)) (submatch nonl)) + (let ((negated (string= (match-string 1) "C")) + (category + (car (rassoc (string-to-char (match-string 2)) + rx-categories)))) + (if category + (let ((re (rxt-char-category category))) + (if negated (rxt-negate re) re)) + (rxt-error "Invalid character category `%s'" (match-string 0))))) + ;; Backreference + ((rx (seq "\\" (submatch (any "1-9")))) + (rxt-backref (string-to-number (match-string 1)))) + ;; Other escaped characters + ((rx (seq "\\" (submatch nonl))) + (rxt-string (match-string 1))) + ;; Normal characters + ((rx (or "\n" nonl)) + (rxt-string (match-string 0))))))) + +(defun rxt-parse-atom/pcre () + (rxt-extended-skip) + (rxt-with-source-location + (or + ;; Is it an atom that's the same in Elisp? + (rxt-parse-atom/common) + ;; Is it common to PCRE regex and character class syntax? + (let ((char (rxt-parse-escapes/pcre))) + (and char + (rxt-string (char-to-string char)))) + ;; Otherwise: + (rxt-token-case + ;; "." wildcard + ((rx ".") + (if rxt-pcre-s-mode + (rxt-anything) + (rxt-nonl))) + ;; Beginning & end of string/line + ((rx "^") (rxt-bol)) + ((rx "$") (rxt-eol)) + ((rx "\\A") (rxt-bos)) + ((rx "\\Z") (rxt-eos)) + ;; Subgroup + ((rx "(") (rxt-parse-subgroup/pcre)) + ;; Metacharacter quoting + ((rx "\\Q") + ;; It would seem simple to take all the characters between \Q + ;; and \E and make an rxt-string, but \Q...\E isn't an atom: + ;; any quantifiers afterward should bind only to the last + ;; character, not the whole string. + (let ((begin (point))) + (search-forward "\\E" nil t) + (let* ((end (match-beginning 0)) + (str (buffer-substring-no-properties begin (1- end))) + (char (char-to-string (char-before end)))) + (rxt-seq (rxt-string str) + (rxt-parse-quantifiers (rxt-string char)))))) + ;; Pre-defined character sets + ((rx "\\" (submatch (any "d" "D" "h" "H" "s" "S" "v" "V" "w" "W"))) + (rxt--pcre-char-set (string-to-char (match-string 1)))) + ;; \ + digits: backreference or octal char? + ((rx "\\" (submatch (+ (any "0-9")))) + (let* ((digits (match-string 1)) + (dec (string-to-number digits))) + ;; from "man pcrepattern": If the number is less than 10, or if + ;; there have been at least that many previous capturing left + ;; parentheses in the expression, the entire sequence is taken + ;; as a back reference. + (if (and (> dec 0) + (or (< dec 10) + (>= rxt-subgroup-count dec))) + (progn + (when rxt-pcre-case-fold + (display-warning + 'rxt "Backreferences with case-folding are handled poorly")) + (rxt-backref dec)) + ;; from "man pcrepattern": if the decimal number is greater + ;; than 9 and there have not been that many capturing + ;; subpatterns, PCRE re-reads up to three octal digits + ;; following the backslash, and uses them to generate a data + ;; character. Any subsequent digits stand for themselves. + (goto-char (match-beginning 1)) + (re-search-forward (rx (** 0 3 (any "0-7")))) + (rxt-string (char-to-string (string-to-number (match-string 0) 8)))))) + ;; Other escaped characters + ((rx "\\" (submatch nonl)) (rxt-string (match-string 1))) + ;; Everything else + ((rx (or (any "\n") nonl)) (rxt-string (match-string 0))))))) + +(defun rxt-parse-escapes/pcre () + "Consume a one-char PCRE escape at point and return its codepoint equivalent. + +Handles only those character escapes which have the same meaning +in character classes as outside them." + (rxt-token-case + ((rx "\\a") #x07) ; bell + ((rx "\\e") #x1b) ; escape + ((rx "\\f") #x0c) ; formfeed + ((rx "\\n") #x0a) ; linefeed + ((rx "\\r") #x0d) ; carriage return + ((rx "\\t") #x09) ; tab + ;; Control character + ((rx "\\c" (submatch nonl)) + ;; from `man pcrepattern': + ;; The precise effect of \cx is as follows: if x is a lower case + ;; letter, it is converted to upper case. Then bit 6 of the + ;; character (hex 40) is inverted. + (logxor (string-to-char (upcase (match-string 1))) #x40)) + ;; Hex escapes + ((rx "\\x" (submatch (** 1 2 (any "0-9" "A-Z" "a-z")))) + (string-to-number (match-string 1) 16)) + ((rx "\\x{" (submatch (* (any "0-9" "A-Z" "a-z"))) "}") + (string-to-number (match-string 1) 16)))) + +(defun rxt-parse-subgroup/pcre () + (catch 'return + (let ((shy nil) + (extended-mode rxt-pcre-extended-mode) + (single-line-mode rxt-pcre-s-mode) + (case-fold rxt-pcre-case-fold)) + (rxt-extended-skip) + ;; Check for special (? ..) and (* ...) syntax + (rxt-token-case + ((rx "?") ; (? + (rxt-token-case + ((rx ")") ; Empty group (?) + (throw 'return (rxt-empty-string))) + (":" (setq shy t)) ; Shy group (?: + ("#" ; Comment (?# + (search-forward ")") + (throw 'return (rxt-empty-string))) + ((rx (or ; Flags (?isx-isx + (seq (group (* (any "gimosx"))) "-" (group (+ (any "gimosx")))) + (seq (group (+ (any "gimosx")))))) + (let ((token (match-string 0)) + (on (or (match-string 1) (match-string 3))) + (off (or (match-string 2) ""))) + (if (cl-find ?x on) (setq extended-mode t)) + (if (cl-find ?s on) (setq single-line-mode t)) + (if (cl-find ?i on) (setq case-fold t)) + (if (cl-find ?x off) (setq extended-mode nil)) + (if (cl-find ?s off) (setq single-line-mode nil)) + (if (cl-find ?i off) (setq case-fold nil)) + (when (string-match-p "[gmo]" token) + (display-warning + 'rxt (format "Unhandled PCRE flags in (?%s" token)))) + (rxt-token-case + (":" (setq shy t)) ; Shy group with flags (?isx-isx: ... + (")" ; Set flags (?isx-isx) + ;; Set flags for the remainder of the current subexpression + (setq rxt-pcre-extended-mode extended-mode + rxt-pcre-s-mode single-line-mode + rxt-pcre-case-fold case-fold) + (throw 'return (rxt-empty-string))))) + ;; Other constructions like (?=, (?!, etc. are not recognised + (t (rxt-error "Unrecognized PCRE extended construction `(?%c'" + (char-after))))) + + ;; No special (* ...) verbs are recognised + ((rx "*") + (let ((begin (point))) + (search-forward ")" nil 'go-to-end) + (rxt-error "Unrecognized PCRE extended construction `(*%s'" + (buffer-substring begin (point)))))) + + ;; Parse the remainder of the subgroup + (unless shy (cl-incf rxt-subgroup-count)) + (let* ((rxt-pcre-extended-mode extended-mode) + (rxt-pcre-s-mode single-line-mode) + (rxt-pcre-case-fold case-fold) + (rx (rxt-parse-exp))) + (rxt-extended-skip) + (rxt-token-case + (")" (if shy rx (rxt-submatch rx))) + (t (rxt-error "Subexpression missing close paren"))))))) + +(defun rxt-parse-subgroup/el () + (let ((kind + (rxt-token-case + ((rx "?:") + (cl-incf rxt-subgroup-count) + 'shy) + ((rx "?" (group (+ (in "0-9"))) ":") + (let ((n (string-to-number (match-string 1)))) + (when (< rxt-subgroup-count n) + (setf rxt-subgroup-count n)) + n)) + ((rx "?") ; Reserved + (rxt-error "Unknown match group sequence"))))) + (let ((rx (rxt-parse-exp))) + (rxt-token-case + ((rx "\\)") + (cond ((eq kind 'shy) rx) + ((numberp kind) + (rxt-submatch-numbered kind rx)) + (t (rxt-submatch rx)))) + (t (rxt-error "Subexpression missing close paren")))))) + +(defun rxt-parse-braces () + (rxt-token-case + (rxt-m-to-n-brace-regexp + (list (string-to-number (match-string 1)) + (string-to-number (match-string 2)))) + (rxt-m-to-?-brace-regexp + (list (string-to-number (match-string 1)) nil)) + (rxt-m-brace-regexp + (let ((a (string-to-number (match-string 1)))) + (list a a))) + (t + (let ((begin (point))) + (search-forward "}" nil 'go-to-end) + (rxt-error "Bad brace expression {%s" + (buffer-substring-no-properties begin (point))))))) + +;; Parse a character set range [...] +(defun rxt-parse-char-class () + (when (eobp) + (rxt-error "Missing close right bracket in regexp")) + (rxt-with-source-location + (let* ((negated (rxt-token-case + ((rx "^") t) + (t nil))) + (begin (point)) + (result + (if negated + (rxt-negate (rxt-char-set-union)) + (rxt-char-set-union))) + (transformer + (if negated #'rxt-negate #'identity)) + (builder + (if negated #'rxt-char-set-intersection #'rxt-choice))) + (catch 'done + (while t + (when (eobp) + (rxt-error "Missing close right bracket in regexp")) + (if (and (looking-at (rx "]")) + (not (= (point) begin))) + (throw 'done result) + (let ((piece (funcall transformer (rxt-parse-char-class-piece)))) + (setq result (funcall builder result piece)))))) + (forward-char) ; Skip over closing "]" + result))) + +;; Parse a single character, a character range, or a posix class +;; within a character set context. Returns an `rxt-char-set'. +(defun rxt-parse-char-class-piece () + (let ((atom (rxt-parse-char-class-atom))) + (cl-typecase atom + (rxt-char-set ; return unchanged + atom) + (integer ; character: check for range + (let ((range-end (rxt-maybe-parse-range-end))) + (if range-end + (rxt-char-set-union (cons atom range-end)) + (rxt-char-set-union atom)))) + (t ; transform into character set + (rxt-char-set-union atom))))) + +;; Parse a single character or named class within a charset. +;; +;; Returns an integer (a character), a symbol (representing a named +;; character class) or an `rxt-char-set' (for pre-defined character +;; classes like \d, \W, etc.) +(defun rxt-parse-char-class-atom () + (or + ;; First, check for PCRE-specific backslash sequences + (and rxt-parse-pcre + (rxt-parse-char-class-atom/pcre)) + ;; Char-class syntax + (rxt-token-case + ;; Named classes [:alnum:], ... + (rxt-named-classes-regexp + (intern (match-string 1))) + ;; Error on unknown posix-class-like syntax + ((rx "[:" (* (any "a-z")) ":]") + (rxt-error "Unknown posix character class `%s'" (match-string 0))) + ;; Error on [= ... ]= collation syntax + ((rx "[" (submatch (any "." "=")) + (* (any "a-z")) (backref 1) "]") + (rxt-error "Unsupported collation syntax `%s'" (match-string 0))) + ;; Other characters stand for themselves + ((rx (or "\n" nonl)) + (string-to-char (match-string 0)))))) + +;; Parse backslash escapes inside PCRE character classes +(defun rxt-parse-char-class-atom/pcre () + (or (rxt-parse-escapes/pcre) + (rxt-token-case + ;; Backslash + digits => octal char + ((rx "\\" (submatch (** 1 3 (any "0-7")))) + (string-to-number (match-string 1) 8)) + ;; Pre-defined character sets + ((rx "\\" (submatch (any "d" "D" "h" "H" "s" "S" "v" "V" "w" "W"))) + (rxt--pcre-char-set (string-to-char (match-string 1)))) + ;; "\b" inside character classes is a backspace + ((rx "\\b") ?\C-h) + ;; Ignore other escapes + ((rx "\\" (submatch nonl)) + (string-to-char (match-string 1)))))) + +;; Look for a range tail (the "-z" in "a-z") after parsing a single +;; character within in a character set. Returns either a character +;; representing the range end, or nil. +(defun rxt-maybe-parse-range-end () + (let ((range-end nil) (end-position nil)) + (when (looking-at (rx "-" (not (any "]")))) + (save-excursion + (forward-char) + (setq range-end (rxt-parse-char-class-atom) + end-position (point)))) + + (if (characterp range-end) + ;; This is a range: move point after it and return the ending character + (progn + (goto-char end-position) + range-end) + ;; Not a range. + nil))) + +;; Return the pre-defined PCRE char-set associated with CHAR: i.e. \d +;; is digits, \D non-digits, \s space characters, etc. +(defun rxt--pcre-char-set (char) + (let* ((base-char (downcase char)) + (negated (/= char base-char)) + (elements (assoc-default base-char rxt-pcre-char-set-alist)) + (base-char-set (apply #'rxt-char-set-union elements))) + (if negated + (rxt-negate base-char-set) + base-char-set))) + + +;;;; Unparser to `rx' syntax + +(defconst rxt-rx-verbose-equivalents + '((bol . line-start) + (eol . line-end) + (nonl . not-newline) + (bos . string-start) + (eos . string-end) + (bow . word-start) + (eow . word-end) + (seq . sequence)) + "Alist of verbose equivalents for short `rx' primitives.") + +(defun rxt-rx-symbol (sym) + (if rxt-verbose-rx-translation + (or (assoc-default sym rxt-rx-verbose-equivalents) + sym) + sym)) + +(defun rxt-adt->rx (re) + (let ((rx + (cl-typecase re + (rxt-primitive + (rxt-rx-symbol (rxt-primitive-rx re))) + + (rxt-string + (if (or (not (rxt-string-case-fold re)) + (string= "" (rxt-string-chars re))) + (rxt-string-chars re) + `(seq + ,@(cl-loop for char across (rxt-string-chars re) + collect `(any ,(upcase char) ,(downcase char)))))) + + (rxt-seq + `(seq ,@(mapcar #'rxt-adt->rx (rxt-seq-elts re)))) + + (rxt-choice + `(or ,@(mapcar #'rxt-adt->rx (rxt-choice-elts re)))) + + (rxt-submatch + (if (rxt-seq-p (rxt-submatch-body re)) + `(submatch + ,@(mapcar #'rxt-adt->rx (rxt-seq-elts (rxt-submatch-body re)))) + `(submatch ,(rxt-adt->rx (rxt-submatch-body re))))) + + (rxt-submatch-numbered + (if (rxt-seq-p (rxt-submatch-numbered-p re)) + `(,(rxt-rx-symbol 'submatch-n) + ,(rxt-submatch-numbered-n re) + ,@(mapcar #'rxt-adt->rx + (rxt-seq-elts + (rxt-submatch-numbered-body re)))) + `(,(rxt-rx-symbol 'submatch-n) + ,(rxt-submatch-numbered-n re) + ,(rxt-adt->rx (rxt-submatch-numbered-body re))))) + + (rxt-backref + (let ((n (rxt-backref-n re))) + (if (<= n 9) + `(backref ,(rxt-backref-n re)) + (rxt-error "Too many backreferences (%s)" n)))) + + (rxt-syntax-class + `(syntax ,(rxt-syntax-class-symbol re))) + + (rxt-char-category + `(category ,(rxt-char-category-symbol re))) + + (rxt-repeat + (let ((from (rxt-repeat-from re)) + (to (rxt-repeat-to re)) + (greedy (rxt-repeat-greedy re)) + (body (rxt-adt->rx (rxt-repeat-body re)))) + (if rxt-verbose-rx-translation + (let ((rx + (cond + ((and (zerop from) (null to)) + `(zero-or-more ,body)) + ((and (equal from 1) (null to)) + `(one-or-more ,body)) + ((and (zerop from) (equal to 1)) + `(zero-or-one ,body)) + ((null to) + `(>= ,from ,body)) + ((equal from to) + `(repeat ,from ,body)) + (t + `(** ,from ,to ,body))))) + (if greedy + (if rxt-explain + rx ; Readable but not strictly accurate. Fixme? + `(maximal-match ,rx)) + `(minimal-match ,rx))) + (cond + ((and (zerop from) (null to)) + `(,(if greedy '* '*?) ,body)) + ((and (equal from 1) (null to)) + `(,(if greedy '+ '+?) ,body)) + ((and (zerop from) (equal to 1)) + `(,(if greedy ? ??) ,body)) + ((null to) + `(>= ,from ,body)) + ((equal from to) + `(= ,from ,body)) + (t + `(** ,from ,to ,body)))))) + + (rxt-char-set-union + (let* ((case-fold (rxt-char-set-union-case-fold re)) + (re (rxt--simplify-char-set re case-fold)) + (chars (rxt-char-set-union-chars re)) + (ranges (rxt-char-set-union-ranges re)) + (classes (rxt-char-set-union-classes re)) + (case-fold (rxt-char-set-union-case-fold re))) + (if (and (null chars) (null ranges) (= 1 (length classes))) + (car classes) + `(any ,@chars ,@ranges ,@classes)))) + + (rxt-char-set-negation + `(not ,(rxt-adt->rx (rxt-char-set-negation-elt re)))) + + (t + (rxt-error "No RX translation of `%s'" (rxt-to-string re)))))) + + ;; Store source information on each fragment of the generated RX + ;; sexp for rxt-explain mode + (when rxt-explain + ;; Use gensyms to store unique source information for multiple + ;; occurrences of primitives like `bol' + (when (symbolp rx) + (setq rx (make-symbol (symbol-name rx)))) + (setf (rxt-location rx) (rxt-location re))) + rx)) + + +;;;; 'Unparser' to PCRE notation + +;;; Based on scsh/posixstr.scm in scsh + +;; To ensure that the operator precedence in the generated regexp does +;; what we want, we need to keep track of what kind of production is +;; returned from each step. Therefore these functions return a string +;; and a numeric "level" which lets the function using the generated +;; regexp know whether it has to be parenthesized: +;; +;; 0: an already parenthesized expression +;; +;; 1: a "piece" binds to any succeeding quantifiers +;; +;; 2: a "branch", or concatenation of pieces, needs parenthesizing to +;; bind to quantifiers +;; +;; 3: a "top", or alternation of branches, needs parenthesizing to +;; bind to quantifiers or to concatenation +;; +;; This idea is stolen straight out of the scsh implementation. + +(defun rxt-adt->pcre (re) + (cl-destructuring-bind (s _) (rxt-adt->pcre/lev re) s)) + +(defun rxt-adt->pcre/lev (re) + (cl-typecase re + (rxt-primitive + (let ((s (rxt-primitive-pcre re))) + (if s + (list s 1) + (rxt-error "No PCRE translation of `%s'" (rxt-to-string re))))) + + (rxt-string (rxt-string->pcre re)) + (rxt-seq (rxt-seq->pcre re)) + (rxt-choice (rxt-choice->pcre re)) + (rxt-submatch (rxt-submatch->pcre re)) + (rxt-backref + (list (format "\\%d" (rxt-backref-n re)) 1)) + (rxt-repeat (rxt-repeat->pcre re)) + + ((or rxt-char-set-union rxt-char-set-negation) + (rxt-char-set->pcre re)) + + ;; FIXME + ;; ((rxt-char-set-intersection re) (rxt-char-set-intersection->pcre re)) + + (t + (rxt-error "No PCRE translation of `%s'" (rxt-to-string re))))) + +(defconst rxt-pcre-metachars (rx (any "\\^.$|()[]*+?{}"))) +(defconst rxt-pcre-charset-metachars (rx (any "]" "[" "\\" "^" "-"))) + +(defun rxt-string->pcre (re) + (let ((chars (rxt-string-chars re))) + (list + (replace-regexp-in-string + rxt-pcre-metachars + "\\\\\\&" chars) + ;; A one-character string is a 'piece' (it binds to a following + ;; quantifier). A longer string is a 'branch' (it has to be + ;; enclosed in parentheses to bind to a following quantifier). + (if (> (length chars) 1) 2 1)))) + +(defun rxt-seq->pcre (re) + (let ((elts (rxt-seq-elts re))) + (if (null elts) + "" + (rxt-seq-elts->pcre elts)))) + +(defun rxt-seq-elts->pcre (elts) + (cl-destructuring-bind + (s lev) (rxt-adt->pcre/lev (car elts)) + (if (null (cdr elts)) + (list s lev) + (cl-destructuring-bind + (s1 lev1) (rxt-seq-elts->pcre (cdr elts)) + (list (concat (rxt-paren-if-necessary s lev) + (rxt-paren-if-necessary s1 lev1)) + 2))))) + +(defun rxt-paren-if-necessary (s lev) + (if (< lev 3) + s + (concat "(?:" s ")"))) + +(defun rxt-choice->pcre (re) + (let ((elts (rxt-choice-elts re))) + (if (null elts) + nil + (rxt-choice-elts->pcre elts)))) + +(defun rxt-choice-elts->pcre (elts) + (cl-destructuring-bind + (s lev) (rxt-adt->pcre/lev (car elts)) + (if (null (cdr elts)) + (list s lev) + (cl-destructuring-bind + (s1 lev1) (rxt-choice-elts->pcre (cdr elts)) + (list (concat s "|" s1) 3))))) + +(defun rxt-submatch->pcre (re) + (cl-destructuring-bind + (s lev) (rxt-adt->pcre/lev (rxt-submatch-body re)) + (list (concat "(" s ")") 0))) + +(defun rxt-repeat->pcre (re) + (let ((from (rxt-repeat-from re)) + (to (rxt-repeat-to re)) + (body (rxt-repeat-body re)) + (greedy (rxt-repeat-greedy re))) + (cl-destructuring-bind + (s lev) (rxt-adt->pcre/lev body) + (cond + ((and to (= from 1) (= to 1)) (list s lev)) + ((and to (= from 0) (= to 0)) (list "" 2)) + (t + (when (> lev 1) ; parenthesize non-atoms + (setq s (concat "(?:" s ")") + lev 0)) + (list (if to + (cond ((and (= from 0) (= to 1)) + (concat s (if greedy "?" "??"))) + ((= from to) + (concat s "{" (number-to-string to) "}")) + (t + (concat s "{" (number-to-string from) + "," (number-to-string to) "}"))) + (cond ((= from 0) + (concat s (if greedy "*" "*?"))) + ((= from 1) + (concat s (if greedy "+" "+?"))) + (t (concat s "{" (number-to-string from) ",}")))) + 1)))))) + +(defun rxt-char-set->pcre (re) + (cond ((rxt-char-set-union-p re) + (list + (concat "[" (rxt-char-set->pcre/chars re) "]") 1)) + + ((rxt-char-set-negation-p re) + (let ((elt (rxt-char-set-negation-elt re))) + (if (rxt-char-set-union-p elt) + (list + (concat "[^" (rxt-char-set->pcre/chars elt) "]") 1) + (rxt-error "No PCRE translation of `%s'" (rxt-to-string elt))))) + + (t + (rxt-error "Non-char-set in rxt-char-set->pcre: %s" re)))) + +;; Fortunately, easier in PCRE than in POSIX! +(defun rxt-char-set->pcre/chars (re) + (cl-flet + ((escape + (char) + (let ((s (char-to-string char))) + (cond ((string-match rxt-pcre-charset-metachars s) + (concat "\\" s)) + + ((and (not (string= s " ")) + (string-match "[^[:graph:]]" s)) + (format "\\x{%x}" char)) + + (t s))))) + + (let ((chars (rxt-char-set-union-chars re)) + (ranges (rxt-char-set-union-ranges re)) + (classes (rxt-char-set-union-classes re))) + + (concat + (mapconcat #'escape chars "") + (mapconcat #'(lambda (rg) + (format "%s-%s" + (escape (car rg)) + (escape (cdr rg)))) + ranges "") + (mapconcat #'(lambda (class) + (format "[:%s:]" class)) + classes ""))))) + + +;;;; Generate all productions of a finite regexp + +(defun rxt-adt->strings (re) + (cl-typecase re + (rxt-primitive + (list "")) + (rxt-string + (list (rxt-string-chars re))) + (rxt-seq + (rxt-seq-elts->strings (rxt-seq-elts re))) + (rxt-choice + (rxt-choice-elts->strings (rxt-choice-elts re))) + (rxt-submatch + (rxt-adt->strings (rxt-submatch-body re))) + (rxt-submatch-numbered + (rxt-adt->strings (rxt-submatch-numbered-body re))) + (rxt-repeat + (rxt-repeat->strings re)) + (rxt-char-set-union + (rxt-char-set->strings re)) + (t + (error "Can't generate productions of %s" + (rxt-syntax-tree-readable re))))) + +(defun rxt-concat-product (heads tails) + (cl-mapcan + (lambda (hs) + (mapcar + (lambda (ts) (concat hs ts)) + tails)) + heads)) + +(defun rxt-seq-elts->strings (elts) + (if (null elts) + '("") + (let ((heads (rxt-adt->strings (car elts))) + (tails (rxt-seq-elts->strings (cdr elts)))) + (rxt-concat-product heads tails)))) + +(defun rxt-choice-elts->strings (elts) + (if (null elts) + '() + (append (rxt-adt->strings (car elts)) + (rxt-choice-elts->strings (cdr elts))))) + +(defun rxt-repeat->strings (re) + (let ((from (rxt-repeat-from re)) + (to (rxt-repeat-to re))) + (if (not to) + (error "Can't generate all productions of unbounded repeat \"%s\"" + (rxt-syntax-tree-readable re)) + (let ((strings (rxt-adt->strings (rxt-repeat-body re)))) + (rxt-repeat-n-m->strings from to strings))))) + +(defun rxt-repeat-n-m->strings (from to strings) + (cond + ((zerop to) '("")) + ((= to from) (rxt-repeat-n->strings from strings)) + (t ; to > from + (let* ((strs-n (rxt-repeat-n->strings from strings)) + (accum (cl-copy-list strs-n))) + (dotimes (i (- to from)) + (setq strs-n (rxt-concat-product strs-n strings)) + (setq accum (nconc accum strs-n))) + accum)))) + +(defun rxt-repeat-n->strings (n strings) + ;; n > 1 + (cond ((zerop n) '("")) + ((= n 1) strings) + (t + (rxt-concat-product + (rxt-repeat-n->strings (- n 1) strings) + strings)))) + +(defun rxt-char-set->strings (re) + (if (rxt-char-set-union-classes re) + (error "Can't generate all productions of named character classes in \"%s\"" + (rxt-syntax-tree-readable re)) + (let ((chars (mapcar #'char-to-string (rxt-char-set-union-chars re)))) + (dolist (range (rxt-char-set-union-ranges re)) + (let ((end (cdr range))) + (cl-do ((i (car range) (+ i 1))) + ((> i end)) + (push (char-to-string i) chars)))) + chars))) + + +;;;; RE-Builder hacks + +(defadvice reb-update-modestring + (after rxt () activate compile) + "This function is hacked for emulated PCRE syntax and regexp conversion." + (setq reb-mode-string + (concat + (format " (%s)" reb-re-syntax) + reb-mode-string)) + (force-mode-line-update)) + +(defadvice reb-change-syntax + (around rxt (&optional syntax) activate compile) + "This function is hacked for emulated PCRE syntax and regexp conversion." + (interactive + (list (intern + (completing-read (format "Select syntax (%s): " reb-re-syntax) + '(read string pcre sregex rx) + nil t "" nil (symbol-name reb-re-syntax))))) + (unless (memq syntax '(read string pcre lisp-re sregex rx)) + (error "Invalid syntax: %s" syntax)) + (let ((re-builder-buffer (get-buffer reb-buffer))) + (setq reb-re-syntax syntax) + (when re-builder-buffer + (with-current-buffer reb-target-buffer + (cl-case syntax + (rx + (let ((rx (rxt-elisp-to-rx reb-regexp))) + (setq reb-regexp-src + (with-temp-buffer + (insert "\n" "'") + (rxt-print rx) + (buffer-string))))) + (pcre + (setq reb-regexp-src (rxt-elisp-to-pcre reb-regexp))))) + (with-current-buffer re-builder-buffer + ;; Hack: prevent reb-auto-update from clobbering the + ;; reb-regexp-src we just set + (let ((inhibit-modification-hooks t)) + (reb-initialize-buffer)) + ;; Enable flag-toggling bindings for PCRE syntax + (rxt--re-builder-switch-pcre-mode))))) + +(defadvice reb-read-regexp + (around rxt () activate compile) + "This function is hacked for emulated PCRE syntax and regexp conversion." + (if (eq reb-re-syntax 'pcre) + (setq ad-return-value + (save-excursion + (goto-char (point-min)) + (rxt-read-delimited-pcre))) + ad-do-it)) + +(defadvice reb-insert-regexp + (around rxt () activate compile) + "This function is hacked for emulated PCRE syntax and regexp conversion." + (if (eq reb-re-syntax 'pcre) + (let ((src (reb-target-binding reb-regexp-src))) + (if src + (insert "\n/" (replace-regexp-in-string "/" "\\/" src t t) "/") + (insert "\n//"))) + ad-do-it)) + +(defadvice reb-cook-regexp + (around rxt (re) activate compile) + "This function is hacked for emulated PCRE syntax and regexp conversion." + (if (eq reb-re-syntax 'pcre) + (setq ad-return-value (rxt-pcre-to-elisp re)) + ad-do-it)) + +(defadvice reb-update-regexp + (around rxt () activate compile) + "This function is hacked for emulated PCRE syntax and regexp conversion." + (setq ad-return-value + (let* ((re-src (reb-read-regexp)) + (re (reb-cook-regexp re-src))) + (with-current-buffer reb-target-buffer + (let ((oldre reb-regexp)) + (prog1 + (not (string= oldre re)) + (setq reb-regexp re) + ;; Update the source re if format requires translation + (when (or (reb-lisp-syntax-p) (eq reb-re-syntax 'pcre)) + (setq reb-regexp-src re-src)))))))) + +(defun rxt--re-builder-switch-pcre-mode () + (rxt--read-pcre-mode + (if (eq reb-re-syntax 'pcre) 1 0))) + +(add-hook 'reb-mode-hook #'rxt--re-builder-switch-pcre-mode) + +(provide 'rxt) +(provide 'pcre2el) + + +;;; pcre2el.el ends here |