diff options
Diffstat (limited to 'configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el')
-rw-r--r-- | configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el | 3157 |
1 files changed, 0 insertions, 3157 deletions
diff --git a/configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el b/configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el deleted file mode 100644 index bea8effe825d..000000000000 --- a/configs/shared/emacs/.emacs.d/elpa/pcre2el-20161120.2103/pcre2el.el +++ /dev/null @@ -1,3157 +0,0 @@ -;;; pcre2el.el --- regexp syntax converter -*- lexical-binding: t -*- - -;; Copyright (C) 2012-2015 Jon Oddie <jonxfield@gmail.com> - -;; Author: joddie <jonxfield at gmail.com> -;; Hacked additionally by: opensource at hardakers dot net -;; Created: 14 Feb 2012 -;; Updated: 13 December 2015 -;; Version: 1.8 -;; Package-Version: 20161120.2103 -;; Url: https://github.com/joddie/pcre2el -;; Package-Requires: ((emacs "24") (cl-lib "0.3")) - -;; This file is NOT part of GNU Emacs. - -;; This program is free software: you can redistribute it and/or -;; modify it under the terms of the GNU General Public License as -;; published by the Free Software Foundation, either version 3 of the -;; License, or (at your option) any later version. -;; -;; This program is distributed in the hope that it will be useful, but -;; WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;; General Public License for more details. -;; -;; You should have received a copy of the GNU General Public License -;; along with this program. If not, see `http://www.gnu.org/licenses/'. - -;; This file incorporates work covered by the following copyright and -;; permission notice: -;; -;; Copyright (c) 1993-2002 Richard Kelsey and Jonathan Rees -;; Copyright (c) 1994-2002 by Olin Shivers and Brian D. Carlstrom. -;; Copyright (c) 1999-2002 by Martin Gasbichler. -;; Copyright (c) 2001-2002 by Michael Sperber. -;; All rights reserved. -;; -;; Redistribution and use in source and binary forms, with or without -;; modification, are permitted provided that the following conditions -;; are met: 1. Redistributions of source code must retain the above -;; copyright notice, this list of conditions and the following -;; disclaimer. 2. Redistributions in binary form must reproduce the -;; above copyright notice, this list of conditions and the following -;; disclaimer in the documentation and/or other materials provided -;; with the distribution. 3. The name of the authors may not be used -;; to endorse or promote products derived from this software without -;; specific prior written permission. -;; -;; THIS SOFTWARE IS PROVIDED BY THE AUTHORS "AS IS" AND ANY EXPRESS OR -;; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY -;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -;;; Commentary: - -;; 1 Overview -;; ========== - -;; `pcre2el' or `rxt' (RegeXp Translator or RegeXp Tools) is a utility -;; for working with regular expressions in Emacs, based on a -;; recursive-descent parser for regexp syntax. In addition to converting -;; (a subset of) PCRE syntax into its Emacs equivalent, it can do the -;; following: - -;; - convert Emacs syntax to PCRE -;; - convert either syntax to `rx', an S-expression based regexp syntax -;; - untangle complex regexps by showing the parse tree in `rx' form and -;; highlighting the corresponding chunks of code -;; - show the complete list of strings (productions) matching a regexp, -;; provided the list is finite -;; - provide live font-locking of regexp syntax (so far only for Elisp -;; buffers -- other modes on the TODO list) - - -;; 2 Usage -;; ======= - -;; Enable `rxt-mode' or its global equivalent `rxt-global-mode' to get -;; the default key-bindings. There are three sets of commands: commands -;; that take a PCRE regexp, commands which take an Emacs regexp, and -;; commands that try to do the right thing based on the current -;; mode. Currently, this means Emacs syntax in `emacs-lisp-mode' and -;; `lisp-interaction-mode', and PCRE syntax everywhere else. - -;; The default key bindings all begin with `C-c /' and have a mnemonic -;; structure: `C-c / <source> <target>', or just `C-c / <target>' for the -;; "do what I mean" commands. The complete list of key bindings is given -;; here and explained in more detail below: - -;; - "Do-what-I-mean" commands: -;; `C-c / /': `rxt-explain' -;; `C-c / c': `rxt-convert-syntax' -;; `C-c / x': `rxt-convert-to-rx' -;; `C-c / '': `rxt-convert-to-strings' - -;; - Commands that work on a PCRE regexp: -;; `C-c / p e': `rxt-pcre-to-elisp' -;; `C-c / %': `pcre-query-replace-regexp' -;; `C-c / p x': `rxt-pcre-to-rx' -;; `C-c / p '': `rxt-pcre-to-strings' -;; `C-c / p /': `rxt-explain-pcre' - -;; - Commands that work on an Emacs regexp: -;; `C-c / e /': `rxt-explain-elisp' -;; `C-c / e p': `rxt-elisp-to-pcre' -;; `C-c / e x': `rxt-elisp-to-rx' -;; `C-c / e '': `rxt-elisp-to-strings' -;; `C-c / e t': `rxt-toggle-elisp-rx' -;; `C-c / t': `rxt-toggle-elisp-rx' - - -;; 2.1 Interactive input and output -;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -;; When used interactively, the conversion commands can read a regexp -;; either from the current buffer or from the minibuffer. The output is -;; displayed in the minibuffer and copied to the kill-ring. - -;; - When called with a prefix argument (`C-u'), they read a regular -;; expression from the minibuffer literally, without further processing -;; -- meaning there's no need to double the backslashes if it's an -;; Emacs regexp. This is the same way commands like -;; `query-replace-regexp' read input. - -;; - When the region is active, they use they the region contents, again -;; literally (without any translation of string syntax). - -;; - With neither a prefix arg nor an active region, the behavior depends -;; on whether the command expects an Emacs regexp or a PCRE one. - -;; Commands that take an Emacs regexp behave like `C-x C-e': they -;; evaluate the sexp before point (which could be simply a string -;; literal) and use its value. This is designed for use in Elisp -;; buffers. As a special case, if point is *inside* a string, it's -;; first moved to the string end, so in practice they should work as -;; long as point is somewhere within the regexp literal. - -;; Commands that take a PCRE regexp try to read a Perl-style delimited -;; regex literal *after* point in the current buffer, including its -;; flags. For example, putting point before the `m' in the following -;; example and doing `C-c / p e' (`rxt-pcre-to-elisp') displays -;; `\(?:bar\|foo\)', correctly stripping out the whitespace and -;; comment: - -;; ,---- -;; | $x =~ m/ foo | (?# comment) bar /x -;; `---- - -;; The PCRE reader currently only works with `/ ... /' delimiters. It -;; will ignore any preceding `m', `s', or `qr' operator, as well as the -;; replacement part of an `s' construction. - -;; Readers for other PCRE-using languages are on the TODO list. - -;; The translation functions display their result in the minibuffer and -;; copy it to the kill ring. When translating something into Elisp -;; syntax, you might need to use the result either literally (e.g. for -;; interactive input to a command like `query-replace-regexp'), or as a -;; string to paste into Lisp code. To allow both uses, -;; `rxt-pcre-to-elisp' copies both versions successively to the -;; kill-ring. The literal regexp without string quoting is the top -;; element of the kill-ring, while the Lisp string is the -;; second-from-top. You can paste the literal regexp somewhere by doing -;; `C-y', or the Lisp string by `C-y M-y'. - - -;; 2.2 Syntax conversion commands -;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -;; `rxt-convert-syntax' (`C-c / c') converts between Emacs and PCRE -;; syntax, depending on the major mode in effect when called. -;; Alternatively, you can specify the conversion direction explicitly by -;; using either `rxt-pcre-to-elisp' (`C-c / p e') or `rxt-elisp-to-pcre' -;; (`C-c / e p'). - -;; Similarly, `rxt-convert-to-rx' (`C-c / x') converts either kind of -;; syntax to `rx' form, while `rxt-convert-pcre-to-rx' (`C-c / p x') and -;; `rxt-convert-elisp-to-rx' (`C-c / e x') convert to `rx' from a -;; specified source type. - -;; In Elisp buffers, you can use `rxt-toggle-elisp-rx' (`C-c / t' or `C-c -;; / e t') to switch the regexp at point back and forth between string -;; and `rx' syntax. Point should either be within an `rx' or -;; `rx-to-string' form or a string literal for this to work. - - -;; 2.3 PCRE mode (experimental) -;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -;; If you want to use emulated PCRE regexp syntax in all Emacs commands, -;; try `pcre-mode', which uses Emacs's advice system to make all commands -;; that read regexps using the minibuffer use emulated PCRE syntax. It -;; should also work with Isearch. - -;; This feature is still fairly experimental. It may fail to work or do -;; the wrong thing with certain commands. Please report bugs. - -;; `pcre-query-replace-regexp' was originally defined to do query-replace -;; using emulated PCRE regexps, and is now made somewhat obsolete by -;; `pcre-mode'. It is bound to `C-c / %' by default, by analogy with -;; `M-%'. Put the following in your `.emacs' if you want to use -;; PCRE-style query replacement everywhere: - -;; ,---- -;; | (global-set-key [(meta %)] 'pcre-query-replace-regexp) -;; `---- - -;; 2.5 Explain regexps -;; ~~~~~~~~~~~~~~~~~~~ - -;; When syntax-highlighting isn't enough to untangle some gnarly regexp -;; you find in the wild, try the 'explain' commands: `rxt-explain' (`C-c -;; / /'), `rxt-explain-pcre' (`C-c / p') and `rxt-explain-elisp' (`C-c / -;; e'). These display the original regexp along with its pretty-printed -;; `rx' equivalent in a new buffer. Moving point around either in the -;; original regexp or the `rx' translation highlights corresponding -;; pieces of syntax, which can aid in seeing things like the scope of -;; quantifiers. - -;; I call them "explain" commands because the `rx' form is close to a -;; plain syntax tree, and this plus the wordiness of the operators -;; usually helps to clarify what is going on. People who dislike Lisp -;; syntax might disagree with this assessment. - - -;; 2.6 Generate all matching strings (productions) -;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -;; Occasionally you come across a regexp which is designed to match a -;; finite set of strings, e.g. a set of keywords, and it would be useful -;; to recover the original set. (In Emacs you can generate such regexps -;; using `regexp-opt'). The commands `rxt-convert-to-strings' (`C-c / -;; ′'), `rxt-pcre-to-strings' (`C-c / p ′') or `rxt-elisp-to-strings' -;; (`C-c / e ′') accomplish this by generating all the matching strings -;; ("productions") of a regexp. (The productions are copied to the kill -;; ring as a Lisp list). - -;; An example in Lisp code: - -;; ,---- -;; | (regexp-opt '("cat" "caterpillar" "catatonic")) -;; | ;; => "\\(?:cat\\(?:atonic\\|erpillar\\)?\\)" -;; | (rxt-elisp-to-strings "\\(?:cat\\(?:atonic\\|erpillar\\)?\\)") -;; | ;; => '("cat" "caterpillar" "catatonic") -;; `---- - -;; For obvious reasons, these commands only work with regexps that don't -;; include any unbounded quantifiers like `+' or `*'. They also can't -;; enumerate all the characters that match a named character class like -;; `[[:alnum:]]'. In either case they will give a (hopefully meaningful) -;; error message. Due to the nature of permutations, it's still possible -;; for a finite regexp to generate a huge number of productions, which -;; will eat memory and slow down your Emacs. Be ready with `C-g' if -;; necessary. - - -;; 2.7 RE-Builder support -;; ~~~~~~~~~~~~~~~~~~~~~~ - -;; The Emacs RE-Builder is a useful visual tool which allows using -;; several different built-in syntaxes via `reb-change-syntax' (`C-c -;; TAB'). It supports Elisp read and literal syntax and `rx', but it can -;; only convert from the symbolic forms to Elisp, not the other way. This -;; package hacks the RE-Builder to also work with emulated PCRE syntax, -;; and to convert transparently between Elisp, PCRE and rx syntaxes. PCRE -;; mode reads a delimited Perl-like literal of the form `/ ... /', and it -;; should correctly support using the `x' and `s' flags. - - -;; 2.8 Use from Lisp -;; ~~~~~~~~~~~~~~~~~ - -;; Example of using the conversion functions: -;; ,---- -;; | (rxt-pcre-to-elisp "(abc|def)\\w+\\d+") -;; | ;; => "\\(\\(?:abc\\|def\\)\\)[_[:alnum:]]+[[:digit:]]+" -;; `---- - -;; All the conversion functions take a single string argument, the regexp -;; to translate: - -;; - `rxt-pcre-to-elisp' -;; - `rxt-pcre-to-rx' -;; - `rxt-pcre-to-strings' -;; - `rxt-elisp-to-pcre' -;; - `rxt-elisp-to-rx' -;; - `rxt-elisp-to-strings' - - -;; 3 Bugs and Limitations -;; ====================== - -;; 3.1 Limitations on PCRE syntax -;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -;; PCRE has a complicated syntax and semantics, only some of which can be -;; translated into Elisp. The following subset of PCRE should be -;; correctly parsed and converted: - -;; - parenthesis grouping `( .. )', including shy matches `(?: ... )' -;; - backreferences (various syntaxes), but only up to 9 per expression -;; - alternation `|' -;; - greedy and non-greedy quantifiers `*', `*?', `+', `+?', `?' and `??' -;; (all of which are the same in Elisp as in PCRE) -;; - numerical quantifiers `{M,N}' -;; - beginning/end of string `\A', `\Z' -;; - string quoting `\Q .. \E' -;; - word boundaries `\b', `\B' (these are the same in Elisp) -;; - single character escapes `\a', `\c', `\e', `\f', `\n', `\r', `\t', -;; `\x', and `\octal digits' (but see below about non-ASCII characters) -;; - character classes `[...]' including Posix escapes -;; - character classes `\d', `\D', `\h', `\H', `\s', `\S', `\v', `\V' -;; both within character class brackets and outside -;; - word and non-word characters `\w' and `\W' (Emacs has the same -;; syntax, but its meaning is different) -;; - `s' (single line) and `x' (extended syntax) flags, in regexp -;; literals, or set within the expression via `(?xs-xs)' or `(?xs-xs: -;; .... )' syntax -;; - comments `(?# ... )' - -;; Most of the more esoteric PCRE features can't really be supported by -;; simple translation to Elisp regexps. These include the different -;; lookaround assertions, conditionals, and the "backtracking control -;; verbs" `(* ...)' . OTOH, there are a few other syntaxes which are -;; currently unsupported and possibly could be: - -;; - `\L', `\U', `\l', `\u' case modifiers -;; - `\g{...}' backreferences - - -;; 3.2 Other limitations -;; ~~~~~~~~~~~~~~~~~~~~~ - -;; - The order of alternatives and characters in char classes sometimes -;; gets shifted around, which is annoying. -;; - Although the string parser tries to interpret PCRE's octal and -;; hexadecimal escapes correctly, there are problems with matching -;; 8-bit characters that I don't use enough to properly understand, -;; e.g.: -;; ,---- -;; | (string-match-p (rxt-pcre-to-elisp "\\377") "\377") => nil -;; `---- -;; A fix for this would be welcome. - -;; - Most of PCRE's rules for how `^', `\A', `$' and `\Z' interact with -;; newlines are not implemented, since they seem less relevant to -;; Emacs's buffer-oriented rather than line-oriented model. However, -;; the different meanings of the `.' metacharacter *are* implemented -;; (it matches newlines with the `/s' flag, but not otherwise). - -;; - Not currently namespace clean (both `rxt-' and a couple of `pcre-' -;; functions). - - -;; 3.3 TODO: -;; ~~~~~~~~~ - -;; - Python-specific extensions to PCRE? -;; - Language-specific stuff to enable regexp font-locking and explaining -;; in different modes. Each language would need two functions, which -;; could be kept in an alist: - -;; 1. A function to read PCRE regexps, taking the string syntax into -;; account. E.g., Python has single-quoted, double-quoted and raw -;; strings, each with different quoting rules. PHP has the kind of -;; belt-and-suspenders solution you would expect: regexps are in -;; strings, /and/ you have to include the `/ ... /' delimiters! -;; Duh. - -;; 2. A function to copy faces back from the parsed string to the -;; original buffer text. This has to recognize any escape sequences -;; so they can be treated as a single character. - - -;; 4 Internal details -;; ================== - -;; `rxt' defines an internal syntax tree representation of regular -;; expressions, parsers for Elisp and PCRE syntax, and 'unparsers' -;; to convert the internal representation to PCRE or `rx' syntax. -;; Converting from the internal representation to Emacs syntax is -;; done by converting to `rx' form and passing it to `rx-to-string'. -;; See `rxt-parse-re', `rxt-adt->pcre', and `rxt-adt->rx' for -;; details. - -;; This code is partially based on Olin Shivers' reference SRE -;; implementation in scsh, although it is simplified in some respects and -;; extended in others. See `scsh/re.scm', `scsh/spencer.scm' and -;; `scsh/posixstr.scm' in the `scsh' source tree for details. In -;; particular, `pcre2el' steals the idea of an abstract data type for -;; regular expressions and the general structure of the string regexp -;; parser and unparser. The data types for character sets are extended in -;; order to support symbolic translation between character set -;; expressions without assuming a small (Latin1) character set. The -;; string parser is also extended to parse a bigger variety of -;; constructions, including POSIX character classes and various Emacs and -;; Perl regexp assertions. Otherwise, only the bare minimum of scsh's -;; abstract data type is implemented. - - -;; 5 Soapbox -;; ========= - -;; Emacs regexps have their annoyances, but it is worth getting used to -;; them. The Emacs assertions for word boundaries, symbol boundaries, and -;; syntax classes depending on the syntax of the mode in effect are -;; especially useful. (PCRE has `\b' for word-boundary, but AFAIK it -;; doesn't have separate assertions for beginning-of-word and -;; end-of-word). Other things that might be done with huge regexps in -;; other languages can be expressed more understandably in Elisp using -;; combinations of `save-excursion' with the various searches (regexp, -;; literal, skip-syntax-forward, sexp-movement functions, etc.). - -;; There's not much point in using `rxt-pcre-to-elisp' to use PCRE -;; notation in a Lisp program you're going to maintain, since you still -;; have to double all the backslashes. Better to just use the converted -;; result (or better yet, the `rx' form). - - -;; 6 History and acknowledgments -;; ============================= - -;; This was originally created out of an answer to a stackoverflow -;; question: -;; [http://stackoverflow.com/questions/9118183/elisp-mechanism-for-converting-pcre-regexps-to-emacs-regexps] - -;; Thanks to: - -;; - Wes Hardaker (hardaker) for the initial inspiration and subsequent -;; hacking -;; - priyadarshan for requesting RX support -;; - Daniel Colascione (dcolascione) for a patch to support Emacs's -;; explicitly-numbered match groups -;; - Aaron Meurer (asmeurer) for requesting Isearch support -;; - Philippe Vaucher (silex) for a patch to support `ibuffer-do-replace-regexp' -;; in PCRE mode - -;;; Code: - -(require 'cl-lib) -(require 'rx) -(require 're-builder) -(require 'advice) -(require 'ring) -(require 'pcase) - -;;; Customization group -(defgroup rxt nil - "Regex syntax converter and utilities." - :version 1.2 - :group 'tools - :group 'lisp - :link '(emacs-commentary-link :tag "commentary" "pcre2el.el") - :link '(emacs-library-link :tag "lisp file" "pcre2el.el") - :link '(url-link :tag "web page" "https://github.com/joddie/pcre2el")) - -(defface rxt-highlight-face - '((((min-colors 16581375) (background light)) :background "#eee8d5") - (((min-colors 16581375) (background dark)) :background "#222222")) - "Face for highlighting corresponding regex syntax in `rxt-explain' buffers." - :group 'rxt) - -(defcustom rxt-verbose-rx-translation nil - "Non-nil if `rxt-pcre-to-rx' and `rxt-elisp-to-rx' should use verbose `rx' primitives. - -Verbose primitives are things like `line-start' instead of `bol', -etc." - :group 'rxt - :type 'boolean) - -(defcustom rxt-explain-verbosely t - "Non-nil if `rxt-explain-elisp' and `rxt-explain-pcre' should use verbose `rx' primitives. - -This overrides the value of `rxt-verbose-rx-translation' for -these commands only." - :group 'rxt - :type 'boolean) - - -;;;; Macros and functions for writing interactive input and output - -;; Macros for handling return values. If called interactively, -;; display the value in the echo area and copy it to the kill ring, -;; otherwise just return the value. PCREs are copied as unquoted -;; strings for yanking into Perl, JS, etc. `rx' forms and other sexps -;; are copied as `read'-able literals for yanking into Elisp buffers. -;; Emacs regexps are copied twice: once as an unquoted value for -;; interactive use, and once as a readable string literal for yanking -;; into Elisp buffers. -(defmacro rxt-return-pcre (expr) - (let ((value (make-symbol "value"))) - `(let ((,value ,expr)) - (when (called-interactively-p 'any) - (rxt--kill-pcre ,value)) - ,value))) - -(defmacro rxt-return-sexp (expr) - (let ((value (make-symbol "value"))) - `(let ((,value ,expr)) - (when (called-interactively-p 'any) - (rxt--kill-sexp ,value)) - ,value))) - -(defmacro rxt-return-emacs-regexp (expr) - (let ((value (make-symbol "value"))) - `(let ((,value ,expr)) - (when (called-interactively-p 'any) - (rxt--kill-emacs-regexp ,value)) - ,value))) - -(defun rxt--kill-sexp (value) - (let ((lisp-literal (prin1-to-string value))) - (message "%s" lisp-literal) - (kill-new lisp-literal))) - -(defun rxt--kill-pcre (value) - (message "%s" value) - (kill-new value)) - -(defun rxt--kill-emacs-regexp (value) - (let ((lisp-literal (prin1-to-string value))) - (message "%s" value) - (kill-new lisp-literal) - (kill-new value))) - -;; Read an Elisp regexp interactively. -;; -;; Three possibilities: -;; -;; 1) With a prefix arg, reads literally from the minibuffer, w/o -;; using string syntax -- just like query-replace-regexp, etc. -;; -;; 2) If the region is active, use the text of the region literally -;; (again w/o string syntax) -;; -;; 3) Otherwise, eval the sexp before point (which might be a string -;; literal or an expression) and use its value. Falls back to method -;; (1) if this fails to produce a string value. -;; -(cl-defun rxt-interactive/elisp (&optional (prompt "Emacs regexp: ")) - (list - (cond (current-prefix-arg - (read-string prompt)) - - ((use-region-p) - (buffer-substring-no-properties (region-beginning) (region-end))) - - (t - (condition-case nil - (save-excursion - (while (nth 3 (syntax-ppss)) (forward-char)) - (let ((re (eval (preceding-sexp)))) - (if (stringp re) re - (read-string prompt)))) - (error - (read-string prompt))))))) - -;; Read a PCRE regexp interactively. -;; -;; Three possibilities: As above, except that without prefix arg or -;; active region, tries to read a delimited regexp literal like /.../, -;; m/.../, or qr/.../ following point in the current buffer. Falls -;; back to reading from minibuffer if that fails. -;; -;; Returns the regexp, with flags as text properties. -;; -;; TODO: Different delimiters -(cl-defun rxt-interactive/pcre (&optional (prompt "PCRE regexp: ")) - (list - (cond (current-prefix-arg - (rxt--read-pcre prompt)) - - ((use-region-p) - (buffer-substring-no-properties (region-beginning) (region-end))) - - (t - (condition-case nil - (rxt-read-delimited-pcre) - (error ; Fall back to reading from minibuffer - (rxt--read-pcre prompt))))) - nil)) - -(define-minor-mode rxt--read-pcre-mode - "Minor-mode with key-bindings for toggling PCRE flags. - -You should not normally call this directly. It will be enabled -in minibuffers for `read-regexp' and in the `re-builder' buffer -when `pcre-mode' is active. These bindings will also be added to -`isearch-mode-map' in `pcre-mode'." - :initial nil - :lighter nil - :keymap - `((,(kbd "C-c s") . ,#'rxt--toggle-s-mode) - (,(kbd "C-c x") . ,#'rxt--toggle-x-mode) - (,(kbd "C-c i") . ,#'rxt--toggle-i-mode))) - -(defun rxt--read-pcre (prompt) - "Read a PCRE regexp for translation, together with option flags. - -The `s', `x', and `i' flags can be toggled using the following -commands: \\<rxt--read-pcre-mode-map> - -\\[rxt--toggle-s-mode] : toggle `s' (single-line) mode -\\[rxt--toggle-x-mode] : toggle `x' (extended) mode -\\[rxt--toggle-i-mode] : toggle `i' (case-insensitive) mode - -In single-line mode, `.' will also match newlines. -In extended mode, whitespace is ignored. - -Case-insensitive mode emulates matching without case, -independently of Emacs's builtin `case-fold-search' setting. -Note that this does not apply to backreferences." - (minibuffer-with-setup-hook #'rxt--read-pcre-mode - (read-from-minibuffer prompt))) - -(defun rxt--toggle-s-mode () - "Toggle emulated PCRE single-line (s) flag." - (interactive) - (rxt--toggle-flag ?s)) - -(defun rxt--toggle-x-mode () - "Toggle emulated PCRE extended (x) flag." - (interactive) - (rxt--toggle-flag ?x)) - -(defun rxt--toggle-i-mode () - "Toggle emulated PCRE case-insensitive (i) flag." - (interactive) - (rxt--toggle-flag ?i)) - -(defun rxt--toggle-flag (char) - "Toggle CHAR, a PCRE flag." - (cond - ((derived-mode-p 'reb-mode) ; RE-Builder - (rxt--toggle-flag-re-builder char)) - ((minibufferp) - (rxt--toggle-flag-minibuffer char)) - (isearch-mode - (rxt--toggle-flag-isearch char)) - (t - (error "Not in minibuffer, RE-Builder or isearch mode.")))) - -(defun rxt--toggle-flag-re-builder (char) - (save-excursion - (goto-char (point-max)) - (search-backward "/") - (forward-char) - (when (looking-at (rx (* (any ?i ?s ?x)))) - (let ((inhibit-modification-hooks t)) - (replace-match (rxt--xor-flags (match-string 0) char) t t)))) - (reb-do-update)) - -(defun rxt--toggle-flag-minibuffer (char) - (setf (buffer-substring (minibuffer-prompt-end) (point-max)) - (rxt--toggle-flag-string (minibuffer-contents) char)) - (when - (and (= (point) (minibuffer-prompt-end)) - (looking-at (rx "(?" (group (+ (any ?i ?s ?x))) ")"))) - (forward-sexp))) - -(defun rxt--toggle-flag-isearch (char) - (when isearch-regexp - (setq isearch-string - (rxt--toggle-flag-string isearch-string char)) - (setq isearch-message - (mapconcat #'isearch-text-char-description isearch-string "")) - (isearch-search-and-update))) - -(defun rxt--toggle-flag-string (string char) - (if (string-match (rx string-start "(?" (group (+ (any ?i ?s ?x))) ")") - string) - (let ((flags (rxt--xor-flags (match-string 1 string) char))) - (if (string= flags "") - (replace-match "" t t string) - (replace-match flags t t string 1))) - (format "(?%c)%s" char string))) - -(defun rxt--xor-flags (flags char) - (concat - (sort - (cl-set-exclusive-or (string-to-list flags) (list char)) - #'<))) - - -;;;; Minor mode for using emulated PCRE syntax - -(defvar pcre-old-isearch-search-fun-function nil - "Original value of `isearch-search-fun-function' before entering `pcre-mode.' - -This function is wrapped by `pcre-isearch-search-fun-function' -and restored on exit from `pcre-mode'.") -(make-variable-buffer-local 'pcre-old-isearch-search-fun-function) - -(defvar pcre-old-isearch-key-bindings nil - "Alist of key-bindings to restore in `isearch-mode-map' on exiting `pcre-mode'.") - -;;;###autoload -(define-minor-mode pcre-mode - "Use emulated PCRE syntax for regexps wherever possible. - -Advises the `interactive' specs of `read-regexp' and the -following other functions so that they read PCRE syntax and -translate to its Emacs equivalent: - -- `align-regexp' -- `find-tag-regexp' -- `sort-regexp-fields' -- `isearch-message-prefix' -- `ibuffer-do-replace-regexp' - -Also alters the behavior of `isearch-mode' when searching by regexp." - nil " PCRE" - nil - :global t - - (if pcre-mode - ;; Enabling - (progn - ;; Enable advice - (ad-enable-regexp "pcre-mode") - ;; Set up isearch hooks - (add-hook 'isearch-mode-hook #'pcre-isearch-mode-hook) - (add-hook 'isearch-mode-end-hook #'pcre-isearch-mode-end-hook) - ;; Add the keybindings of `rxt--read-pcre-mode-map' to - ;; `isearch-mode-map' (so that they do not cause an exit from - ;; `isearch-mode'), and save any existing bindings for those - ;; keys to restore on exit from `pcre-mode'. - (setq pcre-old-isearch-key-bindings - (cl-loop for key being the key-seqs of rxt--read-pcre-mode-map - for def = (lookup-key isearch-mode-map key) - collect (cons (copy-sequence key) - (if (numberp def) nil def)))) - (cl-loop for key being the key-seqs of rxt--read-pcre-mode-map - using (key-bindings def) - do (define-key isearch-mode-map key def))) - - ;; Disable advice - (ad-disable-regexp "pcre-mode") - ;; Remove from isearch hooks - (remove-hook 'isearch-mode-hook #'pcre-isearch-mode-hook) - (remove-hook 'isearch-mode-end-hook #'pcre-isearch-mode-end-hook) - ;; Restore key-bindings - (cl-loop for (key . def) in pcre-old-isearch-key-bindings - do (define-key isearch-mode-map key def))) - - ;; "Activating" advice re-computes the function definitions, which - ;; is necessary whether enabling or disabling - (ad-activate-regexp "pcre-mode")) - -;;; Cache of PCRE -> Elisp translations -(defvar pcre-mode-cache-size 100 - "Number of PCRE-to-Emacs translations to keep in the `pcre-mode' cache.") - -(defvar pcre-mode-cache (make-hash-table :test 'equal) - "Cache of PCRE-to-Emacs translations used in `pcre-mode'. - -Keys are PCRE regexps, values are their Emacs equivalents.") - -(defvar pcre-mode-reverse-cache (make-hash-table :test 'equal) - "Cache of original PCREs translated to Emacs syntax in `pcre-mode'. - -Keys are translated Emacs regexps, values are their original PCRE -form. This is used to display the original PCRE regexp in place -of its translated form.") - -(defvar pcre-cache-ring (make-ring pcre-mode-cache-size) - "Ring of PCRE-to-Emacs translations used in `pcre-mode'. - -When the ring fills up, the oldest element is removed and the -corresponding entries are deleted from the hash tables -`pcre-mode-cache' and `pcre-mode-reverse-cache'.") - -(defun pcre-to-elisp/cached (pcre) - "Translate PCRE to Emacs syntax, caching both forms." - (or (gethash pcre pcre-mode-cache) - (let ((elisp (rxt-pcre-to-elisp pcre))) - (pcre-set-cache pcre elisp) - elisp))) - -(defun pcre-set-cache (pcre-regexp emacs-regexp) - "Add a PCRE-to-Emacs translation to the `pcre-mode' cache." - (when (and (not (zerop (length pcre-regexp))) - (not (zerop (length emacs-regexp))) - (not (gethash pcre-regexp pcre-mode-cache))) - (if (= (ring-length pcre-cache-ring) (ring-size pcre-cache-ring)) - (let* ((old-item (ring-remove pcre-cache-ring)) - (old-pcre (car old-item)) - (old-emacs (cdr old-item))) - (remhash old-pcre pcre-mode-cache) - (remhash old-emacs pcre-mode-reverse-cache)) - (puthash pcre-regexp emacs-regexp pcre-mode-cache) - (puthash emacs-regexp pcre-regexp pcre-mode-reverse-cache) - (ring-insert pcre-cache-ring (cons pcre-regexp emacs-regexp))))) - -;;; Isearch advice -(defun pcre-isearch-mode-hook () - (when (not (eq isearch-search-fun-function #'isearch-search-fun-default)) - (message "Warning: pcre-mode overriding existing isearch function `%s'" - isearch-search-fun-function)) - ;; Prevent an infinite loop, if a previous isearch in pcre-mode - ;; exited without restoring the original search function for some - ;; reason - (unless (eq isearch-search-fun-function #'pcre-isearch-search-fun-function) - (setq pcre-old-isearch-search-fun-function isearch-search-fun-function)) - (set (make-local-variable 'isearch-search-fun-function) - #'pcre-isearch-search-fun-function)) - -(defun pcre-isearch-mode-end-hook () - (setq isearch-search-fun-function pcre-old-isearch-search-fun-function)) - -(defun pcre-isearch-search-fun-function () - "Enable isearching using emulated PCRE syntax. - -This is set as the value of `isearch-search-fun-function' when -`pcre-mode' is enabled. Returns a function which searches using -emulated PCRE regexps when `isearch-regexp' is true." - (lambda (string bound noerror) - (let ((real-search-function - (funcall (or pcre-old-isearch-search-fun-function 'isearch-search-fun-default)))) - (if (not isearch-regexp) - (funcall real-search-function string bound noerror) - ;; Raise an error if the regexp ends in an incomplete escape - ;; sequence (= odd number of backslashes). - ;; TODO: Perhaps this should really be handled in rxt-pcre-to-elisp? - (if (isearch-backslash string) (rxt-error "Trailing backslash")) - (funcall real-search-function - (pcre-to-elisp/cached string) bound noerror))))) - -(defadvice isearch-message-prefix (after pcre-mode disable) - "Add \"PCRE\" to the Isearch message when searching by regexp in `pcre-mode'." - (when (and isearch-regexp - ;; Prevent an inaccurate message if our callback was - ;; removed somehow - (eq isearch-search-fun-function #'pcre-isearch-search-fun-function)) - (let ((message ad-return-value)) - ;; Some hackery to give replacement the same fontification as - ;; the original - (when - (let ((case-fold-search t)) (string-match "regexp" message)) - (let* ((match (match-string 0 message)) - (properties (text-properties-at 0 match)) - (replacement (apply #'propertize "PCRE regexp" properties)) - (new-message (replace-match replacement t t message))) - (setq ad-return-value new-message)))))) - -(defadvice isearch-fallback - (before pcre-mode (want-backslash &optional allow-invalid to-barrier) disable) - "Hack to fall back correctly in `pcre-mode'. " - ;; A dirty hack to the internals of isearch. Falling back to a - ;; previous match position is necessary when the (Emacs) regexp ends - ;; in "*", "?", "\{" or "\|": this is handled in - ;; `isearch-process-search-char' by calling `isearch-fallback' with - ;; `t' for the value of the first parameter, `want-backslash', in - ;; the last two cases. With PCRE regexps, falling back should take - ;; place on "*", "?", "{" or "|", with no backslashes required. - ;; This advice handles the last two cases by unconditionally setting - ;; `want-backslash' to nil. - (ad-set-arg 0 nil)) - -(defadvice isearch-edit-string - (around pcre-mode disable) - "Add PCRE mode-toggling keys to Isearch minibuffer in regexp mode." - (if isearch-regexp - (minibuffer-with-setup-hook - #'rxt--read-pcre-mode - ad-do-it) - ad-do-it)) - -;;; evil-mode advice -(defadvice evil-search-function - (around pcre-mode (forward regexp-p wrap) disable) - (if (and regexp-p (not isearch-mode)) - (let ((real-search-function ad-do-it)) - (setq ad-return-value - (pcre-decorate-search-function real-search-function))) - ad-do-it)) - -(eval-after-load 'evil - '(when pcre-mode - (ad-enable-advice 'evil-search-function 'around 'pcre-mode) - (ad-activate 'evil-search-function))) - -(defun pcre-decorate-search-function (real-search-function) - (lambda (string &optional bound noerror count) - (funcall real-search-function - (pcre-to-elisp/cached string) - bound noerror count))) - -;;; Other hooks and defadvices - -;;;###autoload -(defun pcre-query-replace-regexp () - "Perform `query-replace-regexp' using PCRE syntax. - -Consider using `pcre-mode' instead of this function." - (interactive) - (let ((old-pcre-mode pcre-mode)) - (unwind-protect - (progn - (pcre-mode +1) - (call-interactively #'query-replace-regexp)) - (pcre-mode (if old-pcre-mode 1 0))))) - - -(defadvice add-to-history - (before pcre-mode (history-var newelt &optional maxelt keep-all) disable) - "Add the original PCRE to query-replace history in `pcre-mode'." - (when (eq history-var query-replace-from-history-variable) - (let ((original (gethash newelt pcre-mode-reverse-cache))) - (when original - (ad-set-arg 1 original))))) - -(defadvice query-replace-descr - (before pcre-mode (from) disable) - "Use the original PCRE in Isearch prompts in `pcre-mode'." - (let ((original (gethash from pcre-mode-reverse-cache))) - (when original - (ad-set-arg 0 original)))) - -;;; The `interactive' specs of the following functions are lifted -;;; wholesale from the original built-ins, which see. -(defadvice read-regexp - (around pcre-mode first (prompt &optional defaults history) disable) - "Read regexp using PCRE syntax and convert to Elisp equivalent." - (ad-set-arg 0 (concat "[PCRE] " prompt)) - (minibuffer-with-setup-hook - #'rxt--read-pcre-mode - ad-do-it) - (setq ad-return-value - (pcre-to-elisp/cached ad-return-value))) - -(defadvice align-regexp - (before pcre-mode first (beg end regexp &optional group spacing repeat) disable) - "Read regexp using PCRE syntax and convert to Elisp equivalent." - (interactive - (append - (list (region-beginning) (region-end)) - (if current-prefix-arg - (list (rxt-pcre-to-elisp - (read-string "Complex align using PCRE regexp: " - "(\\s*)")) - (string-to-number - (read-string - "Parenthesis group to modify (justify if negative): " "1")) - (string-to-number - (read-string "Amount of spacing (or column if negative): " - (number-to-string align-default-spacing))) - (y-or-n-p "Repeat throughout line? ")) - (list (concat "\\(\\s-*\\)" - (rxt-pcre-to-elisp - (read-string "Align PCRE regexp: "))) - 1 align-default-spacing nil))))) - -(defadvice ibuffer-do-replace-regexp - (before pcre-mode first (from-str to-str) disable) - "Read regexp using PCRE syntax and convert to Elisp equivalent." - (interactive - (let* ((from-str (read-from-minibuffer "[PCRE] Replace regexp: ")) - (to-str (read-from-minibuffer (concat "[PCRE] Replace " from-str " with: ")))) - (list (rxt-pcre-to-elisp from-str) to-str)))) - -(defadvice find-tag-regexp - (before pcre-mode first (regexp &optional next-p other-window) disable) - "Read regexp using PCRE syntax and convert to Elisp equivalent. -Perform `find-tag-regexp' using emulated PCRE regexp syntax." - (interactive - (let ((args (find-tag-interactive "[PCRE] Find tag regexp: " t))) - (list (rxt-pcre-to-elisp (nth 0 args)) - (nth 1 args) (nth 2 args))))) - -(defadvice sort-regexp-fields - (before pcre-mode first (reverse record-regexp key-regexp beg end) disable) - "Read regexp using PCRE syntax and convert to Elisp equivalent." - (interactive "P\nsPCRE regexp specifying records to sort: \n\ -sPCRE regexp specifying key within record: \nr") - (ad-set-arg 1 (rxt-pcre-to-elisp (ad-get-arg 1))) - (ad-set-arg 2 (rxt-pcre-to-elisp (ad-get-arg 2)))) - - - -;;; Commands that take Emacs-style regexps as input - -;;;###autoload -(defun rxt-elisp-to-pcre (regexp) - "Translate REGEXP, a regexp in Emacs Lisp syntax, to Perl-compatible syntax. - -Interactively, reads the regexp in one of three ways. With a -prefix arg, reads from minibuffer without string escaping, like -`query-replace-regexp'. Without a prefix arg, uses the text of -the region if it is active. Otherwise, uses the result of -evaluating the sexp before point (which might be a string regexp -literal or an expression that produces a string). - -Displays the translated PCRE regexp in the echo area and copies -it to the kill ring. - -Emacs regexp features such as syntax classes which cannot be -translated to PCRE will cause an error." - (interactive (rxt-interactive/elisp)) - (rxt-return-pcre (rxt-adt->pcre (rxt-parse-elisp regexp)))) - -;;;###autoload -(defun rxt-elisp-to-rx (regexp) - "Translate REGEXP, a regexp in Emacs Lisp syntax, to `rx' syntax. - -See `rxt-elisp-to-pcre' for a description of the interactive -behavior and `rx' for documentation of the S-expression based -regexp syntax." - (interactive (rxt-interactive/elisp)) - (rxt-return-sexp (rxt-adt->rx (rxt-parse-elisp regexp)))) - -;;;###autoload -(defun rxt-elisp-to-strings (regexp) - "Return a list of all strings matched by REGEXP, an Emacs Lisp regexp. - -See `rxt-elisp-to-pcre' for a description of the interactive behavior. - -This is useful primarily for getting back the original list of -strings from a regexp generated by `regexp-opt', but it will work -with any regexp without unbounded quantifiers (*, +, {2, } and so -on). - -Throws an error if REGEXP contains any infinite quantifiers." - (interactive (rxt-interactive/elisp)) - (rxt-return-sexp (rxt-adt->strings (rxt-parse-elisp regexp)))) - -;;;###autoload -(defun rxt-toggle-elisp-rx () - "Toggle the regexp near point between Elisp string and rx syntax." - (interactive) - ;; First, position point before the regex form near point (either - ;; a string literal or a list beginning `rx' or `rx-to-string'). - (let* ((context (syntax-ppss)) - (string-start (nth 8 context))) - (cond (string-start (goto-char string-start)) - ((looking-back "\"") (backward-sexp)) - ((looking-at "\"") nil) - (t - ;; Search backwards, leaving point in place on error - (goto-char - (save-excursion - (skip-syntax-forward "-") - (while (not (looking-at - (rx "(" (or "rx" "rx-to-string") symbol-end))) - (backward-up-list)) - (point)))))) - - ;; Read and replace the regex following point - (let* ((regex (read (current-buffer))) - (print-escape-newlines t)) - (save-excursion - (if (listp regex) - ;; Replace rx form with string value - (prin1 (eval regex) (current-buffer)) - ;; Pretty-print rx form - (save-restriction - (let* ((start (point)) - (rx-syntax (rxt-elisp-to-rx regex)) - (rx-form - (pcase rx-syntax - (`(seq . ,rest) `(rx . ,rest)) - (form `(rx ,form))))) - (rxt-print rx-form) - (narrow-to-region start (point))) - (pp-buffer) - ;; remove the extra newline that pp-buffer inserts - (goto-char (point-max)) - (delete-region - (point) - (save-excursion (skip-chars-backward " \t\n") (point)))))) - (kill-sexp -1) - (indent-pp-sexp))) - - - -;;; Commands that translate PCRE to other formats - -;;;###autoload -(defun rxt-pcre-to-elisp (pcre &optional flags) - "Translate PCRE, a regexp in Perl-compatible syntax, to Emacs Lisp. - -Interactively, uses the contents of the region if it is active, -otherwise reads from the minibuffer. Prints the Emacs translation -in the echo area and copies it to the kill ring. - -PCRE regexp features that cannot be translated into Emacs syntax -will cause an error. See the commentary section of pcre2el.el for -more details." - (interactive (rxt-interactive/pcre)) - (rxt-return-emacs-regexp - (rx-to-string - (rxt-pcre-to-rx (rxt--add-flags pcre flags)) - t))) - -;;;###autoload -(defalias 'pcre-to-elisp 'rxt-pcre-to-elisp) - -;;;###autoload -(defun rxt-pcre-to-rx (pcre &optional flags) - "Translate PCRE, a regexp in Perl-compatible syntax, to `rx' syntax. - -See `rxt-pcre-to-elisp' for a description of the interactive behavior." - (interactive (rxt-interactive/pcre)) - (rxt-return-sexp (rxt-adt->rx (rxt-parse-pcre (rxt--add-flags pcre flags))))) - -;;;###autoload -(defun rxt-pcre-to-strings (pcre &optional flags) - "Return a list of all strings matched by PCRE, a Perl-compatible regexp. - -See `rxt-elisp-to-pcre' for a description of the interactive -behavior and `rxt-elisp-to-strings' for why this might be useful. - -Throws an error if PCRE contains any infinite quantifiers." - (interactive (rxt-interactive/pcre)) - (rxt-return-sexp (rxt-adt->strings (rxt-parse-pcre (rxt--add-flags pcre flags))))) - -(defun rxt--add-flags (pcre flags) - "Prepend FLAGS to PCRE." - (if (not (zerop (length flags))) - (concat "(?" flags ")" pcre) - pcre)) - - -;;; Regexp explaining functions to display pretty-printed rx syntax - -;; When the `rxt-explain' flag is non-nil, `rxt-adt->rx' records -;; location information for each element of the generated `rx' form, -;; allowing highlighting corresponding pieces of syntax at point. -(defvar rxt-explain nil) - -(defvar rxt-highlight-overlays nil - "List of active location-highlighting overlays in rxt-help-mode buffer.") - -;;;###autoload -(defun rxt-explain-elisp (regexp) - "Insert the pretty-printed `rx' syntax for REGEXP in a new buffer. - -REGEXP is a regular expression in Emacs Lisp syntax. See -`rxt-elisp-to-pcre' for a description of how REGEXP is read -interactively." - (interactive (rxt-interactive/elisp)) - (let ((rxt-explain t) - (rxt-verbose-rx-translation rxt-explain-verbosely)) - (rxt-pp-rx regexp (rxt-elisp-to-rx regexp)))) - -;;;###autoload -(defun rxt-explain-pcre (regexp &optional flags) - "Insert the pretty-printed `rx' syntax for REGEXP in a new buffer. - -REGEXP is a regular expression in PCRE syntax. See -`rxt-pcre-to-elisp' for a description of how REGEXP is read -interactively." - (interactive (rxt-interactive/pcre)) - (let ((rxt-explain t) - (rxt-verbose-rx-translation rxt-explain-verbosely)) - (rxt-pp-rx regexp (rxt-pcre-to-rx regexp flags)))) - -;;;###autoload -(defun rxt-quote-pcre (text) - "Return a PCRE regexp which matches TEXT literally. - -Any PCRE metacharacters in TEXT will be quoted with a backslash." - (rxt-adt->pcre (rxt-string text))) - - -;;;; Commands that depend on the major mode in effect - -;; Macro: interactively call one of two functions depending on the -;; major mode -(defmacro rxt-mode-dispatch (elisp-function pcre-function) - `(if (memq major-mode '(emacs-lisp-mode lisp-interaction-mode)) - (call-interactively #',elisp-function) - (call-interactively #',pcre-function))) - -;;;###autoload -(defun rxt-explain () - "Pop up a buffer with pretty-printed `rx' syntax for the regex at point. - -Chooses regex syntax to read based on current major mode, calling -`rxt-explain-elisp' if buffer is in `emacs-lisp-mode' or -`lisp-interaction-mode', or `rxt-explain-pcre' otherwise." - (interactive) - (rxt-mode-dispatch rxt-explain-elisp rxt-explain-pcre)) - -;;;###autoload -(defun rxt-convert-syntax () - "Convert regex at point to other kind of syntax, depending on major mode. - -For buffers in `emacs-lisp-mode' or `lisp-interaction-mode', -calls `rxt-elisp-to-pcre' to convert to PCRE syntax. Otherwise, -calls `rxt-pcre-to-elisp' to convert to Emacs syntax. - -The converted syntax is displayed in the echo area and copied to -the kill ring; see the two functions named above for details." - (interactive) - (rxt-mode-dispatch rxt-elisp-to-pcre rxt-pcre-to-elisp)) - -;;;###autoload -(defun rxt-convert-to-rx () - "Convert regex at point to RX syntax. Chooses Emacs or PCRE syntax by major mode." - (interactive) - (rxt-mode-dispatch rxt-elisp-to-rx rxt-pcre-to-rx)) - -;;;###autoload -(defun rxt-convert-to-strings () - "Convert regex at point to RX syntax. Chooses Emacs or PCRE syntax by major mode." - (interactive) - (rxt-mode-dispatch rxt-elisp-to-strings rxt-pcre-to-strings)) - - - -;;; Minor mode and keybindings - -(defvar rxt-mode-map - (let ((map (make-sparse-keymap))) - ;; Generic - (define-key map (kbd "C-c / /") 'rxt-explain) - (define-key map (kbd "C-c / c") 'rxt-convert-syntax) - (define-key map (kbd "C-c / x") 'rxt-convert-to-rx) - (define-key map (kbd "C-c / '") 'rxt-convert-to-strings) - - ;; From PCRE - (define-key map (kbd "C-c / p /") 'rxt-explain-pcre) - (define-key map (kbd "C-c / p e") 'rxt-pcre-to-elisp) - (define-key map (kbd "C-c / p x") 'rxt-pcre-to-rx) - (define-key map (kbd "C-c / p '") 'rxt-pcre-to-strings) - - ;; From Elisp - (define-key map (kbd "C-c / e /") 'rxt-explain-elisp) - (define-key map (kbd "C-c / e p") 'rxt-elisp-to-pcre) - (define-key map (kbd "C-c / e x") 'rxt-elisp-to-rx) - (define-key map (kbd "C-c / e '") 'rxt-elisp-to-strings) - (define-key map (kbd "C-c / e t") 'rxt-toggle-elisp-rx) - (define-key map (kbd "C-c / t") 'rxt-toggle-elisp-rx) - - ;; Search - (define-key map (kbd "C-c / %") 'pcre-query-replace-regexp) - - map) - "Keymap for `rxt-mode'.") - -;;;###autoload -(define-minor-mode rxt-mode - "Regex translation utilities." nil nil) - -;;;###autoload -(defun turn-on-rxt-mode () - "Turn on `rxt-mode' in the current buffer." - (interactive) - (rxt-mode 1)) - -;;;###autoload -(define-globalized-minor-mode rxt-global-mode rxt-mode - turn-on-rxt-mode) - - -;;;; Syntax explanations - -;; Major mode for displaying pretty-printed S-exp syntax -(define-derived-mode rxt-help-mode emacs-lisp-mode "Regexp Explain" - (setq buffer-read-only t) - (add-hook 'post-command-hook 'rxt-highlight-text nil t) - (rxt-highlight-text)) - -;; Hack: stop paredit-mode interfering with `rxt-print' -(eval-when-compile (declare-function paredit-mode "paredit.el")) -(add-hook 'rxt-help-mode-hook - (lambda () - (if (and (boundp 'paredit-mode) - paredit-mode) - (paredit-mode 0)))) - -(define-key rxt-help-mode-map "q" 'quit-window) -(define-key rxt-help-mode-map "z" 'kill-this-buffer) -(define-key rxt-help-mode-map "n" 'next-line) -(define-key rxt-help-mode-map "p" 'previous-line) -(define-key rxt-help-mode-map "f" 'forward-list) -(define-key rxt-help-mode-map "b" 'backward-list) -(define-key rxt-help-mode-map "u" 'backward-up-list) -(define-key rxt-help-mode-map "d" 'down-list) - -(defvar rxt--print-with-overlays nil) -(defvar rxt--print-depth 0) - -(defconst rxt--print-char-alist - '((?\a . "\\a") - (?\b . "\\b") - (?\t . "\\t") - (?\n . "\\n") - (?\v . "\\v") - (?\f . "\\f") - (?\r . "\\r") - (?\e . "\\e") - (?\s . "\\s") - (?\\ . "\\\\") - (?\d . "\\d")) - "Alist of characters to print using an escape sequence in Elisp source. -See (info \"(elisp) Basic Char Syntax\").") - -(defconst rxt--whitespace-display-regexp - (rx-to-string `(any ,@(mapcar #'car rxt--print-char-alist)))) - -(defconst rxt--print-special-chars - '(?\( ?\) ?\\ ?\| ?\; ?\' ?\` ?\" ?\# ?\. ?\,) - "Characters which require a preceding backslash in Elisp source. -See (info \"(elisp) Basic Char Syntax\").") - -(defun rxt-pp-rx (regexp rx) - "Display string regexp REGEXP with its `rx' form RX in an `rxt-help-mode' buffer." - (with-current-buffer (get-buffer-create "* Regexp Explain *") - (let ((print-escape-newlines t) - (inhibit-read-only t)) - (erase-buffer) - (rxt-help-mode) - (insert (rxt--propertize-whitespace regexp)) - (newline 2) - (save-excursion - (let ((sexp-begin (point)) - (rxt--print-with-overlays t)) - (rxt-print rx) - (narrow-to-region sexp-begin (point)) - (pp-buffer) - (widen))) - (rxt-highlight-text)) - (pop-to-buffer (current-buffer)))) - -(cl-defun rxt-print (rx) - "Insert RX, an `rx' form, into the current buffer, optionally adding overlays. - -Similar to `print' or `prin1', but ensures that `rx' forms are -printed readably, using character or integer syntax depending on -context. - -If `rxt--print-with-overlays' is non-nil, also creates overlays linking -elements of RX to their corresponding locations in the source -string (see `rxt-explain-elisp', `rxt-explain-pcre' and -`rxt--make-help-overlays')." - (let ((start (point))) - (cl-typecase rx - (cons - (pcase rx - (`(,(and (or `repeat `**) head) - ,(and (pred integerp) from) - ,(and (pred integerp) to) - . ,rest) - (insert (format "(%s %d %d" head from to)) - (rxt--print-list-tail rest)) - (`(,(and (or `repeat `= `>=) head) - ,(and (pred integerp) n) - . ,rest) - (insert (format "(%s %d" head n)) - (rxt--print-list-tail rest)) - (_ - (rxt--print-list-tail rx t)))) - (symbol - (cl-case rx - ;; `print' escapes the ? characters in the rx operators *? - ;; and +?, but this looks bad and is not strictly necessary: - ;; (eq (read "*?") (read "*\\?")) => t - ;; (eq (read "+?") (read "+\\?")) => t - ((*? +?) (insert (symbol-name rx))) - (t (prin1 rx (current-buffer))))) - (string - (insert (rxt--propertize-whitespace (prin1-to-string rx)))) - (character - (cond - ((eq ? rx) - (insert "?")) - ((memq rx rxt--print-special-chars) - (insert "?\\" rx)) - ((assq rx rxt--print-char-alist) - (insert "?" (assoc-default rx rxt--print-char-alist))) - (t - (insert "?" (char-to-string rx))))) - (t - (prin1 rx (current-buffer)))) - (when rxt--print-with-overlays - (rxt--make-help-overlays rx start (point))))) - -(defun rxt--print-list-tail (tail &optional open-paren) - (let ((rxt--print-depth (1+ rxt--print-depth))) - (let ((done nil)) - (while (not done) - (cl-typecase tail - (null - (insert ")") - (setq done t)) - (cons - (if open-paren - (progn - (insert "(") - (setq open-paren nil)) - (insert " ")) - (rxt-print (car tail)) - (setq tail (cdr tail))) - (t - (insert " . ") - (rxt-print tail) - (insert ")") - (setq done t))))))) - -(defun rxt--make-help-overlays (rx start end) - (let ((location (rxt-location rx))) - (when (and location - (rxt-location-start location) - (rxt-location-end location)) - (let* ((sexp-begin (copy-marker start t)) - (sexp-end (copy-marker end)) - (sexp-bounds (list sexp-begin sexp-end)) - - (source-begin (1+ (rxt-location-start location))) - (source-end (1+ (rxt-location-end location))) - (source-bounds (list source-begin source-end)) - - (bounds (list source-bounds sexp-bounds)) - - (sexp-ol (make-overlay sexp-begin sexp-end (current-buffer) t nil)) - (source-ol (make-overlay source-begin source-end (current-buffer) t nil))) - (dolist (ol (list sexp-ol source-ol)) - (overlay-put ol 'priority rxt--print-depth) - (overlay-put ol 'rxt-bounds bounds)))))) - -(defun rxt--propertize-whitespace (string) - (let ((string (copy-sequence string)) - (start 0)) - (while (string-match rxt--whitespace-display-regexp string start) - (put-text-property (match-beginning 0) (match-end 0) - 'display - (assoc-default (string-to-char (match-string 0 string)) - rxt--print-char-alist) - string) - (setq start (match-end 0))) - string)) - -(defun rxt-highlight-text () - "Highlight the regex syntax at point and its corresponding RX/string form." - (let ((all-bounds (get-char-property (point) 'rxt-bounds))) - (mapc #'delete-overlay rxt-highlight-overlays) - (setq rxt-highlight-overlays nil) - (dolist (bounds all-bounds) - (cl-destructuring-bind (begin end) bounds - (let ((overlay (make-overlay begin end))) - (push overlay rxt-highlight-overlays) - (overlay-put overlay 'face 'rxt-highlight-face)))))) - - -;;;; Error handling - -(if (fboundp 'define-error) - (define-error 'rxt-invalid-regexp "Invalid regexp" 'invalid-regexp) - (put 'rxt-invalid-regexp - 'error-conditions - '(rxt-invalid-regexp invalid-regexp error)) - (put 'rxt-invalid-regexp 'error-message "Invalid regexp")) - -(defun rxt-error (&rest format-args) - (signal 'rxt-invalid-regexp (list (apply #'format format-args)))) - - -;;;; Regexp syntax tree data type - -;; Base class from which other elements of the syntax-tree inherit -(cl-defstruct rxt-syntax-tree) - -;; Struct representing the original source location -(cl-defstruct rxt-location - source ; Either a string or a buffer - start end ; Offsets, 0- or 1-indexed as appropriate - ) - -(defun rxt-location-text (location) - (if (not (rxt-location-p location)) - nil - (let ((start (rxt-location-start location)) - (end (rxt-location-end location)) - (source (rxt-location-source location))) - (cond - ((buffer-live-p source) - (with-current-buffer source - (buffer-substring-no-properties start end))) - ((stringp source) - (substring source start end)) - (t nil))))) - -;; Hash table mapping from syntax-tree elements to source locations. -(defvar rxt-location-map (make-hash-table :weakness 'key)) - -(defun rxt-location (object) - (gethash object rxt-location-map)) - -(gv-define-setter rxt-location (value object) - `(puthash ,object ,value rxt-location-map)) - -(defun rxt-source-text (object) - (rxt-location-text (rxt-location object))) - -(defun rxt-to-string (tree) - "Return a readable representation of TREE, a regex syntax-tree object." - (or (rxt-source-text tree) - (let ((print-level 1)) - (prin1-to-string tree)))) -(defalias 'rxt-syntax-tree-readable 'rxt-to-string) - -;; FIXME -(defvar rxt-pcre-case-fold nil) - -;; Literal string -(cl-defstruct - (rxt-string - (:constructor rxt-string (chars &optional case-fold)) - (:include rxt-syntax-tree)) - chars - (case-fold rxt-pcre-case-fold)) - -(defun rxt-empty-string () - (rxt-string "")) - -(defun rxt-trivial-p (re) - (and (rxt-string-p re) - (equal (rxt-string-chars re) ""))) - -;;; Other primitives -(cl-defstruct (rxt-primitive - (:constructor rxt-primitive (pcre rx)) - (:include rxt-syntax-tree)) - pcre rx) - -(defun rxt-bos () (rxt-primitive "\\A" 'bos)) -(defun rxt-eos () (rxt-primitive "\\Z" 'eos)) - -(defun rxt-bol () (rxt-primitive "^" 'bol)) -(defun rxt-eol () (rxt-primitive "$" 'eol)) - -;; FIXME -(defun rxt-anything () (rxt-primitive "." 'anything)) -(defun rxt-nonl () (rxt-primitive "." 'nonl)) - -(defun rxt-word-boundary () (rxt-primitive "\\b" 'word-boundary)) -(defun rxt-not-word-boundary () (rxt-primitive "\\B" 'not-word-boundary)) - -(defun rxt-wordchar () (rxt-primitive "\\w" 'wordchar)) -(defun rxt-not-wordchar () (rxt-primitive "\\W" 'not-wordchar)) - -(defun rxt-symbol-start () (rxt-primitive nil 'symbol-start)) -(defun rxt-symbol-end () (rxt-primitive nil 'symbol-end)) - -(defun rxt-bow () (rxt-primitive nil 'bow)) -(defun rxt-eow () (rxt-primitive nil 'eow)) - -;;; Sequence -(cl-defstruct - (rxt-seq - (:constructor make-rxt-seq (elts)) - (:include rxt-syntax-tree)) - elts) - -;; Slightly smart sequence constructor: -;; - Flattens nested sequences -;; - Drops trivial "" elements -;; - Empty sequence => "" -;; - Singleton sequence is reduced to its one element. -(defun rxt-seq (&rest res) ; Flatten nested seqs & drop ""'s. - (let ((res (rxt-seq-flatten res))) - (if (consp res) - (if (consp (cdr res)) - (make-rxt-seq res) ; General case - (car res)) ; Singleton sequence - (rxt-empty-string)))) ; Empty seq -- "" - -(defun rxt-seq-flatten (res) - (if (consp res) - (let ((re (car res)) - (tail (rxt-seq-flatten (cdr res)))) - (cond ((rxt-seq-p re) ; Flatten nested seqs - (append (rxt-seq-flatten (rxt-seq-elts re)) tail)) - ((rxt-trivial-p re) tail) ; Drop trivial elts - ((and (rxt-string-p re) ; Flatten strings - (consp tail) - (rxt-string-p (car tail))) - (cons - (rxt-string-concat re (car tail)) - (cdr tail))) - (t (cons re tail)))) - '())) - -(defun rxt-string-concat (str1 str2) - (if (not (eq (rxt-string-case-fold str1) - (rxt-string-case-fold str2))) - (make-rxt-seq (list str1 str2)) - (let ((result - (rxt-string (concat (rxt-string-chars str1) - (rxt-string-chars str2)) - (rxt-string-case-fold str1))) - (first (rxt-location str1)) - (last (rxt-location str2))) - (when (and first last) - (setf (rxt-location result) - (make-rxt-location :source (rxt-location-source first) - :start (rxt-location-start first) - :end (rxt-location-end last)))) - result))) - -;;; Choice (alternation/union) -(cl-defstruct - (rxt-choice - (:constructor make-rxt-choice (elts)) - (:include rxt-syntax-tree)) - elts) - -;;; The empty choice represents a regexp that never matches in any context -(defvar rxt-empty (make-rxt-choice nil)) -(defun rxt-empty-p (re) - (or - (and (rxt-choice-p re) - (null (rxt-choice-elts re))) - (rxt-empty-char-set-p re))) - -(defun rxt-choice (&rest alternatives) - "Construct the alternation (union) of several regexps. - -ALTERNATIVES should be a list of `rxt-syntax-tree' objects. -The return value is an `rxt-choice' object representing a regexp -which matches any one of ALTERNATIVES, but simplified in the -following ways: - -- If ALTERNATIVES contains only one element, it is returned unchanged. - -- All existing `rxt-choice' elements in ALTERNATIVES are replaced - by a flat list of their subexpressions: symbolically, - a|(b|(c|d)) is replaced by a|b|c|d - -- All character sets and single-character strings in ALTERNATIVES - are combined together into one or two character sets, - respecting case-folding behaviour." - (cl-destructuring-bind (other-elements char-set case-fold-char-set) - (rxt--simplify-alternatives alternatives) - (let ((simplified-alternatives - (append (if (not (rxt-empty-p char-set)) - (list char-set) - '()) - (if (not (rxt-empty-p case-fold-char-set)) - (list case-fold-char-set) - '()) - other-elements))) - (pcase simplified-alternatives - (`() - rxt-empty) - (`(,element) - element) - (_ - (make-rxt-choice simplified-alternatives)))))) - -(defun rxt--simplify-alternatives (alternatives) - "Simplify a set of regexp alternatives. - -ALTERNATIVES should be a list of `rxt-syntax-tree' objects to be combined -into an `rxt-choice' structure. The result is a three-element -list (OTHER-ELEMENTS CHAR-SET CASE-FOLDED-CHAR-SET): - -- CHAR-SET is an `rxt-char-set-union' containing the union of all - case-sensitive character sets and single-character strings in - RES. - -- CASE-FOLDED-CHAR-SET is similar but combines all the - case-insensitive character sets and single-character strings. - -- OTHER-ELEMENTS is a list of all other elements, with all - `rxt-choice' structures replaced by a flat list of their - component subexpressions." - (if (null alternatives) - (list '() - (make-rxt-char-set-union :case-fold nil) - (make-rxt-char-set-union :case-fold t)) - (let* ((re (car alternatives))) - (cl-destructuring-bind (tail char-set case-fold-char-set) - (rxt--simplify-alternatives (cdr alternatives)) - (cond ((rxt-choice-p re) ; Flatten nested choices - (list - (append (rxt-choice-elts re) tail) - char-set - case-fold-char-set)) - - ((rxt-empty-p re) ; Drop empty re's. - (list tail char-set case-fold-char-set)) - - ((rxt-char-set-union-p re) ; Fold char sets together - (if (rxt-char-set-union-case-fold re) - (list tail - char-set - (rxt-char-set-union case-fold-char-set re)) - (list tail - (rxt-char-set-union char-set re) - case-fold-char-set))) - - ((and (rxt-string-p re) ; Same for 1-char strings - (= 1 (length (rxt-string-chars re)))) - (if (rxt-string-case-fold re) - (list tail - char-set - (rxt-char-set-union case-fold-char-set re)) - (list tail - (rxt-char-set-union char-set re) - case-fold-char-set))) - - (t ; Otherwise. - (list (cons re tail) char-set case-fold-char-set))))))) - -;;; Repetition -(cl-defstruct (rxt-repeat - (:include rxt-syntax-tree)) - from to body greedy) - -(cl-defun rxt-repeat (from to body &optional (greedy t)) - (if (equal to 0) - (rxt-empty-string) - (make-rxt-repeat :from from :to to - :body body :greedy greedy))) - -;;; Submatch -(cl-defstruct - (rxt-submatch - (:constructor rxt-submatch (body)) - (:include rxt-syntax-tree)) - body) - -;;; Numbered submatch (Emacs only) -(cl-defstruct - (rxt-submatch-numbered - (:constructor rxt-submatch-numbered (n body)) - (:include rxt-syntax-tree)) - n - body) - -;;; Backreference -(cl-defstruct - (rxt-backref - (:constructor rxt-backref (n)) - (:include rxt-syntax-tree)) - n) - -;;; Syntax classes (Emacs only) -(cl-defstruct (rxt-syntax-class - (:include rxt-syntax-tree)) - symbol) - -(defun rxt-syntax-class (symbol) - (if (assoc symbol rx-syntax) - (make-rxt-syntax-class :symbol symbol) - (rxt-error "Invalid syntax class symbol `%s'" symbol))) - -;;; Character categories (Emacs only) -(cl-defstruct (rxt-char-category - (:include rxt-syntax-tree)) - symbol) - -(defun rxt-char-category (symbol) - (if (assoc symbol rx-categories) - (make-rxt-char-category :symbol symbol) - (rxt-error "Invalid character category symbol `%s'" symbol))) - - -;;; Char sets -;; <rxt-char-set> ::= <rxt-char-set-union> -;; | <rxt-char-set-negation> -;; | <rxt-char-set-intersection> - -(cl-defstruct (rxt-char-set (:include rxt-syntax-tree))) - -;; An rxt-char-set-union represents the union of any number of -;; characters, character ranges, and POSIX character classes: anything -;; that can be represented in string notation as a class [ ... ] -;; without the negation operator. -(cl-defstruct (rxt-char-set-union - (:include rxt-char-set)) - chars ; list of single characters - ranges ; list of ranges (from . to) - classes ; list of character classes - (case-fold rxt-pcre-case-fold)) - -;; Test for empty character set -(defun rxt-empty-char-set-p (cset) - (and (rxt-char-set-union-p cset) - (null (rxt-char-set-union-chars cset)) - (null (rxt-char-set-union-ranges cset)) - (null (rxt-char-set-union-classes cset)))) - -;; Simple union constructor -(defun rxt-char-set-union (&rest items) - "Construct an regexp character set representing the union of ITEMS. - -Each element of ITEMS may be either: a character; a -single-character string; a single-character `rxt-string' object; -a cons, (FROM . TO) representing a range of characters; a symbol, -representing a named character class; or an `rxt-char-set-union' -object. All `rxt-char-set-union' objects in ITEMS must have the -same `case-fold' property." - (let ((chars '()) - (ranges '()) - (classes '()) - (case-fold 'undetermined)) - (dolist (item items) - (cl-etypecase item - (character - (push item chars)) - - (string - (cl-assert (= 1 (length item))) - (push (string-to-char item) chars)) - - (rxt-string - (cl-assert (= 1 (length (rxt-string-chars item)))) - (push (string-to-char (rxt-string-chars item)) chars)) - - (cons ; range (from . to) - (cl-check-type (car item) character) - (cl-check-type (cdr item) character) - (push item ranges)) - - (symbol ; named character class - (push item classes)) - - (rxt-char-set-union - (if (eq case-fold 'undetermined) - (setq case-fold (rxt-char-set-union-case-fold item)) - (unless (eq case-fold (rxt-char-set-union-case-fold item)) - (error "Cannot construct union of char-sets with unlike case-fold setting: %S" item))) - (setq chars (nconc chars (rxt-char-set-union-chars item))) - (setq ranges (nconc ranges (rxt-char-set-union-ranges item))) - (setq classes (nconc classes (rxt-char-set-union-classes item)))))) - - (make-rxt-char-set-union :chars chars :ranges ranges :classes classes - :case-fold (if (eq case-fold 'undetermined) - rxt-pcre-case-fold - case-fold)))) - -(defun rxt--all-char-set-union-chars (char-set) - "Return a list of all characters in CHAR-SET." - (cl-assert (rxt-char-set-union-p char-set)) - (append - (rxt-char-set-union-chars char-set) - (cl-loop for (start . end) in (rxt-char-set-union-ranges char-set) - nconc (cl-loop for char from start to end collect char)))) - -(defun rxt--simplify-char-set (char-set &optional case-fold-p) - "Return a minimal char-set to match the same characters as CHAR-SET. - -With optional argument CASE-FOLD-P, return a char-set which -emulates case-folding behaviour by including both uppercase and -lowercase versions of all characters in CHAR-SET." - (cl-assert (rxt-char-set-union-p char-set)) - (let* ((classes (rxt-char-set-union-classes char-set)) - (all-chars - (if case-fold-p - (cl-loop for char in (rxt--all-char-set-union-chars char-set) - nconc (list (upcase char) (downcase char))) - (rxt--all-char-set-union-chars char-set))) - (all-ranges - (rxt--extract-ranges (rxt--remove-redundant-chars all-chars classes)))) - (let ((singletons nil) - (ranges nil)) - (cl-loop for (start . end) in all-ranges - do - (cond ((= start end) (push start singletons)) - ((= (1+ start) end) - (push start singletons) - (push end singletons)) - (t (push (cons start end) ranges)))) - (make-rxt-char-set-union :chars (nreverse singletons) - :ranges (nreverse ranges) - :classes classes - :case-fold (if case-fold-p - nil - (rxt-char-set-union-case-fold char-set)))))) - -(defun rxt--remove-redundant-chars (chars classes) - "Remove all characters which match a character class in CLASSES from CHARS." - (if (null classes) - chars - (string-to-list - (replace-regexp-in-string - (rx-to-string `(any ,@classes)) - "" - (apply #'string chars))))) - -(defun rxt--extract-ranges (chars) - "Return a list of all contiguous ranges in CHARS. - -CHARS should be a list of characters (integers). The return -value is a list of conses (START . END) representing ranges, such -that the union of all the ranges represents the same of -characters as CHARS. - -Example: - (rxt--extract-ranges (list ?a ?b ?c ?q ?x ?y ?z)) - => ((?a . ?c) (?q . ?q) (?x . ?z))" - (let ((array - (apply #'vector - (cl-remove-duplicates - (sort (copy-sequence chars) #'<))))) - (cl-labels - ((recur (start end) - (if (< end start) - nil - (let ((min (aref array start)) - (max (aref array end))) - (if (= (- max min) (- end start)) - (list (cons min max)) - (let* ((split-point (/ (+ start end) 2)) - (left (recur start split-point)) - (right (recur (1+ split-point) end))) - (merge left right)))))) - (merge (left right) - (cond ((null left) right) - ((null right) left) - (t - (let ((last-left (car (last left))) - (first-right (car right))) - (if (= (1+ (cdr last-left)) - (car first-right)) - (append (cl-subseq left 0 -1) - (list - (cons (car last-left) - (cdr first-right))) - (cl-subseq right 1)) - (append left right))))))) - (recur 0 (1- (length array)))))) - -;;; Set complement of character set, syntax class, or character -;;; category - -;; In general, all character sets that can be represented in string -;; notation as [^ ... ] (but see `rxt-char-set-intersection', below), plus -;; Emacs' \Sx and \Cx constructions. -(cl-defstruct (rxt-char-set-negation - (:include rxt-char-set)) - elt) - -(defun rxt-negate (char-set) - "Construct the logical complement (negation) of CHAR-SET. - -CHAR-SET may be any of the following types: `rxt-char-set-union', -`rxt-syntax-class', `rxt-char-category', or `rxt-char-set-negation'." - (cl-etypecase char-set - ((or rxt-char-set-union rxt-syntax-class rxt-char-category) - (make-rxt-char-set-negation :elt char-set)) - - (rxt-char-set-negation - (rxt-char-set-negation-elt char-set)))) - -;;; Intersections of char sets - -;; These are difficult to represent in general, but can be constructed -;; in Perl using double negation; for example: [^\Wabc] means the set -;; complement of [abc] with respect to the universe of "word -;; characters": (& (~ (~ word)) (~ ("abc"))) == (& word (~ ("abc"))) -;; == (- word ("abc")) - -(cl-defstruct (rxt-char-set-intersection - (:include rxt-char-set)) - elts) - -;; Intersection constructor -(defun rxt-char-set-intersection (&rest charsets) - (let ((elts '()) - (cmpl (make-rxt-char-set-union))) - (dolist (cset (rxt-int-flatten charsets)) - (cond - ((rxt-char-set-negation-p cset) - ;; Fold negated charsets together: ~A & ~B = ~(A|B) - (setq cmpl (rxt-char-set-union cmpl (rxt-char-set-negation-elt cset)))) - - ((rxt-char-set-union-p cset) - (push cset elts)) - - (t - (rxt-error "Can't take intersection of non-character-set %S" cset)))) - - (if (null elts) - (rxt-negate cmpl) - (unless (rxt-empty-char-set-p cmpl) - (push (rxt-negate cmpl) elts)) - (if (null (cdr elts)) - (car elts) ; singleton case - (make-rxt-char-set-intersection :elts elts))))) - -;; Constructor helper: flatten nested intersections -(defun rxt-int-flatten (csets) - (if (consp csets) - (let ((cset (car csets)) - (tail (rxt-int-flatten (cdr csets)))) - (if (rxt-char-set-intersection-p cset) - (append (rxt-int-flatten (rxt-char-set-intersection-elts cset)) tail) - (cons cset tail))) - '())) - - - -;;;; Macros for building the parser - -(defmacro rxt-token-case (&rest cases) - "Consume a token at point and evaluate corresponding forms. - -CASES is a list of `cond'-like clauses, (REGEXP BODY ...) where -the REGEXPs define possible tokens which may appear at point. The -CASES are considered in order. For each case, if the text at -point matches REGEXP, then point is moved to the end of the -matched token, the corresponding BODY is evaluated and their -value returned. The matched token is available within the BODY -forms as (match-string 0). - -There can be a default case where REGEXP is `t', which evaluates -the corresponding FORMS but does not move point. - -Returns `nil' if none of the CASES matches." - (declare (debug (&rest (sexp &rest form)))) - `(cond - ,@(cl-loop for (token . action) in cases - collect - (if (eq token t) - `(t ,@action) - `((looking-at ,token) - (goto-char (match-end 0)) - ,@action))))) - -(defmacro rxt-with-source-location (&rest body) - "Evaluate BODY and record source location information on its value. - -BODY may evaluate to any kind of object, but its value should -generally not be `eq' to any other object." - (declare (debug (&rest form))) - (let ((begin (make-symbol "begin")) - (value (make-symbol "value"))) - `(let ((,begin (point)) - (,value ,(macroexp-progn body))) - (setf (rxt-location ,value) - (make-rxt-location :source rxt-source-text-string - :start (1- ,begin) - :end (1- (point)))) - ,value))) - -;; Read PCRE + flags -(defun rxt-read-delimited-pcre () - "Read a Perl-style delimited regexp and flags from the current buffer. - -Point should be before the regexp literal before calling -this. Currently only regexps delimited by / ... / are supported. -A preceding \"m\", \"qr\" or \"s\" will be ignored, as will the -replacement string in an s/.../.../ construction. - -Returns two strings: the regexp and the flags." - (save-excursion - (skip-syntax-forward "-") - - ;; Skip m, qr, s - (let ((is-subst (rxt-token-case - ("s" t) - ((rx (or "m" "qr")) nil)))) - - (when (not (looking-at "/")) - (error "Only Perl regexps delimited by slashes are supported")) - (let ((beg (match-end 0)) - (delim (rx (not (any "\\")) - (group "/")))) - (search-forward-regexp delim) - (let ((end (match-beginning 1))) - (when is-subst (search-forward-regexp delim)) - (let ((pcre (buffer-substring-no-properties beg end))) - (rxt-token-case - ("[gimosx]*" - (rxt--add-flags pcre (match-string-no-properties 0)))))))))) - - -;;;; Elisp and PCRE string notation parser - -;;; Parser constants -(defconst rxt-pcre-char-set-alist - `((?w . ; "word" characters - (?_ alnum)) - (?d . ; digits - (digit)) - (?h . ; horizontal whitespace - (#x0009 #x0020 #x00A0 #x1680 #x180E #x2000 #x2001 #x2002 #x2003 - #x2004 #x2005 #x2006 #x2007 #x2008 #x2009 #x200A #x202F - #x205F #x3000)) - (?s . ; whitespace - (9 10 12 13 32)) - (?v . ; vertical whitespace - (#x000A #x000B #x000C #x000D #x0085 #x2028 #x2029)))) - -(defconst rxt-pcre-named-classes-regexp - (rx "[:" - (submatch - (or "alnum" "alpha" "ascii" "blank" "cntrl" "digit" "graph" "lower" - "print" "punct" "space" "upper" "word" "xdigit")) - ":]")) - -(defconst rxt-elisp-named-classes-regexp - (rx "[:" - (submatch - (or "alnum" "alpha" "ascii" "blank" "cntrl" "digit" "graph" "lower" - "print" "punct" "space" "upper" "word" "xdigit" - "unibyte" "nonascii" "multibyte")) - ":]")) - -;;; The following dynamically bound variables control the operation of -;;; the parser (see `rxt-parse-re'.) -(defvar rxt-parse-pcre nil - "t if the rxt string parser is parsing PCRE syntax, nil for Elisp syntax. - -This should only be let-bound internally, never set otherwise.") - -(defvar rxt-pcre-extended-mode nil - "t if the rxt string parser is emulating PCRE's \"extended\" mode. - -In extended mode (indicated by /x in Perl/PCRE), whitespace -outside of character classes and \\Q...\\E quoting is ignored, -and a `#' character introduces a comment that extends to the end -of line.") - -(defvar rxt-pcre-s-mode nil - "t if the rxt string parser is emulating PCRE's single-line \"/s\" mode. - -When /s is used, PCRE's \".\" matches newline characters, which -otherwise it would not match.") - -(defvar rxt-pcre-case-fold nil - "non-nil to emulate PCRE's case-insensitive \"/i\" mode in translated regexps.") - -(defvar rxt-branch-end-regexp nil) -(defvar rxt-choice-regexp nil) -(defvar rxt-brace-begin-regexp nil) -(defvar rxt-m-to-n-brace-regexp nil) -(defvar rxt-m-to-?-brace-regexp nil) -(defvar rxt-m-brace-regexp nil) -(defvar rxt-named-classes-regexp nil) - -(defvar rxt-subgroup-count nil) -(defvar rxt-source-text-string nil) - -(defun rxt-parse-pcre (re) - (rxt-parse-re re t)) - -(defun rxt-parse-elisp (re) - (rxt-parse-re re nil)) - -(defun rxt-parse-re (re pcre-p) - (let* ((rxt-parse-pcre pcre-p) - (rxt-pcre-extended-mode nil) - (rxt-pcre-s-mode nil) - (rxt-pcre-case-fold nil) - - ;; Bind regexps to match syntax that differs between PCRE and - ;; Elisp only in the addition of a backslash "\" - (escape (if pcre-p "" "\\")) - (rxt-choice-regexp - (rx-to-string `(seq ,escape "|"))) - (rxt-branch-end-regexp - (rx-to-string `(or buffer-end - (seq ,escape (or "|" ")"))))) - (rxt-brace-begin-regexp - (rx-to-string `(seq ,escape "{"))) - (rxt-m-to-n-brace-regexp - (rx-to-string - `(seq - (submatch (* (any "0-9"))) "," (submatch (+ (any "0-9"))) - ,escape "}"))) - (rxt-m-to-?-brace-regexp - (rx-to-string - `(seq (submatch (+ (any "0-9"))) "," ,escape "}"))) - (rxt-m-brace-regexp - (rx-to-string - `(seq (submatch (+ (any "0-9"))) ,escape "}"))) - - ;; Named character classes [: ... :] differ slightly - (rxt-named-classes-regexp - (if pcre-p - rxt-pcre-named-classes-regexp - rxt-elisp-named-classes-regexp)) - - (rxt-subgroup-count 0) - (case-fold-search nil)) - (with-temp-buffer - (insert re) - (goto-char (point-min)) - (let ((rxt-source-text-string re)) - (rxt-parse-exp))))) - -;; Parse a complete regex: a number of branches separated by | or -;; \|, as determined by `rxt-branch-end-regexp'. -(defun rxt-parse-exp () - ;; These variables are let-bound here because in PCRE mode they may - ;; be set internally by (?x) or (?s) constructions, whose scope - ;; lasts until the end of a sub-expression - (rxt-with-source-location - (let ((rxt-pcre-extended-mode rxt-pcre-extended-mode) - (rxt-pcre-s-mode rxt-pcre-s-mode) - (rxt-pcre-case-fold rxt-pcre-case-fold)) - (if (eobp) - (rxt-seq) - (let ((branches '())) - (cl-block nil - (while t - (let ((branch (rxt-parse-branch))) - (push branch branches) - (rxt-token-case - (rxt-choice-regexp nil) - (t (cl-return (apply #'rxt-choice (reverse branches))))))))))))) - -;; Skip over whitespace and comments in PCRE extended mode -(defun rxt-extended-skip () - (when rxt-pcre-extended-mode - (skip-syntax-forward "-") - (while (looking-at "#") - (beginning-of-line 2) - (skip-syntax-forward "-")))) - -;; Parse a regexp "branch": a sequence of pieces -(defun rxt-parse-branch () - (rxt-extended-skip) - (rxt-with-source-location - (let ((pieces '()) - (branch-start-p t)) - (while (not (looking-at rxt-branch-end-regexp)) - (push (rxt-parse-piece branch-start-p) pieces) - (setq branch-start-p nil)) - (apply #'rxt-seq (reverse pieces))))) - -;; Parse a regexp "piece": an atom (`rxt-parse-atom') plus any -;; following quantifiers -(defun rxt-parse-piece (&optional branch-begin) - (rxt-extended-skip) - (rxt-with-source-location - (let ((atom (rxt-parse-atom branch-begin))) - (rxt-parse-quantifiers atom)))) - -;; Parse any and all quantifiers after ATOM and return the quantified -;; regexp, or ATOM unchanged if no quantifiers -(defun rxt-parse-quantifiers (atom) - (catch 'done - (while (not (eobp)) - (let ((atom1 (rxt-parse-quantifier atom))) - (if (eq atom1 atom) - (throw 'done t) - (setq atom atom1))))) - atom) - -;; Possibly parse a single quantifier after ATOM and return the -;; quantified atom, or ATOM if no quantifier -(defun rxt-parse-quantifier (atom) - (rxt-extended-skip) - (rxt-token-case - ((rx "*?") (rxt-repeat 0 nil atom nil)) - ((rx "*") (rxt-repeat 0 nil atom t)) - ((rx "+?") (rxt-repeat 1 nil atom nil)) - ((rx "+") (rxt-repeat 1 nil atom t)) - ((rx "??") (rxt-repeat 0 1 atom nil)) - ((rx "?") (rxt-repeat 0 1 atom t)) - ;; Brace expression "{M,N}", "{M,}", "{M}" - (rxt-brace-begin-regexp - (cl-destructuring-bind (from to) - (rxt-parse-braces) - (rxt-repeat from to atom))) - ;; No quantifiers found - (t atom))) - -;; Parse a regexp atom, i.e. an element that binds to any following -;; quantifiers. This includes characters, character classes, -;; parenthesized groups, assertions, etc. -(defun rxt-parse-atom (&optional branch-begin) - (if (eobp) - (rxt-error "Unexpected end of regular expression") - (if rxt-parse-pcre - (rxt-parse-atom/pcre) - (rxt-parse-atom/el branch-begin)))) - -(defun rxt-parse-atom/common () - (rxt-token-case - ((rx "[") (rxt-parse-char-class)) - ((rx "\\b") (rxt-word-boundary)) - ((rx "\\B") (rxt-not-word-boundary)))) - -(defun rxt-parse-atom/el (branch-begin) - (rxt-with-source-location - (or (rxt-parse-atom/common) - (rxt-token-case - ;; "." wildcard - ((rx ".") (rxt-nonl)) - ;; "^" and "$" are metacharacters only at beginning or end of a - ;; branch in Elisp; elsewhere they are literals - ((rx "^") - (if branch-begin - (rxt-bol) - (rxt-string "^"))) - ((rx "$") - (if (looking-at rxt-branch-end-regexp) - (rxt-eol) - (rxt-string "$"))) - ;; Beginning & end of string, word, symbol - ((rx "\\`") (rxt-bos)) - ((rx "\\'") (rxt-eos)) - ((rx "\\<") (rxt-bow)) - ((rx "\\>") (rxt-eow)) - ((rx "\\_<") (rxt-symbol-start)) - ((rx "\\_>") (rxt-symbol-end)) - ;; Subgroup - ((rx "\\(") (rxt-parse-subgroup/el)) - ;; Word/non-word characters (meaning depending on syntax table) - ((rx "\\w") (rxt-wordchar)) - ((rx "\\W") (rxt-not-wordchar)) - ;; Other syntax categories - ((rx "\\" (submatch (any ?S ?s)) (submatch nonl)) - (let ((negated (string= (match-string 1) "S")) - (syntax - (car (rassoc (string-to-char (match-string 2)) - rx-syntax)))) - (if syntax - (let ((re (rxt-syntax-class syntax))) - (if negated (rxt-negate re) re)) - (rxt-error "Invalid syntax class `\\%s'" (match-string 0))))) - ;; Character categories - ((rx "\\" (submatch (any ?C ?c)) (submatch nonl)) - (let ((negated (string= (match-string 1) "C")) - (category - (car (rassoc (string-to-char (match-string 2)) - rx-categories)))) - (if category - (let ((re (rxt-char-category category))) - (if negated (rxt-negate re) re)) - (rxt-error "Invalid character category `%s'" (match-string 0))))) - ;; Backreference - ((rx (seq "\\" (submatch (any "1-9")))) - (rxt-backref (string-to-number (match-string 1)))) - ;; Other escaped characters - ((rx (seq "\\" (submatch nonl))) - (rxt-string (match-string 1))) - ;; Normal characters - ((rx (or "\n" nonl)) - (rxt-string (match-string 0))))))) - -(defun rxt-parse-atom/pcre () - (rxt-extended-skip) - (rxt-with-source-location - (or - ;; Is it an atom that's the same in Elisp? - (rxt-parse-atom/common) - ;; Is it common to PCRE regex and character class syntax? - (let ((char (rxt-parse-escapes/pcre))) - (and char - (rxt-string (char-to-string char)))) - ;; Otherwise: - (rxt-token-case - ;; "." wildcard - ((rx ".") - (if rxt-pcre-s-mode - (rxt-anything) - (rxt-nonl))) - ;; Beginning & end of string/line - ((rx "^") (rxt-bol)) - ((rx "$") (rxt-eol)) - ((rx "\\A") (rxt-bos)) - ((rx "\\Z") (rxt-eos)) - ;; Subgroup - ((rx "(") (rxt-parse-subgroup/pcre)) - ;; Metacharacter quoting - ((rx "\\Q") - ;; It would seem simple to take all the characters between \Q - ;; and \E and make an rxt-string, but \Q...\E isn't an atom: - ;; any quantifiers afterward should bind only to the last - ;; character, not the whole string. - (let ((begin (point))) - (search-forward "\\E" nil t) - (let* ((end (match-beginning 0)) - (str (buffer-substring-no-properties begin (1- end))) - (char (char-to-string (char-before end)))) - (rxt-seq (rxt-string str) - (rxt-parse-quantifiers (rxt-string char)))))) - ;; Pre-defined character sets - ((rx "\\" (submatch (any "d" "D" "h" "H" "s" "S" "v" "V" "w" "W"))) - (rxt--pcre-char-set (string-to-char (match-string 1)))) - ;; \ + digits: backreference or octal char? - ((rx "\\" (submatch (+ (any "0-9")))) - (let* ((digits (match-string 1)) - (dec (string-to-number digits))) - ;; from "man pcrepattern": If the number is less than 10, or if - ;; there have been at least that many previous capturing left - ;; parentheses in the expression, the entire sequence is taken - ;; as a back reference. - (if (and (> dec 0) - (or (< dec 10) - (>= rxt-subgroup-count dec))) - (progn - (when rxt-pcre-case-fold - (display-warning - 'rxt "Backreferences with case-folding are handled poorly")) - (rxt-backref dec)) - ;; from "man pcrepattern": if the decimal number is greater - ;; than 9 and there have not been that many capturing - ;; subpatterns, PCRE re-reads up to three octal digits - ;; following the backslash, and uses them to generate a data - ;; character. Any subsequent digits stand for themselves. - (goto-char (match-beginning 1)) - (re-search-forward (rx (** 0 3 (any "0-7")))) - (rxt-string (char-to-string (string-to-number (match-string 0) 8)))))) - ;; Other escaped characters - ((rx "\\" (submatch nonl)) (rxt-string (match-string 1))) - ;; Everything else - ((rx (or (any "\n") nonl)) (rxt-string (match-string 0))))))) - -(defun rxt-parse-escapes/pcre () - "Consume a one-char PCRE escape at point and return its codepoint equivalent. - -Handles only those character escapes which have the same meaning -in character classes as outside them." - (rxt-token-case - ((rx "\\a") #x07) ; bell - ((rx "\\e") #x1b) ; escape - ((rx "\\f") #x0c) ; formfeed - ((rx "\\n") #x0a) ; linefeed - ((rx "\\r") #x0d) ; carriage return - ((rx "\\t") #x09) ; tab - ;; Control character - ((rx "\\c" (submatch nonl)) - ;; from `man pcrepattern': - ;; The precise effect of \cx is as follows: if x is a lower case - ;; letter, it is converted to upper case. Then bit 6 of the - ;; character (hex 40) is inverted. - (logxor (string-to-char (upcase (match-string 1))) #x40)) - ;; Hex escapes - ((rx "\\x" (submatch (** 1 2 (any "0-9" "A-Z" "a-z")))) - (string-to-number (match-string 1) 16)) - ((rx "\\x{" (submatch (* (any "0-9" "A-Z" "a-z"))) "}") - (string-to-number (match-string 1) 16)))) - -(defun rxt-parse-subgroup/pcre () - (catch 'return - (let ((shy nil) - (extended-mode rxt-pcre-extended-mode) - (single-line-mode rxt-pcre-s-mode) - (case-fold rxt-pcre-case-fold)) - (rxt-extended-skip) - ;; Check for special (? ..) and (* ...) syntax - (rxt-token-case - ((rx "?") ; (? - (rxt-token-case - ((rx ")") ; Empty group (?) - (throw 'return (rxt-empty-string))) - (":" (setq shy t)) ; Shy group (?: - ("#" ; Comment (?# - (search-forward ")") - (throw 'return (rxt-empty-string))) - ((rx (or ; Flags (?isx-isx - (seq (group (* (any "gimosx"))) "-" (group (+ (any "gimosx")))) - (seq (group (+ (any "gimosx")))))) - (let ((token (match-string 0)) - (on (or (match-string 1) (match-string 3))) - (off (or (match-string 2) ""))) - (if (cl-find ?x on) (setq extended-mode t)) - (if (cl-find ?s on) (setq single-line-mode t)) - (if (cl-find ?i on) (setq case-fold t)) - (if (cl-find ?x off) (setq extended-mode nil)) - (if (cl-find ?s off) (setq single-line-mode nil)) - (if (cl-find ?i off) (setq case-fold nil)) - (when (string-match-p "[gmo]" token) - (display-warning - 'rxt (format "Unhandled PCRE flags in (?%s" token)))) - (rxt-token-case - (":" (setq shy t)) ; Shy group with flags (?isx-isx: ... - (")" ; Set flags (?isx-isx) - ;; Set flags for the remainder of the current subexpression - (setq rxt-pcre-extended-mode extended-mode - rxt-pcre-s-mode single-line-mode - rxt-pcre-case-fold case-fold) - (throw 'return (rxt-empty-string))))) - ;; Other constructions like (?=, (?!, etc. are not recognised - (t (rxt-error "Unrecognized PCRE extended construction `(?%c'" - (char-after))))) - - ;; No special (* ...) verbs are recognised - ((rx "*") - (let ((begin (point))) - (search-forward ")" nil 'go-to-end) - (rxt-error "Unrecognized PCRE extended construction `(*%s'" - (buffer-substring begin (point)))))) - - ;; Parse the remainder of the subgroup - (unless shy (cl-incf rxt-subgroup-count)) - (let* ((rxt-pcre-extended-mode extended-mode) - (rxt-pcre-s-mode single-line-mode) - (rxt-pcre-case-fold case-fold) - (rx (rxt-parse-exp))) - (rxt-extended-skip) - (rxt-token-case - (")" (if shy rx (rxt-submatch rx))) - (t (rxt-error "Subexpression missing close paren"))))))) - -(defun rxt-parse-subgroup/el () - (let ((kind - (rxt-token-case - ((rx "?:") - (cl-incf rxt-subgroup-count) - 'shy) - ((rx "?" (group (+ (in "0-9"))) ":") - (let ((n (string-to-number (match-string 1)))) - (when (< rxt-subgroup-count n) - (setf rxt-subgroup-count n)) - n)) - ((rx "?") ; Reserved - (rxt-error "Unknown match group sequence"))))) - (let ((rx (rxt-parse-exp))) - (rxt-token-case - ((rx "\\)") - (cond ((eq kind 'shy) rx) - ((numberp kind) - (rxt-submatch-numbered kind rx)) - (t (rxt-submatch rx)))) - (t (rxt-error "Subexpression missing close paren")))))) - -(defun rxt-parse-braces () - (rxt-token-case - (rxt-m-to-n-brace-regexp - (list (string-to-number (match-string 1)) - (string-to-number (match-string 2)))) - (rxt-m-to-?-brace-regexp - (list (string-to-number (match-string 1)) nil)) - (rxt-m-brace-regexp - (let ((a (string-to-number (match-string 1)))) - (list a a))) - (t - (let ((begin (point))) - (search-forward "}" nil 'go-to-end) - (rxt-error "Bad brace expression {%s" - (buffer-substring-no-properties begin (point))))))) - -;; Parse a character set range [...] -(defun rxt-parse-char-class () - (when (eobp) - (rxt-error "Missing close right bracket in regexp")) - (rxt-with-source-location - (let* ((negated (rxt-token-case - ((rx "^") t) - (t nil))) - (begin (point)) - (result - (if negated - (rxt-negate (rxt-char-set-union)) - (rxt-char-set-union))) - (transformer - (if negated #'rxt-negate #'identity)) - (builder - (if negated #'rxt-char-set-intersection #'rxt-choice))) - (catch 'done - (while t - (when (eobp) - (rxt-error "Missing close right bracket in regexp")) - (if (and (looking-at (rx "]")) - (not (= (point) begin))) - (throw 'done result) - (let ((piece (funcall transformer (rxt-parse-char-class-piece)))) - (setq result (funcall builder result piece)))))) - (forward-char) ; Skip over closing "]" - result))) - -;; Parse a single character, a character range, or a posix class -;; within a character set context. Returns an `rxt-char-set'. -(defun rxt-parse-char-class-piece () - (let ((atom (rxt-parse-char-class-atom))) - (cl-typecase atom - (rxt-char-set ; return unchanged - atom) - (integer ; character: check for range - (let ((range-end (rxt-maybe-parse-range-end))) - (if range-end - (rxt-char-set-union (cons atom range-end)) - (rxt-char-set-union atom)))) - (t ; transform into character set - (rxt-char-set-union atom))))) - -;; Parse a single character or named class within a charset. -;; -;; Returns an integer (a character), a symbol (representing a named -;; character class) or an `rxt-char-set' (for pre-defined character -;; classes like \d, \W, etc.) -(defun rxt-parse-char-class-atom () - (or - ;; First, check for PCRE-specific backslash sequences - (and rxt-parse-pcre - (rxt-parse-char-class-atom/pcre)) - ;; Char-class syntax - (rxt-token-case - ;; Named classes [:alnum:], ... - (rxt-named-classes-regexp - (intern (match-string 1))) - ;; Error on unknown posix-class-like syntax - ((rx "[:" (* (any "a-z")) ":]") - (rxt-error "Unknown posix character class `%s'" (match-string 0))) - ;; Error on [= ... ]= collation syntax - ((rx "[" (submatch (any "." "=")) - (* (any "a-z")) (backref 1) "]") - (rxt-error "Unsupported collation syntax `%s'" (match-string 0))) - ;; Other characters stand for themselves - ((rx (or "\n" nonl)) - (string-to-char (match-string 0)))))) - -;; Parse backslash escapes inside PCRE character classes -(defun rxt-parse-char-class-atom/pcre () - (or (rxt-parse-escapes/pcre) - (rxt-token-case - ;; Backslash + digits => octal char - ((rx "\\" (submatch (** 1 3 (any "0-7")))) - (string-to-number (match-string 1) 8)) - ;; Pre-defined character sets - ((rx "\\" (submatch (any "d" "D" "h" "H" "s" "S" "v" "V" "w" "W"))) - (rxt--pcre-char-set (string-to-char (match-string 1)))) - ;; "\b" inside character classes is a backspace - ((rx "\\b") ?\C-h) - ;; Ignore other escapes - ((rx "\\" (submatch nonl)) - (string-to-char (match-string 1)))))) - -;; Look for a range tail (the "-z" in "a-z") after parsing a single -;; character within in a character set. Returns either a character -;; representing the range end, or nil. -(defun rxt-maybe-parse-range-end () - (let ((range-end nil) (end-position nil)) - (when (looking-at (rx "-" (not (any "]")))) - (save-excursion - (forward-char) - (setq range-end (rxt-parse-char-class-atom) - end-position (point)))) - - (if (characterp range-end) - ;; This is a range: move point after it and return the ending character - (progn - (goto-char end-position) - range-end) - ;; Not a range. - nil))) - -;; Return the pre-defined PCRE char-set associated with CHAR: i.e. \d -;; is digits, \D non-digits, \s space characters, etc. -(defun rxt--pcre-char-set (char) - (let* ((base-char (downcase char)) - (negated (/= char base-char)) - (elements (assoc-default base-char rxt-pcre-char-set-alist)) - (base-char-set (apply #'rxt-char-set-union elements))) - (if negated - (rxt-negate base-char-set) - base-char-set))) - - -;;;; Unparser to `rx' syntax - -(defconst rxt-rx-verbose-equivalents - '((bol . line-start) - (eol . line-end) - (nonl . not-newline) - (bos . string-start) - (eos . string-end) - (bow . word-start) - (eow . word-end) - (seq . sequence)) - "Alist of verbose equivalents for short `rx' primitives.") - -(defun rxt-rx-symbol (sym) - (if rxt-verbose-rx-translation - (or (assoc-default sym rxt-rx-verbose-equivalents) - sym) - sym)) - -(defun rxt-adt->rx (re) - (let ((rx - (cl-typecase re - (rxt-primitive - (rxt-rx-symbol (rxt-primitive-rx re))) - - (rxt-string - (if (or (not (rxt-string-case-fold re)) - (string= "" (rxt-string-chars re))) - (rxt-string-chars re) - `(seq - ,@(cl-loop for char across (rxt-string-chars re) - collect `(any ,(upcase char) ,(downcase char)))))) - - (rxt-seq - `(seq ,@(mapcar #'rxt-adt->rx (rxt-seq-elts re)))) - - (rxt-choice - `(or ,@(mapcar #'rxt-adt->rx (rxt-choice-elts re)))) - - (rxt-submatch - (if (rxt-seq-p (rxt-submatch-body re)) - `(submatch - ,@(mapcar #'rxt-adt->rx (rxt-seq-elts (rxt-submatch-body re)))) - `(submatch ,(rxt-adt->rx (rxt-submatch-body re))))) - - (rxt-submatch-numbered - (if (rxt-seq-p (rxt-submatch-numbered-p re)) - `(,(rxt-rx-symbol 'submatch-n) - ,(rxt-submatch-numbered-n re) - ,@(mapcar #'rxt-adt->rx - (rxt-seq-elts - (rxt-submatch-numbered-body re)))) - `(,(rxt-rx-symbol 'submatch-n) - ,(rxt-submatch-numbered-n re) - ,(rxt-adt->rx (rxt-submatch-numbered-body re))))) - - (rxt-backref - (let ((n (rxt-backref-n re))) - (if (<= n 9) - `(backref ,(rxt-backref-n re)) - (rxt-error "Too many backreferences (%s)" n)))) - - (rxt-syntax-class - `(syntax ,(rxt-syntax-class-symbol re))) - - (rxt-char-category - `(category ,(rxt-char-category-symbol re))) - - (rxt-repeat - (let ((from (rxt-repeat-from re)) - (to (rxt-repeat-to re)) - (greedy (rxt-repeat-greedy re)) - (body (rxt-adt->rx (rxt-repeat-body re)))) - (if rxt-verbose-rx-translation - (let ((rx - (cond - ((and (zerop from) (null to)) - `(zero-or-more ,body)) - ((and (equal from 1) (null to)) - `(one-or-more ,body)) - ((and (zerop from) (equal to 1)) - `(zero-or-one ,body)) - ((null to) - `(>= ,from ,body)) - ((equal from to) - `(repeat ,from ,body)) - (t - `(** ,from ,to ,body))))) - (if greedy - (if rxt-explain - rx ; Readable but not strictly accurate. Fixme? - `(maximal-match ,rx)) - `(minimal-match ,rx))) - (cond - ((and (zerop from) (null to)) - `(,(if greedy '* '*?) ,body)) - ((and (equal from 1) (null to)) - `(,(if greedy '+ '+?) ,body)) - ((and (zerop from) (equal to 1)) - `(,(if greedy ? ??) ,body)) - ((null to) - `(>= ,from ,body)) - ((equal from to) - `(= ,from ,body)) - (t - `(** ,from ,to ,body)))))) - - (rxt-char-set-union - (let* ((case-fold (rxt-char-set-union-case-fold re)) - (re (rxt--simplify-char-set re case-fold)) - (chars (rxt-char-set-union-chars re)) - (ranges (rxt-char-set-union-ranges re)) - (classes (rxt-char-set-union-classes re)) - (case-fold (rxt-char-set-union-case-fold re))) - (if (and (null chars) (null ranges) (= 1 (length classes))) - (car classes) - `(any ,@chars ,@ranges ,@classes)))) - - (rxt-char-set-negation - `(not ,(rxt-adt->rx (rxt-char-set-negation-elt re)))) - - (t - (rxt-error "No RX translation of `%s'" (rxt-to-string re)))))) - - ;; Store source information on each fragment of the generated RX - ;; sexp for rxt-explain mode - (when rxt-explain - ;; Use gensyms to store unique source information for multiple - ;; occurrences of primitives like `bol' - (when (symbolp rx) - (setq rx (make-symbol (symbol-name rx)))) - (setf (rxt-location rx) (rxt-location re))) - rx)) - - -;;;; 'Unparser' to PCRE notation - -;;; Based on scsh/posixstr.scm in scsh - -;; To ensure that the operator precedence in the generated regexp does -;; what we want, we need to keep track of what kind of production is -;; returned from each step. Therefore these functions return a string -;; and a numeric "level" which lets the function using the generated -;; regexp know whether it has to be parenthesized: -;; -;; 0: an already parenthesized expression -;; -;; 1: a "piece" binds to any succeeding quantifiers -;; -;; 2: a "branch", or concatenation of pieces, needs parenthesizing to -;; bind to quantifiers -;; -;; 3: a "top", or alternation of branches, needs parenthesizing to -;; bind to quantifiers or to concatenation -;; -;; This idea is stolen straight out of the scsh implementation. - -(defun rxt-adt->pcre (re) - (cl-destructuring-bind (s _) (rxt-adt->pcre/lev re) s)) - -(defun rxt-adt->pcre/lev (re) - (cl-typecase re - (rxt-primitive - (let ((s (rxt-primitive-pcre re))) - (if s - (list s 1) - (rxt-error "No PCRE translation of `%s'" (rxt-to-string re))))) - - (rxt-string (rxt-string->pcre re)) - (rxt-seq (rxt-seq->pcre re)) - (rxt-choice (rxt-choice->pcre re)) - (rxt-submatch (rxt-submatch->pcre re)) - (rxt-backref - (list (format "\\%d" (rxt-backref-n re)) 1)) - (rxt-repeat (rxt-repeat->pcre re)) - - ((or rxt-char-set-union rxt-char-set-negation) - (rxt-char-set->pcre re)) - - ;; FIXME - ;; ((rxt-char-set-intersection re) (rxt-char-set-intersection->pcre re)) - - (t - (rxt-error "No PCRE translation of `%s'" (rxt-to-string re))))) - -(defconst rxt-pcre-metachars (rx (any "\\^.$|()[]*+?{}"))) -(defconst rxt-pcre-charset-metachars (rx (any "]" "[" "\\" "^" "-"))) - -(defun rxt-string->pcre (re) - (let ((chars (rxt-string-chars re))) - (list - (replace-regexp-in-string - rxt-pcre-metachars - "\\\\\\&" chars) - ;; A one-character string is a 'piece' (it binds to a following - ;; quantifier). A longer string is a 'branch' (it has to be - ;; enclosed in parentheses to bind to a following quantifier). - (if (> (length chars) 1) 2 1)))) - -(defun rxt-seq->pcre (re) - (let ((elts (rxt-seq-elts re))) - (if (null elts) - "" - (rxt-seq-elts->pcre elts)))) - -(defun rxt-seq-elts->pcre (elts) - (cl-destructuring-bind - (s lev) (rxt-adt->pcre/lev (car elts)) - (if (null (cdr elts)) - (list s lev) - (cl-destructuring-bind - (s1 lev1) (rxt-seq-elts->pcre (cdr elts)) - (list (concat (rxt-paren-if-necessary s lev) - (rxt-paren-if-necessary s1 lev1)) - 2))))) - -(defun rxt-paren-if-necessary (s lev) - (if (< lev 3) - s - (concat "(?:" s ")"))) - -(defun rxt-choice->pcre (re) - (let ((elts (rxt-choice-elts re))) - (if (null elts) - nil - (rxt-choice-elts->pcre elts)))) - -(defun rxt-choice-elts->pcre (elts) - (cl-destructuring-bind - (s lev) (rxt-adt->pcre/lev (car elts)) - (if (null (cdr elts)) - (list s lev) - (cl-destructuring-bind - (s1 lev1) (rxt-choice-elts->pcre (cdr elts)) - (list (concat s "|" s1) 3))))) - -(defun rxt-submatch->pcre (re) - (cl-destructuring-bind - (s lev) (rxt-adt->pcre/lev (rxt-submatch-body re)) - (list (concat "(" s ")") 0))) - -(defun rxt-repeat->pcre (re) - (let ((from (rxt-repeat-from re)) - (to (rxt-repeat-to re)) - (body (rxt-repeat-body re)) - (greedy (rxt-repeat-greedy re))) - (cl-destructuring-bind - (s lev) (rxt-adt->pcre/lev body) - (cond - ((and to (= from 1) (= to 1)) (list s lev)) - ((and to (= from 0) (= to 0)) (list "" 2)) - (t - (when (> lev 1) ; parenthesize non-atoms - (setq s (concat "(?:" s ")") - lev 0)) - (list (if to - (cond ((and (= from 0) (= to 1)) - (concat s (if greedy "?" "??"))) - ((= from to) - (concat s "{" (number-to-string to) "}")) - (t - (concat s "{" (number-to-string from) - "," (number-to-string to) "}"))) - (cond ((= from 0) - (concat s (if greedy "*" "*?"))) - ((= from 1) - (concat s (if greedy "+" "+?"))) - (t (concat s "{" (number-to-string from) ",}")))) - 1)))))) - -(defun rxt-char-set->pcre (re) - (cond ((rxt-char-set-union-p re) - (list - (concat "[" (rxt-char-set->pcre/chars re) "]") 1)) - - ((rxt-char-set-negation-p re) - (let ((elt (rxt-char-set-negation-elt re))) - (if (rxt-char-set-union-p elt) - (list - (concat "[^" (rxt-char-set->pcre/chars elt) "]") 1) - (rxt-error "No PCRE translation of `%s'" (rxt-to-string elt))))) - - (t - (rxt-error "Non-char-set in rxt-char-set->pcre: %s" re)))) - -;; Fortunately, easier in PCRE than in POSIX! -(defun rxt-char-set->pcre/chars (re) - (cl-flet - ((escape - (char) - (let ((s (char-to-string char))) - (cond ((string-match rxt-pcre-charset-metachars s) - (concat "\\" s)) - - ((and (not (string= s " ")) - (string-match "[^[:graph:]]" s)) - (format "\\x{%x}" char)) - - (t s))))) - - (let ((chars (rxt-char-set-union-chars re)) - (ranges (rxt-char-set-union-ranges re)) - (classes (rxt-char-set-union-classes re))) - - (concat - (mapconcat #'escape chars "") - (mapconcat #'(lambda (rg) - (format "%s-%s" - (escape (car rg)) - (escape (cdr rg)))) - ranges "") - (mapconcat #'(lambda (class) - (format "[:%s:]" class)) - classes ""))))) - - -;;;; Generate all productions of a finite regexp - -(defun rxt-adt->strings (re) - (cl-typecase re - (rxt-primitive - (list "")) - (rxt-string - (list (rxt-string-chars re))) - (rxt-seq - (rxt-seq-elts->strings (rxt-seq-elts re))) - (rxt-choice - (rxt-choice-elts->strings (rxt-choice-elts re))) - (rxt-submatch - (rxt-adt->strings (rxt-submatch-body re))) - (rxt-submatch-numbered - (rxt-adt->strings (rxt-submatch-numbered-body re))) - (rxt-repeat - (rxt-repeat->strings re)) - (rxt-char-set-union - (rxt-char-set->strings re)) - (t - (error "Can't generate productions of %s" - (rxt-syntax-tree-readable re))))) - -(defun rxt-concat-product (heads tails) - (cl-mapcan - (lambda (hs) - (mapcar - (lambda (ts) (concat hs ts)) - tails)) - heads)) - -(defun rxt-seq-elts->strings (elts) - (if (null elts) - '("") - (let ((heads (rxt-adt->strings (car elts))) - (tails (rxt-seq-elts->strings (cdr elts)))) - (rxt-concat-product heads tails)))) - -(defun rxt-choice-elts->strings (elts) - (if (null elts) - '() - (append (rxt-adt->strings (car elts)) - (rxt-choice-elts->strings (cdr elts))))) - -(defun rxt-repeat->strings (re) - (let ((from (rxt-repeat-from re)) - (to (rxt-repeat-to re))) - (if (not to) - (error "Can't generate all productions of unbounded repeat \"%s\"" - (rxt-syntax-tree-readable re)) - (let ((strings (rxt-adt->strings (rxt-repeat-body re)))) - (rxt-repeat-n-m->strings from to strings))))) - -(defun rxt-repeat-n-m->strings (from to strings) - (cond - ((zerop to) '("")) - ((= to from) (rxt-repeat-n->strings from strings)) - (t ; to > from - (let* ((strs-n (rxt-repeat-n->strings from strings)) - (accum (cl-copy-list strs-n))) - (dotimes (i (- to from)) - (setq strs-n (rxt-concat-product strs-n strings)) - (setq accum (nconc accum strs-n))) - accum)))) - -(defun rxt-repeat-n->strings (n strings) - ;; n > 1 - (cond ((zerop n) '("")) - ((= n 1) strings) - (t - (rxt-concat-product - (rxt-repeat-n->strings (- n 1) strings) - strings)))) - -(defun rxt-char-set->strings (re) - (if (rxt-char-set-union-classes re) - (error "Can't generate all productions of named character classes in \"%s\"" - (rxt-syntax-tree-readable re)) - (let ((chars (mapcar #'char-to-string (rxt-char-set-union-chars re)))) - (dolist (range (rxt-char-set-union-ranges re)) - (let ((end (cdr range))) - (cl-do ((i (car range) (+ i 1))) - ((> i end)) - (push (char-to-string i) chars)))) - chars))) - - -;;;; RE-Builder hacks - -(defadvice reb-update-modestring - (after rxt () activate compile) - "This function is hacked for emulated PCRE syntax and regexp conversion." - (setq reb-mode-string - (concat - (format " (%s)" reb-re-syntax) - reb-mode-string)) - (force-mode-line-update)) - -(defadvice reb-change-syntax - (around rxt (&optional syntax) activate compile) - "This function is hacked for emulated PCRE syntax and regexp conversion." - (interactive - (list (intern - (completing-read (format "Select syntax (%s): " reb-re-syntax) - '(read string pcre sregex rx) - nil t "" nil (symbol-name reb-re-syntax))))) - (unless (memq syntax '(read string pcre lisp-re sregex rx)) - (error "Invalid syntax: %s" syntax)) - (let ((re-builder-buffer (get-buffer reb-buffer))) - (setq reb-re-syntax syntax) - (when re-builder-buffer - (with-current-buffer reb-target-buffer - (cl-case syntax - (rx - (let ((rx (rxt-elisp-to-rx reb-regexp))) - (setq reb-regexp-src - (with-temp-buffer - (insert "\n" "'") - (rxt-print rx) - (buffer-string))))) - (pcre - (setq reb-regexp-src (rxt-elisp-to-pcre reb-regexp))))) - (with-current-buffer re-builder-buffer - ;; Hack: prevent reb-auto-update from clobbering the - ;; reb-regexp-src we just set - (let ((inhibit-modification-hooks t)) - (reb-initialize-buffer)) - ;; Enable flag-toggling bindings for PCRE syntax - (rxt--re-builder-switch-pcre-mode))))) - -(defadvice reb-read-regexp - (around rxt () activate compile) - "This function is hacked for emulated PCRE syntax and regexp conversion." - (if (eq reb-re-syntax 'pcre) - (setq ad-return-value - (save-excursion - (goto-char (point-min)) - (rxt-read-delimited-pcre))) - ad-do-it)) - -(defadvice reb-insert-regexp - (around rxt () activate compile) - "This function is hacked for emulated PCRE syntax and regexp conversion." - (if (eq reb-re-syntax 'pcre) - (let ((src (reb-target-binding reb-regexp-src))) - (if src - (insert "\n/" (replace-regexp-in-string "/" "\\/" src t t) "/") - (insert "\n//"))) - ad-do-it)) - -(defadvice reb-cook-regexp - (around rxt (re) activate compile) - "This function is hacked for emulated PCRE syntax and regexp conversion." - (if (eq reb-re-syntax 'pcre) - (setq ad-return-value (rxt-pcre-to-elisp re)) - ad-do-it)) - -(defadvice reb-update-regexp - (around rxt () activate compile) - "This function is hacked for emulated PCRE syntax and regexp conversion." - (setq ad-return-value - (let* ((re-src (reb-read-regexp)) - (re (reb-cook-regexp re-src))) - (with-current-buffer reb-target-buffer - (let ((oldre reb-regexp)) - (prog1 - (not (string= oldre re)) - (setq reb-regexp re) - ;; Update the source re if format requires translation - (when (or (reb-lisp-syntax-p) (eq reb-re-syntax 'pcre)) - (setq reb-regexp-src re-src)))))))) - -(defun rxt--re-builder-switch-pcre-mode () - (rxt--read-pcre-mode - (if (eq reb-re-syntax 'pcre) 1 0))) - -(add-hook 'reb-mode-hook #'rxt--re-builder-switch-pcre-mode) - -(provide 'rxt) -(provide 'pcre2el) - - -;;; pcre2el.el ends here |