diff options
author | Eelco Dolstra <edolstra@gmail.com> | 2017-08-16T19·21+0200 |
---|---|---|
committer | Eelco Dolstra <edolstra@gmail.com> | 2017-08-16T19·21+0200 |
commit | 2ee1b9359b26e0d75c0ee885549fee4ad87131d1 (patch) | |
tree | a7b100e984fafb09238dcb5fe8277011cbc1ca56 | |
parent | c2cab207320672fb6ed4af40a99fc9082ff55234 (diff) | |
parent | b8867a0239b1930a16f9ef3f7f3e864b01416dff (diff) |
Merge branch 'tokenize' of https://github.com/nbp/nix
-rw-r--r-- | doc/manual/expressions/builtins.xml | 37 | ||||
-rw-r--r-- | src/libexpr/primops.cc | 68 | ||||
-rw-r--r-- | tests/lang/eval-okay-regex-split.nix | 48 |
3 files changed, 153 insertions, 0 deletions
diff --git a/doc/manual/expressions/builtins.xml b/doc/manual/expressions/builtins.xml index 86c36da1b328..615314880aba 100644 --- a/doc/manual/expressions/builtins.xml +++ b/doc/manual/expressions/builtins.xml @@ -873,6 +873,43 @@ builtins.sort builtins.lessThan [ 483 249 526 147 42 77 ] </varlistentry> + <varlistentry><term><function>builtins.split</function> + <replaceable>regex</replaceable> <replaceable>str</replaceable></term> + + <listitem><para>Returns a list composed of non matched strings interleaved + with the lists of the <link + xlink:href="http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04">extended + POSIX regular expression</link> <replaceable>regex</replaceable> matches + of <replaceable>str</replaceable>. Each item in the lists of matched + sequences is a regex group. + +<programlisting> +builtins.split "(a)b" "abc" +</programlisting> + +Evaluates to <literal>[ "" [ "a" ] "c" ]</literal>. + +<programlisting> +builtins.split "([ac])" "abc" +</programlisting> + +Evaluates to <literal>[ "" [ "a" ] "b" [ "c" ] "" ]</literal>. + +<programlisting> +builtins.split "(a)|(c)" "abc" +</programlisting> + +Evaluates to <literal>[ "" [ "a" null ] "b" [ null "c" ] "" ]</literal>. + +<programlisting> +builtins.split "([[:upper:]]+)" " FOO " +</programlisting> + +Evaluates to <literal>[ " " [ "FOO" ] " " ]</literal>. + + </para></listitem> + </varlistentry> + <varlistentry><term><function>builtins.stringLength</function> <replaceable>e</replaceable></term> diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index 4e51e8ff2562..fcd3f8efee3f 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -1745,6 +1745,73 @@ static void prim_match(EvalState & state, const Pos & pos, Value * * args, Value } +/* Split a string with a regular expression, and return a list of the + non-matching parts interleaved by the lists of the matching groups. */ +static void prim_split(EvalState & state, const Pos & pos, Value * * args, Value & v) +{ + auto re = state.forceStringNoCtx(*args[0], pos); + + try { + + std::regex regex(re, std::regex::extended); + + PathSet context; + const std::string str = state.forceString(*args[1], context, pos); + + auto begin = std::sregex_iterator(str.begin(), str.end(), regex); + auto end = std::sregex_iterator(); + + // Any matches results are surrounded by non-matching results. + const size_t len = std::distance(begin, end); + state.mkList(v, 2 * len + 1); + size_t idx = 0; + Value * elem; + + if (len == 0) { + v.listElems()[idx++] = args[1]; + return; + } + + for (std::sregex_iterator i = begin; i != end; ++i) { + assert(idx <= 2 * len + 1 - 3); + std::smatch match = *i; + + // Add a string for non-matched characters. + elem = v.listElems()[idx++] = state.allocValue(); + mkString(*elem, match.prefix().str().c_str()); + + // Add a list for matched substrings. + const size_t slen = match.size() - 1; + elem = v.listElems()[idx++] = state.allocValue(); + + // Start at 1, beacause the first match is the whole string. + state.mkList(*elem, slen); + for (size_t si = 0; si < slen; ++si) { + if (!match[si + 1].matched) + mkNull(*(elem->listElems()[si] = state.allocValue())); + else + mkString(*(elem->listElems()[si] = state.allocValue()), match[si + 1].str().c_str()); + } + + // Add a string for non-matched suffix characters. + if (idx == 2 * len) { + elem = v.listElems()[idx++] = state.allocValue(); + mkString(*elem, match.suffix().str().c_str()); + } + } + assert(idx == 2 * len + 1); + + } catch (std::regex_error &e) { + if (e.code() == std::regex_constants::error_space) { + // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++ + throw EvalError("memory limit exceeded by regular expression '%s', at %s", re, pos); + } else { + throw EvalError("invalid regular expression '%s', at %s", re, pos); + } + } +} + + static void prim_concatStringSep(EvalState & state, const Pos & pos, Value * * args, Value & v) { PathSet context; @@ -2039,6 +2106,7 @@ void EvalState::createBaseEnv() addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency); addPrimOp("__hashString", 2, prim_hashString); addPrimOp("__match", 2, prim_match); + addPrimOp("__split", 2, prim_split); addPrimOp("__concatStringsSep", 2, prim_concatStringSep); addPrimOp("__replaceStrings", 3, prim_replaceStrings); diff --git a/tests/lang/eval-okay-regex-split.nix b/tests/lang/eval-okay-regex-split.nix new file mode 100644 index 000000000000..0073e057787d --- /dev/null +++ b/tests/lang/eval-okay-regex-split.nix @@ -0,0 +1,48 @@ +with builtins; + +# Non capturing regex returns empty lists +assert split "foobar" "foobar" == ["" [] ""]; +assert split "fo*" "f" == ["" [] ""]; +assert split "fo+" "f" == ["f"]; +assert split "fo*" "fo" == ["" [] ""]; +assert split "fo*" "foo" == ["" [] ""]; +assert split "fo+" "foo" == ["" [] ""]; +assert split "fo{1,2}" "foo" == ["" [] ""]; +assert split "fo{1,2}" "fooo" == ["" [] "o"]; +assert split "fo*" "foobar" == ["" [] "bar"]; + +# Capturing regex returns a list of sub-matches +assert split "(fo*)" "f" == ["" ["f"] ""]; +assert split "(fo+)" "f" == ["f"]; +assert split "(fo*)" "fo" == ["" ["fo"] ""]; +assert split "(f)(o*)" "f" == ["" ["f" ""] ""]; +assert split "(f)(o*)" "foo" == ["" ["f" "oo"] ""]; +assert split "(fo+)" "foo" == ["" ["foo"] ""]; +assert split "(fo{1,2})" "foo" == ["" ["foo"] ""]; +assert split "(fo{1,2})" "fooo" == ["" ["foo"] "o"]; +assert split "(fo*)" "foobar" == ["" ["foo"] "bar"]; + +# Matches are greedy. +assert split "(o+)" "oooofoooo" == ["" ["oooo"] "f" ["oooo"] ""]; + +# Matches multiple times. +assert split "(b)" "foobarbaz" == ["foo" ["b"] "ar" ["b"] "az"]; + +# Split large strings containing newlines. null are inserted when a +# pattern within the current did not match anything. +assert split "[[:space:]]+|([',.!?])" '' + Nix Rocks! + That's why I use it. +'' == [ + "Nix" [ null ] "Rocks" ["!"] "" [ null ] + "That" ["'"] "s" [ null ] "why" [ null ] "I" [ null ] "use" [ null ] "it" ["."] "" [ null ] + "" +]; + +# Documentation examples +assert split "(a)b" "abc" == [ "" [ "a" ] "c" ]; +assert split "([ac])" "abc" == [ "" [ "a" ] "b" [ "c" ] "" ]; +assert split "(a)|(c)" "abc" == [ "" [ "a" null ] "b" [ null "c" ] "" ]; +assert split "([[:upper:]]+)" " FOO " == [ " " [ "FOO" ] " " ]; + +true |