Fix casefold (#150)

nitely · Dec 30, 2024 · 9ccc25f · 9ccc25f
1 parent d4e6b73
commit 9ccc25f
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3  14,4 @@ tests/test_bug
 docs/ugh
 bin/*
 bench/bench
 config.nims
diff --git a/regex.nimble b/regex.nimble
@@ -8,7  8,7 @@ srcDir = "src"
 skipDirs = @["tests", "bench", "docs"]
 
 requires "nim >= 1.6.0"
-requires "unicodedb >= 0.7.2"
 requires "unicodedb >= 0.13.1"
 
 template execTest(lang, target: static string) =
   doAssert lang in ["c", "js"]

diff --git a/src/regex/exptransformation.nim b/src/regex/exptransformation.nim
@@ -3,13  3,13 @@ import std/sets
 import std/tables
 import std/algorithm
 
 import pkg/unicodedb/casing
 
 import ./exptype
 import ./types
 import ./common
 import ./scanner
 
-# todo: can not use unicodeplus due to
-# https://github.com/nim-lang/Nim/issues/7059
 func swapCase(r: Rune): Rune =
   # Note a character can be
   # non-lower and non-upper
@@ -178,10  178,12 @@ func applyFlag(n: var Node, f: Flag) =
     else:
       discard
   of flagCaseInsensitive:
-    if n.kind == reChar and n.cp != n.cp.swapCase():
     if n.kind == reChar and n.cp.hasCaseFolds:
       n.kind = reCharCI
       n.cp = n.cp.simpleCaseFold
     # todo: apply recursevely to
     #       shorthands of reInSet/reNotSet (i.e: [:ascii:])
     # XXX add all casefolds that map to the cp instead of swapCase
     if n.kind in {reInSet, reNotSet}:
       var cps = newSeq[Rune]()
       for cp in items n.cps:
@@ -190,9  192,8 @@ func applyFlag(n: var Node, f: Flag) =
           cps.add cp2
       n.cps.add cps
       for sl in n.ranges[0 .. ^1]:
-        let
-          cpa = sl.a.swapCase()
-          cpb = sl.b.swapCase()
         let cpa = sl.a.swapCase()
         let cpb = sl.b.swapCase()
         if sl.a != cpa and sl.b != cpb:
           n.ranges.add(cpa .. cpb)
   of flagUnGreedy:

diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim
@@ -6,6  6,7 @@ import std/tables
 import std/sets
 import std/algorithm
 
 import pkg/unicodedb/casing
 import pkg/unicodedb/properties
 import pkg/unicodedb/types as utypes
 
@@ -124,7  125,8 @@ func genMatch(c: NimNode, n: Node): NimNode =
       quote do: true
     of reCharCI:
       let cp2Lit = newLit n.cp.swapCase().int32
-      quote do: `c` == `cpLit` or `c` == `cp2Lit`
       let cp3Lit = newLit n.cp.simpleCaseFold().int32
       quote do: `c` == `cpLit` or `c` == `cp2Lit` or simpleCaseFold(`c`) == Rune(`cp3Lit`)
     of reWordAscii:
       genWordAsciiMatch(c)
     of reNotAlphaNumAscii:

diff --git a/src/regex/nodematch.nim b/src/regex/nodematch.nim
@@ -1,5  1,6 @@
 import std/unicode except `==`
 
 import pkg/unicodedb/casing
 import pkg/unicodedb/properties
 import pkg/unicodedb/types as utypes
 
@@ -97,14  98,6 @@ func isDigitAscii(r: Rune): bool {.inline.} =
   else:
     false
 
-# todo: can not use unicodeplus due to
-# https://github.com/nim-lang/Nim/issues/7059
-func swapCase*(r: Rune): Rune =
-  result = r.toLower()
-  if result != r:
-    return
-  result = r.toUpper()
-
 func matchAsciiSet(n: Node, r: Rune): bool =
   assert n.shorthands.len == 0
   result = r in n.cps or
@@ -162,7  155,7 @@ func match*(n: Node, r: Rune): bool {.inline.} =
   of reNotWhiteSpace: not r.isWhiteSpace()
   of reAny: r != lineBreakRune
   of reAnyNL: true
-  of reCharCI: r == n.cp or r == n.cp.swapCase()
   of reCharCI: r == n.cp or n.cp == r.simpleCaseFold
   of reUCC: r.unicodeCategory() in n.cc
   of reNotUCC: r.unicodeCategory() notin n.cc
   of reWordAscii: r.isWordAscii()

diff --git a/tests/tests_misc.nim b/tests/tests_misc.nim
@@ -70,6  70,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
   result = map(
     findAll(s, reg),
     func (m: RegexMatch2): seq[Slice[int]] =
       result = newSeq[Slice[int]]()
       for i in 0 .. m.groupsCount-1:
         result.add m.group(i))
 
@@ -696,3  697,25 @@ test "rust_regression":
   check findAllBounds(r"hiya \N{snowman} bye", re2"(\\N\{[^}] })|([{}])") == @[5 .. 15]
   check findAllCapt(r"hiya \N{snowman} bye", re2"(\\N\{[^}] })|([{}])") ==
     @[@[5 .. 15, nonCapture]]
 
 # https://github.com/BurntSushi/rebar/pull/20
 test "rebar":
   block:
     check match("ſ", re2(r"s", {regexCaseless}))
     check match("s", re2(r"ſ", {regexCaseless}))
     check match("ſ", re2(r"S", {regexCaseless}))
     check match("S", re2(r"ſ", {regexCaseless}))
     check "ſ".len == 2
     check findAllBounds("ſ", re2(r"s", {regexCaseless})) == @[0 .. 1]
     check findAllBounds("s", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
     check findAllBounds("ſ", re2(r"S", {regexCaseless})) == @[0 .. 1]
     check findAllBounds("S", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
     # XXX fix
     #check match("s", re2(r"[ſ]", {regexCaseless}))
     #check match("ſ", re2(r"[s]", {regexCaseless}))
     check match("a", re2(r"A", {regexCaseless}))
     check match("A", re2(r"a", {regexCaseless}))
     check match("@", re2(r"@", {regexCaseless}))
     check findAllBounds("a", re2(r"A", {regexCaseless})) == @[0 .. 0]
     check findAllBounds("A", re2(r"a", {regexCaseless})) == @[0 .. 0]
     check findAllBounds("@", re2(r"@", {regexCaseless})) == @[0 .. 0]