Verify input is utf-8 (#131)

nitely · Oct 22, 2023 · 801ab3a · 801ab3a
1 parent 0fd81c5
commit 801ab3a
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 35 deletions.
diff --git a/src/regex.nim b/src/regex.nim
@@ -352,6  352,13 @@ export
 
 const reNonCapture* = nonCapture
 
 template debugCheckUtf8(s: untyped): untyped =
   ## This is for input strings. Regex are already checked.
   ## On release/danger the behaviour on invalid utf-8 input
   ## is undefined
   when not defined(release):
     assert(verifyUtf8(s) == -1, "Invalid utf-8 input")
 
 when canUseMacro:
   func rex*(s: string): RegexLit =
     ## Raw regex literal string
@@ -462,9  469,11 @@ func match*(
     doAssert "abcd".match(re2"abcd", m)
     doAssert not "abcd".match(re2"abc", m)
 
   debugCheckUtf8 s
   result = matchImpl(s, toRegex(pattern), m, start)
 
 func match*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
   debugCheckUtf8 s
   var m: RegexMatch2
   result = matchImpl(s, toRegex(pattern), m)
 
@@ -496,6  505,7 @@ iterator findAll*(
     doAssert bounds == @[1 .. 2, 4 .. 5]
     doAssert found == @["bc", "bc"]
 
   debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var m: RegexMatch2
@@ -534,6  544,7 @@ iterator findAllBounds*(
       bounds.add bd
     doAssert bounds == @[1 .. 2, 4 .. 5]
 
   debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var ms: RegexMatches2
@@ -598,6  609,7 @@ iterator split*(s: string, sep: Regex2): string {.inline, raises: [].} =
       found.add s
     doAssert found == @["", "a", "Ϊ", "Ⓐ", "弢", ""]
 
   debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -632,6  644,7 @@ func splitIncl*(s: string, sep: Regex2): seq[string] {.inline, raises: [].} =
     doAssert parts == expected
 
   template ab: untyped = m.boundaries
   debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -662,6  675,7 @@ func startsWith*(
     doAssert "abc".startsWith(re2"\w")
     doAssert not "abc".startsWith(re2"\d")
 
   debugCheckUtf8 s
   startsWithImpl2(s, toRegex(pattern), start)
 
 template runeIncAt(s: string, n: var int) =
@@ -680,6  694,7 @@ func endsWith*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
     doAssert "abc".endsWith(re2"\w")
     doAssert not "abc".endsWith(re2"\d")
 
   debugCheckUtf8 s
   result = false
   var
     m: RegexMatch2
@@ -732,7  747,8 @@ func replace*(
     doAssert "Nim is awesome!".replace(re2"(\w\B)", "$1_") ==
       "N_i_m i_s a_w_e_s_o_m_e!"
 
-  result = ""
   debugCheckUtf8 s
   result = newStringOfCap(s.len)
   var
     i, j = 0
     capts = newSeqOfCap[string](toRegex(pattern).groupsCount)
@@ -772,7  788,8 @@ func replace*(
     let text = "**this is a test**"
     doAssert text.replace(re2"(\*)", removeStars) == "this is a test"
 
-  result = ""
   debugCheckUtf8 s
   result = newStringOfCap(s.len)
   var i, j = 0
   for m in findAll(s, pattern):
     result.addsubstr(s, i, m.boundaries.a-1)
@@ -800,7  817,8 @@ func escapeRe*(s: string): string {.raises: [].} =
   #
   # utf-8 ascii code-points cannot be part of multi-byte
   # code-points, so we can read/match byte by byte
-  result = ""
   debugCheckUtf8 s
   result = newStringOfCap(s.len)
   for c in s:
     case c
     of ' ', '#', '$', '&', '(',
@@ -950,9  968,11 @@ func match*(
   m: var RegexMatch,
   start = 0
 ): bool {.inline, raises: [], deprecated: "use match(string, Regex2, var RegexMatch2) instead".} =
   debugCheckUtf8 s
   result = matchImpl(s, pattern, m, start)
 
 func match*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use match(string, Regex2) instead".} =
   debugCheckUtf8 s
   var m: RegexMatch
   result = matchImpl(s, pattern, m)
 
@@ -961,6  981,7 @@ iterator findAll*(
   pattern: Regex,
   start = 0
 ): RegexMatch {.inline, raises: [], deprecated: "use findAll(string, Regex2) instead".} =
   debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var m: RegexMatch
@@ -989,6  1010,7 @@ iterator findAllBounds*(
   pattern: Regex,
   start = 0
 ): Slice[int] {.inline, raises: [], deprecated: "use findAllBounds(string, Regex2) instead".} =
   debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var ms: RegexMatches
@@ -1036,6  1058,7 @@ func find*(
   return false
 
 iterator split*(s: string, sep: Regex): string {.inline, raises: [], deprecated: "use split(string, Regex2) instead".} =
   debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -1058,6  1081,7 @@ func split*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated
 
 func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated: "use splitIncl(string, Regex2) instead".} =
   template ab: untyped = m.boundaries
   debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -1082,10  1106,11 @@ func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprec
 func startsWith*(
   s: string, pattern: Regex, start = 0
 ): bool {.inline, raises: [], deprecated: "use startsWith(string, Regex2) instead".} =
   debugCheckUtf8 s
   startsWithImpl(s, pattern, start)
 
-# XXX use findAll and check last match bounds
 func endsWith*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use endsWith(string, Regex2) instead".} =
   debugCheckUtf8 s
   result = false
   var
     m: RegexMatch
@@ -1121,6  1146,7 @@ func replace*(
   by: string,
   limit = 0
 ): string {.inline, raises: [ValueError], deprecated: "use replace(string, Regex2, string) instead".} =
   debugCheckUtf8 s
   result = ""
   var
     i, j = 0
@@ -1145,7  1171,8 @@ func replace*(
   pattern: Regex,
   by: proc (m: RegexMatch, s: string): string,
   limit = 0
-): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string) :string) instead".} =
 ): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string): string) instead".} =
   debugCheckUtf8 s
   result = ""
   var i, j = 0
   for m in findAll(s, pattern):
@@ -1439,6  1466,7 @@ when isMainModule:
     doAssert re2"\w" in "弢"
     doAssert "2222".find(re2"(22)*", m) and
       m.group(0) == 2 .. 3
     doAssert raisesMsg("\xff") == "Invalid utf-8 regex"
     doAssert raisesMsg(r"[a-\w]") ==
       "Invalid set range. Range can't contain " &
       "a character-class or assertion\n" &

diff --git a/src/regex/common.nim b/src/regex/common.nim
@@ -69,27  69,46 @@ proc `%%`*(
 proc `%%`*(formatstr: string, a: string): string =
   formatstr %% [a]
 
-# XXX this is to support literal optimization
-#     for unicode. It needs testing
-when false:
-  # XXX impl simpler find when memchr is not available?
-  func find*(s: string, r: Rune, start: Natural = 0): int =
-    ## Find unicode rune in a string.
-    if r.ord < 0xff:
-      return find(s, r.char, start)
-    let c = (r.ord and 0xff).char
-    let rsize = r.size()
-    var i = start rsize-1
-    var r2 = 0'u32
-    doAssert rsize >= 1 and rsize <= 4
-    while i < len(s):
-      i = find(s, c, i)
-      if i == -1:
-        return -1
-      for j in i-rsize-1 .. i:
-        r2 = (r2 shl 8) or s[j].uint32
-      if r.uint32 == r2:
-        return i-rsize-1
-      r2 = 0
-      inc i
-    return -1
 type
   verifyUtf8State = enum
     vusError, vusStart, vusA, vusB, vusC, vusD, vusE, vusF, vusG
 
 # Taken from nim-unicodeplus
 func verifyUtf8*(s: string): int =
   ## Return `-1` if `s` is a valid utf-8 string.
   ## Otherwise, return the index of the first bad char.
   var state = vusStart
   var i = 0
   let L = s.len
   while i < L:
     case state:
     of vusStart:
       result = i
       state = if uint8(s[i]) in 0x00'u8 .. 0x7F'u8: vusStart
       elif uint8(s[i]) in 0xC2'u8 .. 0xDF'u8: vusA
       elif uint8(s[i]) in 0xE1'u8 .. 0xEC'u8 or uint8(s[i]) in 0xEE'u8 .. 0xEF'u8: vusB
       elif uint8(s[i]) == 0xE0'u8: vusC
       elif uint8(s[i]) == 0xED'u8: vusD
       elif uint8(s[i]) in 0xF1'u8 .. 0xF3'u8: vusE
       elif uint8(s[i]) == 0xF0'u8: vusF
       elif uint8(s[i]) == 0xF4'u8: vusG
       else: vusError
     of vusA:
       state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusStart else: vusError
     of vusB:
       state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusA else: vusError
     of vusC:
       state = if uint8(s[i]) in 0xA0'u8 .. 0xBF'u8: vusA else: vusError
     of vusD:
       state = if uint8(s[i]) in 0x80'u8 .. 0x9F'u8: vusA else: vusError
     of vusE:
       state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusB else: vusError
     of vusF:
       state = if uint8(s[i]) in 0x90'u8 .. 0xBF'u8: vusB else: vusError
     of vusG:
       state = if uint8(s[i]) in 0x80'u8 .. 0x8F'u8: vusB else: vusError
     of vusError:
       break
     inc i
   if state == vusStart:
     result = -1
diff --git a/src/regex/compiler.nim b/src/regex/compiler.nim
@@ -1,3  1,4 @@
 import ./common
 import ./parser
 import ./exptransformation
 import ./types
@@ -8,6  9,8 @@ when defined(regexDotDir):
   import ./dotgraph
 
 func reImpl*(s: string): Regex {.inline.} =
   if verifyUtf8(s) != -1:
     raise newException(RegexError, "Invalid utf-8 regex")
   var groups: GroupsCapture
   let rpn = s
     .parse

diff --git a/tests/tests.nim b/tests/tests.nim
@@ -1126,7  1126,7 @@ test "tstarts_with":
   check(not "abc".startsWith(re"bc"))
   check startsWith("弢ⒶΪ", re"弢Ⓐ")
   check startsWith("弢", re("\xF0\xAF\xA2\x94"))
-  check(not startsWith("弢", re("\xF0\xAF\xA2")))
   #check(not startsWith("弢", re("\xF0\xAF\xA2")))
   check "abc".startsWith(re"\w")
   check(not "abc".startsWith(re"\d"))
   check "abc".startsWith(re"(a|b)")
@@ -1142,7  1142,7 @@ test "tends_with":
   check(not "abc".endsWith(re"ab"))
   check endsWith("弢ⒶΪ", re"ⒶΪ")
   check endsWith("弢", re("\xF0\xAF\xA2\x94"))
-  check(not endsWith("弢", re("\xAF\xA2\x94")))
   #check(not endsWith("弢", re("\xAF\xA2\x94")))
   check "abc".endsWith(re"(b|c)")
   check "ab".endsWith(re"(b|c)")
   check(not "a".endsWith(re"(b|c)"))
@@ -2475,7  2475,7 @@ test "escapeRe":
   check match("$", re(escapeRe"$"))
   block:
     var s = ""
-    for c in 0 .. 255:
     for c in 0 .. 127:
       s.add c.char
     discard re(escapeRe(s))
 

diff --git a/tests/tests2.nim b/tests/tests2.nim
@@ -1,5  1,6 @@
 from std/unicode import runeLen
 from std/sequtils import map
 from std/strutils import contains
 
 import ../src/regex
 
@@ -1499,7  1500,7 @@ test "tstarts_with":
   check(not "abc".startsWith(re2"bc"))
   check startsWith("弢ⒶΪ", re2"弢Ⓐ")
   check startsWith("弢", re2("\xF0\xAF\xA2\x94"))
-  check(not startsWith("弢", re2("\xF0\xAF\xA2")))
   #check(not startsWith("弢", re2("\xF0\xAF\xA2")))
   check "abc".startsWith(re2"\w")
   check(not "abc".startsWith(re2"\d"))
   check "abc".startsWith(re2"(a|b)")
@@ -1515,7  1516,7 @@ test "tends_with":
   check(not "abc".endsWith(re2"ab"))
   check endsWith("弢ⒶΪ", re2"ⒶΪ")
   check endsWith("弢", re2("\xF0\xAF\xA2\x94"))
-  check(not endsWith("弢", re2("\xAF\xA2\x94")))
   #check(not endsWith("弢", re2("\xAF\xA2\x94")))
   check "abc".endsWith(re2"(b|c)")
   check "ab".endsWith(re2"(b|c)")
   check(not "a".endsWith(re2"(b|c)"))
@@ -2914,7  2915,7 @@ test "escapere2":
   check match("$", re2(escapeRe"$"))
   block:
     var s = ""
-    for c in 0 .. 255:
     for c in 0 .. 127:
       s.add c.char
     discard re2(escapeRe(s))
 
@@ -3025,3  3026,32 @@ test "tlookaround_captures":
     m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture]
   check match("aaab", re2"(\w)(\w )|\w (?<=^(\w)(\w)(\w ))b", m) and
     m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture, nonCapture]
 
 when (NimMajor, NimMinor) >= (2, 0):
   type MyAssertionDefect = ref AssertionDefect
 else:
   type MyAssertionDefect = ref AssertionError
 
 template raisesInvalidUtf8(exp: untyped): untyped =
   try:
     discard exp
     check false
   except MyAssertionDefect:
     check "Invalid utf-8 input" in getCurrentExceptionMsg()
 
 test "tverifyutf8":
   check raisesMsg("\xff") == "Invalid utf-8 regex"
   raisesInvalidUtf8 match("\xff", re2"abc")
   block:
     var m: RegexMatch2
     raisesInvalidUtf8 match("\xff", re2"abc", m)
   raisesInvalidUtf8 findAll("\xff", re2"abc")
   raisesInvalidUtf8 findAllBounds("\xff", re2"abc")
   raisesInvalidUtf8 split("\xff", re2"abc")
   raisesInvalidUtf8 splitIncl("\xff", re2"abc")
   raisesInvalidUtf8 startsWith("\xff", re2"abc")
   raisesInvalidUtf8 endsWith("\xff", re2"abc")
   raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
   raisesInvalidUtf8 replace("\xff", re2"abc",
     (proc (m: RegexMatch2, s: string): string = discard))
   raisesInvalidUtf8 escapeRe("\xff")