Skip to content

Commit

Permalink
Verify input is utf-8 (#131)
Browse files Browse the repository at this point in the history
  • Loading branch information
nitely authored Oct 22, 2023
1 parent 0fd81c5 commit 801ab3a
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 35 deletions.
38 changes: 33 additions & 5 deletions src/regex.nim
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 352,13 @@ export

const reNonCapture* = nonCapture

template debugCheckUtf8(s: untyped): untyped =
## This is for input strings. Regex are already checked.
## On release/danger the behaviour on invalid utf-8 input
## is undefined
when not defined(release):
assert(verifyUtf8(s) == -1, "Invalid utf-8 input")

when canUseMacro:
func rex*(s: string): RegexLit =
## Raw regex literal string
Expand Down Expand Up @@ -462,9 469,11 @@ func match*(
doAssert "abcd".match(re2"abcd", m)
doAssert not "abcd".match(re2"abc", m)

debugCheckUtf8 s
result = matchImpl(s, toRegex(pattern), m, start)

func match*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
debugCheckUtf8 s
var m: RegexMatch2
result = matchImpl(s, toRegex(pattern), m)

Expand Down Expand Up @@ -496,6 505,7 @@ iterator findAll*(
doAssert bounds == @[1 .. 2, 4 .. 5]
doAssert found == @["bc", "bc"]

debugCheckUtf8 s
var i = start
var i2 = start-1
var m: RegexMatch2
Expand Down Expand Up @@ -534,6 544,7 @@ iterator findAllBounds*(
bounds.add bd
doAssert bounds == @[1 .. 2, 4 .. 5]

debugCheckUtf8 s
var i = start
var i2 = start-1
var ms: RegexMatches2
Expand Down Expand Up @@ -598,6 609,7 @@ iterator split*(s: string, sep: Regex2): string {.inline, raises: [].} =
found.add s
doAssert found == @["", "a", "Ϊ", "", "弢", ""]

debugCheckUtf8 s
var
first, last, i = 0
i2 = -1
Expand Down Expand Up @@ -632,6 644,7 @@ func splitIncl*(s: string, sep: Regex2): seq[string] {.inline, raises: [].} =
doAssert parts == expected

template ab: untyped = m.boundaries
debugCheckUtf8 s
var
first, last, i = 0
i2 = -1
Expand Down Expand Up @@ -662,6 675,7 @@ func startsWith*(
doAssert "abc".startsWith(re2"\w")
doAssert not "abc".startsWith(re2"\d")

debugCheckUtf8 s
startsWithImpl2(s, toRegex(pattern), start)

template runeIncAt(s: string, n: var int) =
Expand All @@ -680,6 694,7 @@ func endsWith*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
doAssert "abc".endsWith(re2"\w")
doAssert not "abc".endsWith(re2"\d")

debugCheckUtf8 s
result = false
var
m: RegexMatch2
Expand Down Expand Up @@ -732,7 747,8 @@ func replace*(
doAssert "Nim is awesome!".replace(re2"(\w\B)", "$1_") ==
"N_i_m i_s a_w_e_s_o_m_e!"

result = ""
debugCheckUtf8 s
result = newStringOfCap(s.len)
var
i, j = 0
capts = newSeqOfCap[string](toRegex(pattern).groupsCount)
Expand Down Expand Up @@ -772,7 788,8 @@ func replace*(
let text = "**this is a test**"
doAssert text.replace(re2"(\*)", removeStars) == "this is a test"

result = ""
debugCheckUtf8 s
result = newStringOfCap(s.len)
var i, j = 0
for m in findAll(s, pattern):
result.addsubstr(s, i, m.boundaries.a-1)
Expand Down Expand Up @@ -800,7 817,8 @@ func escapeRe*(s: string): string {.raises: [].} =
#
# utf-8 ascii code-points cannot be part of multi-byte
# code-points, so we can read/match byte by byte
result = ""
debugCheckUtf8 s
result = newStringOfCap(s.len)
for c in s:
case c
of ' ', '#', '$', '&', '(',
Expand Down Expand Up @@ -950,9 968,11 @@ func match*(
m: var RegexMatch,
start = 0
): bool {.inline, raises: [], deprecated: "use match(string, Regex2, var RegexMatch2) instead".} =
debugCheckUtf8 s
result = matchImpl(s, pattern, m, start)

func match*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use match(string, Regex2) instead".} =
debugCheckUtf8 s
var m: RegexMatch
result = matchImpl(s, pattern, m)

Expand All @@ -961,6 981,7 @@ iterator findAll*(
pattern: Regex,
start = 0
): RegexMatch {.inline, raises: [], deprecated: "use findAll(string, Regex2) instead".} =
debugCheckUtf8 s
var i = start
var i2 = start-1
var m: RegexMatch
Expand Down Expand Up @@ -989,6 1010,7 @@ iterator findAllBounds*(
pattern: Regex,
start = 0
): Slice[int] {.inline, raises: [], deprecated: "use findAllBounds(string, Regex2) instead".} =
debugCheckUtf8 s
var i = start
var i2 = start-1
var ms: RegexMatches
Expand Down Expand Up @@ -1036,6 1058,7 @@ func find*(
return false

iterator split*(s: string, sep: Regex): string {.inline, raises: [], deprecated: "use split(string, Regex2) instead".} =
debugCheckUtf8 s
var
first, last, i = 0
i2 = -1
Expand All @@ -1058,6 1081,7 @@ func split*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated

func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated: "use splitIncl(string, Regex2) instead".} =
template ab: untyped = m.boundaries
debugCheckUtf8 s
var
first, last, i = 0
i2 = -1
Expand All @@ -1082,10 1106,11 @@ func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprec
func startsWith*(
s: string, pattern: Regex, start = 0
): bool {.inline, raises: [], deprecated: "use startsWith(string, Regex2) instead".} =
debugCheckUtf8 s
startsWithImpl(s, pattern, start)

# XXX use findAll and check last match bounds
func endsWith*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use endsWith(string, Regex2) instead".} =
debugCheckUtf8 s
result = false
var
m: RegexMatch
Expand Down Expand Up @@ -1121,6 1146,7 @@ func replace*(
by: string,
limit = 0
): string {.inline, raises: [ValueError], deprecated: "use replace(string, Regex2, string) instead".} =
debugCheckUtf8 s
result = ""
var
i, j = 0
Expand All @@ -1145,7 1171,8 @@ func replace*(
pattern: Regex,
by: proc (m: RegexMatch, s: string): string,
limit = 0
): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string) :string) instead".} =
): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string): string) instead".} =
debugCheckUtf8 s
result = ""
var i, j = 0
for m in findAll(s, pattern):
Expand Down Expand Up @@ -1439,6 1466,7 @@ when isMainModule:
doAssert re2"\w" in "弢"
doAssert "2222".find(re2"(22)*", m) and
m.group(0) == 2 .. 3
doAssert raisesMsg("\xff") == "Invalid utf-8 regex"
doAssert raisesMsg(r"[a-\w]") ==
"Invalid set range. Range can't contain " &
"a character-class or assertion\n" &
Expand Down
67 changes: 43 additions & 24 deletions src/regex/common.nim
Original file line number Diff line number Diff line change
Expand Up @@ -69,27 69,46 @@ proc `%%`*(
proc `%%`*(formatstr: string, a: string): string =
formatstr %% [a]

# XXX this is to support literal optimization
# for unicode. It needs testing
when false:
# XXX impl simpler find when memchr is not available?
func find*(s: string, r: Rune, start: Natural = 0): int =
## Find unicode rune in a string.
if r.ord < 0xff:
return find(s, r.char, start)
let c = (r.ord and 0xff).char
let rsize = r.size()
var i = start rsize-1
var r2 = 0'u32
doAssert rsize >= 1 and rsize <= 4
while i < len(s):
i = find(s, c, i)
if i == -1:
return -1
for j in i-rsize-1 .. i:
r2 = (r2 shl 8) or s[j].uint32
if r.uint32 == r2:
return i-rsize-1
r2 = 0
inc i
return -1
type
verifyUtf8State = enum
vusError, vusStart, vusA, vusB, vusC, vusD, vusE, vusF, vusG

# Taken from nim-unicodeplus
func verifyUtf8*(s: string): int =
## Return `-1` if `s` is a valid utf-8 string.
## Otherwise, return the index of the first bad char.
var state = vusStart
var i = 0
let L = s.len
while i < L:
case state:
of vusStart:
result = i
state = if uint8(s[i]) in 0x00'u8 .. 0x7F'u8: vusStart
elif uint8(s[i]) in 0xC2'u8 .. 0xDF'u8: vusA
elif uint8(s[i]) in 0xE1'u8 .. 0xEC'u8 or uint8(s[i]) in 0xEE'u8 .. 0xEF'u8: vusB
elif uint8(s[i]) == 0xE0'u8: vusC
elif uint8(s[i]) == 0xED'u8: vusD
elif uint8(s[i]) in 0xF1'u8 .. 0xF3'u8: vusE
elif uint8(s[i]) == 0xF0'u8: vusF
elif uint8(s[i]) == 0xF4'u8: vusG
else: vusError
of vusA:
state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusStart else: vusError
of vusB:
state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusA else: vusError
of vusC:
state = if uint8(s[i]) in 0xA0'u8 .. 0xBF'u8: vusA else: vusError
of vusD:
state = if uint8(s[i]) in 0x80'u8 .. 0x9F'u8: vusA else: vusError
of vusE:
state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusB else: vusError
of vusF:
state = if uint8(s[i]) in 0x90'u8 .. 0xBF'u8: vusB else: vusError
of vusG:
state = if uint8(s[i]) in 0x80'u8 .. 0x8F'u8: vusB else: vusError
of vusError:
break
inc i
if state == vusStart:
result = -1
3 changes: 3 additions & 0 deletions src/regex/compiler.nim
Original file line number Diff line number Diff line change
@@ -1,3 1,4 @@
import ./common
import ./parser
import ./exptransformation
import ./types
Expand All @@ -8,6 9,8 @@ when defined(regexDotDir):
import ./dotgraph

func reImpl*(s: string): Regex {.inline.} =
if verifyUtf8(s) != -1:
raise newException(RegexError, "Invalid utf-8 regex")
var groups: GroupsCapture
let rpn = s
.parse
Expand Down
6 changes: 3 additions & 3 deletions tests/tests.nim
Original file line number Diff line number Diff line change
Expand Up @@ -1126,7 1126,7 @@ test "tstarts_with":
check(not "abc".startsWith(re"bc"))
check startsWith("弢ⒶΪ", re"弢Ⓐ")
check startsWith("弢", re("\xF0\xAF\xA2\x94"))
check(not startsWith("弢", re("\xF0\xAF\xA2")))
#check(not startsWith("弢", re("\xF0\xAF\xA2")))
check "abc".startsWith(re"\w")
check(not "abc".startsWith(re"\d"))
check "abc".startsWith(re"(a|b)")
Expand All @@ -1142,7 1142,7 @@ test "tends_with":
check(not "abc".endsWith(re"ab"))
check endsWith("弢ⒶΪ", re"ⒶΪ")
check endsWith("弢", re("\xF0\xAF\xA2\x94"))
check(not endsWith("弢", re("\xAF\xA2\x94")))
#check(not endsWith("弢", re("\xAF\xA2\x94")))
check "abc".endsWith(re"(b|c)")
check "ab".endsWith(re"(b|c)")
check(not "a".endsWith(re"(b|c)"))
Expand Down Expand Up @@ -2475,7 2475,7 @@ test "escapeRe":
check match("$", re(escapeRe"$"))
block:
var s = ""
for c in 0 .. 255:
for c in 0 .. 127:
s.add c.char
discard re(escapeRe(s))

Expand Down
36 changes: 33 additions & 3 deletions tests/tests2.nim
Original file line number Diff line number Diff line change
@@ -1,5 1,6 @@
from std/unicode import runeLen
from std/sequtils import map
from std/strutils import contains

import ../src/regex

Expand Down Expand Up @@ -1499,7 1500,7 @@ test "tstarts_with":
check(not "abc".startsWith(re2"bc"))
check startsWith("弢ⒶΪ", re2"弢Ⓐ")
check startsWith("弢", re2("\xF0\xAF\xA2\x94"))
check(not startsWith("弢", re2("\xF0\xAF\xA2")))
#check(not startsWith("弢", re2("\xF0\xAF\xA2")))
check "abc".startsWith(re2"\w")
check(not "abc".startsWith(re2"\d"))
check "abc".startsWith(re2"(a|b)")
Expand All @@ -1515,7 1516,7 @@ test "tends_with":
check(not "abc".endsWith(re2"ab"))
check endsWith("弢ⒶΪ", re2"ⒶΪ")
check endsWith("弢", re2("\xF0\xAF\xA2\x94"))
check(not endsWith("弢", re2("\xAF\xA2\x94")))
#check(not endsWith("弢", re2("\xAF\xA2\x94")))
check "abc".endsWith(re2"(b|c)")
check "ab".endsWith(re2"(b|c)")
check(not "a".endsWith(re2"(b|c)"))
Expand Down Expand Up @@ -2914,7 2915,7 @@ test "escapere2":
check match("$", re2(escapeRe"$"))
block:
var s = ""
for c in 0 .. 255:
for c in 0 .. 127:
s.add c.char
discard re2(escapeRe(s))

Expand Down Expand Up @@ -3025,3 3026,32 @@ test "tlookaround_captures":
m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture]
check match("aaab", re2"(\w)(\w )|\w (?<=^(\w)(\w)(\w ))b", m) and
m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture, nonCapture]

when (NimMajor, NimMinor) >= (2, 0):
type MyAssertionDefect = ref AssertionDefect
else:
type MyAssertionDefect = ref AssertionError

template raisesInvalidUtf8(exp: untyped): untyped =
try:
discard exp
check false
except MyAssertionDefect:
check "Invalid utf-8 input" in getCurrentExceptionMsg()

test "tverifyutf8":
check raisesMsg("\xff") == "Invalid utf-8 regex"
raisesInvalidUtf8 match("\xff", re2"abc")
block:
var m: RegexMatch2
raisesInvalidUtf8 match("\xff", re2"abc", m)
raisesInvalidUtf8 findAll("\xff", re2"abc")
raisesInvalidUtf8 findAllBounds("\xff", re2"abc")
raisesInvalidUtf8 split("\xff", re2"abc")
raisesInvalidUtf8 splitIncl("\xff", re2"abc")
raisesInvalidUtf8 startsWith("\xff", re2"abc")
raisesInvalidUtf8 endsWith("\xff", re2"abc")
raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
raisesInvalidUtf8 replace("\xff", re2"abc",
(proc (m: RegexMatch2, s: string): string = discard))
raisesInvalidUtf8 escapeRe("\xff")

0 comments on commit 801ab3a

Please sign in to comment.