fix(util): Extract HTML data attributes with newline characters

While HTML attributes can not contain unescaped CR and LF characters, they may contain Unicode characters which are considered newline characters by RegExp, such as LS (U 2028). Example that contains a random LS character which breaks data extraction: https://dinrecords.bandcamp.com/album/tone-science-module-no-9-theories-and-conjectures-din-ts09
kellnerd · Jul 23, 2024 · 1744a1a · 1744a1a
1 parent 2db91de
commit 1744a1a
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/utils/html.ts b/utils/html.ts
@@ -10,7 10,7 @@ export function extractTextFromHtml(html: string, expression: RegExp) {
  * Extracts the value of the data attribute with the given key from HTML.
  */
 export function extractDataAttribute(html: string, key: string): string | undefined {
- return extractTextFromHtml(html, new RegExp(`data-${key}=["'](. ?)["']`, 'i'));
  return extractTextFromHtml(html, new RegExp(`data-${key}=["'](. ?)["']`, 'is'));
 }
 
 /** Extracts the `content` value of the meta tag with the given name from HTML. */