DEV Community

Bruce Axtens
Bruce Axtens

Posted on

Extracting example SQL statements from a log

I recently posted a link to a StackOverflow question about building JavaScript RegExp from T-SQL statements.

So I got some good help but went off on a tangent with a different approach, eschewing RegExp in favour of a simpler pattern matching approach.

Below is the Deno code that I cooked up to demonstrate both approaches. The simpler approach actually gets closer to what I want than the RegExp, suggesting that the RegExps need more finessing.

In the process of writing (took most of a week, off and on) I found some really helpful things:

  • Forcing an array to only unique items: list = [...new Set(list)] as Array<string>
  • Custom sorting (see the anonymous function in the .sort() statement. Okay, I've known about it for years but this was a helpful refresher.)
  • console.time() and console.timeEnd() which I had pulled in from a Google Apps Script project.
  • Deno.args, so now I can supply a local file. This was particularly helpful because github's gist resisted storing a 225MB file of T-SQL statements.

On of the interesting outcomes of this, besides giving me some statements to feed into the find missing indexes process is seeing how often incorrectly specified TSQL statements are being generated. The users know how to get in touch with me when something goes wrong, but no one does. What they don't (or won't) tell me I'm now beginning to find out.

// usage.ts
//  pull sqlStatement list from a public gist
//    otherwise from a local file.

let list:string[];

if (Deno.args.length > 0) {
  const text = await Deno.readTextFile(Deno.args[0]);
  list = text.split(/\r\n|\r|\n/g);
} else {
  const sqlStatmentsOnGist = await fetch(
    "https://gist.githubusercontent.com/axtens/fe99f49a7b9d12c4467cb6b3ea509532/raw/7a688b70e5b2502d6116e66efcc28e6fc5bb97ad/Sample%20SQL%20selects",
  );
  const body = new Uint8Array(await sqlStatmentsOnGist.arrayBuffer());
  list = new TextDecoder("utf-8").decode(body).split(/\r\n|\r|\n/g);  
}

list = list.map((elem) => {
  return elem
    .toUpperCase()
    .replace(/\s{2,}/g, " ") /* compress runs of spaces */
    .replace(/;$/g, "") /* remove occasional trailing semicolon */
    .trim()   ";"; /* and then put one back on every line */
});

list = [...new Set(list)] as Array<string>; // keep unique lines

list.sort((a: string, b: string): number => {
  const aa = a.replace(/\s /g, "");
  const bb = b.replace(/\s /g, "");
  if (aa.length < bb.length) {
    return 1;
  }
  if (aa.length > bb.length) {
    return -1;
  }
  if (aa < bb) {
    return 1;
  }
  if (aa > bb) {
    return -1;
  }
  return 0;
}); /* sort longest lines first, then alphabetically */

await Deno.writeTextFile("./sql-unique.txt", list.join("\r\n"));
console.log("wrote sql-unique.txt");

console.time("filterUsingRegularExpression");
let result = filterUsingRegularExpression(list);
console.timeEnd("filterUsingRegularExpression");
await Deno.writeTextFile("./regex_filtered.txt", result.filtered.join("\r\n"));
await Deno.writeTextFile("./regex_patterns.txt", result.patterns.join("\r\n"));

console.time("filterUsingTokens");
result = filterUsingTokens(list);
console.timeEnd("filterUsingTokens");
await Deno.writeTextFile("./token_filtered.txt", result.filtered.join("\r\n"));
await Deno.writeTextFile("./token_patterns.txt", result.patterns.join("\r\n"));

function filterUsingRegularExpression(
  list: string[],
): { filtered: string[]; patterns: string[] } {
  const safe = [];
  const check = [];
  while (true) {
    if (void 0 === list[0]) break;
    safe.push(list[0]);
    const expr2 = list[0]
      .replace(/[.* ?^${}()|[\]\\]/g, "\\$&")
      .replace(/('[^'] ')/g, "'.*'")
      .replace(/\d /g, "\\d ")
      .replace(/\s /g, "\\s ");
    check.push([expr2, list[0]].join("\t"));
    const newList = stripAccordingToPattern(list, new RegExp(expr2, "g"));
    //console.log("found %s not matching", newList.length);
    if (newList.length > 0) {
      if (list.length === newList.length) {
        list = newList.slice(1);
      } else {
        list = newList.slice(0);
      }
    } else {
      break;
    }
  }
  return { filtered: safe, patterns: check };
}

function filterUsingTokens(
  list: string[],
): { filtered: string[]; patterns: string[] } {
  const safe = [];
  const check = [];
  while (true) {
    if (void 0 === list[0]) break;
    safe.push(list[0]);
    const expr = tokenize(list[0]); //getPattern(list[0]);
    check.push([expr, list[0]].join("\t"));
    const newList = stripAccordingToToken(list, expr); //e.search(patt) === -1);
    //console.log("found %s not matching", newList.length);
    if (newList.length > 0) {
      if (list.length === newList.length) {
        list = newList.slice(1);
      } else {
        list = newList.slice(0);
      }
    } else {
      break;
    }
  }
  return { filtered: safe, patterns: check };
}

function tokenize(arg0: string): string {
  return arg0.replace(/\d /g, "|NUMBER|").replace(/'[^']*'/g, "|STRING|");
}

function stripAccordingToToken(sqlList: string[], patt: string): string[] {
  const output = [];
  for (let i = 0; i < sqlList.length; i  ) {
    const item = sqlList[i];
    if (tokenize(item) !== patt) output.push(sqlList[i]);
  }
  return output;
}

function stripAccordingToPattern(list: string[], patt: RegExp): string[] {
  const output = [];
  for (let i = 0; i < list.length; i  ) {
    if (!patt.test(list[i])) output.push(list[i]);
  }
  return output;
}
Enter fullscreen mode Exit fullscreen mode

Top comments (0)