philomena/assets/js/query/lex.ts

import { assertNotNull, assertNotUndefined } from '../utils/assert';
import { AstMatcher, ParseError, TokenList } from './types';

type TokenName = string;
type Token = [TokenName, RegExp];

const tokenList: Token[] = [
  ['fuzz', /^~(?:\d+(\.\d+)?|\.\d+)/],
  ['boost', /^\^[-+]?\d+(\.\d+)?/],
  ['quoted_lit', /^\s*"(?:[^"]|\\")+"/],
  ['lparen', /^\s*\(\s*/],
  ['rparen', /^\s*\)\s*/],
  ['and_op', /^\s*(?:&&|AND)\s+/],
  ['and_op', /^\s*,\s*/],
  ['or_op', /^\s*(?:\|\||OR)\s+/],
  ['not_op', /^\s*NOT(?:\s+|(?=\())/],
  ['not_op', /^\s*[!-]\s*/],
  ['space', /^\s+/],
  ['word', /^(?:\\[\s,()^~]|[^\s,()^~])+/],
  ['word', /^(?:\\[\s,()]|[^\s,()])+/],
];

export type ParseTerm = (term: string, fuzz: number, boost: number) => AstMatcher;

export type Range = [number, number];
export type TermContext = [Range, string];

export interface LexResult {
  tokenList: TokenList;
  termContexts: TermContext[];
  error: ParseError | null;
}

export function generateLexResult(searchStr: string, parseTerm: ParseTerm): LexResult {
  const opQueue: string[] = [];
  const groupNegate: boolean[] = [];

  let searchTerm: string | null = null;
  let boostFuzzStr = '';
  let localSearchStr: string = searchStr;
  let negate = false;
  let boost = 1;
  let fuzz = 0;
  let lparenCtr = 0;

  let termIndex = 0;
  let index = 0;

  const ret: LexResult = {
    tokenList: [],
    termContexts: [],
    error: null,
  };

  const beginTerm = (token: string) => {
    searchTerm = token;
    termIndex = index;
  };

  const endTerm = () => {
    if (searchTerm !== null) {
      // Push to stack.
      ret.tokenList.push(parseTerm(searchTerm, fuzz, boost));
      ret.termContexts.push([[termIndex, termIndex + searchTerm.length], searchTerm]);
      // Reset term and options data.
      boost = 1;
      fuzz = 0;
      searchTerm = null;
      boostFuzzStr = '';
      lparenCtr = 0;
    }

    if (negate) {
      ret.tokenList.push('not_op');
      negate = false;
    }
  };

  while (localSearchStr.length > 0) {
    for (const [tokenName, tokenRe] of tokenList) {
      const match = tokenRe.exec(localSearchStr);

      if (!match) {
        continue;
      }

      const token = match[0];
      const tokenIsBinaryOp = ['and_op', 'or_op'].indexOf(tokenName) !== -1;
      const tokenIsGroupStart = tokenName === 'rparen' && lparenCtr === 0;

      if (searchTerm !== null && (tokenIsBinaryOp || tokenIsGroupStart)) {
        endTerm();
      }

      switch (tokenName) {
        case 'and_op':
          while (opQueue[0] === 'and_op') {
            ret.tokenList.push(assertNotUndefined(opQueue.shift()));
          }
          opQueue.unshift('and_op');
          break;
        case 'or_op':
          while (opQueue[0] === 'and_op' || opQueue[0] === 'or_op') {
            ret.tokenList.push(assertNotUndefined(opQueue.shift()));
          }
          opQueue.unshift('or_op');
          break;
        case 'not_op':
          if (searchTerm) {
            // We're already inside a search term, so it does not apply, obv.
            searchTerm += token;
          } else {
            negate = !negate;
          }
          break;
        case 'lparen':
          if (searchTerm) {
            // If we are inside the search term, do not error out just yet;
            // instead, consider it as part of the search term, as a user convenience.
            searchTerm += token;
            lparenCtr += 1;
          } else {
            opQueue.unshift('lparen');
            groupNegate.push(negate);
            negate = false;
          }
          break;
        case 'rparen':
          if (lparenCtr > 0) {
            searchTerm = assertNotNull(searchTerm) + token;
            lparenCtr -= 1;
          } else {
            while (opQueue.length > 0) {
              const op = assertNotUndefined(opQueue.shift());
              if (op === 'lparen') {
                break;
              }
              ret.tokenList.push(op);
            }
            if (groupNegate.length > 0 && groupNegate.pop()) {
              ret.tokenList.push('not_op');
            }
          }
          break;
        case 'fuzz':
          if (searchTerm) {
            // For this and boost operations, we store the current match so far
            // to a temporary string in case this is actually inside the term.
            fuzz = parseFloat(token.substring(1));
            boostFuzzStr += token;
          } else {
            beginTerm(token);
          }
          break;
        case 'boost':
          if (searchTerm) {
            boost = parseFloat(token.substring(1));
            boostFuzzStr += token;
          } else {
            beginTerm(token);
          }
          break;
        case 'quoted_lit':
          if (searchTerm) {
            searchTerm += token;
          } else {
            beginTerm(token);
          }
          break;
        case 'word':
          if (searchTerm) {
            if (fuzz !== 0 || boost !== 1) {
              boost = 1;
              fuzz = 0;
              searchTerm += boostFuzzStr;
              boostFuzzStr = '';
            }
            searchTerm += token;
          } else {
            beginTerm(token);
          }
          break;
        default:
          // Append extra spaces within search terms.
          if (searchTerm) {
            searchTerm += token;
          }
      }

      // Truncate string and restart the token tests.
      localSearchStr = localSearchStr.substring(token.length);
      index += token.length;

      // Break since we have found a match.
      break;
    }
  }

  // Append final tokens to the stack.
  endTerm();

  if (opQueue.indexOf('rparen') !== -1 || opQueue.indexOf('lparen') !== -1) {
    ret.error = new ParseError('Mismatched parentheses.');
  }

  // Concatenate remaining operators to the token stack.
  ret.tokenList.push(...opQueue);

  return ret;
}

export function generateLexArray(searchStr: string, parseTerm: ParseTerm): TokenList {
  const ret = generateLexResult(searchStr, parseTerm);

  if (ret.error) {
    throw ret.error;
  }

  return ret.tokenList;
}
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`import { assertNotNull, assertNotUndefined } from '../utils/assert';`
			`import { AstMatcher, ParseError, TokenList } from './types';`

			`type TokenName = string;`
			`type Token = [TokenName, RegExp];`

			`const tokenList: Token[] = [`
			`['fuzz', /^~(?:\d+(\.\d+)?\|\.\d+)/],`
			`['boost', /^\^[-+]?\d+(\.\d+)?/],`
			`['quoted_lit', /^\s*"(?:[^"]\|\\")+"/],`
			`['lparen', /^\s\(\s/],`
			`['rparen', /^\s\)\s/],`
			`['and_op', /^\s*(?:&&\|AND)\s+/],`
			`['and_op', /^\s,\s/],`
			`['or_op', /^\s*(?:\\|\\|\|OR)\s+/],`
			`['not_op', /^\s*NOT(?:\s+\|(?=\())/],`
			`['not_op', /^\s[!-]\s/],`
			`['space', /^\s+/],`
			`['word', /^(?:\\[\s,()^~]\|[^\s,()^~])+/],`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`['word', /^(?:\\[\s,()]\|[^\s,()])+/],`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`];`

			`export type ParseTerm = (term: string, fuzz: number, boost: number) => AstMatcher;`

Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`export type Range = [number, number];`
			`export type TermContext = [Range, string];`

			`export interface LexResult {`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`tokenList: TokenList;`
			`termContexts: TermContext[];`
			`error: ParseError \| null;`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`}`

			`export function generateLexResult(searchStr: string, parseTerm: ParseTerm): LexResult {`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`const opQueue: string[] = [];`
			`const groupNegate: boolean[] = [];`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00
			`let searchTerm: string \| null = null;`
			`let boostFuzzStr = '';`
			`let localSearchStr: string = searchStr;`
			`let negate = false;`
			`let boost = 1;`
			`let fuzz = 0;`
			`let lparenCtr = 0;`

Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`let termIndex = 0;`
			`let index = 0;`

			`const ret: LexResult = {`
			`tokenList: [],`
			`termContexts: [],`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`error: null,`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`};`

			`const beginTerm = (token: string) => {`
			`searchTerm = token;`
			`termIndex = index;`
			`};`

			`const endTerm = () => {`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`if (searchTerm !== null) {`
			`// Push to stack.`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.tokenList.push(parseTerm(searchTerm, fuzz, boost));`
			`ret.termContexts.push([[termIndex, termIndex + searchTerm.length], searchTerm]);`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`// Reset term and options data.`
			`boost = 1;`
			`fuzz = 0;`
			`searchTerm = null;`
			`boostFuzzStr = '';`
			`lparenCtr = 0;`
			`}`

			`if (negate) {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.tokenList.push('not_op');`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`negate = false;`
			`}`
			`};`

			`while (localSearchStr.length > 0) {`
			`for (const [tokenName, tokenRe] of tokenList) {`
			`const match = tokenRe.exec(localSearchStr);`

			`if (!match) {`
			`continue;`
			`}`

			`const token = match[0];`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`const tokenIsBinaryOp = ['and_op', 'or_op'].indexOf(tokenName) !== -1;`
			`const tokenIsGroupStart = tokenName === 'rparen' && lparenCtr === 0;`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`if (searchTerm !== null && (tokenIsBinaryOp \|\| tokenIsGroupStart)) {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`endTerm();`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`

			`switch (tokenName) {`
			`case 'and_op':`
			`while (opQueue[0] === 'and_op') {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.tokenList.push(assertNotUndefined(opQueue.shift()));`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`opQueue.unshift('and_op');`
			`break;`
			`case 'or_op':`
			`while (opQueue[0] === 'and_op' \|\| opQueue[0] === 'or_op') {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.tokenList.push(assertNotUndefined(opQueue.shift()));`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`opQueue.unshift('or_op');`
			`break;`
			`case 'not_op':`
			`if (searchTerm) {`
			`// We're already inside a search term, so it does not apply, obv.`
			`searchTerm += token;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`negate = !negate;`
			`}`
			`break;`
			`case 'lparen':`
			`if (searchTerm) {`
			`// If we are inside the search term, do not error out just yet;`
			`// instead, consider it as part of the search term, as a user convenience.`
			`searchTerm += token;`
			`lparenCtr += 1;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`opQueue.unshift('lparen');`
			`groupNegate.push(negate);`
			`negate = false;`
			`}`
			`break;`
			`case 'rparen':`
			`if (lparenCtr > 0) {`
			`searchTerm = assertNotNull(searchTerm) + token;`
			`lparenCtr -= 1;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`while (opQueue.length > 0) {`
			`const op = assertNotUndefined(opQueue.shift());`
			`if (op === 'lparen') {`
			`break;`
			`}`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.tokenList.push(op);`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`if (groupNegate.length > 0 && groupNegate.pop()) {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.tokenList.push('not_op');`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`}`
			`break;`
			`case 'fuzz':`
			`if (searchTerm) {`
			`// For this and boost operations, we store the current match so far`
			`// to a temporary string in case this is actually inside the term.`
			`fuzz = parseFloat(token.substring(1));`
			`boostFuzzStr += token;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`beginTerm(token);`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`break;`
			`case 'boost':`
			`if (searchTerm) {`
			`boost = parseFloat(token.substring(1));`
			`boostFuzzStr += token;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`beginTerm(token);`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`break;`
			`case 'quoted_lit':`
			`if (searchTerm) {`
			`searchTerm += token;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`beginTerm(token);`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`break;`
			`case 'word':`
			`if (searchTerm) {`
			`if (fuzz !== 0 \|\| boost !== 1) {`
			`boost = 1;`
			`fuzz = 0;`
			`searchTerm += boostFuzzStr;`
			`boostFuzzStr = '';`
			`}`
			`searchTerm += token;`
Use prettier for JS formatting 2024-07-04 02:27:59 +02:00			`} else {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`beginTerm(token);`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`
			`break;`
			`default:`
			`// Append extra spaces within search terms.`
			`if (searchTerm) {`
			`searchTerm += token;`
			`}`
			`}`

			`// Truncate string and restart the token tests.`
			`localSearchStr = localSearchStr.substring(token.length);`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`index += token.length;`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00
			`// Break since we have found a match.`
			`break;`
			`}`
			`}`

			`// Append final tokens to the stack.`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`endTerm();`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00
			`if (opQueue.indexOf('rparen') !== -1 \|\| opQueue.indexOf('lparen') !== -1) {`
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`ret.error = new ParseError('Mismatched parentheses.');`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`

Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`// Concatenate remaining operators to the token stack.`
			`ret.tokenList.push(...opQueue);`

			`return ret;`
			`}`

			`export function generateLexArray(searchStr: string, parseTerm: ParseTerm): TokenList {`
			`const ret = generateLexResult(searchStr, parseTerm);`

			`if (ret.error) {`
			`throw ret.error;`
			`}`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00
Add lex stage with intermediate result generation 2024-05-29 04:54:45 +02:00			`return ret.tokenList;`
match_query: unit test and rewrite for TypeScript (#208) * match_query: unit test and rewrite for TypeScript * match_query: use new type for parse errors * match_query: avoid exceptional control flow in date parsing 2024-03-18 13:20:47 +01:00			`}`