mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-11-23 20:18:00 +01:00
225 lines
5.9 KiB
TypeScript
225 lines
5.9 KiB
TypeScript
import { assertNotNull, assertNotUndefined } from '../utils/assert';
|
|
import { AstMatcher, ParseError, TokenList } from './types';
|
|
|
|
type TokenName = string;
|
|
type Token = [TokenName, RegExp];
|
|
|
|
const tokenList: Token[] = [
|
|
['fuzz', /^~(?:\d+(\.\d+)?|\.\d+)/],
|
|
['boost', /^\^[-+]?\d+(\.\d+)?/],
|
|
['quoted_lit', /^\s*"(?:[^"]|\\")+"/],
|
|
['lparen', /^\s*\(\s*/],
|
|
['rparen', /^\s*\)\s*/],
|
|
['and_op', /^\s*(?:&&|AND)\s+/],
|
|
['and_op', /^\s*,\s*/],
|
|
['or_op', /^\s*(?:\|\||OR)\s+/],
|
|
['not_op', /^\s*NOT(?:\s+|(?=\())/],
|
|
['not_op', /^\s*[!-]\s*/],
|
|
['space', /^\s+/],
|
|
['word', /^(?:\\[\s,()^~]|[^\s,()^~])+/],
|
|
['word', /^(?:\\[\s,()]|[^\s,()])+/]
|
|
];
|
|
|
|
export type ParseTerm = (term: string, fuzz: number, boost: number) => AstMatcher;
|
|
|
|
export type Range = [number, number];
|
|
export type TermContext = [Range, string];
|
|
|
|
export interface LexResult {
|
|
tokenList: TokenList,
|
|
termContexts: TermContext[],
|
|
error: ParseError | null
|
|
}
|
|
|
|
export function generateLexResult(searchStr: string, parseTerm: ParseTerm): LexResult {
|
|
const opQueue: string[] = [],
|
|
groupNegate: boolean[] = [];
|
|
|
|
let searchTerm: string | null = null;
|
|
let boostFuzzStr = '';
|
|
let localSearchStr: string = searchStr;
|
|
let negate = false;
|
|
let boost = 1;
|
|
let fuzz = 0;
|
|
let lparenCtr = 0;
|
|
|
|
let termIndex = 0;
|
|
let index = 0;
|
|
|
|
const ret: LexResult = {
|
|
tokenList: [],
|
|
termContexts: [],
|
|
error: null
|
|
};
|
|
|
|
const beginTerm = (token: string) => {
|
|
searchTerm = token;
|
|
termIndex = index;
|
|
};
|
|
|
|
const endTerm = () => {
|
|
if (searchTerm !== null) {
|
|
// Push to stack.
|
|
ret.tokenList.push(parseTerm(searchTerm, fuzz, boost));
|
|
ret.termContexts.push([[termIndex, termIndex + searchTerm.length], searchTerm]);
|
|
// Reset term and options data.
|
|
boost = 1;
|
|
fuzz = 0;
|
|
searchTerm = null;
|
|
boostFuzzStr = '';
|
|
lparenCtr = 0;
|
|
}
|
|
|
|
if (negate) {
|
|
ret.tokenList.push('not_op');
|
|
negate = false;
|
|
}
|
|
};
|
|
|
|
while (localSearchStr.length > 0) {
|
|
for (const [tokenName, tokenRe] of tokenList) {
|
|
const match = tokenRe.exec(localSearchStr);
|
|
|
|
if (!match) {
|
|
continue;
|
|
}
|
|
|
|
const token = match[0];
|
|
|
|
if (searchTerm !== null && (['and_op', 'or_op'].indexOf(tokenName) !== -1 || tokenName === 'rparen' && lparenCtr === 0)) {
|
|
endTerm();
|
|
}
|
|
|
|
switch (tokenName) {
|
|
case 'and_op':
|
|
while (opQueue[0] === 'and_op') {
|
|
ret.tokenList.push(assertNotUndefined(opQueue.shift()));
|
|
}
|
|
opQueue.unshift('and_op');
|
|
break;
|
|
case 'or_op':
|
|
while (opQueue[0] === 'and_op' || opQueue[0] === 'or_op') {
|
|
ret.tokenList.push(assertNotUndefined(opQueue.shift()));
|
|
}
|
|
opQueue.unshift('or_op');
|
|
break;
|
|
case 'not_op':
|
|
if (searchTerm) {
|
|
// We're already inside a search term, so it does not apply, obv.
|
|
searchTerm += token;
|
|
}
|
|
else {
|
|
negate = !negate;
|
|
}
|
|
break;
|
|
case 'lparen':
|
|
if (searchTerm) {
|
|
// If we are inside the search term, do not error out just yet;
|
|
// instead, consider it as part of the search term, as a user convenience.
|
|
searchTerm += token;
|
|
lparenCtr += 1;
|
|
}
|
|
else {
|
|
opQueue.unshift('lparen');
|
|
groupNegate.push(negate);
|
|
negate = false;
|
|
}
|
|
break;
|
|
case 'rparen':
|
|
if (lparenCtr > 0) {
|
|
searchTerm = assertNotNull(searchTerm) + token;
|
|
lparenCtr -= 1;
|
|
}
|
|
else {
|
|
while (opQueue.length > 0) {
|
|
const op = assertNotUndefined(opQueue.shift());
|
|
if (op === 'lparen') {
|
|
break;
|
|
}
|
|
ret.tokenList.push(op);
|
|
}
|
|
if (groupNegate.length > 0 && groupNegate.pop()) {
|
|
ret.tokenList.push('not_op');
|
|
}
|
|
}
|
|
break;
|
|
case 'fuzz':
|
|
if (searchTerm) {
|
|
// For this and boost operations, we store the current match so far
|
|
// to a temporary string in case this is actually inside the term.
|
|
fuzz = parseFloat(token.substring(1));
|
|
boostFuzzStr += token;
|
|
}
|
|
else {
|
|
beginTerm(token);
|
|
}
|
|
break;
|
|
case 'boost':
|
|
if (searchTerm) {
|
|
boost = parseFloat(token.substring(1));
|
|
boostFuzzStr += token;
|
|
}
|
|
else {
|
|
beginTerm(token);
|
|
}
|
|
break;
|
|
case 'quoted_lit':
|
|
if (searchTerm) {
|
|
searchTerm += token;
|
|
}
|
|
else {
|
|
beginTerm(token);
|
|
}
|
|
break;
|
|
case 'word':
|
|
if (searchTerm) {
|
|
if (fuzz !== 0 || boost !== 1) {
|
|
boost = 1;
|
|
fuzz = 0;
|
|
searchTerm += boostFuzzStr;
|
|
boostFuzzStr = '';
|
|
}
|
|
searchTerm += token;
|
|
}
|
|
else {
|
|
beginTerm(token);
|
|
}
|
|
break;
|
|
default:
|
|
// Append extra spaces within search terms.
|
|
if (searchTerm) {
|
|
searchTerm += token;
|
|
}
|
|
}
|
|
|
|
// Truncate string and restart the token tests.
|
|
localSearchStr = localSearchStr.substring(token.length);
|
|
index += token.length;
|
|
|
|
// Break since we have found a match.
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Append final tokens to the stack.
|
|
endTerm();
|
|
|
|
if (opQueue.indexOf('rparen') !== -1 || opQueue.indexOf('lparen') !== -1) {
|
|
ret.error = new ParseError('Mismatched parentheses.');
|
|
}
|
|
|
|
// Concatenate remaining operators to the token stack.
|
|
ret.tokenList.push(...opQueue);
|
|
|
|
return ret;
|
|
}
|
|
|
|
export function generateLexArray(searchStr: string, parseTerm: ParseTerm): TokenList {
|
|
const ret = generateLexResult(searchStr, parseTerm);
|
|
|
|
if (ret.error) {
|
|
throw ret.error;
|
|
}
|
|
|
|
return ret.tokenList;
|
|
}
|