pdf.js/src/core/xfa/formcalc_lexer.js
calixteman b5be515375
XFA - Add a lexer/parser for FormCalc language (#12936)
- the language specifications are: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.364.2157&rep=rep1&type=pdf#page=1049
 - it can be used to:
   * as a scripting language for calculation, validations, ...
   * in SOM expressions to select nodes: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.364.2157&rep=rep1&type=pdf#page=101
2021-02-17 20:28:06 +01:00

386 lines
8.5 KiB
JavaScript

/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const KEYWORDS = new Set([
"and",
"break",
"continue",
"do",
"downto",
"else",
"elseif",
"end",
"endfor",
"endfunc",
"endif",
"endwhile",
"eq",
"exit",
"for",
"foreach",
"func",
"ge",
"gt",
"if",
"in",
"infinity",
"le",
"lt",
"nan",
"ne",
"not",
"null",
"or",
"return",
"step",
"then",
"this",
"throw",
"upto",
"var",
"while",
]);
const TOKEN = {
/* Appears in expression */
and: 0,
divide: 1,
dot: 2,
dotDot: 3,
dotHash: 4,
dotStar: 5,
eq: 6,
ge: 7,
gt: 8,
le: 9,
leftBracket: 10,
leftParen: 11,
lt: 12,
minus: 13,
ne: 14,
not: 15,
null: 16,
number: 17,
or: 18,
plus: 19,
rightBracket: 20,
rightParen: 21,
string: 22,
this: 23,
times: 24,
identifier: 25, // in main statments too
/* Main statements */
break: 26,
continue: 27,
do: 28,
for: 29,
foreach: 30,
func: 31,
if: 32,
var: 33,
while: 34,
/* Others */
assign: 35,
comma: 36,
downto: 37,
else: 38,
elseif: 39,
end: 40,
endif: 41,
endfor: 42,
endfunc: 43,
endwhile: 44,
eof: 45,
exit: 46,
in: 47,
infinity: 48,
nan: 49,
return: 50,
step: 51,
then: 52,
throw: 53,
upto: 54,
};
const hexPattern = /^[uU]([0-9a-fA-F]{4,8})/;
const numberPattern = /^[0-9]*(?:\.[0-9]*)?(?:[Ee][+-]?[0-9]+)?/;
const dotNumberPattern = /^[0-9]*(?:[Ee][+-]?[0-9]+)?/;
const eolPattern = /[\r\n]+/;
const identifierPattern = new RegExp("^[\\p{L}_$!][\\p{L}\\p{N}_$]*", "u");
class Token {
constructor(id, value = null) {
this.id = id;
this.value = value;
}
}
const Singletons = (function () {
const obj = Object.create(null);
const nonSingleton = new Set([
"identifier",
"string",
"number",
"nan",
"infinity",
]);
for (const [name, id] of Object.entries(TOKEN)) {
if (!nonSingleton.has(name)) {
obj[name] = new Token(id);
}
}
obj.nan = new Token(TOKEN.number, NaN);
obj.infinity = new Token(TOKEN.number, Infinity);
return obj;
})();
class Lexer {
constructor(data) {
this.data = data;
this.pos = 0;
this.len = data.length;
this.strBuf = [];
}
skipUntilEOL() {
const match = this.data.slice(this.pos).match(eolPattern);
if (match) {
this.pos += match.index + match[0].length;
} else {
// No eol so consume all the chars.
this.pos = this.len;
}
}
getIdentifier() {
this.pos--;
const match = this.data.slice(this.pos).match(identifierPattern);
if (!match) {
throw new Error(
`Invalid token in FormCalc expression at position ${this.pos}.`
);
}
const identifier = this.data.slice(this.pos, this.pos + match[0].length);
this.pos += match[0].length;
const lower = identifier.toLowerCase();
if (!KEYWORDS.has(lower)) {
return new Token(TOKEN.identifier, identifier);
}
return Singletons[lower];
}
getString() {
const str = this.strBuf;
const data = this.data;
let start = this.pos;
while (this.pos < this.len) {
const char = data.charCodeAt(this.pos++);
if (char === 0x22 /* = " */) {
if (data.charCodeAt(this.pos) === 0x22 /* = " */) {
// Escaped quote.
str.push(data.slice(start, this.pos++));
start = this.pos;
continue;
}
// End of string
break;
}
if (char === 0x5c /* = \ */) {
const match = data.substring(this.pos, this.pos + 10).match(hexPattern);
if (!match) {
continue;
}
str.push(data.slice(start, this.pos - 1));
const code = match[1];
if (code.length === 4) {
str.push(String.fromCharCode(parseInt(code, 16)));
start = this.pos += 5;
} else if (code.length !== 8) {
str.push(String.fromCharCode(parseInt(code.slice(0, 4), 16)));
start = this.pos += 5;
} else {
str.push(String.fromCharCode(parseInt(code, 16)));
start = this.pos += 9;
}
}
}
const lastChunk = data.slice(start, this.pos - 1);
if (str.length === 0) {
return new Token(TOKEN.string, lastChunk);
}
str.push(lastChunk);
const string = str.join("");
str.length = 0;
return new Token(TOKEN.string, string);
}
getNumber(first) {
const match = this.data.substring(this.pos).match(numberPattern);
if (!match) {
return first - 0x30 /* = 0 */;
}
const number = parseFloat(
this.data.substring(this.pos - 1, this.pos + match[0].length)
);
this.pos += match[0].length;
return new Token(TOKEN.number, number);
}
getCompOperator(alt1, alt2) {
if (this.data.charCodeAt(this.pos) === 0x3d /* = = */) {
this.pos++;
return alt1;
}
return alt2;
}
getLower() {
const char = this.data.charCodeAt(this.pos);
if (char === 0x3d /* = = */) {
this.pos++;
return Singletons.le;
}
if (char === 0x3e /* = > */) {
this.pos++;
return Singletons.ne;
}
return Singletons.lt;
}
getSlash() {
if (this.data.charCodeAt(this.pos) === 0x2f /* = / */) {
this.skipUntilEOL();
return false;
}
return true;
}
getDot() {
const char = this.data.charCodeAt(this.pos);
if (char === 0x2e /* = . */) {
this.pos++;
return Singletons.dotDot;
}
if (char === 0x2a /* = * */) {
this.pos++;
return Singletons.dotStar;
}
if (char === 0x23 /* = # */) {
this.pos++;
return Singletons.dotHash;
}
if (0x30 /* = 0 */ <= char && char <= 0x39 /* = 9 */) {
this.pos++;
const match = this.data.substring(this.pos).match(dotNumberPattern);
if (!match) {
return new Token(TOKEN.number, (char - 0x30) /* = 0 */ / 10);
}
const end = this.pos + match[0].length;
const number = parseFloat(this.data.substring(this.pos - 2, end));
this.pos = end;
return new Token(TOKEN.number, number);
}
return Singletons.dot;
}
next() {
while (this.pos < this.len) {
const char = this.data.charCodeAt(this.pos++);
switch (char) {
case 0x09 /* = \t */:
case 0x0a /* = \n */:
case 0x0b /* = \v */:
case 0x0c /* = \f */:
case 0x0d /* = \r */:
case 0x20 /* = */:
break;
case 0x22 /* = " */:
return this.getString();
case 0x26 /* = & */:
return Singletons.and;
case 0x28 /* = ( */:
return Singletons.leftParen;
case 0x29 /* = ) */:
return Singletons.rightParen;
case 0x2a /* = * */:
return Singletons.times;
case 0x2b /* = + */:
return Singletons.plus;
case 0x2c /* = , */:
return Singletons.comma;
case 0x2d /* = - */:
return Singletons.minus;
case 0x2e /* = . */:
return this.getDot();
case 0x2f /* = / */:
if (this.getSlash()) {
return Singletons.divide;
}
// It was a comment.
break;
case 0x30 /* = 0 */:
case 0x31 /* = 1 */:
case 0x32 /* = 2 */:
case 0x33 /* = 3 */:
case 0x34 /* = 4 */:
case 0x35 /* = 5 */:
case 0x36 /* = 6 */:
case 0x37 /* = 7 */:
case 0x38 /* = 8 */:
case 0x39 /* = 9 */:
return this.getNumber(char);
case 0x3b /* = ; */:
this.skipUntilEOL();
break;
case 0x3c /* = < */:
return this.getLower();
case 0x3d /* = = */:
return this.getCompOperator(Singletons.eq, Singletons.assign);
case 0x3e /* = > */:
return this.getCompOperator(Singletons.ge, Singletons.gt);
case 0x5b /* = [ */:
return Singletons.leftBracket;
case 0x5d /* = ] */:
return Singletons.rightBracket;
case 0x7c /* = | */:
return Singletons.or;
default:
return this.getIdentifier();
}
}
return Singletons.eof;
}
}
export { Lexer, Token, TOKEN };