Save form data in XFA datasets when pdf is a mix of acroforms and xfa (#12344)

* Move display/xml_parser.js in shared to use it in worker

* Save form data in XFA datasets when pdf is a mix of acroforms and xfa

Co-authored-by: Brendan Dahl <brendan.dahl@gmail.com>
This commit is contained in:
calixteman 2020-09-09 00:13:52 +02:00 committed by GitHub
parent 622e2fbd3a
commit 68b99c59ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 416 additions and 19 deletions

View File

@ -1073,6 +1073,7 @@ class WidgetAnnotation extends Annotation {
return null;
}
const value = annotationStorage[this.data.id];
const bbox = [
0,
0,
@ -1080,11 +1081,15 @@ class WidgetAnnotation extends Annotation {
this.data.rect[3] - this.data.rect[1],
];
const xfa = {
path: stringToPDFString(dict.get("T") || ""),
value,
};
const newRef = evaluator.xref.getNewRef();
const AP = new Dict(evaluator.xref);
AP.set("N", newRef);
const value = annotationStorage[this.data.id];
const encrypt = evaluator.xref.encrypt;
let originalTransform = null;
let newTransform = null;
@ -1120,9 +1125,9 @@ class WidgetAnnotation extends Annotation {
return [
// data for the original object
// V field changed + reference for new AP
{ ref: this.ref, data: bufferOriginal.join("") },
{ ref: this.ref, data: bufferOriginal.join(""), xfa },
// data for the new AP
{ ref: newRef, data: bufferNew.join("") },
{ ref: newRef, data: bufferNew.join(""), xfa: null },
];
}
@ -1521,6 +1526,11 @@ class ButtonWidgetAnnotation extends WidgetAnnotation {
return null;
}
const xfa = {
path: stringToPDFString(dict.get("T") || ""),
value: value ? this.data.exportValue : "",
};
const name = Name.get(value ? this.data.exportValue : "Off");
dict.set("V", name);
dict.set("AS", name);
@ -1539,7 +1549,7 @@ class ButtonWidgetAnnotation extends WidgetAnnotation {
writeDict(dict, buffer, originalTransform);
buffer.push("\nendobj\n");
return [{ ref: this.ref, data: buffer.join("") }];
return [{ ref: this.ref, data: buffer.join(""), xfa }];
}
async _saveRadioButton(evaluator, task, annotationStorage) {
@ -1555,6 +1565,11 @@ class ButtonWidgetAnnotation extends WidgetAnnotation {
return null;
}
const xfa = {
path: stringToPDFString(dict.get("T") || ""),
value: value ? this.data.buttonValue : "",
};
const name = Name.get(value ? this.data.buttonValue : "Off");
let parentBuffer = null;
const encrypt = evaluator.xref.encrypt;
@ -1593,9 +1608,13 @@ class ButtonWidgetAnnotation extends WidgetAnnotation {
writeDict(dict, buffer, originalTransform);
buffer.push("\nendobj\n");
const newRefs = [{ ref: this.ref, data: buffer.join("") }];
const newRefs = [{ ref: this.ref, data: buffer.join(""), xfa }];
if (parentBuffer !== null) {
newRefs.push({ ref: this.parent, data: parentBuffer.join("") });
newRefs.push({
ref: this.parent,
data: parentBuffer.join(""),
xfa: null,
});
}
return newRefs;

View File

@ -32,7 +32,7 @@ import {
VerbosityLevel,
warn,
} from "../shared/util.js";
import { clearPrimitiveCaches, Ref } from "./primitives.js";
import { clearPrimitiveCaches, Dict, isDict, Ref } from "./primitives.js";
import { LocalPdfManager, NetworkPdfManager } from "./pdf_manager.js";
import { incrementalUpdate } from "./writer.js";
import { isNodeJS } from "../shared/is_node.js";
@ -521,7 +521,10 @@ class WorkerMessageHandler {
filename,
}) {
pdfManager.requestLoadedStream();
const promises = [pdfManager.onLoadedStream()];
const promises = [
pdfManager.onLoadedStream(),
pdfManager.ensureCatalog("acroForm"),
];
const document = pdfManager.pdfDocument;
for (let pageIndex = 0; pageIndex < numPages; pageIndex++) {
promises.push(
@ -532,7 +535,7 @@ class WorkerMessageHandler {
);
}
return Promise.all(promises).then(([stream, ...refs]) => {
return Promise.all(promises).then(([stream, acroForm, ...refs]) => {
let newRefs = [];
for (const ref of refs) {
newRefs = ref
@ -545,6 +548,20 @@ class WorkerMessageHandler {
return stream.bytes;
}
acroForm = isDict(acroForm) ? acroForm : Dict.empty;
const xfa = acroForm.get("XFA") || [];
let xfaDatasets = null;
if (Array.isArray(xfa)) {
for (let i = 0, ii = xfa.length; i < ii; i += 2) {
if (xfa[i] === "datasets") {
xfaDatasets = xfa[i + 1];
}
}
} else {
// TODO: Support XFA streams.
warn("Unsupported XFA type.");
}
const xref = document.xref;
let newXrefInfo = Object.create(null);
if (xref.trailer) {
@ -572,7 +589,13 @@ class WorkerMessageHandler {
}
xref.resetNewRef();
return incrementalUpdate(stream.bytes, newXrefInfo, newRefs);
return incrementalUpdate(
stream.bytes,
newXrefInfo,
newRefs,
xref,
xfaDatasets
);
});
});

View File

@ -14,8 +14,14 @@
*/
/* eslint no-var: error */
import { bytesToString, escapeString } from "../shared/util.js";
import {
bytesToString,
escapeString,
parseXFAPath,
warn,
} from "../shared/util.js";
import { Dict, isDict, isName, isRef, isStream, Name } from "./primitives.js";
import { SimpleDOMNode, SimpleXMLParser } from "../shared/xml_parser.js";
import { calculateMD5 } from "./crypto.js";
function writeDict(dict, buffer, transform) {
@ -123,7 +129,55 @@ function computeMD5(filesize, xrefInfo) {
return bytesToString(calculateMD5(array));
}
function incrementalUpdate(originalData, xrefInfo, newRefs) {
function updateXFA(datasetsRef, newRefs, xref) {
if (datasetsRef === null || xref === null) {
return;
}
const datasets = xref.fetchIfRef(datasetsRef);
const str = bytesToString(datasets.getBytes());
const xml = new SimpleXMLParser(/* hasAttributes */ true).parseFromString(
str
);
for (const { xfa } of newRefs) {
if (!xfa) {
continue;
}
const { path, value } = xfa;
if (!path) {
continue;
}
const node = xml.documentElement.searchNode(parseXFAPath(path), 0);
if (node) {
node.childNodes = [new SimpleDOMNode("#text", value)];
} else {
warn(`Node not found for path: ${path}`);
}
}
const buffer = [];
xml.documentElement.dump(buffer);
let updatedXml = buffer.join("");
const encrypt = xref.encrypt;
if (encrypt) {
const transform = encrypt.createCipherTransform(
datasetsRef.num,
datasetsRef.gen
);
updatedXml = transform.encryptString(updatedXml);
}
const data =
`${datasetsRef.num} ${datasetsRef.gen} obj\n` +
`<< /Type /EmbeddedFile /Length ${updatedXml.length}>>\nstream\n` +
updatedXml +
"\nendstream\nendobj\n";
newRefs.push({ ref: datasetsRef, data });
}
function incrementalUpdate(originalData, xrefInfo, newRefs, xref, datasetsRef) {
updateXFA(datasetsRef, newRefs, xref);
const newXref = new Dict(null);
const refForXrefTable = xrefInfo.newRef;

View File

@ -14,7 +14,7 @@
*/
import { assert } from "../shared/util.js";
import { SimpleXMLParser } from "./xml_parser.js";
import { SimpleXMLParser } from "../shared/xml_parser.js";
class Metadata {
constructor(data) {

View File

@ -910,6 +910,73 @@ const createObjectURL = (function createObjectURLClosure() {
};
})();
/**
* AcroForm field names use an array like notation to refer to
* repeated XFA elements e.g. foo.bar[nnn].
* see: XFA Spec Chapter 3 - Repeated Elements
*
* @param {string} path - XFA path name.
* @returns {Array} - Array of Objects with the name and pos of
* each part of the path.
*/
function parseXFAPath(path) {
const positionPattern = /(.+)\[([0-9]+)\]$/;
return path.split(".").map(component => {
const m = component.match(positionPattern);
if (m) {
return { name: m[1], pos: parseInt(m[2], 10) };
}
return { name: component, pos: 0 };
});
}
const XMLEntities = {
/* < */ 0x3c: "&lt;",
/* > */ 0x3e: "&gt;",
/* & */ 0x26: "&amp;",
/* " */ 0x22: "&quot;",
/* ' */ 0x27: "&apos;",
};
function encodeToXmlString(str) {
const buffer = [];
let start = 0;
for (let i = 0, ii = str.length; i < ii; i++) {
const char = str.codePointAt(i);
if (0x20 <= char && char <= 0x7e) {
// ascii
const entity = XMLEntities[char];
if (entity) {
if (start < i) {
buffer.push(str.substring(start, i));
}
buffer.push(entity);
start = i + 1;
}
} else {
if (start < i) {
buffer.push(str.substring(start, i));
}
buffer.push(`&#x${char.toString(16).toUpperCase()};`);
if (char > 0xd7ff && (char < 0xe000 || char > 0xfffd)) {
// char is represented by two u16
i++;
}
start = i + 1;
}
}
if (buffer.length === 0) {
return str;
}
if (start < str.length) {
buffer.push(str.substring(start, str.length));
}
return buffer.join("");
}
export {
BaseException,
FONT_IDENTITY_MATRIX,
@ -947,6 +1014,7 @@ export {
createPromiseCapability,
createObjectURL,
escapeString,
encodeToXmlString,
getModificationDate,
getVerbosityLevel,
info,
@ -959,6 +1027,7 @@ export {
createValidAbsoluteUrl,
IsLittleEndianCached,
IsEvalSupportedCached,
parseXFAPath,
removeNullCharacters,
setVerbosityLevel,
shadow,

View File

@ -16,6 +16,8 @@
// The code for XMLParserBase copied from
// https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts
import { encodeToXmlString } from "./util.js";
const XMLParserErrorCode = {
NoError: 0,
EndOfDocument: -1,
@ -48,9 +50,9 @@ class XMLParserBase {
_resolveEntities(s) {
return s.replace(/&([^;]+);/g, (all, entity) => {
if (entity.substring(0, 2) === "#x") {
return String.fromCharCode(parseInt(entity.substring(2), 16));
return String.fromCodePoint(parseInt(entity.substring(2), 16));
} else if (entity.substring(0, 1) === "#") {
return String.fromCharCode(parseInt(entity.substring(1), 10));
return String.fromCodePoint(parseInt(entity.substring(1), 10));
}
switch (entity) {
case "lt":
@ -326,14 +328,99 @@ class SimpleDOMNode {
hasChildNodes() {
return this.childNodes && this.childNodes.length > 0;
}
searchNode(paths, pos) {
if (pos >= paths.length) {
return this;
}
const component = paths[pos];
const stack = [];
let node = this;
while (true) {
if (component.name === node.nodeName) {
if (component.pos === 0) {
const res = node.searchNode(paths, pos + 1);
if (res !== null) {
return res;
}
} else if (stack.length === 0) {
return null;
} else {
const [parent] = stack.pop();
let siblingPos = 0;
for (const child of parent.childNodes) {
if (component.name === child.nodeName) {
if (siblingPos === component.pos) {
return child.searchNode(paths, pos + 1);
}
siblingPos++;
}
}
// We didn't find the correct sibling
// so just return the first found node
return node.searchNode(paths, pos + 1);
}
}
if (node.childNodes && node.childNodes.length !== 0) {
stack.push([node, 0]);
node = node.childNodes[0];
} else if (stack.length === 0) {
return null;
} else {
while (stack.length !== 0) {
const [parent, currentPos] = stack.pop();
const newPos = currentPos + 1;
if (newPos < parent.childNodes.length) {
stack.push([parent, newPos]);
node = parent.childNodes[newPos];
break;
}
}
if (stack.length === 0) {
return null;
}
}
}
}
dump(buffer) {
if (this.nodeName === "#text") {
buffer.push(encodeToXmlString(this.nodeValue));
return;
}
buffer.push(`<${this.nodeName}`);
if (this.attributes) {
for (const attribute of this.attributes) {
buffer.push(
` ${attribute.name}=\"${encodeToXmlString(attribute.value)}\"`
);
}
}
if (this.hasChildNodes()) {
buffer.push(">");
for (const child of this.childNodes) {
child.dump(buffer);
}
buffer.push(`</${this.nodeName}>`);
} else if (this.nodeValue) {
buffer.push(`>${encodeToXmlString(this.nodeValue)}</${this.nodeName}>`);
} else {
buffer.push("/>");
}
}
}
class SimpleXMLParser extends XMLParserBase {
constructor() {
constructor(hasAttributes = false) {
super();
this._currentFragment = null;
this._stack = null;
this._errorCode = XMLParserErrorCode.NoError;
this._hasAttributes = hasAttributes;
}
parseFromString(data) {
@ -379,6 +466,9 @@ class SimpleXMLParser extends XMLParserBase {
onBeginElement(name, attributes, isEmpty) {
const node = new SimpleDOMNode(name);
node.childNodes = [];
if (this._hasAttributes) {
node.attributes = attributes;
}
this._currentFragment.push(node);
if (isEmpty) {
return;
@ -403,4 +493,4 @@ class SimpleXMLParser extends XMLParserBase {
}
}
export { SimpleXMLParser };
export { SimpleDOMNode, SimpleXMLParser };

View File

@ -37,6 +37,7 @@
"ui_utils_spec.js",
"unicode_spec.js",
"util_spec.js",
"writer_spec.js"
"writer_spec.js",
"xml_spec.js"
]
}

View File

@ -81,6 +81,7 @@ function initializePDFJS(callback) {
"pdfjs-test/unit/unicode_spec.js",
"pdfjs-test/unit/util_spec.js",
"pdfjs-test/unit/writer_spec.js",
"pdfjs-test/unit/xml_spec.js",
].map(function (moduleName) {
// eslint-disable-next-line no-unsanitized/method
return SystemJS.import(moduleName);

View File

@ -17,6 +17,7 @@ import {
bytesToString,
createPromiseCapability,
createValidAbsoluteUrl,
encodeToXmlString,
escapeString,
getModificationDate,
isArrayBuffer,
@ -24,6 +25,7 @@ import {
isNum,
isSameOrigin,
isString,
parseXFAPath,
removeNullCharacters,
string32,
stringToBytes,
@ -331,4 +333,32 @@ describe("util", function () {
expect(getModificationDate(date)).toEqual("31410610020653");
});
});
describe("parseXFAPath", function () {
it("should get a correctly parsed path", function () {
const path = "foo.bar[12].oof[3].rab.FOO[123].BAR[456]";
expect(parseXFAPath(path)).toEqual([
{ name: "foo", pos: 0 },
{ name: "bar", pos: 12 },
{ name: "oof", pos: 3 },
{ name: "rab", pos: 0 },
{ name: "FOO", pos: 123 },
{ name: "BAR", pos: 456 },
]);
});
});
describe("encodeToXmlString", function () {
it("should get a correctly encoded string with some entities", function () {
const str = "\"\u0397ell😂' & <W😂rld>";
expect(encodeToXmlString(str)).toEqual(
"&quot;&#x397;ell&#x1F602;&apos; &amp; &lt;W&#x1F602;rld&gt;"
);
});
it("should get a correctly encoded basic ascii string", function () {
const str = "hello world";
expect(encodeToXmlString(str)).toEqual(str);
});
});
});

View File

@ -37,7 +37,7 @@ describe("Writer", function () {
info: {},
};
let data = incrementalUpdate(originalData, xrefInfo, newRefs);
let data = incrementalUpdate(originalData, xrefInfo, newRefs, null, null);
data = bytesToString(data);
const expected =

110
test/unit/xml_spec.js Normal file
View File

@ -0,0 +1,110 @@
/* Copyright 2020 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { parseXFAPath } from "../../src/shared/util.js";
import { SimpleXMLParser } from "../../src/shared/xml_parser.js";
describe("XML", function () {
describe("searchNode", function () {
it("should search a node with a given path in xml tree", function () {
const xml = `
<a>
<b>
<c a="123"/>
<d/>
<e>
<f>
<g a="321"/>
</f>
</e>
<c a="456"/>
<c a="789"/>
<h/>
<c a="101112"/>
</b>
<h>
<i/>
<j/>
<k>
<g a="654"/>
</k>
</h>
<b>
<g a="987"/>
<h/>
<g a="121110"/>
</b>
</a>`;
const root = new SimpleXMLParser(true).parseFromString(xml)
.documentElement;
function getAttr(path) {
return root.searchNode(parseXFAPath(path), 0).attributes[0].value;
}
expect(getAttr("b.g")).toEqual("321");
expect(getAttr("e.f.g")).toEqual("321");
expect(getAttr("e.g")).toEqual("321");
expect(getAttr("g")).toEqual("321");
expect(getAttr("h.g")).toEqual("654");
expect(getAttr("b[0].g")).toEqual("321");
expect(getAttr("b[1].g")).toEqual("987");
expect(getAttr("b[1].g[0]")).toEqual("987");
expect(getAttr("b[1].g[1]")).toEqual("121110");
expect(getAttr("c")).toEqual("123");
expect(getAttr("c[1]")).toEqual("456");
expect(getAttr("c[2]")).toEqual("789");
expect(getAttr("c[3]")).toEqual("101112");
});
it("should dump a xml tree", function () {
let xml = `
<a>
<b>
<c a="123"/>
<d>hello</d>
<e>
<f>
<g a="321"/>
</f>
</e>
<c a="456"/>
<c a="789"/>
<h/>
<c a="101112"/>
</b>
<h>
<i/>
<j/>
<k>
W&#x1F602;rld
<g a="654"/>
</k>
</h>
<b>
<g a="987"/>
<h/>
<g a="121110"/>
</b>
</a>`;
xml = xml.replace(/\s+/g, "");
const root = new SimpleXMLParser(true).parseFromString(xml)
.documentElement;
const buffer = [];
root.dump(buffer);
expect(buffer.join("").replace(/\s+/g, "")).toEqual(xml);
});
});
});