[Editor] Add the ability to create/update the structure tree when saving a pdf containing newly added annotations (bug 1845087)

When there is no tree, the tags for the new annotions are just put under the root element.
When there is a tree, we insert the new tags at the right place in using the value
of structTreeParentId (added in PR #16916).
This commit is contained in:
Calixte Denizet 2023-09-11 17:51:22 +02:00
parent 7f8de83e96
commit a8573d4e1b
8 changed files with 613 additions and 14 deletions

View File

@ -1639,7 +1639,7 @@ class MarkupAnnotation extends Annotation {
}
static async createNewAnnotation(xref, annotation, dependencies, params) {
const annotationRef = annotation.ref || xref.getNewTemporaryRef();
const annotationRef = (annotation.ref ||= xref.getNewTemporaryRef());
const ap = await this.createNewAppearanceStream(annotation, xref, params);
const buffer = [];
let annotationDict;
@ -1652,6 +1652,9 @@ class MarkupAnnotation extends Annotation {
} else {
annotationDict = this.createNewDict(annotation, xref, {});
}
if (Number.isInteger(annotation.parentTreeId)) {
annotationDict.set("StructParent", annotation.parentTreeId);
}
buffer.length = 0;
await writeObject(annotationRef, annotationDict, buffer, xref);

View File

@ -84,6 +84,10 @@ class Catalog {
this.systemFontCache = new Map();
}
cloneDict() {
return this._catDict.clone();
}
get version() {
const version = this._catDict.get("Version");
if (version instanceof Name) {
@ -245,11 +249,13 @@ class Catalog {
* @private
*/
_readStructTreeRoot() {
const obj = this._catDict.get("StructTreeRoot");
const rawObj = this._catDict.getRaw("StructTreeRoot");
const obj = this.xref.fetchIfRef(rawObj);
if (!(obj instanceof Dict)) {
return null;
}
const root = new StructTreeRoot(obj);
const root = new StructTreeRoot(obj, rawObj);
root.init();
return root;
}

View File

@ -64,6 +64,10 @@ class BasePdfManager {
return this._docBaseUrl;
}
get catalog() {
return this.pdfDocument.catalog;
}
ensureDoc(prop, args) {
return this.ensure(this.pdfDocument, prop, args);
}

View File

@ -262,6 +262,14 @@ class Dict {
return mergedDict.size > 0 ? mergedDict : Dict.empty;
}
clone() {
const dict = new Dict(this.xref);
for (const key of this.getKeys()) {
dict.set(key, this.getRaw(key));
}
return dict;
}
}
class Ref {

View File

@ -16,6 +16,7 @@
import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js";
import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js";
import { NumberTree } from "./name_number_tree.js";
import { writeObject } from "./writer.js";
const MAX_DEPTH = 40;
@ -28,8 +29,9 @@ const StructElementType = {
};
class StructTreeRoot {
constructor(rootDict) {
constructor(rootDict, rootRef) {
this.dict = rootDict;
this.ref = rootRef instanceof Ref ? rootRef : null;
this.roleMap = new Map();
this.structParentIds = null;
}
@ -67,6 +69,419 @@ class StructTreeRoot {
this.roleMap.set(key, value.name);
});
}
static async canCreateStructureTree({
catalogRef,
pdfManager,
newAnnotationsByPage,
}) {
if (!(catalogRef instanceof Ref)) {
warn("Cannot save the struct tree: no catalog reference.");
return false;
}
let nextKey = 0;
let hasNothingToUpdate = true;
for (const [pageIndex, elements] of newAnnotationsByPage) {
const { ref: pageRef } = await pdfManager.getPage(pageIndex);
if (!(pageRef instanceof Ref)) {
warn(`Cannot save the struct tree: page ${pageIndex} has no ref.`);
hasNothingToUpdate = true;
break;
}
for (const element of elements) {
if (element.accessibilityData?.type) {
// Each tag must have a structure type.
element.parentTreeId = nextKey++;
hasNothingToUpdate = false;
}
}
}
if (hasNothingToUpdate) {
for (const elements of newAnnotationsByPage.values()) {
for (const element of elements) {
delete element.parentTreeId;
}
}
return false;
}
return true;
}
static async createStructureTree({
newAnnotationsByPage,
xref,
catalogRef,
pdfManager,
newRefs,
}) {
const root = pdfManager.catalog.cloneDict();
const structTreeRootRef = xref.getNewTemporaryRef();
root.set("StructTreeRoot", structTreeRootRef);
const buffer = [];
await writeObject(catalogRef, root, buffer, xref);
newRefs.push({ ref: catalogRef, data: buffer.join("") });
const structTreeRoot = new Dict(xref);
structTreeRoot.set("Type", Name.get("StructTreeRoot"));
const parentTreeRef = xref.getNewTemporaryRef();
structTreeRoot.set("ParentTree", parentTreeRef);
const kids = [];
structTreeRoot.set("K", kids);
const parentTree = new Dict(xref);
const nums = [];
parentTree.set("Nums", nums);
const nextKey = await this.#writeKids({
newAnnotationsByPage,
structTreeRootRef,
kids,
nums,
xref,
pdfManager,
newRefs,
buffer,
});
structTreeRoot.set("ParentTreeNextKey", nextKey);
buffer.length = 0;
await writeObject(parentTreeRef, parentTree, buffer, xref);
newRefs.push({ ref: parentTreeRef, data: buffer.join("") });
buffer.length = 0;
await writeObject(structTreeRootRef, structTreeRoot, buffer, xref);
newRefs.push({ ref: structTreeRootRef, data: buffer.join("") });
}
async canUpdateStructTree({ pdfManager, newAnnotationsByPage }) {
if (!this.ref) {
warn("Cannot update the struct tree: no root reference.");
return false;
}
let nextKey = this.dict.get("ParentTreeNextKey");
if (!Number.isInteger(nextKey) || nextKey < 0) {
warn("Cannot update the struct tree: invalid next key.");
return false;
}
const parentTree = this.dict.get("ParentTree");
if (!(parentTree instanceof Dict)) {
warn("Cannot update the struct tree: ParentTree isn't a dict.");
return false;
}
const nums = parentTree.get("Nums");
if (!Array.isArray(nums)) {
warn("Cannot update the struct tree: nums isn't an array.");
return false;
}
const { numPages } = pdfManager.catalog;
for (const pageIndex of newAnnotationsByPage.keys()) {
const { pageDict, ref: pageRef } = await pdfManager.getPage(pageIndex);
if (!(pageRef instanceof Ref)) {
warn(`Cannot save the struct tree: page ${pageIndex} has no ref.`);
return false;
}
const id = pageDict.get("StructParents");
if (!Number.isInteger(id) || id < 0 || id >= numPages) {
warn(`Cannot save the struct tree: page ${pageIndex} has no id.`);
return false;
}
}
let hasNothingToUpdate = true;
for (const [pageIndex, elements] of newAnnotationsByPage) {
const { pageDict } = await pdfManager.getPage(pageIndex);
StructTreeRoot.#collectParents({
elements,
xref: this.dict.xref,
pageDict,
parentTree,
});
for (const element of elements) {
if (element.accessibilityData?.type) {
// Each tag must have a structure type.
element.parentTreeId = nextKey++;
hasNothingToUpdate = false;
}
}
}
if (hasNothingToUpdate) {
for (const elements of newAnnotationsByPage.values()) {
for (const element of elements) {
delete element.parentTreeId;
delete element.structTreeParent;
}
}
return false;
}
return true;
}
async updateStructureTree({ newAnnotationsByPage, pdfManager, newRefs }) {
const xref = this.dict.xref;
const structTreeRoot = this.dict.clone();
const structTreeRootRef = this.ref;
let parentTreeRef = structTreeRoot.getRaw("ParentTree");
let parentTree;
if (parentTreeRef instanceof Ref) {
parentTree = xref.fetch(parentTreeRef);
} else {
parentTree = parentTreeRef;
parentTreeRef = xref.getNewTemporaryRef();
structTreeRoot.set("ParentTree", parentTreeRef);
}
parentTree = parentTree.clone();
let nums = parentTree.getRaw("Nums");
let numsRef = null;
if (nums instanceof Ref) {
numsRef = nums;
nums = xref.fetch(numsRef);
}
nums = nums.slice();
if (!numsRef) {
parentTree.set("Nums", nums);
}
let kids = structTreeRoot.getRaw("K");
let kidsRef = null;
if (kids instanceof Ref) {
kidsRef = kids;
kids = xref.fetch(kidsRef);
} else {
kidsRef = xref.getNewTemporaryRef();
structTreeRoot.set("K", kidsRef);
}
kids = Array.isArray(kids) ? kids.slice() : [kids];
const buffer = [];
const newNextkey = await StructTreeRoot.#writeKids({
newAnnotationsByPage,
structTreeRootRef,
kids,
nums,
xref,
pdfManager,
newRefs,
buffer,
});
structTreeRoot.set("ParentTreeNextKey", newNextkey);
buffer.length = 0;
await writeObject(kidsRef, kids, buffer, xref);
newRefs.push({ ref: kidsRef, data: buffer.join("") });
if (numsRef) {
buffer.length = 0;
await writeObject(numsRef, nums, buffer, xref);
newRefs.push({ ref: numsRef, data: buffer.join("") });
}
buffer.length = 0;
await writeObject(parentTreeRef, parentTree, buffer, xref);
newRefs.push({ ref: parentTreeRef, data: buffer.join("") });
buffer.length = 0;
await writeObject(structTreeRootRef, structTreeRoot, buffer, xref);
newRefs.push({ ref: structTreeRootRef, data: buffer.join("") });
}
static async #writeKids({
newAnnotationsByPage,
structTreeRootRef,
kids,
nums,
xref,
pdfManager,
newRefs,
buffer,
}) {
const objr = Name.get("OBJR");
let nextKey = -Infinity;
for (const [pageIndex, elements] of newAnnotationsByPage) {
const { ref: pageRef } = await pdfManager.getPage(pageIndex);
for (const {
accessibilityData: { type, title, lang, alt, expanded, actualText },
ref,
parentTreeId,
structTreeParent,
} of elements) {
nextKey = Math.max(nextKey, parentTreeId);
const tagRef = xref.getNewTemporaryRef();
const tagDict = new Dict(xref);
// The structure type is required.
tagDict.set("S", Name.get(type));
if (title) {
tagDict.set("T", title);
}
if (lang) {
tagDict.set("Lang", lang);
}
if (alt) {
tagDict.set("Alt", alt);
}
if (expanded) {
tagDict.set("E", expanded);
}
if (actualText) {
tagDict.set("ActualText", actualText);
}
if (structTreeParent) {
await this.#updateParentTag({
structTreeParent,
tagDict,
newTagRef: tagRef,
fallbackRef: structTreeRootRef,
xref,
newRefs,
buffer,
});
} else {
tagDict.set("P", structTreeRootRef);
}
const objDict = new Dict(xref);
tagDict.set("K", objDict);
objDict.set("Type", objr);
objDict.set("Pg", pageRef);
objDict.set("Obj", ref);
buffer.length = 0;
await writeObject(tagRef, tagDict, buffer, xref);
newRefs.push({ ref: tagRef, data: buffer.join("") });
nums.push(parentTreeId, tagRef);
kids.push(tagRef);
}
}
return nextKey + 1;
}
static #collectParents({ elements, xref, pageDict, parentTree }) {
const idToElement = new Map();
for (const element of elements) {
if (element.structTreeParentId) {
const id = parseInt(element.structTreeParentId.split("_mc")[1], 10);
idToElement.set(id, element);
}
}
const id = pageDict.get("StructParents");
const numberTree = new NumberTree(parentTree, xref);
const parentArray = numberTree.get(id);
if (!Array.isArray(parentArray)) {
return;
}
const updateElement = (kid, pageKid, kidRef) => {
const element = idToElement.get(kid);
if (element) {
const parentRef = pageKid.getRaw("P");
const parentDict = xref.fetchIfRef(parentRef);
if (parentRef instanceof Ref && parentDict instanceof Dict) {
// It should always the case, but we check just in case.
element.structTreeParent = { ref: kidRef, dict: pageKid };
}
return true;
}
return false;
};
for (const kidRef of parentArray) {
if (!(kidRef instanceof Ref)) {
continue;
}
const pageKid = xref.fetch(kidRef);
const k = pageKid.get("K");
if (Number.isInteger(k)) {
updateElement(k, pageKid, kidRef);
continue;
}
if (!Array.isArray(k)) {
continue;
}
for (let kid of k) {
kid = xref.fetchIfRef(kid);
if (Number.isInteger(kid) && updateElement(kid, pageKid, kidRef)) {
break;
}
}
}
}
static async #updateParentTag({
structTreeParent: { ref, dict },
tagDict,
newTagRef,
fallbackRef,
xref,
newRefs,
buffer,
}) {
// We get the parent of the tag.
const parentRef = dict.getRaw("P");
let parentDict = xref.fetchIfRef(parentRef);
tagDict.set("P", parentRef);
// We get the kids in order to insert a new tag at the right position.
let saveParentDict = false;
let parentKids;
let parentKidsRef = parentDict.getRaw("K");
if (!(parentKidsRef instanceof Ref)) {
parentKids = parentKidsRef;
parentKidsRef = xref.getNewTemporaryRef();
parentDict = parentDict.clone();
parentDict.set("K", parentKidsRef);
saveParentDict = true;
} else {
parentKids = xref.fetch(parentKidsRef);
}
if (Array.isArray(parentKids)) {
const index = parentKids.indexOf(ref);
if (index >= 0) {
parentKids = parentKids.slice();
parentKids.splice(index + 1, 0, newTagRef);
} else {
warn("Cannot update the struct tree: parent kid not found.");
tagDict.set("P", fallbackRef);
return;
}
} else if (parentKids instanceof Dict) {
parentKids = [parentKidsRef, newTagRef];
parentKidsRef = xref.getNewTemporaryRef();
parentDict.set("K", parentKidsRef);
saveParentDict = true;
}
buffer.length = 0;
await writeObject(parentKidsRef, parentKids, buffer, xref);
newRefs.push({ ref: parentKidsRef, data: buffer.join("") });
if (!saveParentDict) {
return;
}
buffer.length = 0;
await writeObject(parentRef, parentDict, buffer, xref);
newRefs.push({ ref: parentRef, data: buffer.join("") });
}
}
/**

View File

@ -42,6 +42,7 @@ import { clearGlobalCaches } from "./cleanup_helper.js";
import { incrementalUpdate } from "./writer.js";
import { MessageHandler } from "../shared/message_handler.js";
import { PDFWorkerStream } from "./worker_stream.js";
import { StructTreeRoot } from "./struct_tree.js";
class WorkerTask {
constructor(name) {
@ -542,24 +543,54 @@ class WorkerMessageHandler {
pdfManager.ensureDoc("startXRef"),
pdfManager.ensureDoc("xref"),
pdfManager.ensureDoc("linearization"),
pdfManager.ensureCatalog("structTreeRoot"),
];
const promises = [];
const newAnnotationsByPage = !isPureXfa
? getNewAnnotationsMap(annotationStorage)
: null;
const [stream, acroForm, acroFormRef, startXRef, xref, linearization] =
await Promise.all(globalPromises);
const [
stream,
acroForm,
acroFormRef,
startXRef,
xref,
linearization,
_structTreeRoot,
] = await Promise.all(globalPromises);
const catalogRef = xref.trailer.getRaw("Root") || null;
let structTreeRoot;
if (newAnnotationsByPage) {
if (!_structTreeRoot) {
if (
await StructTreeRoot.canCreateStructureTree({
catalogRef,
pdfManager,
newAnnotationsByPage,
})
) {
structTreeRoot = null;
}
} else if (
await _structTreeRoot.canUpdateStructTree({
pdfManager,
newAnnotationsByPage,
})
) {
structTreeRoot = _structTreeRoot;
}
const imagePromises = AnnotationFactory.generateImages(
annotationStorage.values(),
xref,
pdfManager.evaluatorOptions.isOffscreenCanvasSupported
);
const newAnnotationPromises =
structTreeRoot === undefined ? promises : [];
for (const [pageIndex, annotations] of newAnnotationsByPage) {
promises.push(
newAnnotationPromises.push(
pdfManager.getPage(pageIndex).then(page => {
const task = new WorkerTask(`Save (editor): page ${pageIndex}`);
return page
@ -570,6 +601,32 @@ class WorkerMessageHandler {
})
);
}
if (structTreeRoot === null) {
// No structTreeRoot exists, so we need to create one.
promises.push(
Promise.all(newAnnotationPromises).then(async newRefs => {
await StructTreeRoot.createStructureTree({
newAnnotationsByPage,
xref,
catalogRef,
pdfManager,
newRefs,
});
return newRefs;
})
);
} else if (structTreeRoot) {
promises.push(
Promise.all(newAnnotationPromises).then(async newRefs => {
await structTreeRoot.updateStructureTree({
newAnnotationsByPage,
pdfManager,
newRefs,
});
return newRefs;
})
);
}
}
if (isPureXfa) {
@ -643,7 +700,7 @@ class WorkerMessageHandler {
}
newXrefInfo = {
rootRef: xref.trailer.getRaw("Root") || null,
rootRef: catalogRef,
encryptRef: xref.trailer.getRaw("Encrypt") || null,
newRef: xref.getNewTemporaryRef(),
infoRef: xref.trailer.getRaw("Info") || null,

View File

@ -32,6 +32,8 @@ async function writeObject(ref, obj, buffer, { encrypt = null }) {
await writeDict(obj, buffer, transform);
} else if (obj instanceof BaseStream) {
await writeStream(obj, buffer, transform);
} else if (Array.isArray(obj)) {
await writeArray(obj, buffer, transform);
}
buffer.push("\nendobj\n");
}
@ -233,11 +235,7 @@ async function updateAcroform({
return;
}
// Clone the acroForm.
const dict = new Dict(xref);
for (const key of acroForm.getKeys()) {
dict.set(key, acroForm.getRaw(key));
}
const dict = acroForm.clone();
if (hasXfa && !hasXfaDatasetsEntry) {
// We've a XFA array which doesn't contain a datasets entry.

View File

@ -2297,6 +2297,114 @@ describe("api", function () {
await loadingTask.destroy();
});
it("write a new stamp annotation in a tagged pdf, save and check that the structure tree", async function () {
if (isNodeJS) {
pending("Cannot create a bitmap from Node.js.");
}
const TEST_IMAGES_PATH = "../images/";
const filename = "firefox_logo.png";
const path = new URL(TEST_IMAGES_PATH + filename, window.location).href;
const response = await fetch(path);
const blob = await response.blob();
const bitmap = await createImageBitmap(blob);
let loadingTask = getDocument(buildGetDocumentParams("bug1823296.pdf"));
let pdfDoc = await loadingTask.promise;
pdfDoc.annotationStorage.setValue("pdfjs_internal_editor_0", {
annotationType: AnnotationEditorType.STAMP,
rect: [128, 400, 148, 420],
rotation: 0,
bitmap,
bitmapId: "im1",
pageIndex: 0,
structTreeParentId: "p3R_mc12",
accessibilityData: {
type: "Figure",
alt: "Hello World",
},
});
const data = await pdfDoc.saveDocument();
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree();
const leaf = tree.children[0].children[6].children[1];
expect(leaf).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_477R",
},
],
alt: "Hello World",
});
await loadingTask.destroy();
});
it("write a new stamp annotation in a non-tagged pdf, save and check that the structure tree", async function () {
if (isNodeJS) {
pending("Cannot create a bitmap from Node.js.");
}
const TEST_IMAGES_PATH = "../images/";
const filename = "firefox_logo.png";
const path = new URL(TEST_IMAGES_PATH + filename, window.location).href;
const response = await fetch(path);
const blob = await response.blob();
const bitmap = await createImageBitmap(blob);
let loadingTask = getDocument(buildGetDocumentParams("empty.pdf"));
let pdfDoc = await loadingTask.promise;
pdfDoc.annotationStorage.setValue("pdfjs_internal_editor_0", {
annotationType: AnnotationEditorType.STAMP,
rect: [128, 400, 148, 420],
rotation: 0,
bitmap,
bitmapId: "im1",
pageIndex: 0,
structTreeParentId: null,
accessibilityData: {
type: "Figure",
alt: "Hello World",
},
});
const data = await pdfDoc.saveDocument();
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree();
expect(tree).toEqual({
children: [
{
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_18R",
},
],
alt: "Hello World",
},
],
role: "Root",
});
await loadingTask.destroy();
});
describe("Cross-origin", function () {
let loadingTask;
function _checkCanLoad(expectSuccess, filename, options) {