/* Copyright 2021 Mozilla Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js"; import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js"; import { NumberTree } from "./name_number_tree.js"; const MAX_DEPTH = 40; const StructElementType = { PAGE_CONTENT: 1, STREAM_CONTENT: 2, OBJECT: 3, ANNOTATION: 4, ELEMENT: 5, }; class StructTreeRoot { constructor(rootDict) { this.dict = rootDict; this.roleMap = new Map(); this.structParentIds = null; } init() { this.readRoleMap(); } #addIdToPage(pageRef, id, type) { if (!(pageRef instanceof Ref) || id < 0) { return; } this.structParentIds ||= new RefSetCache(); let ids = this.structParentIds.get(pageRef); if (!ids) { ids = []; this.structParentIds.put(pageRef, ids); } ids.push([id, type]); } addAnnotationIdToPage(pageRef, id) { this.#addIdToPage(pageRef, id, StructElementType.ANNOTATION); } readRoleMap() { const roleMapDict = this.dict.get("RoleMap"); if (!(roleMapDict instanceof Dict)) { return; } roleMapDict.forEach((key, value) => { if (!(value instanceof Name)) { return; } this.roleMap.set(key, value.name); }); } } /** * Instead of loading the whole tree we load just the page's relevant structure * elements, which means we need a wrapper structure to represent the tree. */ class StructElementNode { constructor(tree, dict) { this.tree = tree; this.dict = dict; this.kids = []; this.parseKids(); } get role() { const nameObj = this.dict.get("S"); const name = nameObj instanceof Name ? nameObj.name : ""; const { root } = this.tree; if (root.roleMap.has(name)) { return root.roleMap.get(name); } return name; } parseKids() { let pageObjId = null; const objRef = this.dict.getRaw("Pg"); if (objRef instanceof Ref) { pageObjId = objRef.toString(); } const kids = this.dict.get("K"); if (Array.isArray(kids)) { for (const kid of kids) { const element = this.parseKid(pageObjId, kid); if (element) { this.kids.push(element); } } } else { const element = this.parseKid(pageObjId, kids); if (element) { this.kids.push(element); } } } parseKid(pageObjId, kid) { // A direct link to content, the integer is an mcid. if (Number.isInteger(kid)) { if (this.tree.pageDict.objId !== pageObjId) { return null; } return new StructElement({ type: StructElementType.PAGE_CONTENT, mcid: kid, pageObjId, }); } // Find the dictionary for the kid. let kidDict = null; if (kid instanceof Ref) { kidDict = this.dict.xref.fetch(kid); } else if (kid instanceof Dict) { kidDict = kid; } if (!kidDict) { return null; } const pageRef = kidDict.getRaw("Pg"); if (pageRef instanceof Ref) { pageObjId = pageRef.toString(); } const type = kidDict.get("Type") instanceof Name ? kidDict.get("Type").name : null; if (type === "MCR") { if (this.tree.pageDict.objId !== pageObjId) { return null; } const kidRef = kidDict.getRaw("Stm"); return new StructElement({ type: StructElementType.STREAM_CONTENT, refObjId: kidRef instanceof Ref ? kidRef.toString() : null, pageObjId, mcid: kidDict.get("MCID"), }); } if (type === "OBJR") { if (this.tree.pageDict.objId !== pageObjId) { return null; } const kidRef = kidDict.getRaw("Obj"); return new StructElement({ type: StructElementType.OBJECT, refObjId: kidRef instanceof Ref ? kidRef.toString() : null, pageObjId, }); } return new StructElement({ type: StructElementType.ELEMENT, dict: kidDict, }); } } class StructElement { constructor({ type, dict = null, mcid = null, pageObjId = null, refObjId = null, }) { this.type = type; this.dict = dict; this.mcid = mcid; this.pageObjId = pageObjId; this.refObjId = refObjId; this.parentNode = null; } } class StructTreePage { constructor(structTreeRoot, pageDict) { this.root = structTreeRoot; this.rootDict = structTreeRoot ? structTreeRoot.dict : null; this.pageDict = pageDict; this.nodes = []; } parse(pageRef) { if (!this.root || !this.rootDict) { return; } const parentTree = this.rootDict.get("ParentTree"); if (!parentTree) { return; } const id = this.pageDict.get("StructParents"); const ids = pageRef instanceof Ref && this.root.structParentIds?.get(pageRef); if (!Number.isInteger(id) && !ids) { return; } const map = new Map(); const numberTree = new NumberTree(parentTree, this.rootDict.xref); if (Number.isInteger(id)) { const parentArray = numberTree.get(id); if (Array.isArray(parentArray)) { for (const ref of parentArray) { if (ref instanceof Ref) { this.addNode(this.rootDict.xref.fetch(ref), map); } } } } if (!ids) { return; } for (const [elemId, type] of ids) { const obj = numberTree.get(elemId); if (obj) { const elem = this.addNode(this.rootDict.xref.fetchIfRef(obj), map); if ( elem?.kids?.length === 1 && elem.kids[0].type === StructElementType.OBJECT ) { // The node in the struct tree is wrapping an object (annotation // or xobject), so we need to update the type of the node to match // the type of the object. elem.kids[0].type = type; } } } } addNode(dict, map, level = 0) { if (level > MAX_DEPTH) { warn("StructTree MAX_DEPTH reached."); return null; } if (map.has(dict)) { return map.get(dict); } const element = new StructElementNode(this, dict); map.set(dict, element); const parent = dict.get("P"); if (!parent || isName(parent.get("Type"), "StructTreeRoot")) { if (!this.addTopLevelNode(dict, element)) { map.delete(dict); } return element; } const parentNode = this.addNode(parent, map, level + 1); if (!parentNode) { return element; } let save = false; for (const kid of parentNode.kids) { if (kid.type === StructElementType.ELEMENT && kid.dict === dict) { kid.parentNode = element; save = true; } } if (!save) { map.delete(dict); } return element; } addTopLevelNode(dict, element) { const obj = this.rootDict.get("K"); if (!obj) { return false; } if (obj instanceof Dict) { if (obj.objId !== dict.objId) { return false; } this.nodes[0] = element; return true; } if (!Array.isArray(obj)) { return true; } let save = false; for (let i = 0; i < obj.length; i++) { const kidRef = obj[i]; if (kidRef?.toString() === dict.objId) { this.nodes[i] = element; save = true; } } return save; } /** * Convert the tree structure into a simplified object literal that can * be sent to the main thread. * @returns {Object} */ get serializable() { function nodeToSerializable(node, parent, level = 0) { if (level > MAX_DEPTH) { warn("StructTree too deep to be fully serialized."); return; } const obj = Object.create(null); obj.role = node.role; obj.children = []; parent.children.push(obj); const alt = node.dict.get("Alt"); if (typeof alt === "string") { obj.alt = stringToPDFString(alt); } const lang = node.dict.get("Lang"); if (typeof lang === "string") { obj.lang = stringToPDFString(lang); } for (const kid of node.kids) { const kidElement = kid.type === StructElementType.ELEMENT ? kid.parentNode : null; if (kidElement) { nodeToSerializable(kidElement, obj, level + 1); continue; } else if ( kid.type === StructElementType.PAGE_CONTENT || kid.type === StructElementType.STREAM_CONTENT ) { obj.children.push({ type: "content", id: `p${kid.pageObjId}_mc${kid.mcid}`, }); } else if (kid.type === StructElementType.OBJECT) { obj.children.push({ type: "object", id: kid.refObjId, }); } else if (kid.type === StructElementType.ANNOTATION) { obj.children.push({ type: "annotation", id: `${AnnotationPrefix}${kid.refObjId}`, }); } } } const root = Object.create(null); root.children = []; root.role = "Root"; for (const child of this.nodes) { if (!child) { continue; } nodeToSerializable(child, root); } return root; } } export { StructTreePage, StructTreeRoot };