Merge pull request #13171 from brendandahl/struct-tree
[api-minor] Add support for basic structure tree for accessibility.
This commit is contained in:
		
						commit
						03c8c89002
					
				| @ -58,6 +58,7 @@ import { calculateMD5 } from "./crypto.js"; | |||||||
| import { Linearization } from "./parser.js"; | import { Linearization } from "./parser.js"; | ||||||
| import { OperatorList } from "./operator_list.js"; | import { OperatorList } from "./operator_list.js"; | ||||||
| import { PartialEvaluator } from "./evaluator.js"; | import { PartialEvaluator } from "./evaluator.js"; | ||||||
|  | import { StructTreePage } from "./struct_tree.js"; | ||||||
| import { XFAFactory } from "./xfa/factory.js"; | import { XFAFactory } from "./xfa/factory.js"; | ||||||
| 
 | 
 | ||||||
| const DEFAULT_USER_UNIT = 1.0; | const DEFAULT_USER_UNIT = 1.0; | ||||||
| @ -104,6 +105,10 @@ class Page { | |||||||
|       static createObjId() { |       static createObjId() { | ||||||
|         return `p${pageIndex}_${++idCounters.obj}`; |         return `p${pageIndex}_${++idCounters.obj}`; | ||||||
|       } |       } | ||||||
|  | 
 | ||||||
|  |       static getPageObjId() { | ||||||
|  |         return `page${ref.toString()}`; | ||||||
|  |       } | ||||||
|     }; |     }; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
| @ -406,6 +411,7 @@ class Page { | |||||||
|     handler, |     handler, | ||||||
|     task, |     task, | ||||||
|     normalizeWhitespace, |     normalizeWhitespace, | ||||||
|  |     includeMarkedContent, | ||||||
|     sink, |     sink, | ||||||
|     combineTextItems, |     combineTextItems, | ||||||
|   }) { |   }) { | ||||||
| @ -437,12 +443,22 @@ class Page { | |||||||
|         task, |         task, | ||||||
|         resources: this.resources, |         resources: this.resources, | ||||||
|         normalizeWhitespace, |         normalizeWhitespace, | ||||||
|  |         includeMarkedContent, | ||||||
|         combineTextItems, |         combineTextItems, | ||||||
|         sink, |         sink, | ||||||
|       }); |       }); | ||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   async getStructTree() { | ||||||
|  |     const structTreeRoot = await this.pdfManager.ensureCatalog( | ||||||
|  |       "structTreeRoot" | ||||||
|  |     ); | ||||||
|  |     const tree = new StructTreePage(structTreeRoot, this.pageDict); | ||||||
|  |     tree.parse(); | ||||||
|  |     return tree; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   getAnnotationsData(intent) { |   getAnnotationsData(intent) { | ||||||
|     return this._parsedAnnotations.then(function (annotations) { |     return this._parsedAnnotations.then(function (annotations) { | ||||||
|       const annotationsData = []; |       const annotationsData = []; | ||||||
| @ -604,6 +620,10 @@ class PDFDocument { | |||||||
|       static createObjId() { |       static createObjId() { | ||||||
|         unreachable("Abstract method `createObjId` called."); |         unreachable("Abstract method `createObjId` called."); | ||||||
|       } |       } | ||||||
|  | 
 | ||||||
|  |       static getPageObjId() { | ||||||
|  |         unreachable("Abstract method `getPageObjId` called."); | ||||||
|  |       } | ||||||
|     }; |     }; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1913,7 +1913,10 @@ class PartialEvaluator { | |||||||
|               return; |               return; | ||||||
|             } |             } | ||||||
|             // Other marked content types aren't supported yet.
 |             // Other marked content types aren't supported yet.
 | ||||||
|             args = [args[0].name]; |             args = [ | ||||||
|  |               args[0].name, | ||||||
|  |               args[1] instanceof Dict ? args[1].get("MCID") : null, | ||||||
|  |             ]; | ||||||
| 
 | 
 | ||||||
|             break; |             break; | ||||||
|           case OPS.beginMarkedContent: |           case OPS.beginMarkedContent: | ||||||
| @ -1973,6 +1976,7 @@ class PartialEvaluator { | |||||||
|     stateManager = null, |     stateManager = null, | ||||||
|     normalizeWhitespace = false, |     normalizeWhitespace = false, | ||||||
|     combineTextItems = false, |     combineTextItems = false, | ||||||
|  |     includeMarkedContent = false, | ||||||
|     sink, |     sink, | ||||||
|     seenStyles = new Set(), |     seenStyles = new Set(), | ||||||
|   }) { |   }) { | ||||||
| @ -2573,6 +2577,7 @@ class PartialEvaluator { | |||||||
|                     stateManager: xObjStateManager, |                     stateManager: xObjStateManager, | ||||||
|                     normalizeWhitespace, |                     normalizeWhitespace, | ||||||
|                     combineTextItems, |                     combineTextItems, | ||||||
|  |                     includeMarkedContent, | ||||||
|                     sink: sinkWrapper, |                     sink: sinkWrapper, | ||||||
|                     seenStyles, |                     seenStyles, | ||||||
|                   }) |                   }) | ||||||
| @ -2650,6 +2655,38 @@ class PartialEvaluator { | |||||||
|               }) |               }) | ||||||
|             ); |             ); | ||||||
|             return; |             return; | ||||||
|  |           case OPS.beginMarkedContent: | ||||||
|  |             if (includeMarkedContent) { | ||||||
|  |               textContent.items.push({ | ||||||
|  |                 type: "beginMarkedContent", | ||||||
|  |                 tag: isName(args[0]) ? args[0].name : null, | ||||||
|  |               }); | ||||||
|  |             } | ||||||
|  |             break; | ||||||
|  |           case OPS.beginMarkedContentProps: | ||||||
|  |             if (includeMarkedContent) { | ||||||
|  |               flushTextContentItem(); | ||||||
|  |               let mcid = null; | ||||||
|  |               if (isDict(args[1])) { | ||||||
|  |                 mcid = args[1].get("MCID"); | ||||||
|  |               } | ||||||
|  |               textContent.items.push({ | ||||||
|  |                 type: "beginMarkedContentProps", | ||||||
|  |                 id: Number.isInteger(mcid) | ||||||
|  |                   ? `${self.idFactory.getPageObjId()}_mcid${mcid}` | ||||||
|  |                   : null, | ||||||
|  |                 tag: isName(args[0]) ? args[0].name : null, | ||||||
|  |               }); | ||||||
|  |             } | ||||||
|  |             break; | ||||||
|  |           case OPS.endMarkedContent: | ||||||
|  |             if (includeMarkedContent) { | ||||||
|  |               flushTextContentItem(); | ||||||
|  |               textContent.items.push({ | ||||||
|  |                 type: "endMarkedContent", | ||||||
|  |               }); | ||||||
|  |             } | ||||||
|  |             break; | ||||||
|         } // switch
 |         } // switch
 | ||||||
|         if (textContent.items.length >= sink.desiredSize) { |         if (textContent.items.length >= sink.desiredSize) { | ||||||
|           // Wait for ready, if we reach highWaterMark.
 |           // Wait for ready, if we reach highWaterMark.
 | ||||||
|  | |||||||
| @ -60,6 +60,7 @@ import { CipherTransformFactory } from "./crypto.js"; | |||||||
| import { ColorSpace } from "./colorspace.js"; | import { ColorSpace } from "./colorspace.js"; | ||||||
| import { GlobalImageCache } from "./image_utils.js"; | import { GlobalImageCache } from "./image_utils.js"; | ||||||
| import { MetadataParser } from "./metadata_parser.js"; | import { MetadataParser } from "./metadata_parser.js"; | ||||||
|  | import { StructTreeRoot } from "./struct_tree.js"; | ||||||
| 
 | 
 | ||||||
| function fetchDestination(dest) { | function fetchDestination(dest) { | ||||||
|   return isDict(dest) ? dest.get("D") : dest; |   return isDict(dest) ? dest.get("D") : dest; | ||||||
| @ -200,6 +201,32 @@ class Catalog { | |||||||
|     return markInfo; |     return markInfo; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   get structTreeRoot() { | ||||||
|  |     let structTree = null; | ||||||
|  |     try { | ||||||
|  |       structTree = this._readStructTreeRoot(); | ||||||
|  |     } catch (ex) { | ||||||
|  |       if (ex instanceof MissingDataException) { | ||||||
|  |         throw ex; | ||||||
|  |       } | ||||||
|  |       warn("Unable read to structTreeRoot info."); | ||||||
|  |     } | ||||||
|  |     return shadow(this, "structTreeRoot", structTree); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   /** | ||||||
|  |    * @private | ||||||
|  |    */ | ||||||
|  |   _readStructTreeRoot() { | ||||||
|  |     const obj = this._catDict.get("StructTreeRoot"); | ||||||
|  |     if (!isDict(obj)) { | ||||||
|  |       return null; | ||||||
|  |     } | ||||||
|  |     const root = new StructTreeRoot(obj); | ||||||
|  |     root.init(); | ||||||
|  |     return root; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   get toplevelPagesDict() { |   get toplevelPagesDict() { | ||||||
|     const pagesObj = this._catDict.get("Pages"); |     const pagesObj = this._catDict.get("Pages"); | ||||||
|     if (!isDict(pagesObj)) { |     if (!isDict(pagesObj)) { | ||||||
| @ -2626,4 +2653,4 @@ const ObjectLoader = (function () { | |||||||
|   return ObjectLoader; |   return ObjectLoader; | ||||||
| })(); | })(); | ||||||
| 
 | 
 | ||||||
| export { Catalog, FileSpec, ObjectLoader, XRef }; | export { Catalog, FileSpec, NumberTree, ObjectLoader, XRef }; | ||||||
|  | |||||||
							
								
								
									
										335
									
								
								src/core/struct_tree.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										335
									
								
								src/core/struct_tree.js
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,335 @@ | |||||||
|  | /* Copyright 2021 Mozilla Foundation | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | import { isDict, isName, isRef } from "./primitives.js"; | ||||||
|  | import { isString, stringToPDFString, warn } from "../shared/util.js"; | ||||||
|  | import { NumberTree } from "./obj.js"; | ||||||
|  | 
 | ||||||
|  | const MAX_DEPTH = 40; | ||||||
|  | 
 | ||||||
|  | const StructElementType = { | ||||||
|  |   PAGE_CONTENT: "PAGE_CONTENT", | ||||||
|  |   STREAM_CONTENT: "STREAM_CONTENT", | ||||||
|  |   OBJECT: "OBJECT", | ||||||
|  |   ELEMENT: "ELEMENT", | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | class StructTreeRoot { | ||||||
|  |   constructor(rootDict) { | ||||||
|  |     this.dict = rootDict; | ||||||
|  |     this.roleMap = new Map(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   init() { | ||||||
|  |     this.readRoleMap(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   readRoleMap() { | ||||||
|  |     const roleMapDict = this.dict.get("RoleMap"); | ||||||
|  |     if (!isDict(roleMapDict)) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |     roleMapDict.forEach((key, value) => { | ||||||
|  |       if (!isName(value)) { | ||||||
|  |         return; | ||||||
|  |       } | ||||||
|  |       this.roleMap.set(key, value.name); | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Instead of loading the whole tree we load just the page's relevant structure | ||||||
|  |  * elements, which means we need a wrapper structure to represent the tree. | ||||||
|  |  */ | ||||||
|  | class StructElementNode { | ||||||
|  |   constructor(tree, dict) { | ||||||
|  |     this.tree = tree; | ||||||
|  |     this.dict = dict; | ||||||
|  |     this.kids = []; | ||||||
|  |     this.parseKids(); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   get role() { | ||||||
|  |     const nameObj = this.dict.get("S"); | ||||||
|  |     const name = isName(nameObj) ? nameObj.name : ""; | ||||||
|  |     const { root } = this.tree; | ||||||
|  |     if (root.roleMap.has(name)) { | ||||||
|  |       return root.roleMap.get(name); | ||||||
|  |     } | ||||||
|  |     return name; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   parseKids() { | ||||||
|  |     let pageObjId = null; | ||||||
|  |     const objRef = this.dict.getRaw("Pg"); | ||||||
|  |     if (isRef(objRef)) { | ||||||
|  |       pageObjId = objRef.toString(); | ||||||
|  |     } | ||||||
|  |     const kids = this.dict.get("K"); | ||||||
|  |     if (Array.isArray(kids)) { | ||||||
|  |       for (const kid of kids) { | ||||||
|  |         const element = this.parseKid(pageObjId, kid); | ||||||
|  |         if (element) { | ||||||
|  |           this.kids.push(element); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |     } else { | ||||||
|  |       const element = this.parseKid(pageObjId, kids); | ||||||
|  |       if (element) { | ||||||
|  |         this.kids.push(element); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   parseKid(pageObjId, kid) { | ||||||
|  |     // A direct link to content, the integer is an mcid.
 | ||||||
|  |     if (Number.isInteger(kid)) { | ||||||
|  |       if (this.tree.pageDict.objId !== pageObjId) { | ||||||
|  |         return null; | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       return new StructElement({ | ||||||
|  |         type: StructElementType.PAGE_CONTENT, | ||||||
|  |         mcid: kid, | ||||||
|  |         pageObjId, | ||||||
|  |       }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Find the dictionary for the kid.
 | ||||||
|  |     let kidDict = null; | ||||||
|  |     if (isRef(kid)) { | ||||||
|  |       kidDict = this.dict.xref.fetch(kid); | ||||||
|  |     } else if (isDict(kid)) { | ||||||
|  |       kidDict = kid; | ||||||
|  |     } | ||||||
|  |     if (!kidDict) { | ||||||
|  |       return null; | ||||||
|  |     } | ||||||
|  |     const pageRef = kidDict.getRaw("Pg"); | ||||||
|  |     if (isRef(pageRef)) { | ||||||
|  |       pageObjId = pageRef.toString(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const type = isName(kidDict.get("Type")) ? kidDict.get("Type").name : null; | ||||||
|  |     if (type === "MCR") { | ||||||
|  |       if (this.tree.pageDict.objId !== pageObjId) { | ||||||
|  |         return null; | ||||||
|  |       } | ||||||
|  |       return new StructElement({ | ||||||
|  |         type: StructElementType.STREAM_CONTENT, | ||||||
|  |         refObjId: isRef(kidDict.getRaw("Stm")) | ||||||
|  |           ? kidDict.getRaw("Stm").toString() | ||||||
|  |           : null, | ||||||
|  |         pageObjId, | ||||||
|  |         mcid: kidDict.get("MCID"), | ||||||
|  |       }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (type === "OBJR") { | ||||||
|  |       if (this.tree.pageDict.objId !== pageObjId) { | ||||||
|  |         return null; | ||||||
|  |       } | ||||||
|  |       return new StructElement({ | ||||||
|  |         type: StructElementType.OBJECT, | ||||||
|  |         refObjId: isRef(kidDict.getRaw("Obj")) | ||||||
|  |           ? kidDict.getRaw("Obj").toString() | ||||||
|  |           : null, | ||||||
|  |         pageObjId, | ||||||
|  |       }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return new StructElement({ | ||||||
|  |       type: StructElementType.ELEMENT, | ||||||
|  |       dict: kidDict, | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | class StructElement { | ||||||
|  |   constructor({ | ||||||
|  |     type, | ||||||
|  |     dict = null, | ||||||
|  |     mcid = null, | ||||||
|  |     pageObjId = null, | ||||||
|  |     refObjId = null, | ||||||
|  |   }) { | ||||||
|  |     this.type = type; | ||||||
|  |     this.dict = dict; | ||||||
|  |     this.mcid = mcid; | ||||||
|  |     this.pageObjId = pageObjId; | ||||||
|  |     this.refObjId = refObjId; | ||||||
|  |     this.parentNode = null; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | class StructTreePage { | ||||||
|  |   constructor(structTreeRoot, pageDict) { | ||||||
|  |     this.root = structTreeRoot; | ||||||
|  |     this.rootDict = structTreeRoot ? structTreeRoot.dict : null; | ||||||
|  |     this.pageDict = pageDict; | ||||||
|  |     this.nodes = []; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   parse() { | ||||||
|  |     if (!this.root || !this.rootDict) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const parentTree = this.rootDict.get("ParentTree"); | ||||||
|  |     if (!parentTree) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |     const id = this.pageDict.get("StructParents"); | ||||||
|  |     if (!Number.isInteger(id)) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |     const numberTree = new NumberTree(parentTree, this.rootDict.xref); | ||||||
|  |     const parentArray = numberTree.get(id); | ||||||
|  |     if (!Array.isArray(parentArray)) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |     const map = new Map(); | ||||||
|  |     for (const ref of parentArray) { | ||||||
|  |       if (isRef(ref)) { | ||||||
|  |         this.addNode(this.rootDict.xref.fetch(ref), map); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   addNode(dict, map, level = 0) { | ||||||
|  |     if (level > MAX_DEPTH) { | ||||||
|  |       warn("StructTree MAX_DEPTH reached."); | ||||||
|  |       return null; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (map.has(dict)) { | ||||||
|  |       return map.get(dict); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const element = new StructElementNode(this, dict); | ||||||
|  |     map.set(dict, element); | ||||||
|  | 
 | ||||||
|  |     const parent = dict.get("P"); | ||||||
|  | 
 | ||||||
|  |     if (!parent || isName(parent.get("Type"), "StructTreeRoot")) { | ||||||
|  |       if (!this.addTopLevelNode(dict, element)) { | ||||||
|  |         map.delete(dict); | ||||||
|  |       } | ||||||
|  |       return element; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const parentNode = this.addNode(parent, map, level + 1); | ||||||
|  |     if (!parentNode) { | ||||||
|  |       return element; | ||||||
|  |     } | ||||||
|  |     let save = false; | ||||||
|  |     for (const kid of parentNode.kids) { | ||||||
|  |       if (kid.type === StructElementType.ELEMENT && kid.dict === dict) { | ||||||
|  |         kid.parentNode = element; | ||||||
|  |         save = true; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (!save) { | ||||||
|  |       map.delete(dict); | ||||||
|  |     } | ||||||
|  |     return element; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   addTopLevelNode(dict, element) { | ||||||
|  |     const obj = this.rootDict.get("K"); | ||||||
|  |     if (!obj) { | ||||||
|  |       return false; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (isDict(obj)) { | ||||||
|  |       if (obj.objId !== dict.objId) { | ||||||
|  |         return false; | ||||||
|  |       } | ||||||
|  |       this.nodes[0] = element; | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (!Array.isArray(obj)) { | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|  |     let save = false; | ||||||
|  |     for (let i = 0; i < obj.length; i++) { | ||||||
|  |       const kidRef = obj[i]; | ||||||
|  |       if (kidRef && kidRef.toString() === dict.objId) { | ||||||
|  |         this.nodes[i] = element; | ||||||
|  |         save = true; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     return save; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   /** | ||||||
|  |    * Convert the tree structure into a simplifed object literal that can | ||||||
|  |    * be sent to the main thread. | ||||||
|  |    * @returns {Object} | ||||||
|  |    */ | ||||||
|  |   get serializable() { | ||||||
|  |     function nodeToSerializable(node, parent, level = 0) { | ||||||
|  |       if (level > MAX_DEPTH) { | ||||||
|  |         warn("StructTree too deep to be fully serialized."); | ||||||
|  |         return; | ||||||
|  |       } | ||||||
|  |       const obj = Object.create(null); | ||||||
|  |       obj.role = node.role; | ||||||
|  |       obj.children = []; | ||||||
|  |       parent.children.push(obj); | ||||||
|  |       const alt = node.dict.get("Alt"); | ||||||
|  |       if (isString(alt)) { | ||||||
|  |         obj.alt = stringToPDFString(alt); | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       for (const kid of node.kids) { | ||||||
|  |         const kidElement = | ||||||
|  |           kid.type === StructElementType.ELEMENT ? kid.parentNode : null; | ||||||
|  |         if (kidElement) { | ||||||
|  |           nodeToSerializable(kidElement, obj, level + 1); | ||||||
|  |           continue; | ||||||
|  |         } else if ( | ||||||
|  |           kid.type === StructElementType.PAGE_CONTENT || | ||||||
|  |           kid.type === StructElementType.STREAM_CONTENT | ||||||
|  |         ) { | ||||||
|  |           obj.children.push({ | ||||||
|  |             type: "content", | ||||||
|  |             id: `page${kid.pageObjId}_mcid${kid.mcid}`, | ||||||
|  |           }); | ||||||
|  |         } else if (kid.type === StructElementType.OBJECT) { | ||||||
|  |           obj.children.push({ | ||||||
|  |             type: "object", | ||||||
|  |             id: kid.refObjId, | ||||||
|  |           }); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const root = Object.create(null); | ||||||
|  |     root.children = []; | ||||||
|  |     root.role = "Root"; | ||||||
|  |     for (const child of this.nodes) { | ||||||
|  |       if (!child) { | ||||||
|  |         continue; | ||||||
|  |       } | ||||||
|  |       nodeToSerializable(child, root); | ||||||
|  |     } | ||||||
|  |     return root; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | export { StructTreePage, StructTreeRoot }; | ||||||
| @ -717,6 +717,7 @@ class WorkerMessageHandler { | |||||||
|             task, |             task, | ||||||
|             sink, |             sink, | ||||||
|             normalizeWhitespace: data.normalizeWhitespace, |             normalizeWhitespace: data.normalizeWhitespace, | ||||||
|  |             includeMarkedContent: data.includeMarkedContent, | ||||||
|             combineTextItems: data.combineTextItems, |             combineTextItems: data.combineTextItems, | ||||||
|           }) |           }) | ||||||
|           .then( |           .then( | ||||||
| @ -745,6 +746,18 @@ class WorkerMessageHandler { | |||||||
|       }); |       }); | ||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|  |     handler.on("GetStructTree", function wphGetStructTree(data) { | ||||||
|  |       const pageIndex = data.pageIndex; | ||||||
|  |       return pdfManager | ||||||
|  |         .getPage(pageIndex) | ||||||
|  |         .then(function (page) { | ||||||
|  |           return pdfManager.ensure(page, "getStructTree"); | ||||||
|  |         }) | ||||||
|  |         .then(function (structTree) { | ||||||
|  |           return structTree.serializable; | ||||||
|  |         }); | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|     handler.on("FontFallback", function (data) { |     handler.on("FontFallback", function (data) { | ||||||
|       return pdfManager.fontFallback(data.id, handler); |       return pdfManager.fontFallback(data.id, handler); | ||||||
|     }); |     }); | ||||||
|  | |||||||
| @ -1026,13 +1026,17 @@ class PDFDocumentProxy { | |||||||
|  *   whitespace with standard spaces (0x20). The default value is `false`. |  *   whitespace with standard spaces (0x20). The default value is `false`. | ||||||
|  * @property {boolean} disableCombineTextItems - Do not attempt to combine |  * @property {boolean} disableCombineTextItems - Do not attempt to combine | ||||||
|  *   same line {@link TextItem}'s. The default value is `false`. |  *   same line {@link TextItem}'s. The default value is `false`. | ||||||
|  |  * @property {boolean} [includeMarkedContent] - When true include marked | ||||||
|  |  *   content items in the items array of TextContent. The default is `false`. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| /** | /** | ||||||
|  * Page text content. |  * Page text content. | ||||||
|  * |  * | ||||||
|  * @typedef {Object} TextContent |  * @typedef {Object} TextContent | ||||||
|  * @property {Array<TextItem>} items - Array of {@link TextItem} objects. |  * @property {Array<TextItem | TextMarkedContent>} items - Array of | ||||||
|  |  *   {@link TextItem} and {@link TextMarkedContent} objects. TextMarkedContent | ||||||
|  |  *   items are included when includeMarkedContent is true. | ||||||
|  * @property {Object<string, TextStyle>} styles - {@link TextStyle} objects, |  * @property {Object<string, TextStyle>} styles - {@link TextStyle} objects, | ||||||
|  *   indexed by font name. |  *   indexed by font name. | ||||||
|  */ |  */ | ||||||
| @ -1047,6 +1051,17 @@ class PDFDocumentProxy { | |||||||
|  * @property {number} width - Width in device space. |  * @property {number} width - Width in device space. | ||||||
|  * @property {number} height - Height in device space. |  * @property {number} height - Height in device space. | ||||||
|  * @property {string} fontName - Font name used by PDF.js for converted font. |  * @property {string} fontName - Font name used by PDF.js for converted font. | ||||||
|  |  * | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Page text marked content part. | ||||||
|  |  * | ||||||
|  |  * @typedef {Object} TextMarkedContent | ||||||
|  |  * @property {string} type - Either 'beginMarkedContent', | ||||||
|  |  *   'beginMarkedContentProps', or 'endMarkedContent'. | ||||||
|  |  * @property {string} id - The marked content identifier. Only used for type | ||||||
|  |  *   'beginMarkedContentProps'. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| /** | /** | ||||||
| @ -1103,6 +1118,25 @@ class PDFDocumentProxy { | |||||||
|  *   states set. |  *   states set. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
|  | /** | ||||||
|  |  * Structure tree node. The root node will have a role "Root". | ||||||
|  |  * | ||||||
|  |  * @typedef {Object} StructTreeNode | ||||||
|  |  * @property {Array<StructTreeNode | StructTreeContent>} children - Array of | ||||||
|  |  *   {@link StructTreeNode} and {@link StructTreeContent} objects. | ||||||
|  |  * @property {string} role - element's role, already mapped if a role map exists | ||||||
|  |  * in the PDF. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Structure tree content. | ||||||
|  |  * | ||||||
|  |  * @typedef {Object} StructTreeContent | ||||||
|  |  * @property {string} type - either "content" for page and stream structure | ||||||
|  |  *   elements or "object" for object references. | ||||||
|  |  * @property {string} id - unique id that will map to the text layer. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
| /** | /** | ||||||
|  * PDF page operator list. |  * PDF page operator list. | ||||||
|  * |  * | ||||||
| @ -1435,6 +1469,7 @@ class PDFPageProxy { | |||||||
|   streamTextContent({ |   streamTextContent({ | ||||||
|     normalizeWhitespace = false, |     normalizeWhitespace = false, | ||||||
|     disableCombineTextItems = false, |     disableCombineTextItems = false, | ||||||
|  |     includeMarkedContent = false, | ||||||
|   } = {}) { |   } = {}) { | ||||||
|     const TEXT_CONTENT_CHUNK_SIZE = 100; |     const TEXT_CONTENT_CHUNK_SIZE = 100; | ||||||
| 
 | 
 | ||||||
| @ -1444,6 +1479,7 @@ class PDFPageProxy { | |||||||
|         pageIndex: this._pageIndex, |         pageIndex: this._pageIndex, | ||||||
|         normalizeWhitespace: normalizeWhitespace === true, |         normalizeWhitespace: normalizeWhitespace === true, | ||||||
|         combineTextItems: disableCombineTextItems !== true, |         combineTextItems: disableCombineTextItems !== true, | ||||||
|  |         includeMarkedContent: includeMarkedContent === true, | ||||||
|       }, |       }, | ||||||
|       { |       { | ||||||
|         highWaterMark: TEXT_CONTENT_CHUNK_SIZE, |         highWaterMark: TEXT_CONTENT_CHUNK_SIZE, | ||||||
| @ -1484,6 +1520,16 @@ class PDFPageProxy { | |||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   /** | ||||||
|  |    * @returns {Promise<StructTreeNode>} A promise that is resolved with a | ||||||
|  |    *   {@link StructTreeNode} object that represents the page's structure tree. | ||||||
|  |    */ | ||||||
|  |   getStructTree() { | ||||||
|  |     return (this._structTreePromise ||= this._transport.getStructTree( | ||||||
|  |       this._pageIndex | ||||||
|  |     )); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   /** |   /** | ||||||
|    * Destroys the page object. |    * Destroys the page object. | ||||||
|    * @private |    * @private | ||||||
| @ -1513,6 +1559,7 @@ class PDFPageProxy { | |||||||
|     this._annotationsPromise = null; |     this._annotationsPromise = null; | ||||||
|     this._jsActionsPromise = null; |     this._jsActionsPromise = null; | ||||||
|     this._xfaPromise = null; |     this._xfaPromise = null; | ||||||
|  |     this._structTreePromise = null; | ||||||
|     this.pendingCleanup = false; |     this.pendingCleanup = false; | ||||||
|     return Promise.all(waitOn); |     return Promise.all(waitOn); | ||||||
|   } |   } | ||||||
| @ -1548,6 +1595,7 @@ class PDFPageProxy { | |||||||
|     this._annotationsPromise = null; |     this._annotationsPromise = null; | ||||||
|     this._jsActionsPromise = null; |     this._jsActionsPromise = null; | ||||||
|     this._xfaPromise = null; |     this._xfaPromise = null; | ||||||
|  |     this._structTreePromise = null; | ||||||
|     if (resetStats && this._stats) { |     if (resetStats && this._stats) { | ||||||
|       this._stats = new StatTimer(); |       this._stats = new StatTimer(); | ||||||
|     } |     } | ||||||
| @ -2773,6 +2821,12 @@ class WorkerTransport { | |||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   getStructTree(pageIndex) { | ||||||
|  |     return this.messageHandler.sendWithPromise("GetStructTree", { | ||||||
|  |       pageIndex, | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   getOutline() { |   getOutline() { | ||||||
|     return this.messageHandler.sendWithPromise("GetOutline", null); |     return this.messageHandler.sendWithPromise("GetOutline", null); | ||||||
|   } |   } | ||||||
|  | |||||||
| @ -638,6 +638,23 @@ const renderTextLayer = (function renderTextLayerClosure() { | |||||||
| 
 | 
 | ||||||
|     _processItems(items, styleCache) { |     _processItems(items, styleCache) { | ||||||
|       for (let i = 0, len = items.length; i < len; i++) { |       for (let i = 0, len = items.length; i < len; i++) { | ||||||
|  |         if (items[i].str === undefined) { | ||||||
|  |           if ( | ||||||
|  |             items[i].type === "beginMarkedContentProps" || | ||||||
|  |             items[i].type === "beginMarkedContent" | ||||||
|  |           ) { | ||||||
|  |             const parent = this._container; | ||||||
|  |             this._container = document.createElement("span"); | ||||||
|  |             this._container.classList.add("markedContent"); | ||||||
|  |             if (items[i].id !== null) { | ||||||
|  |               this._container.setAttribute("id", `${items[i].id}`); | ||||||
|  |             } | ||||||
|  |             parent.appendChild(this._container); | ||||||
|  |           } else if (items[i].type === "endMarkedContent") { | ||||||
|  |             this._container = this._container.parentNode; | ||||||
|  |           } | ||||||
|  |           continue; | ||||||
|  |         } | ||||||
|         this._textContentItemsStr.push(items[i].str); |         this._textContentItemsStr.push(items[i].str); | ||||||
|         appendText(this, items[i], styleCache, this._layoutTextCtx); |         appendText(this, items[i], styleCache, this._layoutTextCtx); | ||||||
|       } |       } | ||||||
|  | |||||||
| @ -572,6 +572,7 @@ var Driver = (function DriverClosure() { | |||||||
|                 initPromise = page |                 initPromise = page | ||||||
|                   .getTextContent({ |                   .getTextContent({ | ||||||
|                     normalizeWhitespace: true, |                     normalizeWhitespace: true, | ||||||
|  |                     includeMarkedContent: true, | ||||||
|                   }) |                   }) | ||||||
|                   .then(function (textContent) { |                   .then(function (textContent) { | ||||||
|                     return rasterizeTextLayer( |                     return rasterizeTextLayer( | ||||||
|  | |||||||
| @ -24,7 +24,11 @@ async function runTests(results) { | |||||||
|   jasmine.loadConfig({ |   jasmine.loadConfig({ | ||||||
|     random: false, |     random: false, | ||||||
|     spec_dir: "integration", |     spec_dir: "integration", | ||||||
|     spec_files: ["scripting_spec.js", "annotation_spec.js"], |     spec_files: [ | ||||||
|  |       "scripting_spec.js", | ||||||
|  |       "annotation_spec.js", | ||||||
|  |       "accessibility_spec.js", | ||||||
|  |     ], | ||||||
|   }); |   }); | ||||||
| 
 | 
 | ||||||
|   jasmine.addReporter({ |   jasmine.addReporter({ | ||||||
|  | |||||||
							
								
								
									
										69
									
								
								test/integration/accessibility_spec.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								test/integration/accessibility_spec.js
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,69 @@ | |||||||
|  | /* Copyright 2021 Mozilla Foundation | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | const { closePages, loadAndWait } = require("./test_utils.js"); | ||||||
|  | 
 | ||||||
|  | describe("accessibility", () => { | ||||||
|  |   describe("structure tree", () => { | ||||||
|  |     let pages; | ||||||
|  | 
 | ||||||
|  |     beforeAll(async () => { | ||||||
|  |       pages = await loadAndWait("structure_simple.pdf", ".structTree"); | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     afterAll(async () => { | ||||||
|  |       await closePages(pages); | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     it("must build structure that maps to text layer", async () => { | ||||||
|  |       await Promise.all( | ||||||
|  |         pages.map(async ([browserName, page]) => { | ||||||
|  |           await page.waitForSelector(".structTree"); | ||||||
|  | 
 | ||||||
|  |           // Check the headings match up.
 | ||||||
|  |           const head1 = await page.$eval( | ||||||
|  |             ".structTree [role='heading'][aria-level='1'] span", | ||||||
|  |             el => | ||||||
|  |               document.getElementById(el.getAttribute("aria-owns")).textContent | ||||||
|  |           ); | ||||||
|  |           expect(head1).withContext(`In ${browserName}`).toEqual("Heading 1"); | ||||||
|  |           const head2 = await page.$eval( | ||||||
|  |             ".structTree [role='heading'][aria-level='2'] span", | ||||||
|  |             el => | ||||||
|  |               document.getElementById(el.getAttribute("aria-owns")).textContent | ||||||
|  |           ); | ||||||
|  |           expect(head2).withContext(`In ${browserName}`).toEqual("Heading 2"); | ||||||
|  | 
 | ||||||
|  |           // Check the order of the content.
 | ||||||
|  |           const texts = await page.$$eval(".structTree [aria-owns]", nodes => | ||||||
|  |             nodes.map( | ||||||
|  |               el => | ||||||
|  |                 document.getElementById(el.getAttribute("aria-owns")) | ||||||
|  |                   .textContent | ||||||
|  |             ) | ||||||
|  |           ); | ||||||
|  |           expect(texts) | ||||||
|  |             .withContext(`In ${browserName}`) | ||||||
|  |             .toEqual([ | ||||||
|  |               "Heading 1", | ||||||
|  |               "This paragraph 1.", | ||||||
|  |               "Heading 2", | ||||||
|  |               "This paragraph 2.", | ||||||
|  |             ]); | ||||||
|  |         }) | ||||||
|  |       ); | ||||||
|  |     }); | ||||||
|  |   }); | ||||||
|  | }); | ||||||
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -71,6 +71,7 @@ | |||||||
| !issue8570.pdf | !issue8570.pdf | ||||||
| !issue8697.pdf | !issue8697.pdf | ||||||
| !issue8702.pdf | !issue8702.pdf | ||||||
|  | !structure_simple.pdf | ||||||
| !issue12823.pdf | !issue12823.pdf | ||||||
| !issue8707.pdf | !issue8707.pdf | ||||||
| !issue8798r.pdf | !issue8798r.pdf | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/structure_simple.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/structure_simple.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -23,7 +23,7 @@ | |||||||
|   bottom: 0; |   bottom: 0; | ||||||
|   line-height: 1; |   line-height: 1; | ||||||
| } | } | ||||||
| .textLayer > span { | .textLayer span { | ||||||
|   position: absolute; |   position: absolute; | ||||||
|   white-space: pre; |   white-space: pre; | ||||||
|   -webkit-transform-origin: 0% 0%; |   -webkit-transform-origin: 0% 0%; | ||||||
| @ -37,3 +37,8 @@ | |||||||
|   -moz-box-sizing: border-box; |   -moz-box-sizing: border-box; | ||||||
|   box-sizing: border-box; |   box-sizing: border-box; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | .textLayer .markedContent { | ||||||
|  |   border: none; | ||||||
|  |   background-color: transparent; | ||||||
|  | } | ||||||
|  | |||||||
| @ -34,6 +34,7 @@ | |||||||
|     "pdf_history_spec.js", |     "pdf_history_spec.js", | ||||||
|     "primitives_spec.js", |     "primitives_spec.js", | ||||||
|     "stream_spec.js", |     "stream_spec.js", | ||||||
|  |     "struct_tree_spec.js", | ||||||
|     "type1_parser_spec.js", |     "type1_parser_spec.js", | ||||||
|     "ui_utils_spec.js", |     "ui_utils_spec.js", | ||||||
|     "unicode_spec.js", |     "unicode_spec.js", | ||||||
|  | |||||||
| @ -80,6 +80,7 @@ async function initializePDFJS(callback) { | |||||||
|       "pdfjs-test/unit/primitives_spec.js", |       "pdfjs-test/unit/primitives_spec.js", | ||||||
|       "pdfjs-test/unit/scripting_spec.js", |       "pdfjs-test/unit/scripting_spec.js", | ||||||
|       "pdfjs-test/unit/stream_spec.js", |       "pdfjs-test/unit/stream_spec.js", | ||||||
|  |       "pdfjs-test/unit/struct_tree_spec.js", | ||||||
|       "pdfjs-test/unit/type1_parser_spec.js", |       "pdfjs-test/unit/type1_parser_spec.js", | ||||||
|       "pdfjs-test/unit/ui_utils_spec.js", |       "pdfjs-test/unit/ui_utils_spec.js", | ||||||
|       "pdfjs-test/unit/unicode_spec.js", |       "pdfjs-test/unit/unicode_spec.js", | ||||||
|  | |||||||
							
								
								
									
										108
									
								
								test/unit/struct_tree_spec.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								test/unit/struct_tree_spec.js
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,108 @@ | |||||||
|  | /* Copyright 2021 Mozilla Foundation | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | import { buildGetDocumentParams } from "./test_utils.js"; | ||||||
|  | import { getDocument } from "../../src/display/api.js"; | ||||||
|  | 
 | ||||||
|  | function equalTrees(rootA, rootB) { | ||||||
|  |   function walk(a, b) { | ||||||
|  |     expect(a.role).toEqual(b.role); | ||||||
|  |     expect(a.type).toEqual(b.type); | ||||||
|  |     expect("children" in a).toEqual("children" in b); | ||||||
|  |     if (!a.children) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |     expect(a.children.length).toEqual(b.children.length); | ||||||
|  |     for (let i = 0; i < rootA.children.length; i++) { | ||||||
|  |       walk(a.children[i], b.children[i]); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return walk(rootA, rootB); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | describe("struct tree", function () { | ||||||
|  |   describe("getStructTree", function () { | ||||||
|  |     it("parses basic structure", async function () { | ||||||
|  |       const filename = "structure_simple.pdf"; | ||||||
|  |       const params = buildGetDocumentParams(filename); | ||||||
|  |       const loadingTask = getDocument(params); | ||||||
|  |       const doc = await loadingTask.promise; | ||||||
|  |       const page = await doc.getPage(1); | ||||||
|  |       const struct = await page.getStructTree(); | ||||||
|  |       equalTrees( | ||||||
|  |         { | ||||||
|  |           role: "Root", | ||||||
|  |           children: [ | ||||||
|  |             { | ||||||
|  |               role: "Document", | ||||||
|  |               children: [ | ||||||
|  |                 { | ||||||
|  |                   role: "H1", | ||||||
|  |                   children: [ | ||||||
|  |                     { role: "NonStruct", children: [{ type: "content" }] }, | ||||||
|  |                   ], | ||||||
|  |                 }, | ||||||
|  |                 { | ||||||
|  |                   role: "P", | ||||||
|  |                   children: [ | ||||||
|  |                     { role: "NonStruct", children: [{ type: "content" }] }, | ||||||
|  |                   ], | ||||||
|  |                 }, | ||||||
|  |                 { | ||||||
|  |                   role: "H2", | ||||||
|  |                   children: [ | ||||||
|  |                     { role: "NonStruct", children: [{ type: "content" }] }, | ||||||
|  |                   ], | ||||||
|  |                 }, | ||||||
|  |                 { | ||||||
|  |                   role: "P", | ||||||
|  |                   children: [ | ||||||
|  |                     { role: "NonStruct", children: [{ type: "content" }] }, | ||||||
|  |                   ], | ||||||
|  |                 }, | ||||||
|  |               ], | ||||||
|  |             }, | ||||||
|  |           ], | ||||||
|  |         }, | ||||||
|  |         struct | ||||||
|  |       ); | ||||||
|  |       await loadingTask.destroy(); | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     it("parses structure with marked content reference", async function () { | ||||||
|  |       const filename = "issue6782.pdf"; | ||||||
|  |       const params = buildGetDocumentParams(filename); | ||||||
|  |       const loadingTask = getDocument(params); | ||||||
|  |       const doc = await loadingTask.promise; | ||||||
|  |       const page = await doc.getPage(1); | ||||||
|  |       const struct = await page.getStructTree(); | ||||||
|  |       equalTrees( | ||||||
|  |         { | ||||||
|  |           role: "Root", | ||||||
|  |           children: [ | ||||||
|  |             { | ||||||
|  |               role: "Part", | ||||||
|  |               children: [ | ||||||
|  |                 { role: "P", children: Array(27).fill({ type: "content" }) }, | ||||||
|  |               ], | ||||||
|  |             }, | ||||||
|  |           ], | ||||||
|  |         }, | ||||||
|  |         struct | ||||||
|  |       ); | ||||||
|  |       await loadingTask.destroy(); | ||||||
|  |     }); | ||||||
|  |   }); | ||||||
|  | }); | ||||||
| @ -41,6 +41,7 @@ import { AnnotationLayerBuilder } from "./annotation_layer_builder.js"; | |||||||
| import { NullL10n } from "./l10n_utils.js"; | import { NullL10n } from "./l10n_utils.js"; | ||||||
| import { PDFPageView } from "./pdf_page_view.js"; | import { PDFPageView } from "./pdf_page_view.js"; | ||||||
| import { SimpleLinkService } from "./pdf_link_service.js"; | import { SimpleLinkService } from "./pdf_link_service.js"; | ||||||
|  | import { StructTreeLayerBuilder } from "./struct_tree_layer_builder.js"; | ||||||
| import { TextLayerBuilder } from "./text_layer_builder.js"; | import { TextLayerBuilder } from "./text_layer_builder.js"; | ||||||
| import { XfaLayerBuilder } from "./xfa_layer_builder.js"; | import { XfaLayerBuilder } from "./xfa_layer_builder.js"; | ||||||
| 
 | 
 | ||||||
| @ -545,6 +546,7 @@ class BaseViewer { | |||||||
|             textLayerMode: this.textLayerMode, |             textLayerMode: this.textLayerMode, | ||||||
|             annotationLayerFactory: this, |             annotationLayerFactory: this, | ||||||
|             xfaLayerFactory, |             xfaLayerFactory, | ||||||
|  |             structTreeLayerFactory: this, | ||||||
|             imageResourcesPath: this.imageResourcesPath, |             imageResourcesPath: this.imageResourcesPath, | ||||||
|             renderInteractiveForms: this.renderInteractiveForms, |             renderInteractiveForms: this.renderInteractiveForms, | ||||||
|             renderer: this.renderer, |             renderer: this.renderer, | ||||||
| @ -1328,6 +1330,16 @@ class BaseViewer { | |||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   /** | ||||||
|  |    * @param {PDFPage} pdfPage | ||||||
|  |    * @returns {StructTreeLayerBuilder} | ||||||
|  |    */ | ||||||
|  |   createStructTreeLayerBuilder(pdfPage) { | ||||||
|  |     return new StructTreeLayerBuilder({ | ||||||
|  |       pdfPage, | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   /** |   /** | ||||||
|    * @type {boolean} Whether all pages of the PDF document have identical |    * @type {boolean} Whether all pages of the PDF document have identical | ||||||
|    *   widths and heights. |    *   widths and heights. | ||||||
|  | |||||||
| @ -216,6 +216,17 @@ class IPDFXfaLayerFactory { | |||||||
|   createXfaLayerBuilder(pageDiv, pdfPage) {} |   createXfaLayerBuilder(pageDiv, pdfPage) {} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /** | ||||||
|  |  * @interface | ||||||
|  |  */ | ||||||
|  | class IPDFStructTreeLayerFactory { | ||||||
|  |   /** | ||||||
|  |    * @param {PDFPage} pdfPage | ||||||
|  |    * @returns {StructTreeLayerBuilder} | ||||||
|  |    */ | ||||||
|  |   createStructTreeLayerBuilder(pdfPage) {} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /** | /** | ||||||
|  * @interface |  * @interface | ||||||
|  */ |  */ | ||||||
| @ -254,6 +265,7 @@ export { | |||||||
|   IPDFAnnotationLayerFactory, |   IPDFAnnotationLayerFactory, | ||||||
|   IPDFHistory, |   IPDFHistory, | ||||||
|   IPDFLinkService, |   IPDFLinkService, | ||||||
|  |   IPDFStructTreeLayerFactory, | ||||||
|   IPDFTextLayerFactory, |   IPDFTextLayerFactory, | ||||||
|   IPDFXfaLayerFactory, |   IPDFXfaLayerFactory, | ||||||
|   IRenderableView, |   IRenderableView, | ||||||
|  | |||||||
| @ -49,6 +49,7 @@ import { viewerCompatibilityParams } from "./viewer_compatibility.js"; | |||||||
|  *   The default value is `TextLayerMode.ENABLE`. |  *   The default value is `TextLayerMode.ENABLE`. | ||||||
|  * @property {IPDFAnnotationLayerFactory} annotationLayerFactory |  * @property {IPDFAnnotationLayerFactory} annotationLayerFactory | ||||||
|  * @property {IPDFXfaLayerFactory} xfaLayerFactory |  * @property {IPDFXfaLayerFactory} xfaLayerFactory | ||||||
|  |  * @property {IPDFStructTreeLayerFactory} structTreeLayerFactory | ||||||
|  * @property {string} [imageResourcesPath] - Path for image resources, mainly |  * @property {string} [imageResourcesPath] - Path for image resources, mainly | ||||||
|  *   for annotation icons. Include trailing slash. |  *   for annotation icons. Include trailing slash. | ||||||
|  * @property {boolean} renderInteractiveForms - Turns on rendering of |  * @property {boolean} renderInteractiveForms - Turns on rendering of | ||||||
| @ -102,6 +103,7 @@ class PDFPageView { | |||||||
|     this.textLayerFactory = options.textLayerFactory; |     this.textLayerFactory = options.textLayerFactory; | ||||||
|     this.annotationLayerFactory = options.annotationLayerFactory; |     this.annotationLayerFactory = options.annotationLayerFactory; | ||||||
|     this.xfaLayerFactory = options.xfaLayerFactory; |     this.xfaLayerFactory = options.xfaLayerFactory; | ||||||
|  |     this.structTreeLayerFactory = options.structTreeLayerFactory; | ||||||
|     this.renderer = options.renderer || RendererType.CANVAS; |     this.renderer = options.renderer || RendererType.CANVAS; | ||||||
|     this.enableWebGL = options.enableWebGL || false; |     this.enableWebGL = options.enableWebGL || false; | ||||||
|     this.l10n = options.l10n || NullL10n; |     this.l10n = options.l10n || NullL10n; | ||||||
| @ -116,6 +118,7 @@ class PDFPageView { | |||||||
|     this.textLayer = null; |     this.textLayer = null; | ||||||
|     this.zoomLayer = null; |     this.zoomLayer = null; | ||||||
|     this.xfaLayer = null; |     this.xfaLayer = null; | ||||||
|  |     this.structTreeLayer = null; | ||||||
| 
 | 
 | ||||||
|     const div = document.createElement("div"); |     const div = document.createElement("div"); | ||||||
|     div.className = "page"; |     div.className = "page"; | ||||||
| @ -354,6 +357,10 @@ class PDFPageView { | |||||||
|       this.annotationLayer.cancel(); |       this.annotationLayer.cancel(); | ||||||
|       this.annotationLayer = null; |       this.annotationLayer = null; | ||||||
|     } |     } | ||||||
|  |     if (this._onTextLayerRendered) { | ||||||
|  |       this.eventBus._off("textlayerrendered", this._onTextLayerRendered); | ||||||
|  |       this._onTextLayerRendered = null; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   cssTransform(target, redrawAnnotations = false) { |   cssTransform(target, redrawAnnotations = false) { | ||||||
| @ -556,11 +563,12 @@ class PDFPageView { | |||||||
|     this.paintTask = paintTask; |     this.paintTask = paintTask; | ||||||
| 
 | 
 | ||||||
|     const resultPromise = paintTask.promise.then( |     const resultPromise = paintTask.promise.then( | ||||||
|       function () { |       () => { | ||||||
|         return finishPaintTask(null).then(function () { |         return finishPaintTask(null).then(() => { | ||||||
|           if (textLayer) { |           if (textLayer) { | ||||||
|             const readableStream = pdfPage.streamTextContent({ |             const readableStream = pdfPage.streamTextContent({ | ||||||
|               normalizeWhitespace: true, |               normalizeWhitespace: true, | ||||||
|  |               includeMarkedContent: true, | ||||||
|             }); |             }); | ||||||
|             textLayer.setTextContentStream(readableStream); |             textLayer.setTextContentStream(readableStream); | ||||||
|             textLayer.render(); |             textLayer.render(); | ||||||
| @ -599,6 +607,29 @@ class PDFPageView { | |||||||
|       this._renderXfaLayer(); |       this._renderXfaLayer(); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     // The structure tree is currently only supported when the text layer is
 | ||||||
|  |     // enabled and a canvas is used for rendering.
 | ||||||
|  |     if (this.structTreeLayerFactory && this.textLayer && this.canvas) { | ||||||
|  |       // The structure tree must be generated after the text layer for the
 | ||||||
|  |       // aria-owns to work.
 | ||||||
|  |       this._onTextLayerRendered = event => { | ||||||
|  |         if (event.pageNumber !== this.id) { | ||||||
|  |           return; | ||||||
|  |         } | ||||||
|  |         this.eventBus._off("textlayerrendered", this._onTextLayerRendered); | ||||||
|  |         this._onTextLayerRendered = null; | ||||||
|  |         this.pdfPage.getStructTree().then(tree => { | ||||||
|  |           const treeDom = this.structTreeLayer.render(tree); | ||||||
|  |           treeDom.classList.add("structTree"); | ||||||
|  |           this.canvas.appendChild(treeDom); | ||||||
|  |         }); | ||||||
|  |       }; | ||||||
|  |       this.eventBus._on("textlayerrendered", this._onTextLayerRendered); | ||||||
|  |       this.structTreeLayer = this.structTreeLayerFactory.createStructTreeLayerBuilder( | ||||||
|  |         pdfPage | ||||||
|  |       ); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     div.setAttribute("data-loaded", true); |     div.setAttribute("data-loaded", true); | ||||||
| 
 | 
 | ||||||
|     this.eventBus.dispatch("pagerender", { |     this.eventBus.dispatch("pagerender", { | ||||||
|  | |||||||
							
								
								
									
										149
									
								
								web/struct_tree_layer_builder.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								web/struct_tree_layer_builder.js
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,149 @@ | |||||||
|  | /* Copyright 2021 Mozilla Foundation | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | const PDF_ROLE_TO_HTML_ROLE = { | ||||||
|  |   // Document level structure types
 | ||||||
|  |   Document: null, // There's a "document" role, but it doesn't make sense here.
 | ||||||
|  |   DocumentFragment: null, | ||||||
|  |   // Grouping level structure types
 | ||||||
|  |   Part: "group", | ||||||
|  |   Sect: "group", // XXX: There's a "section" role, but it's abstract.
 | ||||||
|  |   Div: "group", | ||||||
|  |   Aside: "note", | ||||||
|  |   NonStruct: "none", | ||||||
|  |   // Block level structure types
 | ||||||
|  |   P: null, | ||||||
|  |   // H<n>,
 | ||||||
|  |   H: "heading", | ||||||
|  |   Title: null, | ||||||
|  |   FENote: "note", | ||||||
|  |   // Sub-block level structure type
 | ||||||
|  |   Sub: "group", | ||||||
|  |   // General inline level structure types
 | ||||||
|  |   Lbl: null, | ||||||
|  |   Span: null, | ||||||
|  |   Em: null, | ||||||
|  |   Strong: null, | ||||||
|  |   Link: "link", | ||||||
|  |   Annot: "note", | ||||||
|  |   Form: "form", | ||||||
|  |   // Ruby and Warichu structure types
 | ||||||
|  |   Ruby: null, | ||||||
|  |   RB: null, | ||||||
|  |   RT: null, | ||||||
|  |   RP: null, | ||||||
|  |   Warichu: null, | ||||||
|  |   WT: null, | ||||||
|  |   WP: null, | ||||||
|  |   // List standard structure types
 | ||||||
|  |   L: "list", | ||||||
|  |   LI: "listitem", | ||||||
|  |   LBody: null, | ||||||
|  |   // Table standard structure types
 | ||||||
|  |   Table: "table", | ||||||
|  |   TR: "row", | ||||||
|  |   TH: "columnheader", | ||||||
|  |   TD: "cell", | ||||||
|  |   THead: "columnheader", | ||||||
|  |   TBody: null, | ||||||
|  |   TFoot: null, | ||||||
|  |   // Standard structure type Caption
 | ||||||
|  |   Caption: null, | ||||||
|  |   // Standard structure type Figure
 | ||||||
|  |   Figure: "figure", | ||||||
|  |   // Standard structure type Formula
 | ||||||
|  |   Formula: null, | ||||||
|  |   // standard structure type Artifact
 | ||||||
|  |   Artifact: null, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | const HEADING_PATTERN = /^H(\d+)$/; | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * @typedef {Object} StructTreeLayerBuilderOptions | ||||||
|  |  * @property {PDFPage} pdfPage | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | class StructTreeLayerBuilder { | ||||||
|  |   /** | ||||||
|  |    * @param {StructTreeLayerBuilderOptions} options | ||||||
|  |    */ | ||||||
|  |   constructor({ pdfPage }) { | ||||||
|  |     this.pdfPage = pdfPage; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   render(structTree) { | ||||||
|  |     return this._walk(structTree); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   _setAttributes(structElement, htmlElement) { | ||||||
|  |     if (structElement.alt !== undefined) { | ||||||
|  |       htmlElement.setAttribute("aria-label", structElement.alt); | ||||||
|  |     } | ||||||
|  |     if (structElement.id !== undefined) { | ||||||
|  |       htmlElement.setAttribute("aria-owns", structElement.id); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   _walk(node) { | ||||||
|  |     if (!node) { | ||||||
|  |       return null; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const element = document.createElement("span"); | ||||||
|  |     if ("role" in node) { | ||||||
|  |       const { role } = node; | ||||||
|  |       const match = role.match(HEADING_PATTERN); | ||||||
|  |       if (match) { | ||||||
|  |         element.setAttribute("role", "heading"); | ||||||
|  |         element.setAttribute("aria-level", match[1]); | ||||||
|  |       } else if (PDF_ROLE_TO_HTML_ROLE[role]) { | ||||||
|  |         element.setAttribute("role", PDF_ROLE_TO_HTML_ROLE[role]); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     this._setAttributes(node, element); | ||||||
|  | 
 | ||||||
|  |     if (node.children) { | ||||||
|  |       if (node.children.length === 1 && "id" in node.children[0]) { | ||||||
|  |         // Often there is only one content node so just set the values on the
 | ||||||
|  |         // parent node to avoid creating an extra span.
 | ||||||
|  |         this._setAttributes(node.children[0], element); | ||||||
|  |       } else { | ||||||
|  |         for (const kid of node.children) { | ||||||
|  |           element.appendChild(this._walk(kid)); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     return element; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * @implements IPDFStructTreeLayerFactory | ||||||
|  |  */ | ||||||
|  | class DefaultStructTreeLayerFactory { | ||||||
|  |   /** | ||||||
|  |    * @param {PDFPage} pdfPage | ||||||
|  |    * @returns {StructTreeLayerBuilder} | ||||||
|  |    */ | ||||||
|  |   createStructTreeLayerBuilder(pdfPage) { | ||||||
|  |     return new StructTreeLayerBuilder({ | ||||||
|  |       pdfPage, | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | export { DefaultStructTreeLayerFactory, StructTreeLayerBuilder }; | ||||||
| @ -24,7 +24,7 @@ | |||||||
|   line-height: 1; |   line-height: 1; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| .textLayer > span { | .textLayer span { | ||||||
|   color: transparent; |   color: transparent; | ||||||
|   position: absolute; |   position: absolute; | ||||||
|   white-space: pre; |   white-space: pre; | ||||||
|  | |||||||
| @ -175,7 +175,7 @@ select { | |||||||
|   display: none !important; |   display: none !important; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| .pdfViewer.enablePermissions .textLayer > span { | .pdfViewer.enablePermissions .textLayer span { | ||||||
|   user-select: none !important; |   user-select: none !important; | ||||||
|   cursor: not-allowed; |   cursor: not-allowed; | ||||||
| } | } | ||||||
| @ -195,12 +195,12 @@ select { | |||||||
|   display: none; |   display: none; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| .pdfPresentationMode:fullscreen .textLayer > span { | .pdfPresentationMode:fullscreen .textLayer span { | ||||||
|   cursor: none; |   cursor: none; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| .pdfPresentationMode.pdfPresentationModeControls > *, | .pdfPresentationMode.pdfPresentationModeControls > *, | ||||||
| .pdfPresentationMode.pdfPresentationModeControls .textLayer > span { | .pdfPresentationMode.pdfPresentationModeControls .textLayer span { | ||||||
|   cursor: default; |   cursor: default; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -1653,19 +1653,19 @@ html[dir="rtl"] #documentPropertiesOverlay .row > * { | |||||||
|   mix-blend-mode: screen; |   mix-blend-mode: screen; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #viewer.textLayer-visible .textLayer > span { | #viewer.textLayer-visible .textLayer span { | ||||||
|   background-color: rgba(255, 255, 0, 0.1); |   background-color: rgba(255, 255, 0, 0.1); | ||||||
|   color: rgba(0, 0, 0, 1); |   color: rgba(0, 0, 0, 1); | ||||||
|   border: solid 1px rgba(255, 0, 0, 0.5); |   border: solid 1px rgba(255, 0, 0, 0.5); | ||||||
|   box-sizing: border-box; |   box-sizing: border-box; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #viewer.textLayer-hover .textLayer > span:hover { | #viewer.textLayer-hover .textLayer span:hover { | ||||||
|   background-color: rgba(255, 255, 255, 1); |   background-color: rgba(255, 255, 255, 1); | ||||||
|   color: rgba(0, 0, 0, 1); |   color: rgba(0, 0, 0, 1); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #viewer.textLayer-shadow .textLayer > span { | #viewer.textLayer-shadow .textLayer span { | ||||||
|   background-color: rgba(255, 255, 255, 0.6); |   background-color: rgba(255, 255, 255, 0.6); | ||||||
|   color: rgba(0, 0, 0, 1); |   color: rgba(0, 0, 0, 1); | ||||||
| } | } | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user