diff --git a/src/core/document.js b/src/core/document.js index ff5c76cb8..456b92577 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -58,6 +58,7 @@ import { calculateMD5 } from "./crypto.js"; import { Linearization } from "./parser.js"; import { OperatorList } from "./operator_list.js"; import { PartialEvaluator } from "./evaluator.js"; +import { StructTreePage } from "./struct_tree.js"; import { XFAFactory } from "./xfa/factory.js"; const DEFAULT_USER_UNIT = 1.0; @@ -104,6 +105,10 @@ class Page { static createObjId() { return `p${pageIndex}_${++idCounters.obj}`; } + + static getPageObjId() { + return `page${ref.toString()}`; + } }; } @@ -406,6 +411,7 @@ class Page { handler, task, normalizeWhitespace, + includeMarkedContent, sink, combineTextItems, }) { @@ -437,12 +443,22 @@ class Page { task, resources: this.resources, normalizeWhitespace, + includeMarkedContent, combineTextItems, sink, }); }); } + async getStructTree() { + const structTreeRoot = await this.pdfManager.ensureCatalog( + "structTreeRoot" + ); + const tree = new StructTreePage(structTreeRoot, this.pageDict); + tree.parse(); + return tree; + } + getAnnotationsData(intent) { return this._parsedAnnotations.then(function (annotations) { const annotationsData = []; @@ -604,6 +620,10 @@ class PDFDocument { static createObjId() { unreachable("Abstract method `createObjId` called."); } + + static getPageObjId() { + unreachable("Abstract method `getPageObjId` called."); + } }; } diff --git a/src/core/evaluator.js b/src/core/evaluator.js index aa7360546..3c49f9783 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1913,7 +1913,10 @@ class PartialEvaluator { return; } // Other marked content types aren't supported yet. - args = [args[0].name]; + args = [ + args[0].name, + args[1] instanceof Dict ? args[1].get("MCID") : null, + ]; break; case OPS.beginMarkedContent: @@ -1973,6 +1976,7 @@ class PartialEvaluator { stateManager = null, normalizeWhitespace = false, combineTextItems = false, + includeMarkedContent = false, sink, seenStyles = new Set(), }) { @@ -2573,6 +2577,7 @@ class PartialEvaluator { stateManager: xObjStateManager, normalizeWhitespace, combineTextItems, + includeMarkedContent, sink: sinkWrapper, seenStyles, }) @@ -2650,6 +2655,38 @@ class PartialEvaluator { }) ); return; + case OPS.beginMarkedContent: + if (includeMarkedContent) { + textContent.items.push({ + type: "beginMarkedContent", + tag: isName(args[0]) ? args[0].name : null, + }); + } + break; + case OPS.beginMarkedContentProps: + if (includeMarkedContent) { + flushTextContentItem(); + let mcid = null; + if (isDict(args[1])) { + mcid = args[1].get("MCID"); + } + textContent.items.push({ + type: "beginMarkedContentProps", + id: Number.isInteger(mcid) + ? `${self.idFactory.getPageObjId()}_mcid${mcid}` + : null, + tag: isName(args[0]) ? args[0].name : null, + }); + } + break; + case OPS.endMarkedContent: + if (includeMarkedContent) { + flushTextContentItem(); + textContent.items.push({ + type: "endMarkedContent", + }); + } + break; } // switch if (textContent.items.length >= sink.desiredSize) { // Wait for ready, if we reach highWaterMark. diff --git a/src/core/obj.js b/src/core/obj.js index 9456e85ad..717404fcb 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -60,6 +60,7 @@ import { CipherTransformFactory } from "./crypto.js"; import { ColorSpace } from "./colorspace.js"; import { GlobalImageCache } from "./image_utils.js"; import { MetadataParser } from "./metadata_parser.js"; +import { StructTreeRoot } from "./struct_tree.js"; function fetchDestination(dest) { return isDict(dest) ? dest.get("D") : dest; @@ -200,6 +201,32 @@ class Catalog { return markInfo; } + get structTreeRoot() { + let structTree = null; + try { + structTree = this._readStructTreeRoot(); + } catch (ex) { + if (ex instanceof MissingDataException) { + throw ex; + } + warn("Unable read to structTreeRoot info."); + } + return shadow(this, "structTreeRoot", structTree); + } + + /** + * @private + */ + _readStructTreeRoot() { + const obj = this._catDict.get("StructTreeRoot"); + if (!isDict(obj)) { + return null; + } + const root = new StructTreeRoot(obj); + root.init(); + return root; + } + get toplevelPagesDict() { const pagesObj = this._catDict.get("Pages"); if (!isDict(pagesObj)) { @@ -2626,4 +2653,4 @@ const ObjectLoader = (function () { return ObjectLoader; })(); -export { Catalog, FileSpec, ObjectLoader, XRef }; +export { Catalog, FileSpec, NumberTree, ObjectLoader, XRef }; diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js new file mode 100644 index 000000000..41587d45c --- /dev/null +++ b/src/core/struct_tree.js @@ -0,0 +1,335 @@ +/* Copyright 2021 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { isDict, isName, isRef } from "./primitives.js"; +import { isString, stringToPDFString, warn } from "../shared/util.js"; +import { NumberTree } from "./obj.js"; + +const MAX_DEPTH = 40; + +const StructElementType = { + PAGE_CONTENT: "PAGE_CONTENT", + STREAM_CONTENT: "STREAM_CONTENT", + OBJECT: "OBJECT", + ELEMENT: "ELEMENT", +}; + +class StructTreeRoot { + constructor(rootDict) { + this.dict = rootDict; + this.roleMap = new Map(); + } + + init() { + this.readRoleMap(); + } + + readRoleMap() { + const roleMapDict = this.dict.get("RoleMap"); + if (!isDict(roleMapDict)) { + return; + } + roleMapDict.forEach((key, value) => { + if (!isName(value)) { + return; + } + this.roleMap.set(key, value.name); + }); + } +} + +/** + * Instead of loading the whole tree we load just the page's relevant structure + * elements, which means we need a wrapper structure to represent the tree. + */ +class StructElementNode { + constructor(tree, dict) { + this.tree = tree; + this.dict = dict; + this.kids = []; + this.parseKids(); + } + + get role() { + const nameObj = this.dict.get("S"); + const name = isName(nameObj) ? nameObj.name : ""; + const { root } = this.tree; + if (root.roleMap.has(name)) { + return root.roleMap.get(name); + } + return name; + } + + parseKids() { + let pageObjId = null; + const objRef = this.dict.getRaw("Pg"); + if (isRef(objRef)) { + pageObjId = objRef.toString(); + } + const kids = this.dict.get("K"); + if (Array.isArray(kids)) { + for (const kid of kids) { + const element = this.parseKid(pageObjId, kid); + if (element) { + this.kids.push(element); + } + } + } else { + const element = this.parseKid(pageObjId, kids); + if (element) { + this.kids.push(element); + } + } + } + + parseKid(pageObjId, kid) { + // A direct link to content, the integer is an mcid. + if (Number.isInteger(kid)) { + if (this.tree.pageDict.objId !== pageObjId) { + return null; + } + + return new StructElement({ + type: StructElementType.PAGE_CONTENT, + mcid: kid, + pageObjId, + }); + } + + // Find the dictionary for the kid. + let kidDict = null; + if (isRef(kid)) { + kidDict = this.dict.xref.fetch(kid); + } else if (isDict(kid)) { + kidDict = kid; + } + if (!kidDict) { + return null; + } + const pageRef = kidDict.getRaw("Pg"); + if (isRef(pageRef)) { + pageObjId = pageRef.toString(); + } + + const type = isName(kidDict.get("Type")) ? kidDict.get("Type").name : null; + if (type === "MCR") { + if (this.tree.pageDict.objId !== pageObjId) { + return null; + } + return new StructElement({ + type: StructElementType.STREAM_CONTENT, + refObjId: isRef(kidDict.getRaw("Stm")) + ? kidDict.getRaw("Stm").toString() + : null, + pageObjId, + mcid: kidDict.get("MCID"), + }); + } + + if (type === "OBJR") { + if (this.tree.pageDict.objId !== pageObjId) { + return null; + } + return new StructElement({ + type: StructElementType.OBJECT, + refObjId: isRef(kidDict.getRaw("Obj")) + ? kidDict.getRaw("Obj").toString() + : null, + pageObjId, + }); + } + + return new StructElement({ + type: StructElementType.ELEMENT, + dict: kidDict, + }); + } +} + +class StructElement { + constructor({ + type, + dict = null, + mcid = null, + pageObjId = null, + refObjId = null, + }) { + this.type = type; + this.dict = dict; + this.mcid = mcid; + this.pageObjId = pageObjId; + this.refObjId = refObjId; + this.parentNode = null; + } +} + +class StructTreePage { + constructor(structTreeRoot, pageDict) { + this.root = structTreeRoot; + this.rootDict = structTreeRoot ? structTreeRoot.dict : null; + this.pageDict = pageDict; + this.nodes = []; + } + + parse() { + if (!this.root || !this.rootDict) { + return; + } + + const parentTree = this.rootDict.get("ParentTree"); + if (!parentTree) { + return; + } + const id = this.pageDict.get("StructParents"); + if (!Number.isInteger(id)) { + return; + } + const numberTree = new NumberTree(parentTree, this.rootDict.xref); + const parentArray = numberTree.get(id); + if (!Array.isArray(parentArray)) { + return; + } + const map = new Map(); + for (const ref of parentArray) { + if (isRef(ref)) { + this.addNode(this.rootDict.xref.fetch(ref), map); + } + } + } + + addNode(dict, map, level = 0) { + if (level > MAX_DEPTH) { + warn("StructTree MAX_DEPTH reached."); + return null; + } + + if (map.has(dict)) { + return map.get(dict); + } + + const element = new StructElementNode(this, dict); + map.set(dict, element); + + const parent = dict.get("P"); + + if (!parent || isName(parent.get("Type"), "StructTreeRoot")) { + if (!this.addTopLevelNode(dict, element)) { + map.delete(dict); + } + return element; + } + + const parentNode = this.addNode(parent, map, level + 1); + if (!parentNode) { + return element; + } + let save = false; + for (const kid of parentNode.kids) { + if (kid.type === StructElementType.ELEMENT && kid.dict === dict) { + kid.parentNode = element; + save = true; + } + } + if (!save) { + map.delete(dict); + } + return element; + } + + addTopLevelNode(dict, element) { + const obj = this.rootDict.get("K"); + if (!obj) { + return false; + } + + if (isDict(obj)) { + if (obj.objId !== dict.objId) { + return false; + } + this.nodes[0] = element; + return true; + } + + if (!Array.isArray(obj)) { + return true; + } + let save = false; + for (let i = 0; i < obj.length; i++) { + const kidRef = obj[i]; + if (kidRef && kidRef.toString() === dict.objId) { + this.nodes[i] = element; + save = true; + } + } + return save; + } + + /** + * Convert the tree structure into a simplifed object literal that can + * be sent to the main thread. + * @returns {Object} + */ + get serializable() { + function nodeToSerializable(node, parent, level = 0) { + if (level > MAX_DEPTH) { + warn("StructTree too deep to be fully serialized."); + return; + } + const obj = Object.create(null); + obj.role = node.role; + obj.children = []; + parent.children.push(obj); + const alt = node.dict.get("Alt"); + if (isString(alt)) { + obj.alt = stringToPDFString(alt); + } + + for (const kid of node.kids) { + const kidElement = + kid.type === StructElementType.ELEMENT ? kid.parentNode : null; + if (kidElement) { + nodeToSerializable(kidElement, obj, level + 1); + continue; + } else if ( + kid.type === StructElementType.PAGE_CONTENT || + kid.type === StructElementType.STREAM_CONTENT + ) { + obj.children.push({ + type: "content", + id: `page${kid.pageObjId}_mcid${kid.mcid}`, + }); + } else if (kid.type === StructElementType.OBJECT) { + obj.children.push({ + type: "object", + id: kid.refObjId, + }); + } + } + } + + const root = Object.create(null); + root.children = []; + root.role = "Root"; + for (const child of this.nodes) { + if (!child) { + continue; + } + nodeToSerializable(child, root); + } + return root; + } +} + +export { StructTreePage, StructTreeRoot }; diff --git a/src/core/worker.js b/src/core/worker.js index 8f2a23afd..2011deb4c 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -717,6 +717,7 @@ class WorkerMessageHandler { task, sink, normalizeWhitespace: data.normalizeWhitespace, + includeMarkedContent: data.includeMarkedContent, combineTextItems: data.combineTextItems, }) .then( @@ -745,6 +746,18 @@ class WorkerMessageHandler { }); }); + handler.on("GetStructTree", function wphGetStructTree(data) { + const pageIndex = data.pageIndex; + return pdfManager + .getPage(pageIndex) + .then(function (page) { + return pdfManager.ensure(page, "getStructTree"); + }) + .then(function (structTree) { + return structTree.serializable; + }); + }); + handler.on("FontFallback", function (data) { return pdfManager.fontFallback(data.id, handler); }); diff --git a/src/display/api.js b/src/display/api.js index d20931cd3..2712ae616 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1013,13 +1013,17 @@ class PDFDocumentProxy { * whitespace with standard spaces (0x20). The default value is `false`. * @property {boolean} disableCombineTextItems - Do not attempt to combine * same line {@link TextItem}'s. The default value is `false`. + * @property {boolean} [includeMarkedContent] - When true include marked + * content items in the items array of TextContent. The default is `false`. */ /** * Page text content. * * @typedef {Object} TextContent - * @property {Array} items - Array of {@link TextItem} objects. + * @property {Array} items - Array of + * {@link TextItem} and {@link TextMarkedContent} objects. TextMarkedContent + * items are included when includeMarkedContent is true. * @property {Object} styles - {@link TextStyle} objects, * indexed by font name. */ @@ -1034,6 +1038,17 @@ class PDFDocumentProxy { * @property {number} width - Width in device space. * @property {number} height - Height in device space. * @property {string} fontName - Font name used by PDF.js for converted font. + * + */ + +/** + * Page text marked content part. + * + * @typedef {Object} TextMarkedContent + * @property {string} type - Either 'beginMarkedContent', + * 'beginMarkedContentProps', or 'endMarkedContent'. + * @property {string} id - The marked content identifier. Only used for type + * 'beginMarkedContentProps'. */ /** @@ -1089,6 +1104,25 @@ class PDFDocumentProxy { * states set. */ +/** + * Structure tree node. The root node will have a role "Root". + * + * @typedef {Object} StructTreeNode + * @property {Array} children - Array of + * {@link StructTreeNode} and {@link StructTreeContent} objects. + * @property {string} role - element's role, already mapped if a role map exists + * in the PDF. + */ + +/** + * Structure tree content. + * + * @typedef {Object} StructTreeContent + * @property {string} type - either "content" for page and stream structure + * elements or "object" for object references. + * @property {string} id - unique id that will map to the text layer. + */ + /** * PDF page operator list. * @@ -1408,6 +1442,7 @@ class PDFPageProxy { streamTextContent({ normalizeWhitespace = false, disableCombineTextItems = false, + includeMarkedContent = false, } = {}) { const TEXT_CONTENT_CHUNK_SIZE = 100; @@ -1417,6 +1452,7 @@ class PDFPageProxy { pageIndex: this._pageIndex, normalizeWhitespace: normalizeWhitespace === true, combineTextItems: disableCombineTextItems !== true, + includeMarkedContent: includeMarkedContent === true, }, { highWaterMark: TEXT_CONTENT_CHUNK_SIZE, @@ -1457,6 +1493,16 @@ class PDFPageProxy { }); } + /** + * @returns {Promise} A promise that is resolved with a + * {@link StructTreeNode} object that represents the page's structure tree. + */ + getStructTree() { + return (this._structTreePromise ||= this._transport.getStructTree( + this._pageIndex + )); + } + /** * Destroys the page object. * @private @@ -1486,6 +1532,7 @@ class PDFPageProxy { this._annotationsPromise = null; this._jsActionsPromise = null; this._xfaPromise = null; + this._structTreePromise = null; this.pendingCleanup = false; return Promise.all(waitOn); } @@ -1521,6 +1568,7 @@ class PDFPageProxy { this._annotationsPromise = null; this._jsActionsPromise = null; this._xfaPromise = null; + this._structTreePromise = null; if (resetStats && this._stats) { this._stats = new StatTimer(); } @@ -2755,6 +2803,12 @@ class WorkerTransport { }); } + getStructTree(pageIndex) { + return this.messageHandler.sendWithPromise("GetStructTree", { + pageIndex, + }); + } + getOutline() { return this.messageHandler.sendWithPromise("GetOutline", null); } diff --git a/src/display/text_layer.js b/src/display/text_layer.js index 6903b1804..c19feae0c 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -638,6 +638,23 @@ const renderTextLayer = (function renderTextLayerClosure() { _processItems(items, styleCache) { for (let i = 0, len = items.length; i < len; i++) { + if (items[i].str === undefined) { + if ( + items[i].type === "beginMarkedContentProps" || + items[i].type === "beginMarkedContent" + ) { + const parent = this._container; + this._container = document.createElement("span"); + this._container.classList.add("markedContent"); + if (items[i].id !== null) { + this._container.setAttribute("id", `${items[i].id}`); + } + parent.appendChild(this._container); + } else if (items[i].type === "endMarkedContent") { + this._container = this._container.parentNode; + } + continue; + } this._textContentItemsStr.push(items[i].str); appendText(this, items[i], styleCache, this._layoutTextCtx); } diff --git a/test/driver.js b/test/driver.js index a83f64218..801976cb5 100644 --- a/test/driver.js +++ b/test/driver.js @@ -572,6 +572,7 @@ var Driver = (function DriverClosure() { initPromise = page .getTextContent({ normalizeWhitespace: true, + includeMarkedContent: true, }) .then(function (textContent) { return rasterizeTextLayer( diff --git a/test/integration-boot.js b/test/integration-boot.js index e7559ee45..7749c6681 100644 --- a/test/integration-boot.js +++ b/test/integration-boot.js @@ -24,7 +24,11 @@ async function runTests(results) { jasmine.loadConfig({ random: false, spec_dir: "integration", - spec_files: ["scripting_spec.js", "annotation_spec.js"], + spec_files: [ + "scripting_spec.js", + "annotation_spec.js", + "accessibility_spec.js", + ], }); jasmine.addReporter({ diff --git a/test/integration/accessibility_spec.js b/test/integration/accessibility_spec.js new file mode 100644 index 000000000..5db2f98df --- /dev/null +++ b/test/integration/accessibility_spec.js @@ -0,0 +1,69 @@ +/* Copyright 2021 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +const { closePages, loadAndWait } = require("./test_utils.js"); + +describe("accessibility", () => { + describe("structure tree", () => { + let pages; + + beforeAll(async () => { + pages = await loadAndWait("structure_simple.pdf", ".structTree"); + }); + + afterAll(async () => { + await closePages(pages); + }); + + it("must build structure that maps to text layer", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + await page.waitForSelector(".structTree"); + + // Check the headings match up. + const head1 = await page.$eval( + ".structTree [role='heading'][aria-level='1'] span", + el => + document.getElementById(el.getAttribute("aria-owns")).textContent + ); + expect(head1).withContext(`In ${browserName}`).toEqual("Heading 1"); + const head2 = await page.$eval( + ".structTree [role='heading'][aria-level='2'] span", + el => + document.getElementById(el.getAttribute("aria-owns")).textContent + ); + expect(head2).withContext(`In ${browserName}`).toEqual("Heading 2"); + + // Check the order of the content. + const texts = await page.$$eval(".structTree [aria-owns]", nodes => + nodes.map( + el => + document.getElementById(el.getAttribute("aria-owns")) + .textContent + ) + ); + expect(texts) + .withContext(`In ${browserName}`) + .toEqual([ + "Heading 1", + "This paragraph 1.", + "Heading 2", + "This paragraph 2.", + ]); + }) + ); + }); + }); +}); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index f30f9f80e..dbafc86a5 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -71,6 +71,7 @@ !issue8570.pdf !issue8697.pdf !issue8702.pdf +!structure_simple.pdf !issue12823.pdf !issue8707.pdf !issue8798r.pdf diff --git a/test/pdfs/structure_simple.pdf b/test/pdfs/structure_simple.pdf new file mode 100644 index 000000000..4ab57cd18 Binary files /dev/null and b/test/pdfs/structure_simple.pdf differ diff --git a/test/text_layer_test.css b/test/text_layer_test.css index 6b88f80e4..aac1afbce 100644 --- a/test/text_layer_test.css +++ b/test/text_layer_test.css @@ -23,7 +23,7 @@ bottom: 0; line-height: 1; } -.textLayer > span { +.textLayer span { position: absolute; white-space: pre; -webkit-transform-origin: 0% 0%; @@ -37,3 +37,8 @@ -moz-box-sizing: border-box; box-sizing: border-box; } + +.textLayer .markedContent { + border: none; + background-color: transparent; +} diff --git a/test/unit/clitests.json b/test/unit/clitests.json index 07a2502a5..32c2d4977 100644 --- a/test/unit/clitests.json +++ b/test/unit/clitests.json @@ -34,6 +34,7 @@ "pdf_history_spec.js", "primitives_spec.js", "stream_spec.js", + "struct_tree_spec.js", "type1_parser_spec.js", "ui_utils_spec.js", "unicode_spec.js", diff --git a/test/unit/jasmine-boot.js b/test/unit/jasmine-boot.js index 56aed3cf7..022ec220f 100644 --- a/test/unit/jasmine-boot.js +++ b/test/unit/jasmine-boot.js @@ -80,6 +80,7 @@ async function initializePDFJS(callback) { "pdfjs-test/unit/primitives_spec.js", "pdfjs-test/unit/scripting_spec.js", "pdfjs-test/unit/stream_spec.js", + "pdfjs-test/unit/struct_tree_spec.js", "pdfjs-test/unit/type1_parser_spec.js", "pdfjs-test/unit/ui_utils_spec.js", "pdfjs-test/unit/unicode_spec.js", diff --git a/test/unit/struct_tree_spec.js b/test/unit/struct_tree_spec.js new file mode 100644 index 000000000..255a5e51d --- /dev/null +++ b/test/unit/struct_tree_spec.js @@ -0,0 +1,108 @@ +/* Copyright 2021 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { buildGetDocumentParams } from "./test_utils.js"; +import { getDocument } from "../../src/display/api.js"; + +function equalTrees(rootA, rootB) { + function walk(a, b) { + expect(a.role).toEqual(b.role); + expect(a.type).toEqual(b.type); + expect("children" in a).toEqual("children" in b); + if (!a.children) { + return; + } + expect(a.children.length).toEqual(b.children.length); + for (let i = 0; i < rootA.children.length; i++) { + walk(a.children[i], b.children[i]); + } + } + return walk(rootA, rootB); +} + +describe("struct tree", function () { + describe("getStructTree", function () { + it("parses basic structure", async function () { + const filename = "structure_simple.pdf"; + const params = buildGetDocumentParams(filename); + const loadingTask = getDocument(params); + const doc = await loadingTask.promise; + const page = await doc.getPage(1); + const struct = await page.getStructTree(); + equalTrees( + { + role: "Root", + children: [ + { + role: "Document", + children: [ + { + role: "H1", + children: [ + { role: "NonStruct", children: [{ type: "content" }] }, + ], + }, + { + role: "P", + children: [ + { role: "NonStruct", children: [{ type: "content" }] }, + ], + }, + { + role: "H2", + children: [ + { role: "NonStruct", children: [{ type: "content" }] }, + ], + }, + { + role: "P", + children: [ + { role: "NonStruct", children: [{ type: "content" }] }, + ], + }, + ], + }, + ], + }, + struct + ); + await loadingTask.destroy(); + }); + + it("parses structure with marked content reference", async function () { + const filename = "issue6782.pdf"; + const params = buildGetDocumentParams(filename); + const loadingTask = getDocument(params); + const doc = await loadingTask.promise; + const page = await doc.getPage(1); + const struct = await page.getStructTree(); + equalTrees( + { + role: "Root", + children: [ + { + role: "Part", + children: [ + { role: "P", children: Array(27).fill({ type: "content" }) }, + ], + }, + ], + }, + struct + ); + await loadingTask.destroy(); + }); + }); +}); diff --git a/web/base_viewer.js b/web/base_viewer.js index b8ec64c54..7361de4ab 100644 --- a/web/base_viewer.js +++ b/web/base_viewer.js @@ -41,6 +41,7 @@ import { AnnotationLayerBuilder } from "./annotation_layer_builder.js"; import { NullL10n } from "./l10n_utils.js"; import { PDFPageView } from "./pdf_page_view.js"; import { SimpleLinkService } from "./pdf_link_service.js"; +import { StructTreeLayerBuilder } from "./struct_tree_layer_builder.js"; import { TextLayerBuilder } from "./text_layer_builder.js"; import { XfaLayerBuilder } from "./xfa_layer_builder.js"; @@ -545,6 +546,7 @@ class BaseViewer { textLayerMode: this.textLayerMode, annotationLayerFactory: this, xfaLayerFactory, + structTreeLayerFactory: this, imageResourcesPath: this.imageResourcesPath, renderInteractiveForms: this.renderInteractiveForms, renderer: this.renderer, @@ -1329,6 +1331,16 @@ class BaseViewer { }); } + /** + * @param {PDFPage} pdfPage + * @returns {StructTreeLayerBuilder} + */ + createStructTreeLayerBuilder(pdfPage) { + return new StructTreeLayerBuilder({ + pdfPage, + }); + } + /** * @type {boolean} Whether all pages of the PDF document have identical * widths and heights. diff --git a/web/interfaces.js b/web/interfaces.js index adf9116c2..c8ec18327 100644 --- a/web/interfaces.js +++ b/web/interfaces.js @@ -216,6 +216,17 @@ class IPDFXfaLayerFactory { createXfaLayerBuilder(pageDiv, pdfPage) {} } +/** + * @interface + */ +class IPDFStructTreeLayerFactory { + /** + * @param {PDFPage} pdfPage + * @returns {StructTreeLayerBuilder} + */ + createStructTreeLayerBuilder(pdfPage) {} +} + /** * @interface */ @@ -254,6 +265,7 @@ export { IPDFAnnotationLayerFactory, IPDFHistory, IPDFLinkService, + IPDFStructTreeLayerFactory, IPDFTextLayerFactory, IPDFXfaLayerFactory, IRenderableView, diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index 64ec61553..6341bc75b 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -49,6 +49,7 @@ import { viewerCompatibilityParams } from "./viewer_compatibility.js"; * The default value is `TextLayerMode.ENABLE`. * @property {IPDFAnnotationLayerFactory} annotationLayerFactory * @property {IPDFXfaLayerFactory} xfaLayerFactory + * @property {IPDFStructTreeLayerFactory} structTreeLayerFactory * @property {string} [imageResourcesPath] - Path for image resources, mainly * for annotation icons. Include trailing slash. * @property {boolean} renderInteractiveForms - Turns on rendering of @@ -104,6 +105,7 @@ class PDFPageView { this.textLayerFactory = options.textLayerFactory; this.annotationLayerFactory = options.annotationLayerFactory; this.xfaLayerFactory = options.xfaLayerFactory; + this.structTreeLayerFactory = options.structTreeLayerFactory; this.renderer = options.renderer || RendererType.CANVAS; this.enableWebGL = options.enableWebGL || false; this.l10n = options.l10n || NullL10n; @@ -119,6 +121,7 @@ class PDFPageView { this.textLayer = null; this.zoomLayer = null; this.xfaLayer = null; + this.structTreeLayer = null; const div = document.createElement("div"); div.className = "page"; @@ -357,6 +360,10 @@ class PDFPageView { this.annotationLayer.cancel(); this.annotationLayer = null; } + if (this._onTextLayerRendered) { + this.eventBus._off("textlayerrendered", this._onTextLayerRendered); + this._onTextLayerRendered = null; + } } cssTransform(target, redrawAnnotations = false) { @@ -559,11 +566,12 @@ class PDFPageView { this.paintTask = paintTask; const resultPromise = paintTask.promise.then( - function () { - return finishPaintTask(null).then(function () { + () => { + return finishPaintTask(null).then(() => { if (textLayer) { const readableStream = pdfPage.streamTextContent({ normalizeWhitespace: true, + includeMarkedContent: true, }); textLayer.setTextContentStream(readableStream); textLayer.render(); @@ -602,6 +610,29 @@ class PDFPageView { this._renderXfaLayer(); } + // The structure tree is currently only supported when the text layer is + // enabled and a canvas is used for rendering. + if (this.structTreeLayerFactory && this.textLayer && this.canvas) { + // The structure tree must be generated after the text layer for the + // aria-owns to work. + this._onTextLayerRendered = event => { + if (event.pageNumber !== this.id) { + return; + } + this.eventBus._off("textlayerrendered", this._onTextLayerRendered); + this._onTextLayerRendered = null; + this.pdfPage.getStructTree().then(tree => { + const treeDom = this.structTreeLayer.render(tree); + treeDom.classList.add("structTree"); + this.canvas.appendChild(treeDom); + }); + }; + this.eventBus._on("textlayerrendered", this._onTextLayerRendered); + this.structTreeLayer = this.structTreeLayerFactory.createStructTreeLayerBuilder( + pdfPage + ); + } + div.setAttribute("data-loaded", true); this.eventBus.dispatch("pagerender", { diff --git a/web/struct_tree_layer_builder.js b/web/struct_tree_layer_builder.js new file mode 100644 index 000000000..86775d70e --- /dev/null +++ b/web/struct_tree_layer_builder.js @@ -0,0 +1,149 @@ +/* Copyright 2021 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +const PDF_ROLE_TO_HTML_ROLE = { + // Document level structure types + Document: null, // There's a "document" role, but it doesn't make sense here. + DocumentFragment: null, + // Grouping level structure types + Part: "group", + Sect: "group", // XXX: There's a "section" role, but it's abstract. + Div: "group", + Aside: "note", + NonStruct: "none", + // Block level structure types + P: null, + // H, + H: "heading", + Title: null, + FENote: "note", + // Sub-block level structure type + Sub: "group", + // General inline level structure types + Lbl: null, + Span: null, + Em: null, + Strong: null, + Link: "link", + Annot: "note", + Form: "form", + // Ruby and Warichu structure types + Ruby: null, + RB: null, + RT: null, + RP: null, + Warichu: null, + WT: null, + WP: null, + // List standard structure types + L: "list", + LI: "listitem", + LBody: null, + // Table standard structure types + Table: "table", + TR: "row", + TH: "columnheader", + TD: "cell", + THead: "columnheader", + TBody: null, + TFoot: null, + // Standard structure type Caption + Caption: null, + // Standard structure type Figure + Figure: "figure", + // Standard structure type Formula + Formula: null, + // standard structure type Artifact + Artifact: null, +}; + +const HEADING_PATTERN = /^H(\d+)$/; + +/** + * @typedef {Object} StructTreeLayerBuilderOptions + * @property {PDFPage} pdfPage + */ + +class StructTreeLayerBuilder { + /** + * @param {StructTreeLayerBuilderOptions} options + */ + constructor({ pdfPage }) { + this.pdfPage = pdfPage; + } + + render(structTree) { + return this._walk(structTree); + } + + _setAttributes(structElement, htmlElement) { + if (structElement.alt !== undefined) { + htmlElement.setAttribute("aria-label", structElement.alt); + } + if (structElement.id !== undefined) { + htmlElement.setAttribute("aria-owns", structElement.id); + } + } + + _walk(node) { + if (!node) { + return null; + } + + const element = document.createElement("span"); + if ("role" in node) { + const { role } = node; + const match = role.match(HEADING_PATTERN); + if (match) { + element.setAttribute("role", "heading"); + element.setAttribute("aria-level", match[1]); + } else if (PDF_ROLE_TO_HTML_ROLE[role]) { + element.setAttribute("role", PDF_ROLE_TO_HTML_ROLE[role]); + } + } + + this._setAttributes(node, element); + + if (node.children) { + if (node.children.length === 1 && "id" in node.children[0]) { + // Often there is only one content node so just set the values on the + // parent node to avoid creating an extra span. + this._setAttributes(node.children[0], element); + } else { + for (const kid of node.children) { + element.appendChild(this._walk(kid)); + } + } + } + return element; + } +} + +/** + * @implements IPDFStructTreeLayerFactory + */ +class DefaultStructTreeLayerFactory { + /** + * @param {PDFPage} pdfPage + * @returns {StructTreeLayerBuilder} + */ + createStructTreeLayerBuilder(pdfPage) { + return new StructTreeLayerBuilder({ + pdfPage, + }); + } +} + +export { DefaultStructTreeLayerFactory, StructTreeLayerBuilder }; diff --git a/web/text_layer_builder.css b/web/text_layer_builder.css index 98e76d5c2..1d453b16e 100644 --- a/web/text_layer_builder.css +++ b/web/text_layer_builder.css @@ -24,7 +24,7 @@ line-height: 1; } -.textLayer > span { +.textLayer span { color: transparent; position: absolute; white-space: pre; diff --git a/web/viewer.css b/web/viewer.css index e982b2577..442f57580 100644 --- a/web/viewer.css +++ b/web/viewer.css @@ -175,7 +175,7 @@ select { display: none !important; } -.pdfViewer.enablePermissions .textLayer > span { +.pdfViewer.enablePermissions .textLayer span { user-select: none !important; cursor: not-allowed; } @@ -195,12 +195,12 @@ select { display: none; } -.pdfPresentationMode:fullscreen .textLayer > span { +.pdfPresentationMode:fullscreen .textLayer span { cursor: none; } .pdfPresentationMode.pdfPresentationModeControls > *, -.pdfPresentationMode.pdfPresentationModeControls .textLayer > span { +.pdfPresentationMode.pdfPresentationModeControls .textLayer span { cursor: default; } @@ -1653,19 +1653,19 @@ html[dir="rtl"] #documentPropertiesOverlay .row > * { mix-blend-mode: screen; } -#viewer.textLayer-visible .textLayer > span { +#viewer.textLayer-visible .textLayer span { background-color: rgba(255, 255, 0, 0.1); color: rgba(0, 0, 0, 1); border: solid 1px rgba(255, 0, 0, 0.5); box-sizing: border-box; } -#viewer.textLayer-hover .textLayer > span:hover { +#viewer.textLayer-hover .textLayer span:hover { background-color: rgba(255, 255, 255, 1); color: rgba(0, 0, 0, 1); } -#viewer.textLayer-shadow .textLayer > span { +#viewer.textLayer-shadow .textLayer span { background-color: rgba(255, 255, 255, 0.6); color: rgba(0, 0, 0, 1); }