Add support for basic structure tree for accessibility.

When a PDF is "marked" we now generate a separate DOM that represents
the structure tree from the PDF.  This DOM is inserted into the <canvas>
element and allows screen readers to walk the tree and have more
information about headings, images, links, etc. To link the structure
tree DOM (which is empty) to the text layer aria-owns is used. This
required modifying the text layer creation so that marked items are
now tracked.
This commit is contained in:
Brendan Dahl 2021-03-31 15:07:02 -07:00
parent 6429ccc002
commit fc9501a637
22 changed files with 911 additions and 14 deletions

View File

@ -58,6 +58,7 @@ import { calculateMD5 } from "./crypto.js";
import { Linearization } from "./parser.js"; import { Linearization } from "./parser.js";
import { OperatorList } from "./operator_list.js"; import { OperatorList } from "./operator_list.js";
import { PartialEvaluator } from "./evaluator.js"; import { PartialEvaluator } from "./evaluator.js";
import { StructTreePage } from "./struct_tree.js";
import { XFAFactory } from "./xfa/factory.js"; import { XFAFactory } from "./xfa/factory.js";
const DEFAULT_USER_UNIT = 1.0; const DEFAULT_USER_UNIT = 1.0;
@ -104,6 +105,10 @@ class Page {
static createObjId() { static createObjId() {
return `p${pageIndex}_${++idCounters.obj}`; return `p${pageIndex}_${++idCounters.obj}`;
} }
static getPageObjId() {
return `page${ref.toString()}`;
}
}; };
} }
@ -406,6 +411,7 @@ class Page {
handler, handler,
task, task,
normalizeWhitespace, normalizeWhitespace,
includeMarkedContent,
sink, sink,
combineTextItems, combineTextItems,
}) { }) {
@ -437,12 +443,22 @@ class Page {
task, task,
resources: this.resources, resources: this.resources,
normalizeWhitespace, normalizeWhitespace,
includeMarkedContent,
combineTextItems, combineTextItems,
sink, sink,
}); });
}); });
} }
async getStructTree() {
const structTreeRoot = await this.pdfManager.ensureCatalog(
"structTreeRoot"
);
const tree = new StructTreePage(structTreeRoot, this.pageDict);
tree.parse();
return tree;
}
getAnnotationsData(intent) { getAnnotationsData(intent) {
return this._parsedAnnotations.then(function (annotations) { return this._parsedAnnotations.then(function (annotations) {
const annotationsData = []; const annotationsData = [];
@ -604,6 +620,10 @@ class PDFDocument {
static createObjId() { static createObjId() {
unreachable("Abstract method `createObjId` called."); unreachable("Abstract method `createObjId` called.");
} }
static getPageObjId() {
unreachable("Abstract method `getPageObjId` called.");
}
}; };
} }

View File

@ -1913,7 +1913,10 @@ class PartialEvaluator {
return; return;
} }
// Other marked content types aren't supported yet. // Other marked content types aren't supported yet.
args = [args[0].name]; args = [
args[0].name,
args[1] instanceof Dict ? args[1].get("MCID") : null,
];
break; break;
case OPS.beginMarkedContent: case OPS.beginMarkedContent:
@ -1973,6 +1976,7 @@ class PartialEvaluator {
stateManager = null, stateManager = null,
normalizeWhitespace = false, normalizeWhitespace = false,
combineTextItems = false, combineTextItems = false,
includeMarkedContent = false,
sink, sink,
seenStyles = new Set(), seenStyles = new Set(),
}) { }) {
@ -2573,6 +2577,7 @@ class PartialEvaluator {
stateManager: xObjStateManager, stateManager: xObjStateManager,
normalizeWhitespace, normalizeWhitespace,
combineTextItems, combineTextItems,
includeMarkedContent,
sink: sinkWrapper, sink: sinkWrapper,
seenStyles, seenStyles,
}) })
@ -2650,6 +2655,38 @@ class PartialEvaluator {
}) })
); );
return; return;
case OPS.beginMarkedContent:
if (includeMarkedContent) {
textContent.items.push({
type: "beginMarkedContent",
tag: isName(args[0]) ? args[0].name : null,
});
}
break;
case OPS.beginMarkedContentProps:
if (includeMarkedContent) {
flushTextContentItem();
let mcid = null;
if (isDict(args[1])) {
mcid = args[1].get("MCID");
}
textContent.items.push({
type: "beginMarkedContentProps",
id: Number.isInteger(mcid)
? `${self.idFactory.getPageObjId()}_mcid${mcid}`
: null,
tag: isName(args[0]) ? args[0].name : null,
});
}
break;
case OPS.endMarkedContent:
if (includeMarkedContent) {
flushTextContentItem();
textContent.items.push({
type: "endMarkedContent",
});
}
break;
} // switch } // switch
if (textContent.items.length >= sink.desiredSize) { if (textContent.items.length >= sink.desiredSize) {
// Wait for ready, if we reach highWaterMark. // Wait for ready, if we reach highWaterMark.

View File

@ -60,6 +60,7 @@ import { CipherTransformFactory } from "./crypto.js";
import { ColorSpace } from "./colorspace.js"; import { ColorSpace } from "./colorspace.js";
import { GlobalImageCache } from "./image_utils.js"; import { GlobalImageCache } from "./image_utils.js";
import { MetadataParser } from "./metadata_parser.js"; import { MetadataParser } from "./metadata_parser.js";
import { StructTreeRoot } from "./struct_tree.js";
function fetchDestination(dest) { function fetchDestination(dest) {
return isDict(dest) ? dest.get("D") : dest; return isDict(dest) ? dest.get("D") : dest;
@ -200,6 +201,32 @@ class Catalog {
return markInfo; return markInfo;
} }
get structTreeRoot() {
let structTree = null;
try {
structTree = this._readStructTreeRoot();
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn("Unable read to structTreeRoot info.");
}
return shadow(this, "structTreeRoot", structTree);
}
/**
* @private
*/
_readStructTreeRoot() {
const obj = this._catDict.get("StructTreeRoot");
if (!isDict(obj)) {
return null;
}
const root = new StructTreeRoot(obj);
root.init();
return root;
}
get toplevelPagesDict() { get toplevelPagesDict() {
const pagesObj = this._catDict.get("Pages"); const pagesObj = this._catDict.get("Pages");
if (!isDict(pagesObj)) { if (!isDict(pagesObj)) {
@ -2626,4 +2653,4 @@ const ObjectLoader = (function () {
return ObjectLoader; return ObjectLoader;
})(); })();
export { Catalog, FileSpec, ObjectLoader, XRef }; export { Catalog, FileSpec, NumberTree, ObjectLoader, XRef };

335
src/core/struct_tree.js Normal file
View File

@ -0,0 +1,335 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { isDict, isName, isRef } from "./primitives.js";
import { isString, stringToPDFString, warn } from "../shared/util.js";
import { NumberTree } from "./obj.js";
const MAX_DEPTH = 40;
const StructElementType = {
PAGE_CONTENT: "PAGE_CONTENT",
STREAM_CONTENT: "STREAM_CONTENT",
OBJECT: "OBJECT",
ELEMENT: "ELEMENT",
};
class StructTreeRoot {
constructor(rootDict) {
this.dict = rootDict;
this.roleMap = new Map();
}
init() {
this.readRoleMap();
}
readRoleMap() {
const roleMapDict = this.dict.get("RoleMap");
if (!isDict(roleMapDict)) {
return;
}
roleMapDict.forEach((key, value) => {
if (!isName(value)) {
return;
}
this.roleMap.set(key, value.name);
});
}
}
/**
* Instead of loading the whole tree we load just the page's relevant structure
* elements, which means we need a wrapper structure to represent the tree.
*/
class StructElementNode {
constructor(tree, dict) {
this.tree = tree;
this.dict = dict;
this.kids = [];
this.parseKids();
}
get role() {
const nameObj = this.dict.get("S");
const name = isName(nameObj) ? nameObj.name : "";
const { root } = this.tree;
if (root.roleMap.has(name)) {
return root.roleMap.get(name);
}
return name;
}
parseKids() {
let pageObjId = null;
const objRef = this.dict.getRaw("Pg");
if (isRef(objRef)) {
pageObjId = objRef.toString();
}
const kids = this.dict.get("K");
if (Array.isArray(kids)) {
for (const kid of kids) {
const element = this.parseKid(pageObjId, kid);
if (element) {
this.kids.push(element);
}
}
} else {
const element = this.parseKid(pageObjId, kids);
if (element) {
this.kids.push(element);
}
}
}
parseKid(pageObjId, kid) {
// A direct link to content, the integer is an mcid.
if (Number.isInteger(kid)) {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.PAGE_CONTENT,
mcid: kid,
pageObjId,
});
}
// Find the dictionary for the kid.
let kidDict = null;
if (isRef(kid)) {
kidDict = this.dict.xref.fetch(kid);
} else if (isDict(kid)) {
kidDict = kid;
}
if (!kidDict) {
return null;
}
const pageRef = kidDict.getRaw("Pg");
if (isRef(pageRef)) {
pageObjId = pageRef.toString();
}
const type = isName(kidDict.get("Type")) ? kidDict.get("Type").name : null;
if (type === "MCR") {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.STREAM_CONTENT,
refObjId: isRef(kidDict.getRaw("Stm"))
? kidDict.getRaw("Stm").toString()
: null,
pageObjId,
mcid: kidDict.get("MCID"),
});
}
if (type === "OBJR") {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.OBJECT,
refObjId: isRef(kidDict.getRaw("Obj"))
? kidDict.getRaw("Obj").toString()
: null,
pageObjId,
});
}
return new StructElement({
type: StructElementType.ELEMENT,
dict: kidDict,
});
}
}
class StructElement {
constructor({
type,
dict = null,
mcid = null,
pageObjId = null,
refObjId = null,
}) {
this.type = type;
this.dict = dict;
this.mcid = mcid;
this.pageObjId = pageObjId;
this.refObjId = refObjId;
this.parentNode = null;
}
}
class StructTreePage {
constructor(structTreeRoot, pageDict) {
this.root = structTreeRoot;
this.rootDict = structTreeRoot ? structTreeRoot.dict : null;
this.pageDict = pageDict;
this.nodes = [];
}
parse() {
if (!this.root || !this.rootDict) {
return;
}
const parentTree = this.rootDict.get("ParentTree");
if (!parentTree) {
return;
}
const id = this.pageDict.get("StructParents");
if (!Number.isInteger(id)) {
return;
}
const numberTree = new NumberTree(parentTree, this.rootDict.xref);
const parentArray = numberTree.get(id);
if (!Array.isArray(parentArray)) {
return;
}
const map = new Map();
for (const ref of parentArray) {
if (isRef(ref)) {
this.addNode(this.rootDict.xref.fetch(ref), map);
}
}
}
addNode(dict, map, level = 0) {
if (level > MAX_DEPTH) {
warn("StructTree MAX_DEPTH reached.");
return null;
}
if (map.has(dict)) {
return map.get(dict);
}
const element = new StructElementNode(this, dict);
map.set(dict, element);
const parent = dict.get("P");
if (!parent || isName(parent.get("Type"), "StructTreeRoot")) {
if (!this.addTopLevelNode(dict, element)) {
map.delete(dict);
}
return element;
}
const parentNode = this.addNode(parent, map, level + 1);
if (!parentNode) {
return element;
}
let save = false;
for (const kid of parentNode.kids) {
if (kid.type === StructElementType.ELEMENT && kid.dict === dict) {
kid.parentNode = element;
save = true;
}
}
if (!save) {
map.delete(dict);
}
return element;
}
addTopLevelNode(dict, element) {
const obj = this.rootDict.get("K");
if (!obj) {
return false;
}
if (isDict(obj)) {
if (obj.objId !== dict.objId) {
return false;
}
this.nodes[0] = element;
return true;
}
if (!Array.isArray(obj)) {
return true;
}
let save = false;
for (let i = 0; i < obj.length; i++) {
const kidRef = obj[i];
if (kidRef && kidRef.toString() === dict.objId) {
this.nodes[i] = element;
save = true;
}
}
return save;
}
/**
* Convert the tree structure into a simplifed object literal that can
* be sent to the main thread.
* @returns {Object}
*/
get serializable() {
function nodeToSerializable(node, parent, level = 0) {
if (level > MAX_DEPTH) {
warn("StructTree too deep to be fully serialized.");
return;
}
const obj = Object.create(null);
obj.role = node.role;
obj.children = [];
parent.children.push(obj);
const alt = node.dict.get("Alt");
if (isString(alt)) {
obj.alt = stringToPDFString(alt);
}
for (const kid of node.kids) {
const kidElement =
kid.type === StructElementType.ELEMENT ? kid.parentNode : null;
if (kidElement) {
nodeToSerializable(kidElement, obj, level + 1);
continue;
} else if (
kid.type === StructElementType.PAGE_CONTENT ||
kid.type === StructElementType.STREAM_CONTENT
) {
obj.children.push({
type: "content",
id: `page${kid.pageObjId}_mcid${kid.mcid}`,
});
} else if (kid.type === StructElementType.OBJECT) {
obj.children.push({
type: "object",
id: kid.refObjId,
});
}
}
}
const root = Object.create(null);
root.children = [];
root.role = "Root";
for (const child of this.nodes) {
if (!child) {
continue;
}
nodeToSerializable(child, root);
}
return root;
}
}
export { StructTreePage, StructTreeRoot };

View File

@ -717,6 +717,7 @@ class WorkerMessageHandler {
task, task,
sink, sink,
normalizeWhitespace: data.normalizeWhitespace, normalizeWhitespace: data.normalizeWhitespace,
includeMarkedContent: data.includeMarkedContent,
combineTextItems: data.combineTextItems, combineTextItems: data.combineTextItems,
}) })
.then( .then(
@ -745,6 +746,18 @@ class WorkerMessageHandler {
}); });
}); });
handler.on("GetStructTree", function wphGetStructTree(data) {
const pageIndex = data.pageIndex;
return pdfManager
.getPage(pageIndex)
.then(function (page) {
return pdfManager.ensure(page, "getStructTree");
})
.then(function (structTree) {
return structTree.serializable;
});
});
handler.on("FontFallback", function (data) { handler.on("FontFallback", function (data) {
return pdfManager.fontFallback(data.id, handler); return pdfManager.fontFallback(data.id, handler);
}); });

View File

@ -1013,13 +1013,17 @@ class PDFDocumentProxy {
* whitespace with standard spaces (0x20). The default value is `false`. * whitespace with standard spaces (0x20). The default value is `false`.
* @property {boolean} disableCombineTextItems - Do not attempt to combine * @property {boolean} disableCombineTextItems - Do not attempt to combine
* same line {@link TextItem}'s. The default value is `false`. * same line {@link TextItem}'s. The default value is `false`.
* @property {boolean} [includeMarkedContent] - When true include marked
* content items in the items array of TextContent. The default is `false`.
*/ */
/** /**
* Page text content. * Page text content.
* *
* @typedef {Object} TextContent * @typedef {Object} TextContent
* @property {Array<TextItem>} items - Array of {@link TextItem} objects. * @property {Array<TextItem | TextMarkedContent>} items - Array of
* {@link TextItem} and {@link TextMarkedContent} objects. TextMarkedContent
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects, * @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name. * indexed by font name.
*/ */
@ -1034,6 +1038,17 @@ class PDFDocumentProxy {
* @property {number} width - Width in device space. * @property {number} width - Width in device space.
* @property {number} height - Height in device space. * @property {number} height - Height in device space.
* @property {string} fontName - Font name used by PDF.js for converted font. * @property {string} fontName - Font name used by PDF.js for converted font.
*
*/
/**
* Page text marked content part.
*
* @typedef {Object} TextMarkedContent
* @property {string} type - Either 'beginMarkedContent',
* 'beginMarkedContentProps', or 'endMarkedContent'.
* @property {string} id - The marked content identifier. Only used for type
* 'beginMarkedContentProps'.
*/ */
/** /**
@ -1089,6 +1104,25 @@ class PDFDocumentProxy {
* states set. * states set.
*/ */
/**
* Structure tree node. The root node will have a role "Root".
*
* @typedef {Object} StructTreeNode
* @property {Array<StructTreeNode | StructTreeContent>} children - Array of
* {@link StructTreeNode} and {@link StructTreeContent} objects.
* @property {string} role - element's role, already mapped if a role map exists
* in the PDF.
*/
/**
* Structure tree content.
*
* @typedef {Object} StructTreeContent
* @property {string} type - either "content" for page and stream structure
* elements or "object" for object references.
* @property {string} id - unique id that will map to the text layer.
*/
/** /**
* PDF page operator list. * PDF page operator list.
* *
@ -1408,6 +1442,7 @@ class PDFPageProxy {
streamTextContent({ streamTextContent({
normalizeWhitespace = false, normalizeWhitespace = false,
disableCombineTextItems = false, disableCombineTextItems = false,
includeMarkedContent = false,
} = {}) { } = {}) {
const TEXT_CONTENT_CHUNK_SIZE = 100; const TEXT_CONTENT_CHUNK_SIZE = 100;
@ -1417,6 +1452,7 @@ class PDFPageProxy {
pageIndex: this._pageIndex, pageIndex: this._pageIndex,
normalizeWhitespace: normalizeWhitespace === true, normalizeWhitespace: normalizeWhitespace === true,
combineTextItems: disableCombineTextItems !== true, combineTextItems: disableCombineTextItems !== true,
includeMarkedContent: includeMarkedContent === true,
}, },
{ {
highWaterMark: TEXT_CONTENT_CHUNK_SIZE, highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
@ -1457,6 +1493,16 @@ class PDFPageProxy {
}); });
} }
/**
* @returns {Promise<StructTreeNode>} A promise that is resolved with a
* {@link StructTreeNode} object that represents the page's structure tree.
*/
getStructTree() {
return (this._structTreePromise ||= this._transport.getStructTree(
this._pageIndex
));
}
/** /**
* Destroys the page object. * Destroys the page object.
* @private * @private
@ -1486,6 +1532,7 @@ class PDFPageProxy {
this._annotationsPromise = null; this._annotationsPromise = null;
this._jsActionsPromise = null; this._jsActionsPromise = null;
this._xfaPromise = null; this._xfaPromise = null;
this._structTreePromise = null;
this.pendingCleanup = false; this.pendingCleanup = false;
return Promise.all(waitOn); return Promise.all(waitOn);
} }
@ -1521,6 +1568,7 @@ class PDFPageProxy {
this._annotationsPromise = null; this._annotationsPromise = null;
this._jsActionsPromise = null; this._jsActionsPromise = null;
this._xfaPromise = null; this._xfaPromise = null;
this._structTreePromise = null;
if (resetStats && this._stats) { if (resetStats && this._stats) {
this._stats = new StatTimer(); this._stats = new StatTimer();
} }
@ -2755,6 +2803,12 @@ class WorkerTransport {
}); });
} }
getStructTree(pageIndex) {
return this.messageHandler.sendWithPromise("GetStructTree", {
pageIndex,
});
}
getOutline() { getOutline() {
return this.messageHandler.sendWithPromise("GetOutline", null); return this.messageHandler.sendWithPromise("GetOutline", null);
} }

View File

@ -638,6 +638,23 @@ const renderTextLayer = (function renderTextLayerClosure() {
_processItems(items, styleCache) { _processItems(items, styleCache) {
for (let i = 0, len = items.length; i < len; i++) { for (let i = 0, len = items.length; i < len; i++) {
if (items[i].str === undefined) {
if (
items[i].type === "beginMarkedContentProps" ||
items[i].type === "beginMarkedContent"
) {
const parent = this._container;
this._container = document.createElement("span");
this._container.classList.add("markedContent");
if (items[i].id !== null) {
this._container.setAttribute("id", `${items[i].id}`);
}
parent.appendChild(this._container);
} else if (items[i].type === "endMarkedContent") {
this._container = this._container.parentNode;
}
continue;
}
this._textContentItemsStr.push(items[i].str); this._textContentItemsStr.push(items[i].str);
appendText(this, items[i], styleCache, this._layoutTextCtx); appendText(this, items[i], styleCache, this._layoutTextCtx);
} }

View File

@ -572,6 +572,7 @@ var Driver = (function DriverClosure() {
initPromise = page initPromise = page
.getTextContent({ .getTextContent({
normalizeWhitespace: true, normalizeWhitespace: true,
includeMarkedContent: true,
}) })
.then(function (textContent) { .then(function (textContent) {
return rasterizeTextLayer( return rasterizeTextLayer(

View File

@ -24,7 +24,11 @@ async function runTests(results) {
jasmine.loadConfig({ jasmine.loadConfig({
random: false, random: false,
spec_dir: "integration", spec_dir: "integration",
spec_files: ["scripting_spec.js", "annotation_spec.js"], spec_files: [
"scripting_spec.js",
"annotation_spec.js",
"accessibility_spec.js",
],
}); });
jasmine.addReporter({ jasmine.addReporter({

View File

@ -0,0 +1,69 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const { closePages, loadAndWait } = require("./test_utils.js");
describe("accessibility", () => {
describe("structure tree", () => {
let pages;
beforeAll(async () => {
pages = await loadAndWait("structure_simple.pdf", ".structTree");
});
afterAll(async () => {
await closePages(pages);
});
it("must build structure that maps to text layer", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
await page.waitForSelector(".structTree");
// Check the headings match up.
const head1 = await page.$eval(
".structTree [role='heading'][aria-level='1'] span",
el =>
document.getElementById(el.getAttribute("aria-owns")).textContent
);
expect(head1).withContext(`In ${browserName}`).toEqual("Heading 1");
const head2 = await page.$eval(
".structTree [role='heading'][aria-level='2'] span",
el =>
document.getElementById(el.getAttribute("aria-owns")).textContent
);
expect(head2).withContext(`In ${browserName}`).toEqual("Heading 2");
// Check the order of the content.
const texts = await page.$$eval(".structTree [aria-owns]", nodes =>
nodes.map(
el =>
document.getElementById(el.getAttribute("aria-owns"))
.textContent
)
);
expect(texts)
.withContext(`In ${browserName}`)
.toEqual([
"Heading 1",
"This paragraph 1.",
"Heading 2",
"This paragraph 2.",
]);
})
);
});
});
});

View File

@ -71,6 +71,7 @@
!issue8570.pdf !issue8570.pdf
!issue8697.pdf !issue8697.pdf
!issue8702.pdf !issue8702.pdf
!structure_simple.pdf
!issue12823.pdf !issue12823.pdf
!issue8707.pdf !issue8707.pdf
!issue8798r.pdf !issue8798r.pdf

Binary file not shown.

View File

@ -23,7 +23,7 @@
bottom: 0; bottom: 0;
line-height: 1; line-height: 1;
} }
.textLayer > span { .textLayer span {
position: absolute; position: absolute;
white-space: pre; white-space: pre;
-webkit-transform-origin: 0% 0%; -webkit-transform-origin: 0% 0%;
@ -37,3 +37,8 @@
-moz-box-sizing: border-box; -moz-box-sizing: border-box;
box-sizing: border-box; box-sizing: border-box;
} }
.textLayer .markedContent {
border: none;
background-color: transparent;
}

View File

@ -34,6 +34,7 @@
"pdf_history_spec.js", "pdf_history_spec.js",
"primitives_spec.js", "primitives_spec.js",
"stream_spec.js", "stream_spec.js",
"struct_tree_spec.js",
"type1_parser_spec.js", "type1_parser_spec.js",
"ui_utils_spec.js", "ui_utils_spec.js",
"unicode_spec.js", "unicode_spec.js",

View File

@ -80,6 +80,7 @@ async function initializePDFJS(callback) {
"pdfjs-test/unit/primitives_spec.js", "pdfjs-test/unit/primitives_spec.js",
"pdfjs-test/unit/scripting_spec.js", "pdfjs-test/unit/scripting_spec.js",
"pdfjs-test/unit/stream_spec.js", "pdfjs-test/unit/stream_spec.js",
"pdfjs-test/unit/struct_tree_spec.js",
"pdfjs-test/unit/type1_parser_spec.js", "pdfjs-test/unit/type1_parser_spec.js",
"pdfjs-test/unit/ui_utils_spec.js", "pdfjs-test/unit/ui_utils_spec.js",
"pdfjs-test/unit/unicode_spec.js", "pdfjs-test/unit/unicode_spec.js",

View File

@ -0,0 +1,108 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { buildGetDocumentParams } from "./test_utils.js";
import { getDocument } from "../../src/display/api.js";
function equalTrees(rootA, rootB) {
function walk(a, b) {
expect(a.role).toEqual(b.role);
expect(a.type).toEqual(b.type);
expect("children" in a).toEqual("children" in b);
if (!a.children) {
return;
}
expect(a.children.length).toEqual(b.children.length);
for (let i = 0; i < rootA.children.length; i++) {
walk(a.children[i], b.children[i]);
}
}
return walk(rootA, rootB);
}
describe("struct tree", function () {
describe("getStructTree", function () {
it("parses basic structure", async function () {
const filename = "structure_simple.pdf";
const params = buildGetDocumentParams(filename);
const loadingTask = getDocument(params);
const doc = await loadingTask.promise;
const page = await doc.getPage(1);
const struct = await page.getStructTree();
equalTrees(
{
role: "Root",
children: [
{
role: "Document",
children: [
{
role: "H1",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
{
role: "P",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
{
role: "H2",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
{
role: "P",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
],
},
],
},
struct
);
await loadingTask.destroy();
});
it("parses structure with marked content reference", async function () {
const filename = "issue6782.pdf";
const params = buildGetDocumentParams(filename);
const loadingTask = getDocument(params);
const doc = await loadingTask.promise;
const page = await doc.getPage(1);
const struct = await page.getStructTree();
equalTrees(
{
role: "Root",
children: [
{
role: "Part",
children: [
{ role: "P", children: Array(27).fill({ type: "content" }) },
],
},
],
},
struct
);
await loadingTask.destroy();
});
});
});

View File

@ -41,6 +41,7 @@ import { AnnotationLayerBuilder } from "./annotation_layer_builder.js";
import { NullL10n } from "./l10n_utils.js"; import { NullL10n } from "./l10n_utils.js";
import { PDFPageView } from "./pdf_page_view.js"; import { PDFPageView } from "./pdf_page_view.js";
import { SimpleLinkService } from "./pdf_link_service.js"; import { SimpleLinkService } from "./pdf_link_service.js";
import { StructTreeLayerBuilder } from "./struct_tree_layer_builder.js";
import { TextLayerBuilder } from "./text_layer_builder.js"; import { TextLayerBuilder } from "./text_layer_builder.js";
import { XfaLayerBuilder } from "./xfa_layer_builder.js"; import { XfaLayerBuilder } from "./xfa_layer_builder.js";
@ -545,6 +546,7 @@ class BaseViewer {
textLayerMode: this.textLayerMode, textLayerMode: this.textLayerMode,
annotationLayerFactory: this, annotationLayerFactory: this,
xfaLayerFactory, xfaLayerFactory,
structTreeLayerFactory: this,
imageResourcesPath: this.imageResourcesPath, imageResourcesPath: this.imageResourcesPath,
renderInteractiveForms: this.renderInteractiveForms, renderInteractiveForms: this.renderInteractiveForms,
renderer: this.renderer, renderer: this.renderer,
@ -1329,6 +1331,16 @@ class BaseViewer {
}); });
} }
/**
* @param {PDFPage} pdfPage
* @returns {StructTreeLayerBuilder}
*/
createStructTreeLayerBuilder(pdfPage) {
return new StructTreeLayerBuilder({
pdfPage,
});
}
/** /**
* @type {boolean} Whether all pages of the PDF document have identical * @type {boolean} Whether all pages of the PDF document have identical
* widths and heights. * widths and heights.

View File

@ -216,6 +216,17 @@ class IPDFXfaLayerFactory {
createXfaLayerBuilder(pageDiv, pdfPage) {} createXfaLayerBuilder(pageDiv, pdfPage) {}
} }
/**
* @interface
*/
class IPDFStructTreeLayerFactory {
/**
* @param {PDFPage} pdfPage
* @returns {StructTreeLayerBuilder}
*/
createStructTreeLayerBuilder(pdfPage) {}
}
/** /**
* @interface * @interface
*/ */
@ -254,6 +265,7 @@ export {
IPDFAnnotationLayerFactory, IPDFAnnotationLayerFactory,
IPDFHistory, IPDFHistory,
IPDFLinkService, IPDFLinkService,
IPDFStructTreeLayerFactory,
IPDFTextLayerFactory, IPDFTextLayerFactory,
IPDFXfaLayerFactory, IPDFXfaLayerFactory,
IRenderableView, IRenderableView,

View File

@ -49,6 +49,7 @@ import { viewerCompatibilityParams } from "./viewer_compatibility.js";
* The default value is `TextLayerMode.ENABLE`. * The default value is `TextLayerMode.ENABLE`.
* @property {IPDFAnnotationLayerFactory} annotationLayerFactory * @property {IPDFAnnotationLayerFactory} annotationLayerFactory
* @property {IPDFXfaLayerFactory} xfaLayerFactory * @property {IPDFXfaLayerFactory} xfaLayerFactory
* @property {IPDFStructTreeLayerFactory} structTreeLayerFactory
* @property {string} [imageResourcesPath] - Path for image resources, mainly * @property {string} [imageResourcesPath] - Path for image resources, mainly
* for annotation icons. Include trailing slash. * for annotation icons. Include trailing slash.
* @property {boolean} renderInteractiveForms - Turns on rendering of * @property {boolean} renderInteractiveForms - Turns on rendering of
@ -104,6 +105,7 @@ class PDFPageView {
this.textLayerFactory = options.textLayerFactory; this.textLayerFactory = options.textLayerFactory;
this.annotationLayerFactory = options.annotationLayerFactory; this.annotationLayerFactory = options.annotationLayerFactory;
this.xfaLayerFactory = options.xfaLayerFactory; this.xfaLayerFactory = options.xfaLayerFactory;
this.structTreeLayerFactory = options.structTreeLayerFactory;
this.renderer = options.renderer || RendererType.CANVAS; this.renderer = options.renderer || RendererType.CANVAS;
this.enableWebGL = options.enableWebGL || false; this.enableWebGL = options.enableWebGL || false;
this.l10n = options.l10n || NullL10n; this.l10n = options.l10n || NullL10n;
@ -119,6 +121,7 @@ class PDFPageView {
this.textLayer = null; this.textLayer = null;
this.zoomLayer = null; this.zoomLayer = null;
this.xfaLayer = null; this.xfaLayer = null;
this.structTreeLayer = null;
const div = document.createElement("div"); const div = document.createElement("div");
div.className = "page"; div.className = "page";
@ -357,6 +360,10 @@ class PDFPageView {
this.annotationLayer.cancel(); this.annotationLayer.cancel();
this.annotationLayer = null; this.annotationLayer = null;
} }
if (this._onTextLayerRendered) {
this.eventBus._off("textlayerrendered", this._onTextLayerRendered);
this._onTextLayerRendered = null;
}
} }
cssTransform(target, redrawAnnotations = false) { cssTransform(target, redrawAnnotations = false) {
@ -559,11 +566,12 @@ class PDFPageView {
this.paintTask = paintTask; this.paintTask = paintTask;
const resultPromise = paintTask.promise.then( const resultPromise = paintTask.promise.then(
function () { () => {
return finishPaintTask(null).then(function () { return finishPaintTask(null).then(() => {
if (textLayer) { if (textLayer) {
const readableStream = pdfPage.streamTextContent({ const readableStream = pdfPage.streamTextContent({
normalizeWhitespace: true, normalizeWhitespace: true,
includeMarkedContent: true,
}); });
textLayer.setTextContentStream(readableStream); textLayer.setTextContentStream(readableStream);
textLayer.render(); textLayer.render();
@ -602,6 +610,29 @@ class PDFPageView {
this._renderXfaLayer(); this._renderXfaLayer();
} }
// The structure tree is currently only supported when the text layer is
// enabled and a canvas is used for rendering.
if (this.structTreeLayerFactory && this.textLayer && this.canvas) {
// The structure tree must be generated after the text layer for the
// aria-owns to work.
this._onTextLayerRendered = event => {
if (event.pageNumber !== this.id) {
return;
}
this.eventBus._off("textlayerrendered", this._onTextLayerRendered);
this._onTextLayerRendered = null;
this.pdfPage.getStructTree().then(tree => {
const treeDom = this.structTreeLayer.render(tree);
treeDom.classList.add("structTree");
this.canvas.appendChild(treeDom);
});
};
this.eventBus._on("textlayerrendered", this._onTextLayerRendered);
this.structTreeLayer = this.structTreeLayerFactory.createStructTreeLayerBuilder(
pdfPage
);
}
div.setAttribute("data-loaded", true); div.setAttribute("data-loaded", true);
this.eventBus.dispatch("pagerender", { this.eventBus.dispatch("pagerender", {

View File

@ -0,0 +1,149 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const PDF_ROLE_TO_HTML_ROLE = {
// Document level structure types
Document: null, // There's a "document" role, but it doesn't make sense here.
DocumentFragment: null,
// Grouping level structure types
Part: "group",
Sect: "group", // XXX: There's a "section" role, but it's abstract.
Div: "group",
Aside: "note",
NonStruct: "none",
// Block level structure types
P: null,
// H<n>,
H: "heading",
Title: null,
FENote: "note",
// Sub-block level structure type
Sub: "group",
// General inline level structure types
Lbl: null,
Span: null,
Em: null,
Strong: null,
Link: "link",
Annot: "note",
Form: "form",
// Ruby and Warichu structure types
Ruby: null,
RB: null,
RT: null,
RP: null,
Warichu: null,
WT: null,
WP: null,
// List standard structure types
L: "list",
LI: "listitem",
LBody: null,
// Table standard structure types
Table: "table",
TR: "row",
TH: "columnheader",
TD: "cell",
THead: "columnheader",
TBody: null,
TFoot: null,
// Standard structure type Caption
Caption: null,
// Standard structure type Figure
Figure: "figure",
// Standard structure type Formula
Formula: null,
// standard structure type Artifact
Artifact: null,
};
const HEADING_PATTERN = /^H(\d+)$/;
/**
* @typedef {Object} StructTreeLayerBuilderOptions
* @property {PDFPage} pdfPage
*/
class StructTreeLayerBuilder {
/**
* @param {StructTreeLayerBuilderOptions} options
*/
constructor({ pdfPage }) {
this.pdfPage = pdfPage;
}
render(structTree) {
return this._walk(structTree);
}
_setAttributes(structElement, htmlElement) {
if (structElement.alt !== undefined) {
htmlElement.setAttribute("aria-label", structElement.alt);
}
if (structElement.id !== undefined) {
htmlElement.setAttribute("aria-owns", structElement.id);
}
}
_walk(node) {
if (!node) {
return null;
}
const element = document.createElement("span");
if ("role" in node) {
const { role } = node;
const match = role.match(HEADING_PATTERN);
if (match) {
element.setAttribute("role", "heading");
element.setAttribute("aria-level", match[1]);
} else if (PDF_ROLE_TO_HTML_ROLE[role]) {
element.setAttribute("role", PDF_ROLE_TO_HTML_ROLE[role]);
}
}
this._setAttributes(node, element);
if (node.children) {
if (node.children.length === 1 && "id" in node.children[0]) {
// Often there is only one content node so just set the values on the
// parent node to avoid creating an extra span.
this._setAttributes(node.children[0], element);
} else {
for (const kid of node.children) {
element.appendChild(this._walk(kid));
}
}
}
return element;
}
}
/**
* @implements IPDFStructTreeLayerFactory
*/
class DefaultStructTreeLayerFactory {
/**
* @param {PDFPage} pdfPage
* @returns {StructTreeLayerBuilder}
*/
createStructTreeLayerBuilder(pdfPage) {
return new StructTreeLayerBuilder({
pdfPage,
});
}
}
export { DefaultStructTreeLayerFactory, StructTreeLayerBuilder };

View File

@ -24,7 +24,7 @@
line-height: 1; line-height: 1;
} }
.textLayer > span { .textLayer span {
color: transparent; color: transparent;
position: absolute; position: absolute;
white-space: pre; white-space: pre;

View File

@ -175,7 +175,7 @@ select {
display: none !important; display: none !important;
} }
.pdfViewer.enablePermissions .textLayer > span { .pdfViewer.enablePermissions .textLayer span {
user-select: none !important; user-select: none !important;
cursor: not-allowed; cursor: not-allowed;
} }
@ -195,12 +195,12 @@ select {
display: none; display: none;
} }
.pdfPresentationMode:fullscreen .textLayer > span { .pdfPresentationMode:fullscreen .textLayer span {
cursor: none; cursor: none;
} }
.pdfPresentationMode.pdfPresentationModeControls > *, .pdfPresentationMode.pdfPresentationModeControls > *,
.pdfPresentationMode.pdfPresentationModeControls .textLayer > span { .pdfPresentationMode.pdfPresentationModeControls .textLayer span {
cursor: default; cursor: default;
} }
@ -1653,19 +1653,19 @@ html[dir="rtl"] #documentPropertiesOverlay .row > * {
mix-blend-mode: screen; mix-blend-mode: screen;
} }
#viewer.textLayer-visible .textLayer > span { #viewer.textLayer-visible .textLayer span {
background-color: rgba(255, 255, 0, 0.1); background-color: rgba(255, 255, 0, 0.1);
color: rgba(0, 0, 0, 1); color: rgba(0, 0, 0, 1);
border: solid 1px rgba(255, 0, 0, 0.5); border: solid 1px rgba(255, 0, 0, 0.5);
box-sizing: border-box; box-sizing: border-box;
} }
#viewer.textLayer-hover .textLayer > span:hover { #viewer.textLayer-hover .textLayer span:hover {
background-color: rgba(255, 255, 255, 1); background-color: rgba(255, 255, 255, 1);
color: rgba(0, 0, 0, 1); color: rgba(0, 0, 0, 1);
} }
#viewer.textLayer-shadow .textLayer > span { #viewer.textLayer-shadow .textLayer span {
background-color: rgba(255, 255, 255, 0.6); background-color: rgba(255, 255, 255, 0.6);
color: rgba(0, 0, 0, 1); color: rgba(0, 0, 0, 1);
} }