Merge pull request #13171 from brendandahl/struct-tree

[api-minor] Add support for basic structure tree for accessibility.
This commit is contained in:
Tim van der Meij 2021-04-09 21:32:44 +02:00 committed by GitHub
commit 03c8c89002
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 911 additions and 14 deletions

View File

@ -58,6 +58,7 @@ import { calculateMD5 } from "./crypto.js";
import { Linearization } from "./parser.js"; import { Linearization } from "./parser.js";
import { OperatorList } from "./operator_list.js"; import { OperatorList } from "./operator_list.js";
import { PartialEvaluator } from "./evaluator.js"; import { PartialEvaluator } from "./evaluator.js";
import { StructTreePage } from "./struct_tree.js";
import { XFAFactory } from "./xfa/factory.js"; import { XFAFactory } from "./xfa/factory.js";
const DEFAULT_USER_UNIT = 1.0; const DEFAULT_USER_UNIT = 1.0;
@ -104,6 +105,10 @@ class Page {
static createObjId() { static createObjId() {
return `p${pageIndex}_${++idCounters.obj}`; return `p${pageIndex}_${++idCounters.obj}`;
} }
static getPageObjId() {
return `page${ref.toString()}`;
}
}; };
} }
@ -406,6 +411,7 @@ class Page {
handler, handler,
task, task,
normalizeWhitespace, normalizeWhitespace,
includeMarkedContent,
sink, sink,
combineTextItems, combineTextItems,
}) { }) {
@ -437,12 +443,22 @@ class Page {
task, task,
resources: this.resources, resources: this.resources,
normalizeWhitespace, normalizeWhitespace,
includeMarkedContent,
combineTextItems, combineTextItems,
sink, sink,
}); });
}); });
} }
async getStructTree() {
const structTreeRoot = await this.pdfManager.ensureCatalog(
"structTreeRoot"
);
const tree = new StructTreePage(structTreeRoot, this.pageDict);
tree.parse();
return tree;
}
getAnnotationsData(intent) { getAnnotationsData(intent) {
return this._parsedAnnotations.then(function (annotations) { return this._parsedAnnotations.then(function (annotations) {
const annotationsData = []; const annotationsData = [];
@ -604,6 +620,10 @@ class PDFDocument {
static createObjId() { static createObjId() {
unreachable("Abstract method `createObjId` called."); unreachable("Abstract method `createObjId` called.");
} }
static getPageObjId() {
unreachable("Abstract method `getPageObjId` called.");
}
}; };
} }

View File

@ -1913,7 +1913,10 @@ class PartialEvaluator {
return; return;
} }
// Other marked content types aren't supported yet. // Other marked content types aren't supported yet.
args = [args[0].name]; args = [
args[0].name,
args[1] instanceof Dict ? args[1].get("MCID") : null,
];
break; break;
case OPS.beginMarkedContent: case OPS.beginMarkedContent:
@ -1973,6 +1976,7 @@ class PartialEvaluator {
stateManager = null, stateManager = null,
normalizeWhitespace = false, normalizeWhitespace = false,
combineTextItems = false, combineTextItems = false,
includeMarkedContent = false,
sink, sink,
seenStyles = new Set(), seenStyles = new Set(),
}) { }) {
@ -2573,6 +2577,7 @@ class PartialEvaluator {
stateManager: xObjStateManager, stateManager: xObjStateManager,
normalizeWhitespace, normalizeWhitespace,
combineTextItems, combineTextItems,
includeMarkedContent,
sink: sinkWrapper, sink: sinkWrapper,
seenStyles, seenStyles,
}) })
@ -2650,6 +2655,38 @@ class PartialEvaluator {
}) })
); );
return; return;
case OPS.beginMarkedContent:
if (includeMarkedContent) {
textContent.items.push({
type: "beginMarkedContent",
tag: isName(args[0]) ? args[0].name : null,
});
}
break;
case OPS.beginMarkedContentProps:
if (includeMarkedContent) {
flushTextContentItem();
let mcid = null;
if (isDict(args[1])) {
mcid = args[1].get("MCID");
}
textContent.items.push({
type: "beginMarkedContentProps",
id: Number.isInteger(mcid)
? `${self.idFactory.getPageObjId()}_mcid${mcid}`
: null,
tag: isName(args[0]) ? args[0].name : null,
});
}
break;
case OPS.endMarkedContent:
if (includeMarkedContent) {
flushTextContentItem();
textContent.items.push({
type: "endMarkedContent",
});
}
break;
} // switch } // switch
if (textContent.items.length >= sink.desiredSize) { if (textContent.items.length >= sink.desiredSize) {
// Wait for ready, if we reach highWaterMark. // Wait for ready, if we reach highWaterMark.

View File

@ -60,6 +60,7 @@ import { CipherTransformFactory } from "./crypto.js";
import { ColorSpace } from "./colorspace.js"; import { ColorSpace } from "./colorspace.js";
import { GlobalImageCache } from "./image_utils.js"; import { GlobalImageCache } from "./image_utils.js";
import { MetadataParser } from "./metadata_parser.js"; import { MetadataParser } from "./metadata_parser.js";
import { StructTreeRoot } from "./struct_tree.js";
function fetchDestination(dest) { function fetchDestination(dest) {
return isDict(dest) ? dest.get("D") : dest; return isDict(dest) ? dest.get("D") : dest;
@ -200,6 +201,32 @@ class Catalog {
return markInfo; return markInfo;
} }
get structTreeRoot() {
let structTree = null;
try {
structTree = this._readStructTreeRoot();
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn("Unable read to structTreeRoot info.");
}
return shadow(this, "structTreeRoot", structTree);
}
/**
* @private
*/
_readStructTreeRoot() {
const obj = this._catDict.get("StructTreeRoot");
if (!isDict(obj)) {
return null;
}
const root = new StructTreeRoot(obj);
root.init();
return root;
}
get toplevelPagesDict() { get toplevelPagesDict() {
const pagesObj = this._catDict.get("Pages"); const pagesObj = this._catDict.get("Pages");
if (!isDict(pagesObj)) { if (!isDict(pagesObj)) {
@ -2626,4 +2653,4 @@ const ObjectLoader = (function () {
return ObjectLoader; return ObjectLoader;
})(); })();
export { Catalog, FileSpec, ObjectLoader, XRef }; export { Catalog, FileSpec, NumberTree, ObjectLoader, XRef };

335
src/core/struct_tree.js Normal file
View File

@ -0,0 +1,335 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { isDict, isName, isRef } from "./primitives.js";
import { isString, stringToPDFString, warn } from "../shared/util.js";
import { NumberTree } from "./obj.js";
const MAX_DEPTH = 40;
const StructElementType = {
PAGE_CONTENT: "PAGE_CONTENT",
STREAM_CONTENT: "STREAM_CONTENT",
OBJECT: "OBJECT",
ELEMENT: "ELEMENT",
};
class StructTreeRoot {
constructor(rootDict) {
this.dict = rootDict;
this.roleMap = new Map();
}
init() {
this.readRoleMap();
}
readRoleMap() {
const roleMapDict = this.dict.get("RoleMap");
if (!isDict(roleMapDict)) {
return;
}
roleMapDict.forEach((key, value) => {
if (!isName(value)) {
return;
}
this.roleMap.set(key, value.name);
});
}
}
/**
* Instead of loading the whole tree we load just the page's relevant structure
* elements, which means we need a wrapper structure to represent the tree.
*/
class StructElementNode {
constructor(tree, dict) {
this.tree = tree;
this.dict = dict;
this.kids = [];
this.parseKids();
}
get role() {
const nameObj = this.dict.get("S");
const name = isName(nameObj) ? nameObj.name : "";
const { root } = this.tree;
if (root.roleMap.has(name)) {
return root.roleMap.get(name);
}
return name;
}
parseKids() {
let pageObjId = null;
const objRef = this.dict.getRaw("Pg");
if (isRef(objRef)) {
pageObjId = objRef.toString();
}
const kids = this.dict.get("K");
if (Array.isArray(kids)) {
for (const kid of kids) {
const element = this.parseKid(pageObjId, kid);
if (element) {
this.kids.push(element);
}
}
} else {
const element = this.parseKid(pageObjId, kids);
if (element) {
this.kids.push(element);
}
}
}
parseKid(pageObjId, kid) {
// A direct link to content, the integer is an mcid.
if (Number.isInteger(kid)) {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.PAGE_CONTENT,
mcid: kid,
pageObjId,
});
}
// Find the dictionary for the kid.
let kidDict = null;
if (isRef(kid)) {
kidDict = this.dict.xref.fetch(kid);
} else if (isDict(kid)) {
kidDict = kid;
}
if (!kidDict) {
return null;
}
const pageRef = kidDict.getRaw("Pg");
if (isRef(pageRef)) {
pageObjId = pageRef.toString();
}
const type = isName(kidDict.get("Type")) ? kidDict.get("Type").name : null;
if (type === "MCR") {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.STREAM_CONTENT,
refObjId: isRef(kidDict.getRaw("Stm"))
? kidDict.getRaw("Stm").toString()
: null,
pageObjId,
mcid: kidDict.get("MCID"),
});
}
if (type === "OBJR") {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.OBJECT,
refObjId: isRef(kidDict.getRaw("Obj"))
? kidDict.getRaw("Obj").toString()
: null,
pageObjId,
});
}
return new StructElement({
type: StructElementType.ELEMENT,
dict: kidDict,
});
}
}
class StructElement {
constructor({
type,
dict = null,
mcid = null,
pageObjId = null,
refObjId = null,
}) {
this.type = type;
this.dict = dict;
this.mcid = mcid;
this.pageObjId = pageObjId;
this.refObjId = refObjId;
this.parentNode = null;
}
}
class StructTreePage {
constructor(structTreeRoot, pageDict) {
this.root = structTreeRoot;
this.rootDict = structTreeRoot ? structTreeRoot.dict : null;
this.pageDict = pageDict;
this.nodes = [];
}
parse() {
if (!this.root || !this.rootDict) {
return;
}
const parentTree = this.rootDict.get("ParentTree");
if (!parentTree) {
return;
}
const id = this.pageDict.get("StructParents");
if (!Number.isInteger(id)) {
return;
}
const numberTree = new NumberTree(parentTree, this.rootDict.xref);
const parentArray = numberTree.get(id);
if (!Array.isArray(parentArray)) {
return;
}
const map = new Map();
for (const ref of parentArray) {
if (isRef(ref)) {
this.addNode(this.rootDict.xref.fetch(ref), map);
}
}
}
addNode(dict, map, level = 0) {
if (level > MAX_DEPTH) {
warn("StructTree MAX_DEPTH reached.");
return null;
}
if (map.has(dict)) {
return map.get(dict);
}
const element = new StructElementNode(this, dict);
map.set(dict, element);
const parent = dict.get("P");
if (!parent || isName(parent.get("Type"), "StructTreeRoot")) {
if (!this.addTopLevelNode(dict, element)) {
map.delete(dict);
}
return element;
}
const parentNode = this.addNode(parent, map, level + 1);
if (!parentNode) {
return element;
}
let save = false;
for (const kid of parentNode.kids) {
if (kid.type === StructElementType.ELEMENT && kid.dict === dict) {
kid.parentNode = element;
save = true;
}
}
if (!save) {
map.delete(dict);
}
return element;
}
addTopLevelNode(dict, element) {
const obj = this.rootDict.get("K");
if (!obj) {
return false;
}
if (isDict(obj)) {
if (obj.objId !== dict.objId) {
return false;
}
this.nodes[0] = element;
return true;
}
if (!Array.isArray(obj)) {
return true;
}
let save = false;
for (let i = 0; i < obj.length; i++) {
const kidRef = obj[i];
if (kidRef && kidRef.toString() === dict.objId) {
this.nodes[i] = element;
save = true;
}
}
return save;
}
/**
* Convert the tree structure into a simplifed object literal that can
* be sent to the main thread.
* @returns {Object}
*/
get serializable() {
function nodeToSerializable(node, parent, level = 0) {
if (level > MAX_DEPTH) {
warn("StructTree too deep to be fully serialized.");
return;
}
const obj = Object.create(null);
obj.role = node.role;
obj.children = [];
parent.children.push(obj);
const alt = node.dict.get("Alt");
if (isString(alt)) {
obj.alt = stringToPDFString(alt);
}
for (const kid of node.kids) {
const kidElement =
kid.type === StructElementType.ELEMENT ? kid.parentNode : null;
if (kidElement) {
nodeToSerializable(kidElement, obj, level + 1);
continue;
} else if (
kid.type === StructElementType.PAGE_CONTENT ||
kid.type === StructElementType.STREAM_CONTENT
) {
obj.children.push({
type: "content",
id: `page${kid.pageObjId}_mcid${kid.mcid}`,
});
} else if (kid.type === StructElementType.OBJECT) {
obj.children.push({
type: "object",
id: kid.refObjId,
});
}
}
}
const root = Object.create(null);
root.children = [];
root.role = "Root";
for (const child of this.nodes) {
if (!child) {
continue;
}
nodeToSerializable(child, root);
}
return root;
}
}
export { StructTreePage, StructTreeRoot };

View File

@ -717,6 +717,7 @@ class WorkerMessageHandler {
task, task,
sink, sink,
normalizeWhitespace: data.normalizeWhitespace, normalizeWhitespace: data.normalizeWhitespace,
includeMarkedContent: data.includeMarkedContent,
combineTextItems: data.combineTextItems, combineTextItems: data.combineTextItems,
}) })
.then( .then(
@ -745,6 +746,18 @@ class WorkerMessageHandler {
}); });
}); });
handler.on("GetStructTree", function wphGetStructTree(data) {
const pageIndex = data.pageIndex;
return pdfManager
.getPage(pageIndex)
.then(function (page) {
return pdfManager.ensure(page, "getStructTree");
})
.then(function (structTree) {
return structTree.serializable;
});
});
handler.on("FontFallback", function (data) { handler.on("FontFallback", function (data) {
return pdfManager.fontFallback(data.id, handler); return pdfManager.fontFallback(data.id, handler);
}); });

View File

@ -1026,13 +1026,17 @@ class PDFDocumentProxy {
* whitespace with standard spaces (0x20). The default value is `false`. * whitespace with standard spaces (0x20). The default value is `false`.
* @property {boolean} disableCombineTextItems - Do not attempt to combine * @property {boolean} disableCombineTextItems - Do not attempt to combine
* same line {@link TextItem}'s. The default value is `false`. * same line {@link TextItem}'s. The default value is `false`.
* @property {boolean} [includeMarkedContent] - When true include marked
* content items in the items array of TextContent. The default is `false`.
*/ */
/** /**
* Page text content. * Page text content.
* *
* @typedef {Object} TextContent * @typedef {Object} TextContent
* @property {Array<TextItem>} items - Array of {@link TextItem} objects. * @property {Array<TextItem | TextMarkedContent>} items - Array of
* {@link TextItem} and {@link TextMarkedContent} objects. TextMarkedContent
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects, * @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name. * indexed by font name.
*/ */
@ -1047,6 +1051,17 @@ class PDFDocumentProxy {
* @property {number} width - Width in device space. * @property {number} width - Width in device space.
* @property {number} height - Height in device space. * @property {number} height - Height in device space.
* @property {string} fontName - Font name used by PDF.js for converted font. * @property {string} fontName - Font name used by PDF.js for converted font.
*
*/
/**
* Page text marked content part.
*
* @typedef {Object} TextMarkedContent
* @property {string} type - Either 'beginMarkedContent',
* 'beginMarkedContentProps', or 'endMarkedContent'.
* @property {string} id - The marked content identifier. Only used for type
* 'beginMarkedContentProps'.
*/ */
/** /**
@ -1103,6 +1118,25 @@ class PDFDocumentProxy {
* states set. * states set.
*/ */
/**
* Structure tree node. The root node will have a role "Root".
*
* @typedef {Object} StructTreeNode
* @property {Array<StructTreeNode | StructTreeContent>} children - Array of
* {@link StructTreeNode} and {@link StructTreeContent} objects.
* @property {string} role - element's role, already mapped if a role map exists
* in the PDF.
*/
/**
* Structure tree content.
*
* @typedef {Object} StructTreeContent
* @property {string} type - either "content" for page and stream structure
* elements or "object" for object references.
* @property {string} id - unique id that will map to the text layer.
*/
/** /**
* PDF page operator list. * PDF page operator list.
* *
@ -1435,6 +1469,7 @@ class PDFPageProxy {
streamTextContent({ streamTextContent({
normalizeWhitespace = false, normalizeWhitespace = false,
disableCombineTextItems = false, disableCombineTextItems = false,
includeMarkedContent = false,
} = {}) { } = {}) {
const TEXT_CONTENT_CHUNK_SIZE = 100; const TEXT_CONTENT_CHUNK_SIZE = 100;
@ -1444,6 +1479,7 @@ class PDFPageProxy {
pageIndex: this._pageIndex, pageIndex: this._pageIndex,
normalizeWhitespace: normalizeWhitespace === true, normalizeWhitespace: normalizeWhitespace === true,
combineTextItems: disableCombineTextItems !== true, combineTextItems: disableCombineTextItems !== true,
includeMarkedContent: includeMarkedContent === true,
}, },
{ {
highWaterMark: TEXT_CONTENT_CHUNK_SIZE, highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
@ -1484,6 +1520,16 @@ class PDFPageProxy {
}); });
} }
/**
* @returns {Promise<StructTreeNode>} A promise that is resolved with a
* {@link StructTreeNode} object that represents the page's structure tree.
*/
getStructTree() {
return (this._structTreePromise ||= this._transport.getStructTree(
this._pageIndex
));
}
/** /**
* Destroys the page object. * Destroys the page object.
* @private * @private
@ -1513,6 +1559,7 @@ class PDFPageProxy {
this._annotationsPromise = null; this._annotationsPromise = null;
this._jsActionsPromise = null; this._jsActionsPromise = null;
this._xfaPromise = null; this._xfaPromise = null;
this._structTreePromise = null;
this.pendingCleanup = false; this.pendingCleanup = false;
return Promise.all(waitOn); return Promise.all(waitOn);
} }
@ -1548,6 +1595,7 @@ class PDFPageProxy {
this._annotationsPromise = null; this._annotationsPromise = null;
this._jsActionsPromise = null; this._jsActionsPromise = null;
this._xfaPromise = null; this._xfaPromise = null;
this._structTreePromise = null;
if (resetStats && this._stats) { if (resetStats && this._stats) {
this._stats = new StatTimer(); this._stats = new StatTimer();
} }
@ -2773,6 +2821,12 @@ class WorkerTransport {
}); });
} }
getStructTree(pageIndex) {
return this.messageHandler.sendWithPromise("GetStructTree", {
pageIndex,
});
}
getOutline() { getOutline() {
return this.messageHandler.sendWithPromise("GetOutline", null); return this.messageHandler.sendWithPromise("GetOutline", null);
} }

View File

@ -638,6 +638,23 @@ const renderTextLayer = (function renderTextLayerClosure() {
_processItems(items, styleCache) { _processItems(items, styleCache) {
for (let i = 0, len = items.length; i < len; i++) { for (let i = 0, len = items.length; i < len; i++) {
if (items[i].str === undefined) {
if (
items[i].type === "beginMarkedContentProps" ||
items[i].type === "beginMarkedContent"
) {
const parent = this._container;
this._container = document.createElement("span");
this._container.classList.add("markedContent");
if (items[i].id !== null) {
this._container.setAttribute("id", `${items[i].id}`);
}
parent.appendChild(this._container);
} else if (items[i].type === "endMarkedContent") {
this._container = this._container.parentNode;
}
continue;
}
this._textContentItemsStr.push(items[i].str); this._textContentItemsStr.push(items[i].str);
appendText(this, items[i], styleCache, this._layoutTextCtx); appendText(this, items[i], styleCache, this._layoutTextCtx);
} }

View File

@ -572,6 +572,7 @@ var Driver = (function DriverClosure() {
initPromise = page initPromise = page
.getTextContent({ .getTextContent({
normalizeWhitespace: true, normalizeWhitespace: true,
includeMarkedContent: true,
}) })
.then(function (textContent) { .then(function (textContent) {
return rasterizeTextLayer( return rasterizeTextLayer(

View File

@ -24,7 +24,11 @@ async function runTests(results) {
jasmine.loadConfig({ jasmine.loadConfig({
random: false, random: false,
spec_dir: "integration", spec_dir: "integration",
spec_files: ["scripting_spec.js", "annotation_spec.js"], spec_files: [
"scripting_spec.js",
"annotation_spec.js",
"accessibility_spec.js",
],
}); });
jasmine.addReporter({ jasmine.addReporter({

View File

@ -0,0 +1,69 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const { closePages, loadAndWait } = require("./test_utils.js");
describe("accessibility", () => {
describe("structure tree", () => {
let pages;
beforeAll(async () => {
pages = await loadAndWait("structure_simple.pdf", ".structTree");
});
afterAll(async () => {
await closePages(pages);
});
it("must build structure that maps to text layer", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
await page.waitForSelector(".structTree");
// Check the headings match up.
const head1 = await page.$eval(
".structTree [role='heading'][aria-level='1'] span",
el =>
document.getElementById(el.getAttribute("aria-owns")).textContent
);
expect(head1).withContext(`In ${browserName}`).toEqual("Heading 1");
const head2 = await page.$eval(
".structTree [role='heading'][aria-level='2'] span",
el =>
document.getElementById(el.getAttribute("aria-owns")).textContent
);
expect(head2).withContext(`In ${browserName}`).toEqual("Heading 2");
// Check the order of the content.
const texts = await page.$$eval(".structTree [aria-owns]", nodes =>
nodes.map(
el =>
document.getElementById(el.getAttribute("aria-owns"))
.textContent
)
);
expect(texts)
.withContext(`In ${browserName}`)
.toEqual([
"Heading 1",
"This paragraph 1.",
"Heading 2",
"This paragraph 2.",
]);
})
);
});
});
});

View File

@ -71,6 +71,7 @@
!issue8570.pdf !issue8570.pdf
!issue8697.pdf !issue8697.pdf
!issue8702.pdf !issue8702.pdf
!structure_simple.pdf
!issue12823.pdf !issue12823.pdf
!issue8707.pdf !issue8707.pdf
!issue8798r.pdf !issue8798r.pdf

Binary file not shown.

View File

@ -23,7 +23,7 @@
bottom: 0; bottom: 0;
line-height: 1; line-height: 1;
} }
.textLayer > span { .textLayer span {
position: absolute; position: absolute;
white-space: pre; white-space: pre;
-webkit-transform-origin: 0% 0%; -webkit-transform-origin: 0% 0%;
@ -37,3 +37,8 @@
-moz-box-sizing: border-box; -moz-box-sizing: border-box;
box-sizing: border-box; box-sizing: border-box;
} }
.textLayer .markedContent {
border: none;
background-color: transparent;
}

View File

@ -34,6 +34,7 @@
"pdf_history_spec.js", "pdf_history_spec.js",
"primitives_spec.js", "primitives_spec.js",
"stream_spec.js", "stream_spec.js",
"struct_tree_spec.js",
"type1_parser_spec.js", "type1_parser_spec.js",
"ui_utils_spec.js", "ui_utils_spec.js",
"unicode_spec.js", "unicode_spec.js",

View File

@ -80,6 +80,7 @@ async function initializePDFJS(callback) {
"pdfjs-test/unit/primitives_spec.js", "pdfjs-test/unit/primitives_spec.js",
"pdfjs-test/unit/scripting_spec.js", "pdfjs-test/unit/scripting_spec.js",
"pdfjs-test/unit/stream_spec.js", "pdfjs-test/unit/stream_spec.js",
"pdfjs-test/unit/struct_tree_spec.js",
"pdfjs-test/unit/type1_parser_spec.js", "pdfjs-test/unit/type1_parser_spec.js",
"pdfjs-test/unit/ui_utils_spec.js", "pdfjs-test/unit/ui_utils_spec.js",
"pdfjs-test/unit/unicode_spec.js", "pdfjs-test/unit/unicode_spec.js",

View File

@ -0,0 +1,108 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { buildGetDocumentParams } from "./test_utils.js";
import { getDocument } from "../../src/display/api.js";
function equalTrees(rootA, rootB) {
function walk(a, b) {
expect(a.role).toEqual(b.role);
expect(a.type).toEqual(b.type);
expect("children" in a).toEqual("children" in b);
if (!a.children) {
return;
}
expect(a.children.length).toEqual(b.children.length);
for (let i = 0; i < rootA.children.length; i++) {
walk(a.children[i], b.children[i]);
}
}
return walk(rootA, rootB);
}
describe("struct tree", function () {
describe("getStructTree", function () {
it("parses basic structure", async function () {
const filename = "structure_simple.pdf";
const params = buildGetDocumentParams(filename);
const loadingTask = getDocument(params);
const doc = await loadingTask.promise;
const page = await doc.getPage(1);
const struct = await page.getStructTree();
equalTrees(
{
role: "Root",
children: [
{
role: "Document",
children: [
{
role: "H1",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
{
role: "P",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
{
role: "H2",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
{
role: "P",
children: [
{ role: "NonStruct", children: [{ type: "content" }] },
],
},
],
},
],
},
struct
);
await loadingTask.destroy();
});
it("parses structure with marked content reference", async function () {
const filename = "issue6782.pdf";
const params = buildGetDocumentParams(filename);
const loadingTask = getDocument(params);
const doc = await loadingTask.promise;
const page = await doc.getPage(1);
const struct = await page.getStructTree();
equalTrees(
{
role: "Root",
children: [
{
role: "Part",
children: [
{ role: "P", children: Array(27).fill({ type: "content" }) },
],
},
],
},
struct
);
await loadingTask.destroy();
});
});
});

View File

@ -41,6 +41,7 @@ import { AnnotationLayerBuilder } from "./annotation_layer_builder.js";
import { NullL10n } from "./l10n_utils.js"; import { NullL10n } from "./l10n_utils.js";
import { PDFPageView } from "./pdf_page_view.js"; import { PDFPageView } from "./pdf_page_view.js";
import { SimpleLinkService } from "./pdf_link_service.js"; import { SimpleLinkService } from "./pdf_link_service.js";
import { StructTreeLayerBuilder } from "./struct_tree_layer_builder.js";
import { TextLayerBuilder } from "./text_layer_builder.js"; import { TextLayerBuilder } from "./text_layer_builder.js";
import { XfaLayerBuilder } from "./xfa_layer_builder.js"; import { XfaLayerBuilder } from "./xfa_layer_builder.js";
@ -545,6 +546,7 @@ class BaseViewer {
textLayerMode: this.textLayerMode, textLayerMode: this.textLayerMode,
annotationLayerFactory: this, annotationLayerFactory: this,
xfaLayerFactory, xfaLayerFactory,
structTreeLayerFactory: this,
imageResourcesPath: this.imageResourcesPath, imageResourcesPath: this.imageResourcesPath,
renderInteractiveForms: this.renderInteractiveForms, renderInteractiveForms: this.renderInteractiveForms,
renderer: this.renderer, renderer: this.renderer,
@ -1328,6 +1330,16 @@ class BaseViewer {
}); });
} }
/**
* @param {PDFPage} pdfPage
* @returns {StructTreeLayerBuilder}
*/
createStructTreeLayerBuilder(pdfPage) {
return new StructTreeLayerBuilder({
pdfPage,
});
}
/** /**
* @type {boolean} Whether all pages of the PDF document have identical * @type {boolean} Whether all pages of the PDF document have identical
* widths and heights. * widths and heights.

View File

@ -216,6 +216,17 @@ class IPDFXfaLayerFactory {
createXfaLayerBuilder(pageDiv, pdfPage) {} createXfaLayerBuilder(pageDiv, pdfPage) {}
} }
/**
* @interface
*/
class IPDFStructTreeLayerFactory {
/**
* @param {PDFPage} pdfPage
* @returns {StructTreeLayerBuilder}
*/
createStructTreeLayerBuilder(pdfPage) {}
}
/** /**
* @interface * @interface
*/ */
@ -254,6 +265,7 @@ export {
IPDFAnnotationLayerFactory, IPDFAnnotationLayerFactory,
IPDFHistory, IPDFHistory,
IPDFLinkService, IPDFLinkService,
IPDFStructTreeLayerFactory,
IPDFTextLayerFactory, IPDFTextLayerFactory,
IPDFXfaLayerFactory, IPDFXfaLayerFactory,
IRenderableView, IRenderableView,

View File

@ -49,6 +49,7 @@ import { viewerCompatibilityParams } from "./viewer_compatibility.js";
* The default value is `TextLayerMode.ENABLE`. * The default value is `TextLayerMode.ENABLE`.
* @property {IPDFAnnotationLayerFactory} annotationLayerFactory * @property {IPDFAnnotationLayerFactory} annotationLayerFactory
* @property {IPDFXfaLayerFactory} xfaLayerFactory * @property {IPDFXfaLayerFactory} xfaLayerFactory
* @property {IPDFStructTreeLayerFactory} structTreeLayerFactory
* @property {string} [imageResourcesPath] - Path for image resources, mainly * @property {string} [imageResourcesPath] - Path for image resources, mainly
* for annotation icons. Include trailing slash. * for annotation icons. Include trailing slash.
* @property {boolean} renderInteractiveForms - Turns on rendering of * @property {boolean} renderInteractiveForms - Turns on rendering of
@ -102,6 +103,7 @@ class PDFPageView {
this.textLayerFactory = options.textLayerFactory; this.textLayerFactory = options.textLayerFactory;
this.annotationLayerFactory = options.annotationLayerFactory; this.annotationLayerFactory = options.annotationLayerFactory;
this.xfaLayerFactory = options.xfaLayerFactory; this.xfaLayerFactory = options.xfaLayerFactory;
this.structTreeLayerFactory = options.structTreeLayerFactory;
this.renderer = options.renderer || RendererType.CANVAS; this.renderer = options.renderer || RendererType.CANVAS;
this.enableWebGL = options.enableWebGL || false; this.enableWebGL = options.enableWebGL || false;
this.l10n = options.l10n || NullL10n; this.l10n = options.l10n || NullL10n;
@ -116,6 +118,7 @@ class PDFPageView {
this.textLayer = null; this.textLayer = null;
this.zoomLayer = null; this.zoomLayer = null;
this.xfaLayer = null; this.xfaLayer = null;
this.structTreeLayer = null;
const div = document.createElement("div"); const div = document.createElement("div");
div.className = "page"; div.className = "page";
@ -354,6 +357,10 @@ class PDFPageView {
this.annotationLayer.cancel(); this.annotationLayer.cancel();
this.annotationLayer = null; this.annotationLayer = null;
} }
if (this._onTextLayerRendered) {
this.eventBus._off("textlayerrendered", this._onTextLayerRendered);
this._onTextLayerRendered = null;
}
} }
cssTransform(target, redrawAnnotations = false) { cssTransform(target, redrawAnnotations = false) {
@ -556,11 +563,12 @@ class PDFPageView {
this.paintTask = paintTask; this.paintTask = paintTask;
const resultPromise = paintTask.promise.then( const resultPromise = paintTask.promise.then(
function () { () => {
return finishPaintTask(null).then(function () { return finishPaintTask(null).then(() => {
if (textLayer) { if (textLayer) {
const readableStream = pdfPage.streamTextContent({ const readableStream = pdfPage.streamTextContent({
normalizeWhitespace: true, normalizeWhitespace: true,
includeMarkedContent: true,
}); });
textLayer.setTextContentStream(readableStream); textLayer.setTextContentStream(readableStream);
textLayer.render(); textLayer.render();
@ -599,6 +607,29 @@ class PDFPageView {
this._renderXfaLayer(); this._renderXfaLayer();
} }
// The structure tree is currently only supported when the text layer is
// enabled and a canvas is used for rendering.
if (this.structTreeLayerFactory && this.textLayer && this.canvas) {
// The structure tree must be generated after the text layer for the
// aria-owns to work.
this._onTextLayerRendered = event => {
if (event.pageNumber !== this.id) {
return;
}
this.eventBus._off("textlayerrendered", this._onTextLayerRendered);
this._onTextLayerRendered = null;
this.pdfPage.getStructTree().then(tree => {
const treeDom = this.structTreeLayer.render(tree);
treeDom.classList.add("structTree");
this.canvas.appendChild(treeDom);
});
};
this.eventBus._on("textlayerrendered", this._onTextLayerRendered);
this.structTreeLayer = this.structTreeLayerFactory.createStructTreeLayerBuilder(
pdfPage
);
}
div.setAttribute("data-loaded", true); div.setAttribute("data-loaded", true);
this.eventBus.dispatch("pagerender", { this.eventBus.dispatch("pagerender", {

View File

@ -0,0 +1,149 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const PDF_ROLE_TO_HTML_ROLE = {
// Document level structure types
Document: null, // There's a "document" role, but it doesn't make sense here.
DocumentFragment: null,
// Grouping level structure types
Part: "group",
Sect: "group", // XXX: There's a "section" role, but it's abstract.
Div: "group",
Aside: "note",
NonStruct: "none",
// Block level structure types
P: null,
// H<n>,
H: "heading",
Title: null,
FENote: "note",
// Sub-block level structure type
Sub: "group",
// General inline level structure types
Lbl: null,
Span: null,
Em: null,
Strong: null,
Link: "link",
Annot: "note",
Form: "form",
// Ruby and Warichu structure types
Ruby: null,
RB: null,
RT: null,
RP: null,
Warichu: null,
WT: null,
WP: null,
// List standard structure types
L: "list",
LI: "listitem",
LBody: null,
// Table standard structure types
Table: "table",
TR: "row",
TH: "columnheader",
TD: "cell",
THead: "columnheader",
TBody: null,
TFoot: null,
// Standard structure type Caption
Caption: null,
// Standard structure type Figure
Figure: "figure",
// Standard structure type Formula
Formula: null,
// standard structure type Artifact
Artifact: null,
};
const HEADING_PATTERN = /^H(\d+)$/;
/**
* @typedef {Object} StructTreeLayerBuilderOptions
* @property {PDFPage} pdfPage
*/
class StructTreeLayerBuilder {
/**
* @param {StructTreeLayerBuilderOptions} options
*/
constructor({ pdfPage }) {
this.pdfPage = pdfPage;
}
render(structTree) {
return this._walk(structTree);
}
_setAttributes(structElement, htmlElement) {
if (structElement.alt !== undefined) {
htmlElement.setAttribute("aria-label", structElement.alt);
}
if (structElement.id !== undefined) {
htmlElement.setAttribute("aria-owns", structElement.id);
}
}
_walk(node) {
if (!node) {
return null;
}
const element = document.createElement("span");
if ("role" in node) {
const { role } = node;
const match = role.match(HEADING_PATTERN);
if (match) {
element.setAttribute("role", "heading");
element.setAttribute("aria-level", match[1]);
} else if (PDF_ROLE_TO_HTML_ROLE[role]) {
element.setAttribute("role", PDF_ROLE_TO_HTML_ROLE[role]);
}
}
this._setAttributes(node, element);
if (node.children) {
if (node.children.length === 1 && "id" in node.children[0]) {
// Often there is only one content node so just set the values on the
// parent node to avoid creating an extra span.
this._setAttributes(node.children[0], element);
} else {
for (const kid of node.children) {
element.appendChild(this._walk(kid));
}
}
}
return element;
}
}
/**
* @implements IPDFStructTreeLayerFactory
*/
class DefaultStructTreeLayerFactory {
/**
* @param {PDFPage} pdfPage
* @returns {StructTreeLayerBuilder}
*/
createStructTreeLayerBuilder(pdfPage) {
return new StructTreeLayerBuilder({
pdfPage,
});
}
}
export { DefaultStructTreeLayerFactory, StructTreeLayerBuilder };

View File

@ -24,7 +24,7 @@
line-height: 1; line-height: 1;
} }
.textLayer > span { .textLayer span {
color: transparent; color: transparent;
position: absolute; position: absolute;
white-space: pre; white-space: pre;

View File

@ -175,7 +175,7 @@ select {
display: none !important; display: none !important;
} }
.pdfViewer.enablePermissions .textLayer > span { .pdfViewer.enablePermissions .textLayer span {
user-select: none !important; user-select: none !important;
cursor: not-allowed; cursor: not-allowed;
} }
@ -195,12 +195,12 @@ select {
display: none; display: none;
} }
.pdfPresentationMode:fullscreen .textLayer > span { .pdfPresentationMode:fullscreen .textLayer span {
cursor: none; cursor: none;
} }
.pdfPresentationMode.pdfPresentationModeControls > *, .pdfPresentationMode.pdfPresentationModeControls > *,
.pdfPresentationMode.pdfPresentationModeControls .textLayer > span { .pdfPresentationMode.pdfPresentationModeControls .textLayer span {
cursor: default; cursor: default;
} }
@ -1653,19 +1653,19 @@ html[dir="rtl"] #documentPropertiesOverlay .row > * {
mix-blend-mode: screen; mix-blend-mode: screen;
} }
#viewer.textLayer-visible .textLayer > span { #viewer.textLayer-visible .textLayer span {
background-color: rgba(255, 255, 0, 0.1); background-color: rgba(255, 255, 0, 0.1);
color: rgba(0, 0, 0, 1); color: rgba(0, 0, 0, 1);
border: solid 1px rgba(255, 0, 0, 0.5); border: solid 1px rgba(255, 0, 0, 0.5);
box-sizing: border-box; box-sizing: border-box;
} }
#viewer.textLayer-hover .textLayer > span:hover { #viewer.textLayer-hover .textLayer span:hover {
background-color: rgba(255, 255, 255, 1); background-color: rgba(255, 255, 255, 1);
color: rgba(0, 0, 0, 1); color: rgba(0, 0, 0, 1);
} }
#viewer.textLayer-shadow .textLayer > span { #viewer.textLayer-shadow .textLayer span {
background-color: rgba(255, 255, 255, 0.6); background-color: rgba(255, 255, 255, 0.6);
color: rgba(0, 0, 0, 1); color: rgba(0, 0, 0, 1);
} }