Merge pull request #16286 from calixteman/copy_all

Add the possibility to copy all the pdf text whatever the rendered pages are (bug 1788035)
This commit is contained in:
calixteman 2023-04-15 19:43:13 +02:00 committed by GitHub
commit 92baf14531
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 253 additions and 3 deletions

View File

@ -26,13 +26,13 @@ async function runTests(results) {
random: false,
spec_dir: "integration",
spec_files: [
"scripting_spec.js",
"annotation_spec.js",
"accessibility_spec.js",
"annotation_spec.js",
"copy_paste_spec.js",
"find_spec.js",
"freetext_editor_spec.js",
"ink_editor_spec.js",
"a11y_spec.js",
"scripting_spec.js",
],
});

View File

@ -0,0 +1,120 @@
/* Copyright 2023 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const { closePages, loadAndWait, mockClipboard } = require("./test_utils.js");
describe("Copy and paste", () => {
describe("all text", () => {
let pages;
beforeAll(async () => {
pages = await loadAndWait("tracemonkey.pdf", ".textLayer");
await mockClipboard(pages);
});
afterAll(async () => {
await closePages(pages);
});
it("must check that we've all the contents", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
await page.keyboard.down("Control");
await page.keyboard.press("a");
await page.keyboard.up("Control");
await page.waitForTimeout(500);
await page.keyboard.down("Control");
await page.keyboard.press("c");
await page.keyboard.up("Control");
await page.waitForTimeout(500);
await page.waitForFunction(
`document.querySelector('#viewerContainer').style.cursor !== "wait"`
);
const text = await page.evaluate(() =>
navigator.clipboard.readText()
);
expect(!!text).withContext(`In ${browserName}`).toEqual(true);
expect(text.includes("Dynamic languages such as JavaScript"))
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes("This section provides an overview of our system")
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes(
"are represented by function calls. This makes the LIR used by"
)
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes("When compiling loops, we consult the oracle before")
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(text.includes("Nested Trace Tree Formation"))
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes(
"An important detail is that the call to the inner trace"
)
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(text.includes("When trace recording is completed, nanojit"))
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes(
"SpiderMonkey, like many VMs, needs to preempt the user program"
)
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes(
"Using similar computations, we find that trace recording takes"
)
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes(
"specialization algorithm. We also described our trace compiler"
)
)
.withContext(`In ${browserName}`)
.toEqual(true);
expect(
text.includes(
"dynamic optimization system. In Proceedings of the ACM SIGPLAN"
)
)
.withContext(`In ${browserName}`)
.toEqual(true);
})
);
});
});
});

View File

@ -118,3 +118,19 @@ const waitForSelectedEditor = async (page, selector) => {
);
};
exports.waitForSelectedEditor = waitForSelectedEditor;
const mockClipboard = async pages => {
await Promise.all(
pages.map(async ([_, page]) => {
await page.evaluate(() => {
let data = null;
const clipboard = {
writeText: async text => (data = text),
readText: async () => data,
};
Object.defineProperty(navigator, "clipboard", { value: clipboard });
});
})
);
};
exports.mockClipboard = mockClipboard;

View File

@ -504,6 +504,7 @@ const PDFViewerApplication = {
this.pdfViewer = new PDFViewer({
container,
viewer,
hiddenCopyElement: appConfig.hiddenCopyElement,
eventBus,
renderingQueue: pdfRenderingQueue,
linkService: pdfLinkService,

View File

@ -45,6 +45,15 @@
transform: rotate(270deg) translateX(-100%);
}
#hiddenCopyElement {
position: absolute;
top: 0;
left: 0;
width: 0;
height: 0;
display: none;
}
.pdfViewer {
/* Define this variable here and not in :root to avoid to reflow all the UI
when scaling (see #15929). */

View File

@ -82,6 +82,8 @@ function isValidAnnotationEditorMode(mode) {
* @typedef {Object} PDFViewerOptions
* @property {HTMLDivElement} container - The container for the viewer element.
* @property {HTMLDivElement} [viewer] - The viewer element.
* @property {HTMLDivElement} [hiddenCopyElement] - The hidden element used to
* check if all is selected.
* @property {EventBus} eventBus - The application event bus.
* @property {IPDFLinkService} linkService - The navigation/linking service.
* @property {IDownloadManager} [downloadManager] - The download manager
@ -205,8 +207,16 @@ class PDFViewer {
#containerTopLeft = null;
#copyCallbackBound = this.#copyCallback.bind(this);
#enablePermissions = false;
#getAllTextInProgress = false;
#hiddenCopyElement = null;
#interruptCopyCondition = false;
#previousContainerHeight = 0;
#resizeObserver = new ResizeObserver(this.#resizeObserverCallback.bind(this));
@ -230,6 +240,7 @@ class PDFViewer {
}
this.container = options.container;
this.viewer = options.viewer || options.container.firstElementChild;
this.#hiddenCopyElement = options.hiddenCopyElement;
if (
typeof PDFJSDev === "undefined" ||
@ -638,6 +649,89 @@ class PDFViewer {
]);
}
async getAllText() {
const texts = [];
const buffer = [];
for (
let pageNum = 1, pagesCount = this.pdfDocument.numPages;
pageNum <= pagesCount;
++pageNum
) {
if (this.#interruptCopyCondition) {
return null;
}
buffer.length = 0;
const page = await this.pdfDocument.getPage(pageNum);
const { items } = await page.getTextContent();
for (const item of items) {
if (item.str) {
buffer.push(item.str);
}
if (item.hasEOL) {
buffer.push("\n");
}
}
texts.push(buffer.join(""));
}
return texts.join("\n");
}
#copyCallback(event) {
const selection = document.getSelection();
const { focusNode, anchorNode } = selection;
if (
anchorNode &&
focusNode &&
selection.containsNode(this.#hiddenCopyElement)
) {
// About the condition above:
// - having non-null anchorNode and focusNode are here to guaranty that
// we have at least a kind of selection.
// - this.#hiddenCopyElement is an invisible element which is impossible
// to select manually (its display is none) but ctrl+A will select all
// including this element so having it in the selection means that all
// has been selected.
// TODO: if all the pages are rendered we don't need to wait for
// getAllText and we could just get text from the Selection object.
if (this.#getAllTextInProgress) {
return;
}
this.#getAllTextInProgress = true;
// Select all the document.
const savedCursor = this.container.style.cursor;
this.container.style.cursor = "wait";
const interruptCopy = ev =>
(this.#interruptCopyCondition = ev.key === "Escape");
window.addEventListener("keydown", interruptCopy);
this.getAllText()
.then(async text => {
if (text !== null) {
await navigator.clipboard.writeText(text);
}
})
.catch(reason => {
console.warn(
`Something goes wrong when extracting the text: ${reason.message}`
);
})
.finally(() => {
this.#getAllTextInProgress = false;
this.#interruptCopyCondition = false;
window.removeEventListener("keydown", interruptCopy);
this.container.style.cursor = savedCursor;
});
event.preventDefault();
event.stopPropagation();
}
}
/**
* @param {PDFDocumentProxy} pdfDocument
*/
@ -805,6 +899,10 @@ class PDFViewer {
this.findController?.setDocument(pdfDocument); // Enable searching.
this._scriptingManager?.setDocument(pdfDocument); // Enable scripting.
if (this.#hiddenCopyElement) {
document.addEventListener("copy", this.#copyCallbackBound);
}
if (this.#annotationEditorUIManager) {
// Ensure that the Editor buttons, in the toolbar, are updated.
this.eventBus.dispatch("annotationeditormodechanged", {
@ -949,6 +1047,8 @@ class PDFViewer {
this.viewer.removeAttribute("lang");
// Reset all PDF document permissions.
this.viewer.classList.remove(ENABLE_PERMISSIONS_CLASS);
document.removeEventListener("copy", this.#copyCallbackBound);
}
#ensurePageViewVisible() {

View File

@ -82,6 +82,7 @@ See https://github.com/adobe-type-tools/cmap-resources
<div id="mainContainer">
<div id="viewerContainer" tabindex="0">
<div id="hiddenCopyElement"></div>
<div id="viewer" class="pdfViewer"></div>
</div>
</div> <!-- mainContainer -->

View File

@ -41,6 +41,7 @@ function getViewerConfiguration() {
appContainer: document.body,
mainContainer,
viewerContainer: document.getElementById("viewer"),
hiddenCopyElement: document.getElementById("hiddenCopyElement"),
toolbar: {
mainContainer,
container: document.getElementById("floatingToolbar"),

View File

@ -385,6 +385,7 @@ See https://github.com/adobe-type-tools/cmap-resources
</div>
<div id="viewerContainer" tabindex="0">
<div id="hiddenCopyElement"></div>
<div id="viewer" class="pdfViewer"></div>
</div>
</div> <!-- mainContainer -->

View File

@ -41,6 +41,7 @@ function getViewerConfiguration() {
appContainer: document.body,
mainContainer: document.getElementById("viewerContainer"),
viewerContainer: document.getElementById("viewer"),
hiddenCopyElement: document.getElementById("hiddenCopyElement"),
toolbar: {
container: document.getElementById("toolbarViewer"),
numPages: document.getElementById("numPages"),