[api-minor] Remove the disableCombineTextItems
option
*Please note:* This parameter has never been used within the PDF.js library/viewer itself, and it was only ever added for backwards compatibility reasons. This parameter was added in PR 7475, over six years ago, to try and optionally maintain the previous *default* text-extraction behaviour. However as part of the general text-extraction improvements in PR 13257, almost two years ago, the `disableCombineTextItems` functionality was accidentally "broken" in various ways. Note how the only (very basic) unit-test was updated in a way that doesn't really make sense, since generally speaking you'd expect that using the option should result in *more* (or at least the same number of) text-items. Furthermore there's also the recent issue 16209, where the option causes almost all textContent to be concatenated together. Hence this patch proposes that we simply remove the `disableCombineTextItems` option since it's essentially unused/untested functionality, as evident from the fact that it took almost two years for someone to notice that it's broken.
This commit is contained in:
parent
09da8026b6
commit
5063a6f2a9
@ -1010,7 +1010,6 @@ class Annotation {
|
|||||||
task,
|
task,
|
||||||
resources,
|
resources,
|
||||||
includeMarkedContent: true,
|
includeMarkedContent: true,
|
||||||
combineTextItems: true,
|
|
||||||
sink,
|
sink,
|
||||||
viewBox,
|
viewBox,
|
||||||
});
|
});
|
||||||
|
@ -511,13 +511,7 @@ class Page {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
extractTextContent({
|
extractTextContent({ handler, task, includeMarkedContent, sink }) {
|
||||||
handler,
|
|
||||||
task,
|
|
||||||
includeMarkedContent,
|
|
||||||
sink,
|
|
||||||
combineTextItems,
|
|
||||||
}) {
|
|
||||||
const contentStreamPromise = this.getContentStream();
|
const contentStreamPromise = this.getContentStream();
|
||||||
const resourcesPromise = this.loadResources([
|
const resourcesPromise = this.loadResources([
|
||||||
"ExtGState",
|
"ExtGState",
|
||||||
@ -545,7 +539,6 @@ class Page {
|
|||||||
task,
|
task,
|
||||||
resources: this.resources,
|
resources: this.resources,
|
||||||
includeMarkedContent,
|
includeMarkedContent,
|
||||||
combineTextItems,
|
|
||||||
sink,
|
sink,
|
||||||
viewBox: this.view,
|
viewBox: this.view,
|
||||||
});
|
});
|
||||||
|
@ -2236,7 +2236,6 @@ class PartialEvaluator {
|
|||||||
task,
|
task,
|
||||||
resources,
|
resources,
|
||||||
stateManager = null,
|
stateManager = null,
|
||||||
combineTextItems = false,
|
|
||||||
includeMarkedContent = false,
|
includeMarkedContent = false,
|
||||||
sink,
|
sink,
|
||||||
seenStyles = new Set(),
|
seenStyles = new Set(),
|
||||||
@ -2534,11 +2533,7 @@ class PartialEvaluator {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (!textState.font || !textContentItem.prevTransform) {
|
||||||
!combineTextItems ||
|
|
||||||
!textState.font ||
|
|
||||||
!textContentItem.prevTransform
|
|
||||||
) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3191,7 +3186,6 @@ class PartialEvaluator {
|
|||||||
task,
|
task,
|
||||||
resources: xobj.dict.get("Resources") || resources,
|
resources: xobj.dict.get("Resources") || resources,
|
||||||
stateManager: xObjStateManager,
|
stateManager: xObjStateManager,
|
||||||
combineTextItems,
|
|
||||||
includeMarkedContent,
|
includeMarkedContent,
|
||||||
sink: sinkWrapper,
|
sink: sinkWrapper,
|
||||||
seenStyles,
|
seenStyles,
|
||||||
|
@ -741,7 +741,7 @@ class WorkerMessageHandler {
|
|||||||
});
|
});
|
||||||
|
|
||||||
handler.on("GetTextContent", function (data, sink) {
|
handler.on("GetTextContent", function (data, sink) {
|
||||||
const pageIndex = data.pageIndex;
|
const { pageIndex, includeMarkedContent } = data;
|
||||||
|
|
||||||
pdfManager.getPage(pageIndex).then(function (page) {
|
pdfManager.getPage(pageIndex).then(function (page) {
|
||||||
const task = new WorkerTask("GetTextContent: page " + pageIndex);
|
const task = new WorkerTask("GetTextContent: page " + pageIndex);
|
||||||
@ -755,8 +755,7 @@ class WorkerMessageHandler {
|
|||||||
handler,
|
handler,
|
||||||
task,
|
task,
|
||||||
sink,
|
sink,
|
||||||
includeMarkedContent: data.includeMarkedContent,
|
includeMarkedContent,
|
||||||
combineTextItems: data.combineTextItems,
|
|
||||||
})
|
})
|
||||||
.then(
|
.then(
|
||||||
function () {
|
function () {
|
||||||
|
@ -1120,8 +1120,6 @@ class PDFDocumentProxy {
|
|||||||
* Page getTextContent parameters.
|
* Page getTextContent parameters.
|
||||||
*
|
*
|
||||||
* @typedef {Object} getTextContentParameters
|
* @typedef {Object} getTextContentParameters
|
||||||
* @property {boolean} disableCombineTextItems - Do not attempt to combine
|
|
||||||
* same line {@link TextItem}'s. The default value is `false`.
|
|
||||||
* @property {boolean} [includeMarkedContent] - When true include marked
|
* @property {boolean} [includeMarkedContent] - When true include marked
|
||||||
* content items in the items array of TextContent. The default is `false`.
|
* content items in the items array of TextContent. The default is `false`.
|
||||||
*/
|
*/
|
||||||
@ -1602,17 +1600,13 @@ class PDFPageProxy {
|
|||||||
* @param {getTextContentParameters} params - getTextContent parameters.
|
* @param {getTextContentParameters} params - getTextContent parameters.
|
||||||
* @returns {ReadableStream} Stream for reading text content chunks.
|
* @returns {ReadableStream} Stream for reading text content chunks.
|
||||||
*/
|
*/
|
||||||
streamTextContent({
|
streamTextContent({ includeMarkedContent = false } = {}) {
|
||||||
disableCombineTextItems = false,
|
|
||||||
includeMarkedContent = false,
|
|
||||||
} = {}) {
|
|
||||||
const TEXT_CONTENT_CHUNK_SIZE = 100;
|
const TEXT_CONTENT_CHUNK_SIZE = 100;
|
||||||
|
|
||||||
return this._transport.messageHandler.sendWithStream(
|
return this._transport.messageHandler.sendWithStream(
|
||||||
"GetTextContent",
|
"GetTextContent",
|
||||||
{
|
{
|
||||||
pageIndex: this._pageIndex,
|
pageIndex: this._pageIndex,
|
||||||
combineTextItems: disableCombineTextItems !== true,
|
|
||||||
includeMarkedContent: includeMarkedContent === true,
|
includeMarkedContent: includeMarkedContent === true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -21,6 +21,7 @@ import {
|
|||||||
ImageKind,
|
ImageKind,
|
||||||
InvalidPDFException,
|
InvalidPDFException,
|
||||||
MissingPDFException,
|
MissingPDFException,
|
||||||
|
objectSize,
|
||||||
OPS,
|
OPS,
|
||||||
PasswordException,
|
PasswordException,
|
||||||
PasswordResponses,
|
PasswordResponses,
|
||||||
@ -2321,26 +2322,16 @@ describe("api", function () {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("gets text content", async function () {
|
it("gets text content", async function () {
|
||||||
const defaultPromise = page.getTextContent();
|
const { items, styles } = await page.getTextContent();
|
||||||
const parametersPromise = page.getTextContent({
|
|
||||||
disableCombineTextItems: true,
|
|
||||||
});
|
|
||||||
|
|
||||||
const data = await Promise.all([defaultPromise, parametersPromise]);
|
expect(items.length).toEqual(15);
|
||||||
|
expect(objectSize(styles)).toEqual(5);
|
||||||
|
|
||||||
expect(!!data[0].items).toEqual(true);
|
const text = mergeText(items);
|
||||||
expect(data[0].items.length).toEqual(15);
|
expect(text).toEqual(`Table Of Content
|
||||||
expect(!!data[0].styles).toEqual(true);
|
|
||||||
|
|
||||||
const page1 = mergeText(data[0].items);
|
|
||||||
expect(page1).toEqual(`Table Of Content
|
|
||||||
Chapter 1 .......................................................... 2
|
Chapter 1 .......................................................... 2
|
||||||
Paragraph 1.1 ...................................................... 3
|
Paragraph 1.1 ...................................................... 3
|
||||||
page 1 / 3`);
|
page 1 / 3`);
|
||||||
|
|
||||||
expect(!!data[1].items).toEqual(true);
|
|
||||||
expect(data[1].items.length).toEqual(6);
|
|
||||||
expect(!!data[1].styles).toEqual(true);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
it("gets text content, with correct properties (issue 8276)", async function () {
|
it("gets text content, with correct properties (issue 8276)", async function () {
|
||||||
|
Loading…
Reference in New Issue
Block a user