[api-minor] Remove the disableCombineTextItems option
				
					
				
			*Please note:* This parameter has never been used within the PDF.js library/viewer itself, and it was only ever added for backwards compatibility reasons. This parameter was added in PR 7475, over six years ago, to try and optionally maintain the previous *default* text-extraction behaviour. However as part of the general text-extraction improvements in PR 13257, almost two years ago, the `disableCombineTextItems` functionality was accidentally "broken" in various ways. Note how the only (very basic) unit-test was updated in a way that doesn't really make sense, since generally speaking you'd expect that using the option should result in *more* (or at least the same number of) text-items. Furthermore there's also the recent issue 16209, where the option causes almost all textContent to be concatenated together. Hence this patch proposes that we simply remove the `disableCombineTextItems` option since it's essentially unused/untested functionality, as evident from the fact that it took almost two years for someone to notice that it's broken.
This commit is contained in:
		
							parent
							
								
									09da8026b6
								
							
						
					
					
						commit
						5063a6f2a9
					
				| @ -1010,7 +1010,6 @@ class Annotation { | |||||||
|       task, |       task, | ||||||
|       resources, |       resources, | ||||||
|       includeMarkedContent: true, |       includeMarkedContent: true, | ||||||
|       combineTextItems: true, |  | ||||||
|       sink, |       sink, | ||||||
|       viewBox, |       viewBox, | ||||||
|     }); |     }); | ||||||
|  | |||||||
| @ -511,13 +511,7 @@ class Page { | |||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   extractTextContent({ |   extractTextContent({ handler, task, includeMarkedContent, sink }) { | ||||||
|     handler, |  | ||||||
|     task, |  | ||||||
|     includeMarkedContent, |  | ||||||
|     sink, |  | ||||||
|     combineTextItems, |  | ||||||
|   }) { |  | ||||||
|     const contentStreamPromise = this.getContentStream(); |     const contentStreamPromise = this.getContentStream(); | ||||||
|     const resourcesPromise = this.loadResources([ |     const resourcesPromise = this.loadResources([ | ||||||
|       "ExtGState", |       "ExtGState", | ||||||
| @ -545,7 +539,6 @@ class Page { | |||||||
|         task, |         task, | ||||||
|         resources: this.resources, |         resources: this.resources, | ||||||
|         includeMarkedContent, |         includeMarkedContent, | ||||||
|         combineTextItems, |  | ||||||
|         sink, |         sink, | ||||||
|         viewBox: this.view, |         viewBox: this.view, | ||||||
|       }); |       }); | ||||||
|  | |||||||
| @ -2236,7 +2236,6 @@ class PartialEvaluator { | |||||||
|     task, |     task, | ||||||
|     resources, |     resources, | ||||||
|     stateManager = null, |     stateManager = null, | ||||||
|     combineTextItems = false, |  | ||||||
|     includeMarkedContent = false, |     includeMarkedContent = false, | ||||||
|     sink, |     sink, | ||||||
|     seenStyles = new Set(), |     seenStyles = new Set(), | ||||||
| @ -2534,11 +2533,7 @@ class PartialEvaluator { | |||||||
|         return false; |         return false; | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       if ( |       if (!textState.font || !textContentItem.prevTransform) { | ||||||
|         !combineTextItems || |  | ||||||
|         !textState.font || |  | ||||||
|         !textContentItem.prevTransform |  | ||||||
|       ) { |  | ||||||
|         return true; |         return true; | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
| @ -3191,7 +3186,6 @@ class PartialEvaluator { | |||||||
|                     task, |                     task, | ||||||
|                     resources: xobj.dict.get("Resources") || resources, |                     resources: xobj.dict.get("Resources") || resources, | ||||||
|                     stateManager: xObjStateManager, |                     stateManager: xObjStateManager, | ||||||
|                     combineTextItems, |  | ||||||
|                     includeMarkedContent, |                     includeMarkedContent, | ||||||
|                     sink: sinkWrapper, |                     sink: sinkWrapper, | ||||||
|                     seenStyles, |                     seenStyles, | ||||||
|  | |||||||
| @ -741,7 +741,7 @@ class WorkerMessageHandler { | |||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|     handler.on("GetTextContent", function (data, sink) { |     handler.on("GetTextContent", function (data, sink) { | ||||||
|       const pageIndex = data.pageIndex; |       const { pageIndex, includeMarkedContent } = data; | ||||||
| 
 | 
 | ||||||
|       pdfManager.getPage(pageIndex).then(function (page) { |       pdfManager.getPage(pageIndex).then(function (page) { | ||||||
|         const task = new WorkerTask("GetTextContent: page " + pageIndex); |         const task = new WorkerTask("GetTextContent: page " + pageIndex); | ||||||
| @ -755,8 +755,7 @@ class WorkerMessageHandler { | |||||||
|             handler, |             handler, | ||||||
|             task, |             task, | ||||||
|             sink, |             sink, | ||||||
|             includeMarkedContent: data.includeMarkedContent, |             includeMarkedContent, | ||||||
|             combineTextItems: data.combineTextItems, |  | ||||||
|           }) |           }) | ||||||
|           .then( |           .then( | ||||||
|             function () { |             function () { | ||||||
|  | |||||||
| @ -1120,8 +1120,6 @@ class PDFDocumentProxy { | |||||||
|  * Page getTextContent parameters. |  * Page getTextContent parameters. | ||||||
|  * |  * | ||||||
|  * @typedef {Object} getTextContentParameters |  * @typedef {Object} getTextContentParameters | ||||||
|  * @property {boolean} disableCombineTextItems - Do not attempt to combine |  | ||||||
|  *   same line {@link TextItem}'s. The default value is `false`. |  | ||||||
|  * @property {boolean} [includeMarkedContent] - When true include marked |  * @property {boolean} [includeMarkedContent] - When true include marked | ||||||
|  *   content items in the items array of TextContent. The default is `false`. |  *   content items in the items array of TextContent. The default is `false`. | ||||||
|  */ |  */ | ||||||
| @ -1602,17 +1600,13 @@ class PDFPageProxy { | |||||||
|    * @param {getTextContentParameters} params - getTextContent parameters. |    * @param {getTextContentParameters} params - getTextContent parameters. | ||||||
|    * @returns {ReadableStream} Stream for reading text content chunks. |    * @returns {ReadableStream} Stream for reading text content chunks. | ||||||
|    */ |    */ | ||||||
|   streamTextContent({ |   streamTextContent({ includeMarkedContent = false } = {}) { | ||||||
|     disableCombineTextItems = false, |  | ||||||
|     includeMarkedContent = false, |  | ||||||
|   } = {}) { |  | ||||||
|     const TEXT_CONTENT_CHUNK_SIZE = 100; |     const TEXT_CONTENT_CHUNK_SIZE = 100; | ||||||
| 
 | 
 | ||||||
|     return this._transport.messageHandler.sendWithStream( |     return this._transport.messageHandler.sendWithStream( | ||||||
|       "GetTextContent", |       "GetTextContent", | ||||||
|       { |       { | ||||||
|         pageIndex: this._pageIndex, |         pageIndex: this._pageIndex, | ||||||
|         combineTextItems: disableCombineTextItems !== true, |  | ||||||
|         includeMarkedContent: includeMarkedContent === true, |         includeMarkedContent: includeMarkedContent === true, | ||||||
|       }, |       }, | ||||||
|       { |       { | ||||||
|  | |||||||
| @ -21,6 +21,7 @@ import { | |||||||
|   ImageKind, |   ImageKind, | ||||||
|   InvalidPDFException, |   InvalidPDFException, | ||||||
|   MissingPDFException, |   MissingPDFException, | ||||||
|  |   objectSize, | ||||||
|   OPS, |   OPS, | ||||||
|   PasswordException, |   PasswordException, | ||||||
|   PasswordResponses, |   PasswordResponses, | ||||||
| @ -2321,26 +2322,16 @@ describe("api", function () { | |||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|     it("gets text content", async function () { |     it("gets text content", async function () { | ||||||
|       const defaultPromise = page.getTextContent(); |       const { items, styles } = await page.getTextContent(); | ||||||
|       const parametersPromise = page.getTextContent({ |  | ||||||
|         disableCombineTextItems: true, |  | ||||||
|       }); |  | ||||||
| 
 | 
 | ||||||
|       const data = await Promise.all([defaultPromise, parametersPromise]); |       expect(items.length).toEqual(15); | ||||||
|  |       expect(objectSize(styles)).toEqual(5); | ||||||
| 
 | 
 | ||||||
|       expect(!!data[0].items).toEqual(true); |       const text = mergeText(items); | ||||||
|       expect(data[0].items.length).toEqual(15); |       expect(text).toEqual(`Table Of Content
 | ||||||
|       expect(!!data[0].styles).toEqual(true); |  | ||||||
| 
 |  | ||||||
|       const page1 = mergeText(data[0].items); |  | ||||||
|       expect(page1).toEqual(`Table Of Content
 |  | ||||||
| Chapter 1 .......................................................... 2 | Chapter 1 .......................................................... 2 | ||||||
| Paragraph 1.1 ...................................................... 3 | Paragraph 1.1 ...................................................... 3 | ||||||
| page 1 / 3`);
 | page 1 / 3`);
 | ||||||
| 
 |  | ||||||
|       expect(!!data[1].items).toEqual(true); |  | ||||||
|       expect(data[1].items.length).toEqual(6); |  | ||||||
|       expect(!!data[1].styles).toEqual(true); |  | ||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|     it("gets text content, with correct properties (issue 8276)", async function () { |     it("gets text content, with correct properties (issue 8276)", async function () { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user