[api-minor] Add a parameter to PDFPageProxy_getTextContent that controls whether PartialEvaluator_getTextContent will attempt to combine same line text items
				
					
				
			From the discussion in issue 7445, it seems that there may be cases where an API consumer would want to get the text content as is, without combined text items.
This commit is contained in:
		
							parent
							
								
									9228a04061
								
							
						
					
					
						commit
						f297e4d17c
					
				| @ -265,7 +265,8 @@ var Page = (function PageClosure() { | ||||
|     }, | ||||
| 
 | ||||
|     extractTextContent: function Page_extractTextContent(task, | ||||
|                                                          normalizeWhitespace) { | ||||
|                                                          normalizeWhitespace, | ||||
|                                                          combineTextItems) { | ||||
|       var handler = { | ||||
|         on: function nullHandlerOn() {}, | ||||
|         send: function nullHandlerSend() {} | ||||
| @ -298,7 +299,8 @@ var Page = (function PageClosure() { | ||||
|                                                task, | ||||
|                                                self.resources, | ||||
|                                                /* stateManager = */ null, | ||||
|                                                normalizeWhitespace); | ||||
|                                                normalizeWhitespace, | ||||
|                                                combineTextItems); | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|  | ||||
| @ -1110,7 +1110,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|     getTextContent: | ||||
|         function PartialEvaluator_getTextContent(stream, task, resources, | ||||
|                                                  stateManager, | ||||
|                                                  normalizeWhitespace) { | ||||
|                                                  normalizeWhitespace, | ||||
|                                                  combineTextItems) { | ||||
| 
 | ||||
|       stateManager = (stateManager || new StateManager(new TextState())); | ||||
| 
 | ||||
| @ -1421,7 +1422,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|               var isSameTextLine = !textState.font ? false : | ||||
|                 ((textState.font.vertical ? args[0] : args[1]) === 0); | ||||
|               advance = args[0] - args[1]; | ||||
|               if (isSameTextLine && textContentItem.initialized && | ||||
|               if (combineTextItems && | ||||
|                   isSameTextLine && textContentItem.initialized && | ||||
|                   advance > 0 && | ||||
|                   advance <= textContentItem.fakeMultiSpaceMax) { | ||||
|                 textState.translateTextLineMatrix(args[0], args[1]); | ||||
| @ -1453,7 +1455,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|               // Optimization to treat same line movement as advance.
 | ||||
|               advance = textState.calcTextLineMatrixAdvance( | ||||
|                 args[0], args[1], args[2], args[3], args[4], args[5]); | ||||
|               if (advance !== null && textContentItem.initialized && | ||||
|               if (combineTextItems && | ||||
|                   advance !== null && textContentItem.initialized && | ||||
|                   advance.value > 0 && | ||||
|                   advance.value <= textContentItem.fakeMultiSpaceMax) { | ||||
|                 textState.translateTextLineMatrix(advance.width, | ||||
| @ -1594,7 +1597,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
| 
 | ||||
|               next(self.getTextContent(xobj, task, | ||||
|                    xobj.dict.get('Resources') || resources, stateManager, | ||||
|                    normalizeWhitespace).then(function (formTextContent) { | ||||
|                    normalizeWhitespace, combineTextItems).then( | ||||
|                 function (formTextContent) { | ||||
|                   Util.appendToArray(textContent.items, formTextContent.items); | ||||
|                   Util.extendObj(textContent.styles, formTextContent.styles); | ||||
|                   stateManager.restore(); | ||||
|  | ||||
| @ -891,12 +891,14 @@ var WorkerMessageHandler = { | ||||
|     handler.on('GetTextContent', function wphExtractText(data) { | ||||
|       var pageIndex = data.pageIndex; | ||||
|       var normalizeWhitespace = data.normalizeWhitespace; | ||||
|       var combineTextItems = data.combineTextItems; | ||||
|       return pdfManager.getPage(pageIndex).then(function(page) { | ||||
|         var task = new WorkerTask('GetTextContent: page ' + pageIndex); | ||||
|         startWorkerTask(task); | ||||
|         var pageNum = pageIndex + 1; | ||||
|         var start = Date.now(); | ||||
|         return page.extractTextContent(task, normalizeWhitespace).then( | ||||
|         return page.extractTextContent(task, normalizeWhitespace, | ||||
|                                        combineTextItems).then( | ||||
|             function(textContent) { | ||||
|           finishWorkerTask(task); | ||||
|           info('text indexing: page=' + pageNum + ' - time=' + | ||||
|  | ||||
| @ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { | ||||
|  * @typedef {Object} getTextContentParameters | ||||
|  * @param {boolean} normalizeWhitespace - replaces all occurrences of | ||||
|  *   whitespace with standard spaces (0x20). The default value is `false`. | ||||
|  * @param {boolean} disableCombineTextItems - do not attempt to combine | ||||
|  *   same line {@link TextItem}'s. The default value is `false`. | ||||
|  */ | ||||
| 
 | ||||
| /** | ||||
| @ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() { | ||||
|      * object that represent the page text content. | ||||
|      */ | ||||
|     getTextContent: function PDFPageProxy_getTextContent(params) { | ||||
|       var normalizeWhitespace = (params && params.normalizeWhitespace) || false; | ||||
| 
 | ||||
|       return this.transport.messageHandler.sendWithPromise('GetTextContent', { | ||||
|         pageIndex: this.pageNumber - 1, | ||||
|         normalizeWhitespace: normalizeWhitespace, | ||||
|         normalizeWhitespace: (params && params.normalizeWhitespace === true ? | ||||
|                               true : /* Default */ false), | ||||
|         combineTextItems: (params && params.disableCombineTextItems === true ? | ||||
|                            false : /* Default */ true), | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|  | ||||
| @ -469,9 +469,9 @@ var Driver = (function DriverClosure() { | ||||
|               textLayerContext.clearRect(0, 0, | ||||
|                 textLayerCanvas.width, textLayerCanvas.height); | ||||
|               // The text builder will draw its content on the test canvas
 | ||||
|               initPromise = | ||||
|                 page.getTextContent({ normalizeWhitespace: true }).then( | ||||
|                   function(textContent) { | ||||
|               initPromise = page.getTextContent({ | ||||
|                 normalizeWhitespace: true, | ||||
|               }).then(function(textContent) { | ||||
|                 return rasterizeTextLayer(textLayerContext, viewport, | ||||
|                                           textContent); | ||||
|               }); | ||||
|  | ||||
| @ -771,12 +771,14 @@ describe('api', function() { | ||||
|     }); | ||||
|     it('gets text content', function (done) { | ||||
|       var defaultPromise = page.getTextContent(); | ||||
|       var normalizeWhitespacePromise = page.getTextContent({ | ||||
|         normalizeWhitespace: true }); | ||||
|       var parametersPromise = page.getTextContent({ | ||||
|         normalizeWhitespace: true, | ||||
|         disableCombineTextItems: true, | ||||
|       }); | ||||
| 
 | ||||
|       var promises = [ | ||||
|         defaultPromise, | ||||
|         normalizeWhitespacePromise | ||||
|         parametersPromise, | ||||
|       ]; | ||||
|       Promise.all(promises).then(function (data) { | ||||
|         expect(!!data[0].items).toEqual(true); | ||||
|  | ||||
| @ -503,12 +503,12 @@ var PDFPageView = (function PDFPageViewClosure() { | ||||
|         function pdfPageRenderCallback() { | ||||
|           pageViewDrawCallback(null); | ||||
|           if (textLayer) { | ||||
|             self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( | ||||
|               function textContentResolved(textContent) { | ||||
|             self.pdfPage.getTextContent({ | ||||
|               normalizeWhitespace: true, | ||||
|             }).then(function textContentResolved(textContent) { | ||||
|               textLayer.setTextContent(textContent); | ||||
|               textLayer.render(TEXT_LAYER_RENDER_DELAY); | ||||
|               } | ||||
|             ); | ||||
|             }); | ||||
|           } | ||||
|         }, | ||||
|         function pdfPageRenderError(error) { | ||||
|  | ||||
| @ -784,7 +784,9 @@ var PDFViewer = (function pdfViewer() { | ||||
| 
 | ||||
|     getPageTextContent: function (pageIndex) { | ||||
|       return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { | ||||
|         return page.getTextContent({ normalizeWhitespace: true }); | ||||
|         return page.getTextContent({ | ||||
|           normalizeWhitespace: true, | ||||
|         }); | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user