[api-minor] Add a parameter to PDFPageProxy_getTextContent that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
				
					
				
			This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
This commit is contained in:
		
							parent
							
								
									c2dfe9e9a9
								
							
						
					
					
						commit
						6dfe53b976
					
				| @ -218,7 +218,8 @@ var Page = (function PageClosure() { | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|     extractTextContent: function Page_extractTextContent(task) { | ||||
|     extractTextContent: function Page_extractTextContent(task, | ||||
|                                                          normalizeWhitespace) { | ||||
|       var handler = { | ||||
|         on: function nullHandlerOn() {}, | ||||
|         send: function nullHandlerSend() {} | ||||
| @ -248,7 +249,9 @@ var Page = (function PageClosure() { | ||||
| 
 | ||||
|         return partialEvaluator.getTextContent(contentStream, | ||||
|                                                task, | ||||
|                                                self.resources); | ||||
|                                                self.resources, | ||||
|                                                /* stateManager = */ null, | ||||
|                                                normalizeWhitespace); | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|  | ||||
| @ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|     getTextContent: function PartialEvaluator_getTextContent(stream, task, | ||||
|                                                              resources, | ||||
|                                                              stateManager) { | ||||
|     getTextContent: | ||||
|         function PartialEvaluator_getTextContent(stream, task, resources, | ||||
|                                                  stateManager, | ||||
|                                                  normalizeWhitespace) { | ||||
| 
 | ||||
|       stateManager = (stateManager || new StateManager(new TextState())); | ||||
| 
 | ||||
|       var WhitespaceRegexp = /\s/g; | ||||
| 
 | ||||
|       var textContent = { | ||||
|         items: [], | ||||
|         styles: Object.create(null) | ||||
| @ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|         return textContentItem; | ||||
|       } | ||||
| 
 | ||||
|       function replaceWhitespace(str) { | ||||
|         // Replaces all whitespaces with standard spaces (0x20), to avoid
 | ||||
|         // alignment issues between the textLayer and the canvas if the text
 | ||||
|         // contains e.g. tabs (fixes issue6612.pdf).
 | ||||
|         var i = 0, ii = str.length, code; | ||||
|         while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) { | ||||
|           i++; | ||||
|         } | ||||
|         return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str); | ||||
|       } | ||||
| 
 | ||||
|       function runBidiTransform(textChunk) { | ||||
|         var str = textChunk.str.join(''); | ||||
|         var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); | ||||
|         return { | ||||
|           str: bidiResult.str, | ||||
|           str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) : | ||||
|                                       bidiResult.str), | ||||
|           dir: bidiResult.dir, | ||||
|           width: textChunk.width, | ||||
|           height: textChunk.height, | ||||
| @ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|               } | ||||
| 
 | ||||
|               return self.getTextContent(xobj, task, | ||||
|                 xobj.dict.get('Resources') || resources, stateManager). | ||||
|                 then(function (formTextContent) { | ||||
|                 xobj.dict.get('Resources') || resources, stateManager, | ||||
|                 normalizeWhitespace).then(function (formTextContent) { | ||||
|                   Util.appendToArray(textContent.items, formTextContent.items); | ||||
|                   Util.extendObj(textContent.styles, formTextContent.styles); | ||||
|                   stateManager.restore(); | ||||
|  | ||||
| @ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { | ||||
| 
 | ||||
|     handler.on('GetTextContent', function wphExtractText(data) { | ||||
|       var pageIndex = data.pageIndex; | ||||
|       var normalizeWhitespace = data.normalizeWhitespace; | ||||
|       return pdfManager.getPage(pageIndex).then(function(page) { | ||||
|         var task = new WorkerTask('GetTextContent: page ' + pageIndex); | ||||
|         startWorkerTask(task); | ||||
|         var pageNum = pageIndex + 1; | ||||
|         var start = Date.now(); | ||||
|         return page.extractTextContent(task).then(function(textContent) { | ||||
|         return page.extractTextContent(task, normalizeWhitespace).then( | ||||
|             function(textContent) { | ||||
|           finishWorkerTask(task); | ||||
|           info('text indexing: page=' + pageNum + ' - time=' + | ||||
|                (Date.now() - start) + 'ms'); | ||||
|  | ||||
| @ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { | ||||
|   return PDFDocumentProxy; | ||||
| })(); | ||||
| 
 | ||||
| /** | ||||
|  * Page getTextContent parameters. | ||||
|  * | ||||
|  * @typedef {Object} getTextContentParameters | ||||
|  * @param {boolean} normalizeWhitespace - replaces all occurrences of | ||||
|  *   whitespace with standard spaces (0x20). The default value is `false`. | ||||
|  */ | ||||
| 
 | ||||
| /** | ||||
|  * Page text content. | ||||
|  * | ||||
| @ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() { | ||||
|     }, | ||||
| 
 | ||||
|     /** | ||||
|      * @param {getTextContentParameters} params - getTextContent parameters. | ||||
|      * @return {Promise} That is resolved a {@link TextContent} | ||||
|      * object that represent the page text content. | ||||
|      */ | ||||
|     getTextContent: function PDFPageProxy_getTextContent() { | ||||
|     getTextContent: function PDFPageProxy_getTextContent(params) { | ||||
|       var normalizeWhitespace = (params && params.normalizeWhitespace) || false; | ||||
| 
 | ||||
|       return this.transport.messageHandler.sendWithPromise('GetTextContent', { | ||||
|         pageIndex: this.pageNumber - 1 | ||||
|         pageIndex: this.pageNumber - 1, | ||||
|         normalizeWhitespace: normalizeWhitespace, | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|  | ||||
| @ -334,10 +334,12 @@ var Driver = (function DriverClosure() { | ||||
|               textLayerContext.clearRect(0, 0, | ||||
|                 textLayerCanvas.width, textLayerCanvas.height); | ||||
|               // The text builder will draw its content on the test canvas
 | ||||
|               initPromise = page.getTextContent().then(function(textContent) { | ||||
|                 return rasterizeTextLayer(textLayerContext, viewport, | ||||
|                                           textContent); | ||||
|               }); | ||||
|               initPromise = | ||||
|                 page.getTextContent({ normalizeWhitespace: true }).then( | ||||
|                   function(textContent) { | ||||
|                     return rasterizeTextLayer(textLayerContext, viewport, | ||||
|                                               textContent); | ||||
|                 }); | ||||
|             } else { | ||||
|               textLayerCanvas = null; | ||||
|               initPromise = Promise.resolve(); | ||||
|  | ||||
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -49,6 +49,7 @@ | ||||
| !issue5280.pdf | ||||
| !issue5677.pdf | ||||
| !issue5954.pdf | ||||
| !issue6612.pdf | ||||
| !alphatrans.pdf | ||||
| !devicen.pdf | ||||
| !cmykjpeg.pdf | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/issue6612.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/issue6612.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -1271,6 +1271,13 @@ | ||||
|        "link": false, | ||||
|        "type": "eq" | ||||
|     }, | ||||
|     {  "id": "issue6612-text", | ||||
|        "file": "pdfs/issue6612.pdf", | ||||
|        "md5": "657f33236496916597cd70ef1222509a", | ||||
|        "rounds": 1, | ||||
|        "link": false, | ||||
|        "type": "text" | ||||
|     }, | ||||
|     {  "id": "zerowidthline", | ||||
|       "file": "pdfs/zerowidthline.pdf", | ||||
|       "md5": "295d26e61a85635433f8e4b768953f60", | ||||
|  | ||||
| @ -482,11 +482,21 @@ describe('api', function() { | ||||
|       }); | ||||
|     }); | ||||
|     it('gets text content', function () { | ||||
|       var promise = page.getTextContent(); | ||||
|       waitsForPromiseResolved(promise, function (data) { | ||||
|         expect(!!data.items).toEqual(true); | ||||
|         expect(data.items.length).toEqual(7); | ||||
|         expect(!!data.styles).toEqual(true); | ||||
|       var defaultPromise = page.getTextContent(); | ||||
|       var normalizeWhitespacePromise = page.getTextContent({ | ||||
|         normalizeWhitespace: true }); | ||||
| 
 | ||||
|       var promises = [ | ||||
|         defaultPromise, | ||||
|         normalizeWhitespacePromise | ||||
|       ]; | ||||
|       waitsForPromiseResolved(Promise.all(promises), function (data) { | ||||
|         expect(!!data[0].items).toEqual(true); | ||||
|         expect(data[0].items.length).toEqual(7); | ||||
|         expect(!!data[0].styles).toEqual(true); | ||||
| 
 | ||||
|         // A simple check that ensures the two `textContent` object match.
 | ||||
|         expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1])); | ||||
|       }); | ||||
|     }); | ||||
|     it('gets operator list', function() { | ||||
|  | ||||
| @ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() { | ||||
|       '\u00BC': '1/4', // Vulgar fraction one quarter
 | ||||
|       '\u00BD': '1/2', // Vulgar fraction one half
 | ||||
|       '\u00BE': '3/4', // Vulgar fraction three quarters
 | ||||
|       '\u00A0': ' ' // No-break space
 | ||||
|     }; | ||||
|     this.findBar = options.findBar || null; | ||||
| 
 | ||||
|  | ||||
| @ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() { | ||||
|         function pdfPageRenderCallback() { | ||||
|           pageViewDrawCallback(null); | ||||
|           if (textLayer) { | ||||
|             self.pdfPage.getTextContent().then( | ||||
|             self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( | ||||
|               function textContentResolved(textContent) { | ||||
|                 textLayer.setTextContent(textContent); | ||||
|                 textLayer.render(TEXT_LAYER_RENDER_DELAY); | ||||
|  | ||||
| @ -471,7 +471,7 @@ var PDFViewer = (function pdfViewer() { | ||||
|       if (!this.pdfDocument) { | ||||
|         return; | ||||
|       } | ||||
|        | ||||
| 
 | ||||
|       var pageView = this._pages[pageNumber - 1]; | ||||
| 
 | ||||
|       if (this.isInPresentationMode) { | ||||
| @ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() { | ||||
| 
 | ||||
|     getPageTextContent: function (pageIndex) { | ||||
|       return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { | ||||
|         return page.getTextContent(); | ||||
|         return page.getTextContent({ normalizeWhitespace: true }); | ||||
|       }); | ||||
|     }, | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user