[api-minor] Add a parameter to PDFPageProxy_getTextContent that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
				
					
				
			This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
This commit is contained in:
		
							parent
							
								
									c2dfe9e9a9
								
							
						
					
					
						commit
						6dfe53b976
					
				| @ -218,7 +218,8 @@ var Page = (function PageClosure() { | |||||||
|       }); |       }); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     extractTextContent: function Page_extractTextContent(task) { |     extractTextContent: function Page_extractTextContent(task, | ||||||
|  |                                                          normalizeWhitespace) { | ||||||
|       var handler = { |       var handler = { | ||||||
|         on: function nullHandlerOn() {}, |         on: function nullHandlerOn() {}, | ||||||
|         send: function nullHandlerSend() {} |         send: function nullHandlerSend() {} | ||||||
| @ -248,7 +249,9 @@ var Page = (function PageClosure() { | |||||||
| 
 | 
 | ||||||
|         return partialEvaluator.getTextContent(contentStream, |         return partialEvaluator.getTextContent(contentStream, | ||||||
|                                                task, |                                                task, | ||||||
|                                                self.resources); |                                                self.resources, | ||||||
|  |                                                /* stateManager = */ null, | ||||||
|  |                                                normalizeWhitespace); | ||||||
|       }); |       }); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | |||||||
|       }); |       }); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     getTextContent: function PartialEvaluator_getTextContent(stream, task, |     getTextContent: | ||||||
|                                                              resources, |         function PartialEvaluator_getTextContent(stream, task, resources, | ||||||
|                                                              stateManager) { |                                                  stateManager, | ||||||
|  |                                                  normalizeWhitespace) { | ||||||
| 
 | 
 | ||||||
|       stateManager = (stateManager || new StateManager(new TextState())); |       stateManager = (stateManager || new StateManager(new TextState())); | ||||||
| 
 | 
 | ||||||
|  |       var WhitespaceRegexp = /\s/g; | ||||||
|  | 
 | ||||||
|       var textContent = { |       var textContent = { | ||||||
|         items: [], |         items: [], | ||||||
|         styles: Object.create(null) |         styles: Object.create(null) | ||||||
| @ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | |||||||
|         return textContentItem; |         return textContentItem; | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|  |       function replaceWhitespace(str) { | ||||||
|  |         // Replaces all whitespaces with standard spaces (0x20), to avoid
 | ||||||
|  |         // alignment issues between the textLayer and the canvas if the text
 | ||||||
|  |         // contains e.g. tabs (fixes issue6612.pdf).
 | ||||||
|  |         var i = 0, ii = str.length, code; | ||||||
|  |         while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) { | ||||||
|  |           i++; | ||||||
|  |         } | ||||||
|  |         return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str); | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|       function runBidiTransform(textChunk) { |       function runBidiTransform(textChunk) { | ||||||
|         var str = textChunk.str.join(''); |         var str = textChunk.str.join(''); | ||||||
|         var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); |         var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); | ||||||
|         return { |         return { | ||||||
|           str: bidiResult.str, |           str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) : | ||||||
|  |                                       bidiResult.str), | ||||||
|           dir: bidiResult.dir, |           dir: bidiResult.dir, | ||||||
|           width: textChunk.width, |           width: textChunk.width, | ||||||
|           height: textChunk.height, |           height: textChunk.height, | ||||||
| @ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | |||||||
|               } |               } | ||||||
| 
 | 
 | ||||||
|               return self.getTextContent(xobj, task, |               return self.getTextContent(xobj, task, | ||||||
|                 xobj.dict.get('Resources') || resources, stateManager). |                 xobj.dict.get('Resources') || resources, stateManager, | ||||||
|                 then(function (formTextContent) { |                 normalizeWhitespace).then(function (formTextContent) { | ||||||
|                   Util.appendToArray(textContent.items, formTextContent.items); |                   Util.appendToArray(textContent.items, formTextContent.items); | ||||||
|                   Util.extendObj(textContent.styles, formTextContent.styles); |                   Util.extendObj(textContent.styles, formTextContent.styles); | ||||||
|                   stateManager.restore(); |                   stateManager.restore(); | ||||||
|  | |||||||
| @ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { | |||||||
| 
 | 
 | ||||||
|     handler.on('GetTextContent', function wphExtractText(data) { |     handler.on('GetTextContent', function wphExtractText(data) { | ||||||
|       var pageIndex = data.pageIndex; |       var pageIndex = data.pageIndex; | ||||||
|  |       var normalizeWhitespace = data.normalizeWhitespace; | ||||||
|       return pdfManager.getPage(pageIndex).then(function(page) { |       return pdfManager.getPage(pageIndex).then(function(page) { | ||||||
|         var task = new WorkerTask('GetTextContent: page ' + pageIndex); |         var task = new WorkerTask('GetTextContent: page ' + pageIndex); | ||||||
|         startWorkerTask(task); |         startWorkerTask(task); | ||||||
|         var pageNum = pageIndex + 1; |         var pageNum = pageIndex + 1; | ||||||
|         var start = Date.now(); |         var start = Date.now(); | ||||||
|         return page.extractTextContent(task).then(function(textContent) { |         return page.extractTextContent(task, normalizeWhitespace).then( | ||||||
|  |             function(textContent) { | ||||||
|           finishWorkerTask(task); |           finishWorkerTask(task); | ||||||
|           info('text indexing: page=' + pageNum + ' - time=' + |           info('text indexing: page=' + pageNum + ' - time=' + | ||||||
|                (Date.now() - start) + 'ms'); |                (Date.now() - start) + 'ms'); | ||||||
|  | |||||||
| @ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { | |||||||
|   return PDFDocumentProxy; |   return PDFDocumentProxy; | ||||||
| })(); | })(); | ||||||
| 
 | 
 | ||||||
|  | /** | ||||||
|  |  * Page getTextContent parameters. | ||||||
|  |  * | ||||||
|  |  * @typedef {Object} getTextContentParameters | ||||||
|  |  * @param {boolean} normalizeWhitespace - replaces all occurrences of | ||||||
|  |  *   whitespace with standard spaces (0x20). The default value is `false`. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
| /** | /** | ||||||
|  * Page text content. |  * Page text content. | ||||||
|  * |  * | ||||||
| @ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() { | |||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     /** |     /** | ||||||
|  |      * @param {getTextContentParameters} params - getTextContent parameters. | ||||||
|      * @return {Promise} That is resolved a {@link TextContent} |      * @return {Promise} That is resolved a {@link TextContent} | ||||||
|      * object that represent the page text content. |      * object that represent the page text content. | ||||||
|      */ |      */ | ||||||
|     getTextContent: function PDFPageProxy_getTextContent() { |     getTextContent: function PDFPageProxy_getTextContent(params) { | ||||||
|  |       var normalizeWhitespace = (params && params.normalizeWhitespace) || false; | ||||||
|  | 
 | ||||||
|       return this.transport.messageHandler.sendWithPromise('GetTextContent', { |       return this.transport.messageHandler.sendWithPromise('GetTextContent', { | ||||||
|         pageIndex: this.pageNumber - 1 |         pageIndex: this.pageNumber - 1, | ||||||
|  |         normalizeWhitespace: normalizeWhitespace, | ||||||
|       }); |       }); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -334,10 +334,12 @@ var Driver = (function DriverClosure() { | |||||||
|               textLayerContext.clearRect(0, 0, |               textLayerContext.clearRect(0, 0, | ||||||
|                 textLayerCanvas.width, textLayerCanvas.height); |                 textLayerCanvas.width, textLayerCanvas.height); | ||||||
|               // The text builder will draw its content on the test canvas
 |               // The text builder will draw its content on the test canvas
 | ||||||
|               initPromise = page.getTextContent().then(function(textContent) { |               initPromise = | ||||||
|                 return rasterizeTextLayer(textLayerContext, viewport, |                 page.getTextContent({ normalizeWhitespace: true }).then( | ||||||
|                                           textContent); |                   function(textContent) { | ||||||
|               }); |                     return rasterizeTextLayer(textLayerContext, viewport, | ||||||
|  |                                               textContent); | ||||||
|  |                 }); | ||||||
|             } else { |             } else { | ||||||
|               textLayerCanvas = null; |               textLayerCanvas = null; | ||||||
|               initPromise = Promise.resolve(); |               initPromise = Promise.resolve(); | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -49,6 +49,7 @@ | |||||||
| !issue5280.pdf | !issue5280.pdf | ||||||
| !issue5677.pdf | !issue5677.pdf | ||||||
| !issue5954.pdf | !issue5954.pdf | ||||||
|  | !issue6612.pdf | ||||||
| !alphatrans.pdf | !alphatrans.pdf | ||||||
| !devicen.pdf | !devicen.pdf | ||||||
| !cmykjpeg.pdf | !cmykjpeg.pdf | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/issue6612.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/issue6612.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -1271,6 +1271,13 @@ | |||||||
|        "link": false, |        "link": false, | ||||||
|        "type": "eq" |        "type": "eq" | ||||||
|     }, |     }, | ||||||
|  |     {  "id": "issue6612-text", | ||||||
|  |        "file": "pdfs/issue6612.pdf", | ||||||
|  |        "md5": "657f33236496916597cd70ef1222509a", | ||||||
|  |        "rounds": 1, | ||||||
|  |        "link": false, | ||||||
|  |        "type": "text" | ||||||
|  |     }, | ||||||
|     {  "id": "zerowidthline", |     {  "id": "zerowidthline", | ||||||
|       "file": "pdfs/zerowidthline.pdf", |       "file": "pdfs/zerowidthline.pdf", | ||||||
|       "md5": "295d26e61a85635433f8e4b768953f60", |       "md5": "295d26e61a85635433f8e4b768953f60", | ||||||
|  | |||||||
| @ -482,11 +482,21 @@ describe('api', function() { | |||||||
|       }); |       }); | ||||||
|     }); |     }); | ||||||
|     it('gets text content', function () { |     it('gets text content', function () { | ||||||
|       var promise = page.getTextContent(); |       var defaultPromise = page.getTextContent(); | ||||||
|       waitsForPromiseResolved(promise, function (data) { |       var normalizeWhitespacePromise = page.getTextContent({ | ||||||
|         expect(!!data.items).toEqual(true); |         normalizeWhitespace: true }); | ||||||
|         expect(data.items.length).toEqual(7); | 
 | ||||||
|         expect(!!data.styles).toEqual(true); |       var promises = [ | ||||||
|  |         defaultPromise, | ||||||
|  |         normalizeWhitespacePromise | ||||||
|  |       ]; | ||||||
|  |       waitsForPromiseResolved(Promise.all(promises), function (data) { | ||||||
|  |         expect(!!data[0].items).toEqual(true); | ||||||
|  |         expect(data[0].items.length).toEqual(7); | ||||||
|  |         expect(!!data[0].styles).toEqual(true); | ||||||
|  | 
 | ||||||
|  |         // A simple check that ensures the two `textContent` object match.
 | ||||||
|  |         expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1])); | ||||||
|       }); |       }); | ||||||
|     }); |     }); | ||||||
|     it('gets operator list', function() { |     it('gets operator list', function() { | ||||||
|  | |||||||
| @ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() { | |||||||
|       '\u00BC': '1/4', // Vulgar fraction one quarter
 |       '\u00BC': '1/4', // Vulgar fraction one quarter
 | ||||||
|       '\u00BD': '1/2', // Vulgar fraction one half
 |       '\u00BD': '1/2', // Vulgar fraction one half
 | ||||||
|       '\u00BE': '3/4', // Vulgar fraction three quarters
 |       '\u00BE': '3/4', // Vulgar fraction three quarters
 | ||||||
|       '\u00A0': ' ' // No-break space
 |  | ||||||
|     }; |     }; | ||||||
|     this.findBar = options.findBar || null; |     this.findBar = options.findBar || null; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() { | |||||||
|         function pdfPageRenderCallback() { |         function pdfPageRenderCallback() { | ||||||
|           pageViewDrawCallback(null); |           pageViewDrawCallback(null); | ||||||
|           if (textLayer) { |           if (textLayer) { | ||||||
|             self.pdfPage.getTextContent().then( |             self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( | ||||||
|               function textContentResolved(textContent) { |               function textContentResolved(textContent) { | ||||||
|                 textLayer.setTextContent(textContent); |                 textLayer.setTextContent(textContent); | ||||||
|                 textLayer.render(TEXT_LAYER_RENDER_DELAY); |                 textLayer.render(TEXT_LAYER_RENDER_DELAY); | ||||||
|  | |||||||
| @ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() { | |||||||
| 
 | 
 | ||||||
|     getPageTextContent: function (pageIndex) { |     getPageTextContent: function (pageIndex) { | ||||||
|       return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { |       return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { | ||||||
|         return page.getTextContent(); |         return page.getTextContent({ normalizeWhitespace: true }); | ||||||
|       }); |       }); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user