[api-minor] Add a parameter to PDFPageProxy_getTextContent that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
				
					
				
			This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
This commit is contained in:
		
							parent
							
								
									c2dfe9e9a9
								
							
						
					
					
						commit
						6dfe53b976
					
				@ -218,7 +218,8 @@ var Page = (function PageClosure() {
 | 
			
		||||
      });
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    extractTextContent: function Page_extractTextContent(task) {
 | 
			
		||||
    extractTextContent: function Page_extractTextContent(task,
 | 
			
		||||
                                                         normalizeWhitespace) {
 | 
			
		||||
      var handler = {
 | 
			
		||||
        on: function nullHandlerOn() {},
 | 
			
		||||
        send: function nullHandlerSend() {}
 | 
			
		||||
@ -248,7 +249,9 @@ var Page = (function PageClosure() {
 | 
			
		||||
 | 
			
		||||
        return partialEvaluator.getTextContent(contentStream,
 | 
			
		||||
                                               task,
 | 
			
		||||
                                               self.resources);
 | 
			
		||||
                                               self.resources,
 | 
			
		||||
                                               /* stateManager = */ null,
 | 
			
		||||
                                               normalizeWhitespace);
 | 
			
		||||
      });
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
 | 
			
		||||
      });
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    getTextContent: function PartialEvaluator_getTextContent(stream, task,
 | 
			
		||||
                                                             resources,
 | 
			
		||||
                                                             stateManager) {
 | 
			
		||||
    getTextContent:
 | 
			
		||||
        function PartialEvaluator_getTextContent(stream, task, resources,
 | 
			
		||||
                                                 stateManager,
 | 
			
		||||
                                                 normalizeWhitespace) {
 | 
			
		||||
 | 
			
		||||
      stateManager = (stateManager || new StateManager(new TextState()));
 | 
			
		||||
 | 
			
		||||
      var WhitespaceRegexp = /\s/g;
 | 
			
		||||
 | 
			
		||||
      var textContent = {
 | 
			
		||||
        items: [],
 | 
			
		||||
        styles: Object.create(null)
 | 
			
		||||
@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
 | 
			
		||||
        return textContentItem;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      function replaceWhitespace(str) {
 | 
			
		||||
        // Replaces all whitespaces with standard spaces (0x20), to avoid
 | 
			
		||||
        // alignment issues between the textLayer and the canvas if the text
 | 
			
		||||
        // contains e.g. tabs (fixes issue6612.pdf).
 | 
			
		||||
        var i = 0, ii = str.length, code;
 | 
			
		||||
        while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
 | 
			
		||||
          i++;
 | 
			
		||||
        }
 | 
			
		||||
        return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      function runBidiTransform(textChunk) {
 | 
			
		||||
        var str = textChunk.str.join('');
 | 
			
		||||
        var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
 | 
			
		||||
        return {
 | 
			
		||||
          str: bidiResult.str,
 | 
			
		||||
          str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
 | 
			
		||||
                                      bidiResult.str),
 | 
			
		||||
          dir: bidiResult.dir,
 | 
			
		||||
          width: textChunk.width,
 | 
			
		||||
          height: textChunk.height,
 | 
			
		||||
@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
 | 
			
		||||
              }
 | 
			
		||||
 | 
			
		||||
              return self.getTextContent(xobj, task,
 | 
			
		||||
                xobj.dict.get('Resources') || resources, stateManager).
 | 
			
		||||
                then(function (formTextContent) {
 | 
			
		||||
                xobj.dict.get('Resources') || resources, stateManager,
 | 
			
		||||
                normalizeWhitespace).then(function (formTextContent) {
 | 
			
		||||
                  Util.appendToArray(textContent.items, formTextContent.items);
 | 
			
		||||
                  Util.extendObj(textContent.styles, formTextContent.styles);
 | 
			
		||||
                  stateManager.restore();
 | 
			
		||||
 | 
			
		||||
@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
 | 
			
		||||
 | 
			
		||||
    handler.on('GetTextContent', function wphExtractText(data) {
 | 
			
		||||
      var pageIndex = data.pageIndex;
 | 
			
		||||
      var normalizeWhitespace = data.normalizeWhitespace;
 | 
			
		||||
      return pdfManager.getPage(pageIndex).then(function(page) {
 | 
			
		||||
        var task = new WorkerTask('GetTextContent: page ' + pageIndex);
 | 
			
		||||
        startWorkerTask(task);
 | 
			
		||||
        var pageNum = pageIndex + 1;
 | 
			
		||||
        var start = Date.now();
 | 
			
		||||
        return page.extractTextContent(task).then(function(textContent) {
 | 
			
		||||
        return page.extractTextContent(task, normalizeWhitespace).then(
 | 
			
		||||
            function(textContent) {
 | 
			
		||||
          finishWorkerTask(task);
 | 
			
		||||
          info('text indexing: page=' + pageNum + ' - time=' +
 | 
			
		||||
               (Date.now() - start) + 'ms');
 | 
			
		||||
 | 
			
		||||
@ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
 | 
			
		||||
  return PDFDocumentProxy;
 | 
			
		||||
})();
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Page getTextContent parameters.
 | 
			
		||||
 *
 | 
			
		||||
 * @typedef {Object} getTextContentParameters
 | 
			
		||||
 * @param {boolean} normalizeWhitespace - replaces all occurrences of
 | 
			
		||||
 *   whitespace with standard spaces (0x20). The default value is `false`.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Page text content.
 | 
			
		||||
 *
 | 
			
		||||
@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * @param {getTextContentParameters} params - getTextContent parameters.
 | 
			
		||||
     * @return {Promise} That is resolved a {@link TextContent}
 | 
			
		||||
     * object that represent the page text content.
 | 
			
		||||
     */
 | 
			
		||||
    getTextContent: function PDFPageProxy_getTextContent() {
 | 
			
		||||
    getTextContent: function PDFPageProxy_getTextContent(params) {
 | 
			
		||||
      var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
 | 
			
		||||
 | 
			
		||||
      return this.transport.messageHandler.sendWithPromise('GetTextContent', {
 | 
			
		||||
        pageIndex: this.pageNumber - 1
 | 
			
		||||
        pageIndex: this.pageNumber - 1,
 | 
			
		||||
        normalizeWhitespace: normalizeWhitespace,
 | 
			
		||||
      });
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -334,10 +334,12 @@ var Driver = (function DriverClosure() {
 | 
			
		||||
              textLayerContext.clearRect(0, 0,
 | 
			
		||||
                textLayerCanvas.width, textLayerCanvas.height);
 | 
			
		||||
              // The text builder will draw its content on the test canvas
 | 
			
		||||
              initPromise = page.getTextContent().then(function(textContent) {
 | 
			
		||||
                return rasterizeTextLayer(textLayerContext, viewport,
 | 
			
		||||
                                          textContent);
 | 
			
		||||
              });
 | 
			
		||||
              initPromise =
 | 
			
		||||
                page.getTextContent({ normalizeWhitespace: true }).then(
 | 
			
		||||
                  function(textContent) {
 | 
			
		||||
                    return rasterizeTextLayer(textLayerContext, viewport,
 | 
			
		||||
                                              textContent);
 | 
			
		||||
                });
 | 
			
		||||
            } else {
 | 
			
		||||
              textLayerCanvas = null;
 | 
			
		||||
              initPromise = Promise.resolve();
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							@ -49,6 +49,7 @@
 | 
			
		||||
!issue5280.pdf
 | 
			
		||||
!issue5677.pdf
 | 
			
		||||
!issue5954.pdf
 | 
			
		||||
!issue6612.pdf
 | 
			
		||||
!alphatrans.pdf
 | 
			
		||||
!devicen.pdf
 | 
			
		||||
!cmykjpeg.pdf
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/issue6612.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/issue6612.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@ -1271,6 +1271,13 @@
 | 
			
		||||
       "link": false,
 | 
			
		||||
       "type": "eq"
 | 
			
		||||
    },
 | 
			
		||||
    {  "id": "issue6612-text",
 | 
			
		||||
       "file": "pdfs/issue6612.pdf",
 | 
			
		||||
       "md5": "657f33236496916597cd70ef1222509a",
 | 
			
		||||
       "rounds": 1,
 | 
			
		||||
       "link": false,
 | 
			
		||||
       "type": "text"
 | 
			
		||||
    },
 | 
			
		||||
    {  "id": "zerowidthline",
 | 
			
		||||
      "file": "pdfs/zerowidthline.pdf",
 | 
			
		||||
      "md5": "295d26e61a85635433f8e4b768953f60",
 | 
			
		||||
 | 
			
		||||
@ -482,11 +482,21 @@ describe('api', function() {
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
    it('gets text content', function () {
 | 
			
		||||
      var promise = page.getTextContent();
 | 
			
		||||
      waitsForPromiseResolved(promise, function (data) {
 | 
			
		||||
        expect(!!data.items).toEqual(true);
 | 
			
		||||
        expect(data.items.length).toEqual(7);
 | 
			
		||||
        expect(!!data.styles).toEqual(true);
 | 
			
		||||
      var defaultPromise = page.getTextContent();
 | 
			
		||||
      var normalizeWhitespacePromise = page.getTextContent({
 | 
			
		||||
        normalizeWhitespace: true });
 | 
			
		||||
 | 
			
		||||
      var promises = [
 | 
			
		||||
        defaultPromise,
 | 
			
		||||
        normalizeWhitespacePromise
 | 
			
		||||
      ];
 | 
			
		||||
      waitsForPromiseResolved(Promise.all(promises), function (data) {
 | 
			
		||||
        expect(!!data[0].items).toEqual(true);
 | 
			
		||||
        expect(data[0].items.length).toEqual(7);
 | 
			
		||||
        expect(!!data[0].styles).toEqual(true);
 | 
			
		||||
 | 
			
		||||
        // A simple check that ensures the two `textContent` object match.
 | 
			
		||||
        expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1]));
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
    it('gets operator list', function() {
 | 
			
		||||
 | 
			
		||||
@ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() {
 | 
			
		||||
      '\u00BC': '1/4', // Vulgar fraction one quarter
 | 
			
		||||
      '\u00BD': '1/2', // Vulgar fraction one half
 | 
			
		||||
      '\u00BE': '3/4', // Vulgar fraction three quarters
 | 
			
		||||
      '\u00A0': ' ' // No-break space
 | 
			
		||||
    };
 | 
			
		||||
    this.findBar = options.findBar || null;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() {
 | 
			
		||||
        function pdfPageRenderCallback() {
 | 
			
		||||
          pageViewDrawCallback(null);
 | 
			
		||||
          if (textLayer) {
 | 
			
		||||
            self.pdfPage.getTextContent().then(
 | 
			
		||||
            self.pdfPage.getTextContent({ normalizeWhitespace: true }).then(
 | 
			
		||||
              function textContentResolved(textContent) {
 | 
			
		||||
                textLayer.setTextContent(textContent);
 | 
			
		||||
                textLayer.render(TEXT_LAYER_RENDER_DELAY);
 | 
			
		||||
 | 
			
		||||
@ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() {
 | 
			
		||||
 | 
			
		||||
    getPageTextContent: function (pageIndex) {
 | 
			
		||||
      return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
 | 
			
		||||
        return page.getTextContent();
 | 
			
		||||
        return page.getTextContent({ normalizeWhitespace: true });
 | 
			
		||||
      });
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user