diff --git a/src/core/core.js b/src/core/core.js index 984c5e91e..52ac6d58b 100644 --- a/src/core/core.js +++ b/src/core/core.js @@ -218,7 +218,8 @@ var Page = (function PageClosure() { }); }, - extractTextContent: function Page_extractTextContent(task) { + extractTextContent: function Page_extractTextContent(task, + normalizeWhitespace) { var handler = { on: function nullHandlerOn() {}, send: function nullHandlerSend() {} @@ -248,7 +249,9 @@ var Page = (function PageClosure() { return partialEvaluator.getTextContent(contentStream, task, - self.resources); + self.resources, + /* stateManager = */ null, + normalizeWhitespace); }); }, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 7e80ecf42..20087751d 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }); }, - getTextContent: function PartialEvaluator_getTextContent(stream, task, - resources, - stateManager) { + getTextContent: + function PartialEvaluator_getTextContent(stream, task, resources, + stateManager, + normalizeWhitespace) { stateManager = (stateManager || new StateManager(new TextState())); + var WhitespaceRegexp = /\s/g; + var textContent = { items: [], styles: Object.create(null) @@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return textContentItem; } + function replaceWhitespace(str) { + // Replaces all whitespaces with standard spaces (0x20), to avoid + // alignment issues between the textLayer and the canvas if the text + // contains e.g. tabs (fixes issue6612.pdf). + var i = 0, ii = str.length, code; + while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) { + i++; + } + return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str); + } + function runBidiTransform(textChunk) { var str = textChunk.str.join(''); var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); return { - str: bidiResult.str, + str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) : + bidiResult.str), dir: bidiResult.dir, width: textChunk.width, height: textChunk.height, @@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } return self.getTextContent(xobj, task, - xobj.dict.get('Resources') || resources, stateManager). - then(function (formTextContent) { + xobj.dict.get('Resources') || resources, stateManager, + normalizeWhitespace).then(function (formTextContent) { Util.appendToArray(textContent.items, formTextContent.items); Util.extendObj(textContent.styles, formTextContent.styles); stateManager.restore(); diff --git a/src/core/worker.js b/src/core/worker.js index 08fa18981..c45634004 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { handler.on('GetTextContent', function wphExtractText(data) { var pageIndex = data.pageIndex; + var normalizeWhitespace = data.normalizeWhitespace; return pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent(task).then(function(textContent) { + return page.extractTextContent(task, normalizeWhitespace).then( + function(textContent) { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + (Date.now() - start) + 'ms'); diff --git a/src/display/api.js b/src/display/api.js index e3aafa0eb..1b8dce16c 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { return PDFDocumentProxy; })(); +/** + * Page getTextContent parameters. + * + * @typedef {Object} getTextContentParameters + * @param {boolean} normalizeWhitespace - replaces all occurrences of + * whitespace with standard spaces (0x20). The default value is `false`. + */ + /** * Page text content. * @@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() { }, /** + * @param {getTextContentParameters} params - getTextContent parameters. * @return {Promise} That is resolved a {@link TextContent} * object that represent the page text content. */ - getTextContent: function PDFPageProxy_getTextContent() { + getTextContent: function PDFPageProxy_getTextContent(params) { + var normalizeWhitespace = (params && params.normalizeWhitespace) || false; + return this.transport.messageHandler.sendWithPromise('GetTextContent', { - pageIndex: this.pageNumber - 1 + pageIndex: this.pageNumber - 1, + normalizeWhitespace: normalizeWhitespace, }); }, diff --git a/test/driver.js b/test/driver.js index c41ec7012..a61084eb0 100644 --- a/test/driver.js +++ b/test/driver.js @@ -334,10 +334,12 @@ var Driver = (function DriverClosure() { textLayerContext.clearRect(0, 0, textLayerCanvas.width, textLayerCanvas.height); // The text builder will draw its content on the test canvas - initPromise = page.getTextContent().then(function(textContent) { - return rasterizeTextLayer(textLayerContext, viewport, - textContent); - }); + initPromise = + page.getTextContent({ normalizeWhitespace: true }).then( + function(textContent) { + return rasterizeTextLayer(textLayerContext, viewport, + textContent); + }); } else { textLayerCanvas = null; initPromise = Promise.resolve(); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 62a7a80a9..38a33eb86 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -49,6 +49,7 @@ !issue5280.pdf !issue5677.pdf !issue5954.pdf +!issue6612.pdf !alphatrans.pdf !devicen.pdf !cmykjpeg.pdf diff --git a/test/pdfs/issue6612.pdf b/test/pdfs/issue6612.pdf new file mode 100644 index 000000000..c9543f12d Binary files /dev/null and b/test/pdfs/issue6612.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index 1bb299cee..178e2f0b5 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1271,6 +1271,13 @@ "link": false, "type": "eq" }, + { "id": "issue6612-text", + "file": "pdfs/issue6612.pdf", + "md5": "657f33236496916597cd70ef1222509a", + "rounds": 1, + "link": false, + "type": "text" + }, { "id": "zerowidthline", "file": "pdfs/zerowidthline.pdf", "md5": "295d26e61a85635433f8e4b768953f60", diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 9749942ff..714166a12 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -482,11 +482,21 @@ describe('api', function() { }); }); it('gets text content', function () { - var promise = page.getTextContent(); - waitsForPromiseResolved(promise, function (data) { - expect(!!data.items).toEqual(true); - expect(data.items.length).toEqual(7); - expect(!!data.styles).toEqual(true); + var defaultPromise = page.getTextContent(); + var normalizeWhitespacePromise = page.getTextContent({ + normalizeWhitespace: true }); + + var promises = [ + defaultPromise, + normalizeWhitespacePromise + ]; + waitsForPromiseResolved(Promise.all(promises), function (data) { + expect(!!data[0].items).toEqual(true); + expect(data[0].items.length).toEqual(7); + expect(!!data[0].styles).toEqual(true); + + // A simple check that ensures the two `textContent` object match. + expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1])); }); }); it('gets operator list', function() { diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 6f264a86f..183db2d1c 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() { '\u00BC': '1/4', // Vulgar fraction one quarter '\u00BD': '1/2', // Vulgar fraction one half '\u00BE': '3/4', // Vulgar fraction three quarters - '\u00A0': ' ' // No-break space }; this.findBar = options.findBar || null; diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index bfa5875ae..440b31fb4 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() { function pdfPageRenderCallback() { pageViewDrawCallback(null); if (textLayer) { - self.pdfPage.getTextContent().then( + self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( function textContentResolved(textContent) { textLayer.setTextContent(textContent); textLayer.render(TEXT_LAYER_RENDER_DELAY); diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js index e32d9c671..a9c3d3d73 100644 --- a/web/pdf_viewer.js +++ b/web/pdf_viewer.js @@ -471,7 +471,7 @@ var PDFViewer = (function pdfViewer() { if (!this.pdfDocument) { return; } - + var pageView = this._pages[pageNumber - 1]; if (this.isInPresentationMode) { @@ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() { getPageTextContent: function (pageIndex) { return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { - return page.getTextContent(); + return page.getTextContent({ normalizeWhitespace: true }); }); },