diff --git a/src/core/document.js b/src/core/document.js index 2aea50098..79e0c2494 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -270,7 +270,7 @@ var Page = (function PageClosure() { }, extractTextContent({ handler, task, normalizeWhitespace, - combineTextItems, }) { + sink, combineTextItems, }) { var contentStreamPromise = this.pdfManager.ensure(this, 'getContentStream'); var resourcesPromise = this.loadResources([ @@ -298,6 +298,7 @@ var Page = (function PageClosure() { resources: this.resources, normalizeWhitespace, combineTextItems, + sink, }); }); }, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 41f0c636d..42c505e2d 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1176,7 +1176,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }, getTextContent({ stream, task, resources, stateManager = null, - normalizeWhitespace = false, combineTextItems = false, }) { + normalizeWhitespace = false, combineTextItems = false, + sink, seenStyles = Object.create(null), }) { // Ensure that `resources`/`stateManager` is correctly initialized, // even if the provided parameter is e.g. `null`. resources = resources || Dict.empty; @@ -1214,7 +1215,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd. var xobjs = null; - var xobjsCache = Object.create(null); + var skipEmptyXObjs = Object.create(null); var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager); @@ -1225,7 +1226,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return textContentItem; } var font = textState.font; - if (!(font.loadedName in textContent.styles)) { + if (!(font.loadedName in seenStyles)) { + seenStyles[font.loadedName] = true; textContent.styles[font.loadedName] = { fontFamily: font.fallbackName, ascent: font.ascent, @@ -1416,11 +1418,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { textContentItem.str.length = 0; } + function enqueueChunk() { + let length = textContent.items.length; + if (length > 0) { + sink.enqueue(textContent, length); + textContent.items = []; + textContent.styles = Object.create(null); + } + } + var timeSlotManager = new TimeSlotManager(); return new Promise(function promiseBody(resolve, reject) { - var next = function (promise) { - promise.then(function () { + let next = function (promise) { + enqueueChunk(); + Promise.all([promise, sink.ready]).then(function () { try { promiseBody(resolve, reject); } catch (ex) { @@ -1615,11 +1627,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } var name = args[0].name; - if (xobjsCache.key === name) { - if (xobjsCache.texts) { - Util.appendToArray(textContent.items, xobjsCache.texts.items); - Util.extendObj(textContent.styles, xobjsCache.texts.styles); - } + if (name in skipEmptyXObjs) { break; } @@ -1633,8 +1641,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { assert(isName(type), 'XObject should have a Name subtype'); if (type.name !== 'Form') { - xobjsCache.key = name; - xobjsCache.texts = null; + skipEmptyXObjs[name] = true; break; } @@ -1650,6 +1657,26 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { xObjStateManager.transform(matrix); } + // Enqueue the `textContent` chunk before parsing the /Form + // XObject. + enqueueChunk(); + let sinkWrapper = { + enqueueInvoked: false, + + enqueue(chunk, size) { + this.enqueueInvoked = true; + sink.enqueue(chunk, size); + }, + + get desiredSize() { + return sink.desiredSize; + }, + + get ready() { + return sink.ready; + }, + }; + next(self.getTextContent({ stream: xobj, task, @@ -1657,12 +1684,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { stateManager: xObjStateManager, normalizeWhitespace, combineTextItems, - }).then(function (formTextContent) { - Util.appendToArray(textContent.items, formTextContent.items); - Util.extendObj(textContent.styles, formTextContent.styles); - - xobjsCache.key = name; - xobjsCache.texts = formTextContent; + sink: sinkWrapper, + seenStyles, + }).then(function() { + if (!sinkWrapper.enqueueInvoked) { + skipEmptyXObjs[name] = true; + } })); return; case OPS.setGState: @@ -1686,20 +1713,27 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } break; } // switch + if (textContent.items.length >= sink.desiredSize) { + // Wait for ready, if we reach highWaterMark. + stop = true; + break; + } } // while if (stop) { next(deferred); return; } flushTextContentItem(); - resolve(textContent); + enqueueChunk(); + resolve(); }).catch((reason) => { if (this.options.ignoreErrors) { // Error(s) in the TextContent -- allow text-extraction to continue. warn('getTextContent - ignoring errors during task: ' + task.name); flushTextContentItem(); - return textContent; + enqueueChunk(); + return; } throw reason; }); diff --git a/src/core/worker.js b/src/core/worker.js index 411834072..770ba111b 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -874,30 +874,35 @@ var WorkerMessageHandler = { }); }, this); - handler.on('GetTextContent', function wphExtractText(data) { + handler.on('GetTextContent', function wphExtractText(data, sink) { var pageIndex = data.pageIndex; - return pdfManager.getPage(pageIndex).then(function(page) { + sink.onPull = function (desiredSize) { }; + sink.onCancel = function (reason) { }; + + pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent({ + page.extractTextContent({ handler, task, + sink, normalizeWhitespace: data.normalizeWhitespace, combineTextItems: data.combineTextItems, - }).then(function(textContent) { + }).then(function() { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + (Date.now() - start) + 'ms'); - return textContent; + sink.close(); }, function (reason) { finishWorkerTask(task); if (task.terminated) { return; // ignoring errors from the terminated thread } + sink.error(reason); throw reason; }); }); diff --git a/src/display/api.js b/src/display/api.js index 28ea19c77..effecb128 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -954,6 +954,24 @@ var PDFPageProxy = (function PDFPageProxyClosure() { return intentState.opListReadCapability.promise; }, + /** + * @param {getTextContentParameters} params - getTextContent parameters. + * @return {ReadableStream} ReadableStream to read textContent chunks. + */ + streamTextContent(params = {}) { + const TEXT_CONTENT_CHUNK_SIZE = 100; + return this.transport.messageHandler.sendWithStream('GetTextContent', { + pageIndex: this.pageNumber - 1, + normalizeWhitespace: (params.normalizeWhitespace === true), + combineTextItems: (params.disableCombineTextItems !== true), + }, { + highWaterMark: TEXT_CONTENT_CHUNK_SIZE, + size(textContent) { + return textContent.items.length; + }, + }); + }, + /** * @param {getTextContentParameters} params - getTextContent parameters. * @return {Promise} That is resolved a {@link TextContent} @@ -961,10 +979,28 @@ var PDFPageProxy = (function PDFPageProxyClosure() { */ getTextContent: function PDFPageProxy_getTextContent(params) { params = params || {}; - return this.transport.messageHandler.sendWithPromise('GetTextContent', { - pageIndex: this.pageNumber - 1, - normalizeWhitespace: (params.normalizeWhitespace === true), - combineTextItems: (params.disableCombineTextItems !== true), + let readableStream = this.streamTextContent(params); + + return new Promise(function(resolve, reject) { + function pump() { + reader.read().then(function({ value, done, }) { + if (done) { + resolve(textContent); + return; + } + Util.extendObj(textContent.styles, value.styles); + Util.appendToArray(textContent.items, value.items); + pump(); + }, reject); + } + + let reader = readableStream.getReader(); + let textContent = { + items: [], + styles: Object.create(null), + }; + + pump(); }); }, diff --git a/src/display/text_layer.js b/src/display/text_layer.js index 71727ee43..61c08edaa 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -20,14 +20,20 @@ import { CustomStyle, getDefaultSetting } from './dom_utils'; * Text layer render parameters. * * @typedef {Object} TextLayerRenderParameters - * @property {TextContent} textContent - Text content to render (the object is - * returned by the page's getTextContent() method). + * @property {TextContent} textContent - (optional) Text content to render + * (the object is returned by the page's getTextContent() method). + * @property {ReadableStream} textContentStream - (optional) Text content + * stream to render (the stream is returned by the page's + * streamTextContent() method). * @property {HTMLElement} container - HTML element that will contain text runs. * @property {PageViewport} viewport - The target viewport to properly * layout the text runs. * @property {Array} textDivs - (optional) HTML elements that are correspond * the text items of the textContent input. This is output and shall be * initially be set to empty array. + * @property {Array} textContentItemsStr - (optional) Strings that correspond + * the `str` property of the text items of textContent input. This is output + * and shall be initially be set to empty array. * @property {number} timeout - (optional) Delay in milliseconds before * rendering of the text runs occurs. * @property {boolean} enhanceTextSelection - (optional) Whether to turn on the @@ -122,6 +128,9 @@ var renderTextLayer = (function renderTextLayerClosure() { } } task._textDivProperties.set(textDiv, textDivProperties); + if (task._textContentStream) { + task._layoutText(textDiv); + } if (task._enhanceTextSelection) { var angleCos = 1, angleSin = 0; @@ -157,7 +166,6 @@ var renderTextLayer = (function renderTextLayerClosure() { if (task._canceled) { return; } - var textLayerFrag = task._container; var textDivs = task._textDivs; var capability = task._capability; var textDivsLength = textDivs.length; @@ -170,50 +178,12 @@ var renderTextLayer = (function renderTextLayerClosure() { return; } - // The temporary canvas is used to measure text length in the DOM. - var canvas = document.createElement('canvas'); - if (typeof PDFJSDev === 'undefined' || - PDFJSDev.test('FIREFOX || MOZCENTRAL || GENERIC')) { - canvas.mozOpaque = true; + if (!task._textContentStream) { + for (var i = 0; i < textDivsLength; i++) { + task._layoutText(textDivs[i]); + } } - var ctx = canvas.getContext('2d', { alpha: false, }); - var lastFontSize; - var lastFontFamily; - for (var i = 0; i < textDivsLength; i++) { - var textDiv = textDivs[i]; - var textDivProperties = task._textDivProperties.get(textDiv); - if (textDivProperties.isWhitespace) { - continue; - } - - var fontSize = textDiv.style.fontSize; - var fontFamily = textDiv.style.fontFamily; - - // Only build font string and set to context if different from last. - if (fontSize !== lastFontSize || fontFamily !== lastFontFamily) { - ctx.font = fontSize + ' ' + fontFamily; - lastFontSize = fontSize; - lastFontFamily = fontFamily; - } - - var width = ctx.measureText(textDiv.textContent).width; - textLayerFrag.appendChild(textDiv); - - var transform = ''; - if (textDivProperties.canvasWidth !== 0 && width > 0) { - textDivProperties.scale = textDivProperties.canvasWidth / width; - transform = 'scaleX(' + textDivProperties.scale + ')'; - } - if (textDivProperties.angle !== 0) { - transform = 'rotate(' + textDivProperties.angle + 'deg) ' + transform; - } - if (transform !== '') { - textDivProperties.originalTransform = transform; - CustomStyle.setProp('transform', textDiv, transform); - } - task._textDivProperties.set(textDiv, textDivProperties); - } task._renderingDone = true; capability.resolve(); } @@ -499,19 +469,27 @@ var renderTextLayer = (function renderTextLayerClosure() { * @param {boolean} enhanceTextSelection * @private */ - function TextLayerRenderTask(textContent, container, viewport, textDivs, - enhanceTextSelection) { + function TextLayerRenderTask({ textContent, textContentStream, container, + viewport, textDivs, textContentItemsStr, + enhanceTextSelection, }) { this._textContent = textContent; + this._textContentStream = textContentStream; this._container = container; this._viewport = viewport; this._textDivs = textDivs || []; + this._textContentItemsStr = textContentItemsStr || []; + this._enhanceTextSelection = !!enhanceTextSelection; + + this._reader = null; + this._layoutTextLastFontSize = null; + this._layoutTextLastFontFamily = null; + this._layoutTextCtx = null; this._textDivProperties = new WeakMap(); this._renderingDone = false; this._canceled = false; this._capability = createPromiseCapability(); this._renderTimer = null; this._bounds = []; - this._enhanceTextSelection = !!enhanceTextSelection; } TextLayerRenderTask.prototype = { get promise() { @@ -519,6 +497,10 @@ var renderTextLayer = (function renderTextLayerClosure() { }, cancel: function TextLayer_cancel() { + if (this._reader) { + this._reader.cancel(); + this._reader = null; + } this._canceled = true; if (this._renderTimer !== null) { clearTimeout(this._renderTimer); @@ -527,21 +509,100 @@ var renderTextLayer = (function renderTextLayerClosure() { this._capability.reject('canceled'); }, - _render: function TextLayer_render(timeout) { - var textItems = this._textContent.items; - var textStyles = this._textContent.styles; - for (var i = 0, len = textItems.length; i < len; i++) { - appendText(this, textItems[i], textStyles); + _processItems(items, styleCache) { + for (let i = 0, len = items.length; i < len; i++) { + this._textContentItemsStr.push(items[i].str); + appendText(this, items[i], styleCache); + } + }, + + _layoutText(textDiv) { + let textLayerFrag = this._container; + + let textDivProperties = this._textDivProperties.get(textDiv); + if (textDivProperties.isWhitespace) { + return; } - if (!timeout) { // Render right away - render(this); - } else { // Schedule - this._renderTimer = setTimeout(() => { - render(this); - this._renderTimer = null; - }, timeout); + let fontSize = textDiv.style.fontSize; + let fontFamily = textDiv.style.fontFamily; + + // Only build font string and set to context if different from last. + if (fontSize !== this._layoutTextLastFontSize || + fontFamily !== this._layoutTextLastFontFamily) { + this._layoutTextCtx.font = fontSize + ' ' + fontFamily; + this._lastFontSize = fontSize; + this._lastFontFamily = fontFamily; } + + let width = this._layoutTextCtx.measureText(textDiv.textContent).width; + + let transform = ''; + if (textDivProperties.canvasWidth !== 0 && width > 0) { + textDivProperties.scale = textDivProperties.canvasWidth / width; + transform = 'scaleX(' + textDivProperties.scale + ')'; + } + if (textDivProperties.angle !== 0) { + transform = 'rotate(' + textDivProperties.angle + 'deg) ' + transform; + } + if (transform !== '') { + textDivProperties.originalTransform = transform; + CustomStyle.setProp('transform', textDiv, transform); + } + this._textDivProperties.set(textDiv, textDivProperties); + textLayerFrag.appendChild(textDiv); + }, + + _render: function TextLayer_render(timeout) { + let capability = createPromiseCapability(); + let styleCache = Object.create(null); + + // The temporary canvas is used to measure text length in the DOM. + let canvas = document.createElement('canvas'); + if (typeof PDFJSDev === 'undefined' || + PDFJSDev.test('FIREFOX || MOZCENTRAL || GENERIC')) { + canvas.mozOpaque = true; + } + this._layoutTextCtx = canvas.getContext('2d', { alpha: false, }); + + if (this._textContent) { + let textItems = this._textContent.items; + let textStyles = this._textContent.styles; + this._processItems(textItems, textStyles); + capability.resolve(); + } else if (this._textContentStream) { + let pump = () => { + this._reader.read().then(({ value, done, }) => { + if (done) { + capability.resolve(); + return; + } + + Util.extendObj(styleCache, value.styles); + this._processItems(value.items, styleCache); + pump(); + + }, capability.reject); + }; + + this._reader = this._textContentStream.getReader(); + pump(); + } else { + throw new Error('Neither "textContent" nor "textContentStream"' + + ' parameters specified.'); + } + + capability.promise.then(() => { + styleCache = null; + if (!timeout) { // Render right away + render(this); + } else { // Schedule + this._renderTimer = setTimeout(() => { + render(this); + this._renderTimer = null; + }, timeout); + } + }, this._capability.reject); }, expandTextDivs: function TextLayer_expandTextDivs(expandDivs) { @@ -610,11 +671,15 @@ var renderTextLayer = (function renderTextLayerClosure() { * @returns {TextLayerRenderTask} */ function renderTextLayer(renderParameters) { - var task = new TextLayerRenderTask(renderParameters.textContent, - renderParameters.container, - renderParameters.viewport, - renderParameters.textDivs, - renderParameters.enhanceTextSelection); + var task = new TextLayerRenderTask({ + textContent: renderParameters.textContent, + textContentStream: renderParameters.textContentStream, + container: renderParameters.container, + viewport: renderParameters.viewport, + textDivs: renderParameters.textDivs, + textContentItemsStr: renderParameters.textContentItemsStr, + enhanceTextSelection: renderParameters.enhanceTextSelection, + }); task._render(renderParameters.timeout); return task; } diff --git a/src/shared/util.js b/src/shared/util.js index e338adbe6..7cd30a027 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -1378,6 +1378,7 @@ MessageHandler.prototype = { this.streamControllers[streamId] = { controller, startCall: startCapability, + isClosed: false, }; this.postMessage({ sourceName, @@ -1409,6 +1410,7 @@ MessageHandler.prototype = { cancel: (reason) => { let cancelCapability = createPromiseCapability(); this.streamControllers[streamId].cancelCall = cancelCapability; + this.streamControllers[streamId].isClosed = true; this.postMessage({ sourceName, targetName, @@ -1532,9 +1534,15 @@ MessageHandler.prototype = { }); break; case 'enqueue': - this.streamControllers[data.streamId].controller.enqueue(data.chunk); + if (!this.streamControllers[data.streamId].isClosed) { + this.streamControllers[data.streamId].controller.enqueue(data.chunk); + } break; case 'close': + if (this.streamControllers[data.streamId].isClosed) { + break; + } + this.streamControllers[data.streamId].isClosed = true; this.streamControllers[data.streamId].controller.close(); deleteStreamController(); break; @@ -1548,6 +1556,9 @@ MessageHandler.prototype = { deleteStreamController(); break; case 'cancel': + if (!this.streamSinks[data.streamId]) { + break; + } resolveCall(this.streamSinks[data.streamId].onCancel, [data.reason]).then(() => { sendStreamResponse({ stream: 'cancel_complete', success: true, }); diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index f61a4f192..d6b499ecf 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -24,8 +24,6 @@ import { import { getGlobalEventBus } from './dom_events'; import { RenderingStates } from './pdf_rendering_queue'; -const TEXT_LAYER_RENDER_DELAY = 200; // ms - /** * @typedef {Object} PDFPageViewOptions * @property {HTMLDivElement} container - The viewer element. @@ -444,12 +442,11 @@ class PDFPageView { let resultPromise = paintTask.promise.then(function() { return finishPaintTask(null).then(function () { if (textLayer) { - pdfPage.getTextContent({ + let readableStream = pdfPage.streamTextContent({ normalizeWhitespace: true, - }).then(function textContentResolved(textContent) { - textLayer.setTextContent(textContent); - textLayer.render(TEXT_LAYER_RENDER_DELAY); }); + textLayer.setTextContentStream(readableStream); + textLayer.render(); } }); }, function(reason) { diff --git a/web/text_layer_builder.js b/web/text_layer_builder.js index 66376d4f8..fa5c752d1 100644 --- a/web/text_layer_builder.js +++ b/web/text_layer_builder.js @@ -41,6 +41,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { this.textLayerDiv = options.textLayerDiv; this.eventBus = options.eventBus || getGlobalEventBus(); this.textContent = null; + this.textContentItemsStr = []; + this.textContentStream = null; this.renderingDone = false; this.pageIdx = options.pageIndex; this.pageNumber = this.pageIdx + 1; @@ -79,7 +81,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { * for specified amount of ms. */ render: function TextLayerBuilder_render(timeout) { - if (!this.textContent || this.renderingDone) { + if (!(this.textContent || this.textContentStream) || this.renderingDone) { return; } this.cancel(); @@ -88,9 +90,11 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { var textLayerFrag = document.createDocumentFragment(); this.textLayerRenderTask = renderTextLayer({ textContent: this.textContent, + textContentStream: this.textContentStream, container: textLayerFrag, viewport: this.viewport, textDivs: this.textDivs, + textContentItemsStr: this.textContentItemsStr, timeout, enhanceTextSelection: this.enhanceTextSelection, }); @@ -113,6 +117,11 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { } }, + setTextContentStream(readableStream) { + this.cancel(); + this.textContentStream = readableStream; + }, + setTextContent: function TextLayerBuilder_setTextContent(textContent) { this.cancel(); this.textContent = textContent; @@ -122,8 +131,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { matchesLength) { var i = 0; var iIndex = 0; - var bidiTexts = this.textContent.items; - var end = bidiTexts.length - 1; + let textContentItemsStr = this.textContentItemsStr; + var end = textContentItemsStr.length - 1; var queryLen = (this.findController === null ? 0 : this.findController.state.query.length); var ret = []; @@ -135,12 +144,13 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { var matchIdx = matches[m]; // Loop over the divIdxs. - while (i !== end && matchIdx >= (iIndex + bidiTexts[i].str.length)) { - iIndex += bidiTexts[i].str.length; + while (i !== end && matchIdx >= + (iIndex + textContentItemsStr[i].length)) { + iIndex += textContentItemsStr[i].length; i++; } - if (i === bidiTexts.length) { + if (i === textContentItemsStr.length) { console.error('Could not find a matching mapping'); } @@ -160,8 +170,9 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { // Somewhat the same array as above, but use > instead of >= to get // the end position right. - while (i !== end && matchIdx > (iIndex + bidiTexts[i].str.length)) { - iIndex += bidiTexts[i].str.length; + while (i !== end && matchIdx > + (iIndex + textContentItemsStr[i].length)) { + iIndex += textContentItemsStr[i].length; i++; } @@ -181,7 +192,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { return; } - var bidiTexts = this.textContent.items; + let textContentItemsStr = this.textContentItemsStr; var textDivs = this.textDivs; var prevEnd = null; var pageIdx = this.pageIdx; @@ -204,7 +215,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { function appendTextToDiv(divIdx, fromOffset, toOffset, className) { var div = textDivs[divIdx]; - var content = bidiTexts[divIdx].str.substring(fromOffset, toOffset); + var content = + textContentItemsStr[divIdx].substring(fromOffset, toOffset); var node = document.createTextNode(content); if (className) { var span = document.createElement('span'); @@ -277,7 +289,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { // Clear all matches. var matches = this.matches; var textDivs = this.textDivs; - var bidiTexts = this.textContent.items; + let textContentItemsStr = this.textContentItemsStr; var clearedUntilDivIdx = -1; // Clear all current matches. @@ -286,7 +298,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() { var begin = Math.max(clearedUntilDivIdx, match.begin.divIdx); for (var n = begin, end = match.end.divIdx; n <= end; n++) { var div = textDivs[n]; - div.textContent = bidiTexts[n].str; + div.textContent = textContentItemsStr[n]; div.className = ''; } clearedUntilDivIdx = match.end.divIdx + 1;