Adds Streams API in getTextContent to stream data.

This patch adds Streams API support in getTextContent
so that we can stream data in chunks instead of fetching
whole data from worker thread to main thread. This patch
supports Streams API without changing the core functionality
of getTextContent.

Enqueue textContent directly at getTextContent in partialEvaluator.

Adds desiredSize and ready property in streamSink.
This commit is contained in:
Mukul Mishra 2017-04-17 18:16:53 +05:30
parent 209751346c
commit 0c13d0ff46
8 changed files with 275 additions and 114 deletions

View File

@ -271,7 +271,7 @@ var Page = (function PageClosure() {
},
extractTextContent({ handler, task, normalizeWhitespace,
combineTextItems, }) {
sink, combineTextItems, }) {
var contentStreamPromise = this.pdfManager.ensure(this,
'getContentStream');
var resourcesPromise = this.loadResources([
@ -299,6 +299,7 @@ var Page = (function PageClosure() {
resources: this.resources,
normalizeWhitespace,
combineTextItems,
sink,
});
});
},

View File

@ -1176,7 +1176,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
},
getTextContent({ stream, task, resources, stateManager = null,
normalizeWhitespace = false, combineTextItems = false, }) {
normalizeWhitespace = false, combineTextItems = false,
sink, seenStyles = Object.create(null), }) {
// Ensure that `resources`/`stateManager` is correctly initialized,
// even if the provided parameter is e.g. `null`.
resources = resources || Dict.empty;
@ -1214,7 +1215,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
var xobjs = null;
var xobjsCache = Object.create(null);
var skipEmptyXObjs = Object.create(null);
var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
@ -1225,7 +1226,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return textContentItem;
}
var font = textState.font;
if (!(font.loadedName in textContent.styles)) {
if (!(font.loadedName in seenStyles)) {
seenStyles[font.loadedName] = true;
textContent.styles[font.loadedName] = {
fontFamily: font.fallbackName,
ascent: font.ascent,
@ -1416,11 +1418,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textContentItem.str.length = 0;
}
function enqueueChunk() {
let length = textContent.items.length;
if (length > 0) {
sink.enqueue(textContent, length);
textContent.items = [];
textContent.styles = Object.create(null);
}
}
var timeSlotManager = new TimeSlotManager();
return new Promise(function promiseBody(resolve, reject) {
var next = function (promise) {
promise.then(function () {
let next = function (promise) {
enqueueChunk();
Promise.all([promise, sink.ready]).then(function () {
try {
promiseBody(resolve, reject);
} catch (ex) {
@ -1615,11 +1627,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
var name = args[0].name;
if (xobjsCache.key === name) {
if (xobjsCache.texts) {
Util.appendToArray(textContent.items, xobjsCache.texts.items);
Util.extendObj(textContent.styles, xobjsCache.texts.styles);
}
if (name in skipEmptyXObjs) {
break;
}
@ -1633,8 +1641,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
assert(isName(type), 'XObject should have a Name subtype');
if (type.name !== 'Form') {
xobjsCache.key = name;
xobjsCache.texts = null;
skipEmptyXObjs[name] = true;
break;
}
@ -1650,6 +1657,26 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
xObjStateManager.transform(matrix);
}
// Enqueue the `textContent` chunk before parsing the /Form
// XObject.
enqueueChunk();
let sinkWrapper = {
enqueueInvoked: false,
enqueue(chunk, size) {
this.enqueueInvoked = true;
sink.enqueue(chunk, size);
},
get desiredSize() {
return sink.desiredSize;
},
get ready() {
return sink.ready;
},
};
next(self.getTextContent({
stream: xobj,
task,
@ -1657,12 +1684,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
stateManager: xObjStateManager,
normalizeWhitespace,
combineTextItems,
}).then(function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles);
xobjsCache.key = name;
xobjsCache.texts = formTextContent;
sink: sinkWrapper,
seenStyles,
}).then(function() {
if (!sinkWrapper.enqueueInvoked) {
skipEmptyXObjs[name] = true;
}
}));
return;
case OPS.setGState:
@ -1686,20 +1713,27 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
break;
} // switch
if (textContent.items.length >= sink.desiredSize) {
// Wait for ready, if we reach highWaterMark.
stop = true;
break;
}
} // while
if (stop) {
next(deferred);
return;
}
flushTextContentItem();
resolve(textContent);
enqueueChunk();
resolve();
}).catch((reason) => {
if (this.options.ignoreErrors) {
// Error(s) in the TextContent -- allow text-extraction to continue.
warn('getTextContent - ignoring errors during task: ' + task.name);
flushTextContentItem();
return textContent;
enqueueChunk();
return;
}
throw reason;
});

View File

@ -874,30 +874,35 @@ var WorkerMessageHandler = {
});
}, this);
handler.on('GetTextContent', function wphExtractText(data) {
handler.on('GetTextContent', function wphExtractText(data, sink) {
var pageIndex = data.pageIndex;
return pdfManager.getPage(pageIndex).then(function(page) {
sink.onPull = function (desiredSize) { };
sink.onCancel = function (reason) { };
pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
return page.extractTextContent({
page.extractTextContent({
handler,
task,
sink,
normalizeWhitespace: data.normalizeWhitespace,
combineTextItems: data.combineTextItems,
}).then(function(textContent) {
}).then(function() {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +
(Date.now() - start) + 'ms');
return textContent;
sink.close();
}, function (reason) {
finishWorkerTask(task);
if (task.terminated) {
return; // ignoring errors from the terminated thread
}
sink.error(reason);
throw reason;
});
});

View File

@ -950,6 +950,24 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
return intentState.opListReadCapability.promise;
},
/**
* @param {getTextContentParameters} params - getTextContent parameters.
* @return {ReadableStream} ReadableStream to read textContent chunks.
*/
streamTextContent(params = {}) {
const TEXT_CONTENT_CHUNK_SIZE = 100;
return this.transport.messageHandler.sendWithStream('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: (params.normalizeWhitespace === true),
combineTextItems: (params.disableCombineTextItems !== true),
}, {
highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
size(textContent) {
return textContent.items.length;
},
});
},
/**
* @param {getTextContentParameters} params - getTextContent parameters.
* @return {Promise} That is resolved a {@link TextContent}
@ -957,10 +975,28 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
*/
getTextContent: function PDFPageProxy_getTextContent(params) {
params = params || {};
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: (params.normalizeWhitespace === true),
combineTextItems: (params.disableCombineTextItems !== true),
let readableStream = this.streamTextContent(params);
return new Promise(function(resolve, reject) {
function pump() {
reader.read().then(function({ value, done, }) {
if (done) {
resolve(textContent);
return;
}
Util.extendObj(textContent.styles, value.styles);
Util.appendToArray(textContent.items, value.items);
pump();
}, reject);
}
let reader = readableStream.getReader();
let textContent = {
items: [],
styles: Object.create(null),
};
pump();
});
},

View File

@ -20,14 +20,20 @@ import { CustomStyle, getDefaultSetting } from './dom_utils';
* Text layer render parameters.
*
* @typedef {Object} TextLayerRenderParameters
* @property {TextContent} textContent - Text content to render (the object is
* returned by the page's getTextContent() method).
* @property {TextContent} textContent - (optional) Text content to render
* (the object is returned by the page's getTextContent() method).
* @property {ReadableStream} textContentStream - (optional) Text content
* stream to render (the stream is returned by the page's
* streamTextContent() method).
* @property {HTMLElement} container - HTML element that will contain text runs.
* @property {PageViewport} viewport - The target viewport to properly
* layout the text runs.
* @property {Array} textDivs - (optional) HTML elements that are correspond
* the text items of the textContent input. This is output and shall be
* initially be set to empty array.
* @property {Array} textContentItemsStr - (optional) Strings that correspond
* the `str` property of the text items of textContent input. This is output
* and shall be initially be set to empty array.
* @property {number} timeout - (optional) Delay in milliseconds before
* rendering of the text runs occurs.
* @property {boolean} enhanceTextSelection - (optional) Whether to turn on the
@ -122,6 +128,9 @@ var renderTextLayer = (function renderTextLayerClosure() {
}
}
task._textDivProperties.set(textDiv, textDivProperties);
if (task._textContentStream) {
task._layoutText(textDiv);
}
if (task._enhanceTextSelection) {
var angleCos = 1, angleSin = 0;
@ -157,7 +166,6 @@ var renderTextLayer = (function renderTextLayerClosure() {
if (task._canceled) {
return;
}
var textLayerFrag = task._container;
var textDivs = task._textDivs;
var capability = task._capability;
var textDivsLength = textDivs.length;
@ -170,50 +178,12 @@ var renderTextLayer = (function renderTextLayerClosure() {
return;
}
// The temporary canvas is used to measure text length in the DOM.
var canvas = document.createElement('canvas');
if (typeof PDFJSDev === 'undefined' ||
PDFJSDev.test('FIREFOX || MOZCENTRAL || GENERIC')) {
canvas.mozOpaque = true;
if (!task._textContentStream) {
for (var i = 0; i < textDivsLength; i++) {
task._layoutText(textDivs[i]);
}
}
var ctx = canvas.getContext('2d', { alpha: false, });
var lastFontSize;
var lastFontFamily;
for (var i = 0; i < textDivsLength; i++) {
var textDiv = textDivs[i];
var textDivProperties = task._textDivProperties.get(textDiv);
if (textDivProperties.isWhitespace) {
continue;
}
var fontSize = textDiv.style.fontSize;
var fontFamily = textDiv.style.fontFamily;
// Only build font string and set to context if different from last.
if (fontSize !== lastFontSize || fontFamily !== lastFontFamily) {
ctx.font = fontSize + ' ' + fontFamily;
lastFontSize = fontSize;
lastFontFamily = fontFamily;
}
var width = ctx.measureText(textDiv.textContent).width;
textLayerFrag.appendChild(textDiv);
var transform = '';
if (textDivProperties.canvasWidth !== 0 && width > 0) {
textDivProperties.scale = textDivProperties.canvasWidth / width;
transform = 'scaleX(' + textDivProperties.scale + ')';
}
if (textDivProperties.angle !== 0) {
transform = 'rotate(' + textDivProperties.angle + 'deg) ' + transform;
}
if (transform !== '') {
textDivProperties.originalTransform = transform;
CustomStyle.setProp('transform', textDiv, transform);
}
task._textDivProperties.set(textDiv, textDivProperties);
}
task._renderingDone = true;
capability.resolve();
}
@ -499,19 +469,27 @@ var renderTextLayer = (function renderTextLayerClosure() {
* @param {boolean} enhanceTextSelection
* @private
*/
function TextLayerRenderTask(textContent, container, viewport, textDivs,
enhanceTextSelection) {
function TextLayerRenderTask({ textContent, textContentStream, container,
viewport, textDivs, textContentItemsStr,
enhanceTextSelection, }) {
this._textContent = textContent;
this._textContentStream = textContentStream;
this._container = container;
this._viewport = viewport;
this._textDivs = textDivs || [];
this._textContentItemsStr = textContentItemsStr || [];
this._enhanceTextSelection = !!enhanceTextSelection;
this._reader = null;
this._layoutTextLastFontSize = null;
this._layoutTextLastFontFamily = null;
this._layoutTextCtx = null;
this._textDivProperties = new WeakMap();
this._renderingDone = false;
this._canceled = false;
this._capability = createPromiseCapability();
this._renderTimer = null;
this._bounds = [];
this._enhanceTextSelection = !!enhanceTextSelection;
}
TextLayerRenderTask.prototype = {
get promise() {
@ -519,6 +497,10 @@ var renderTextLayer = (function renderTextLayerClosure() {
},
cancel: function TextLayer_cancel() {
if (this._reader) {
this._reader.cancel();
this._reader = null;
}
this._canceled = true;
if (this._renderTimer !== null) {
clearTimeout(this._renderTimer);
@ -527,21 +509,100 @@ var renderTextLayer = (function renderTextLayerClosure() {
this._capability.reject('canceled');
},
_render: function TextLayer_render(timeout) {
var textItems = this._textContent.items;
var textStyles = this._textContent.styles;
for (var i = 0, len = textItems.length; i < len; i++) {
appendText(this, textItems[i], textStyles);
_processItems(items, styleCache) {
for (let i = 0, len = items.length; i < len; i++) {
this._textContentItemsStr.push(items[i].str);
appendText(this, items[i], styleCache);
}
},
_layoutText(textDiv) {
let textLayerFrag = this._container;
let textDivProperties = this._textDivProperties.get(textDiv);
if (textDivProperties.isWhitespace) {
return;
}
if (!timeout) { // Render right away
render(this);
} else { // Schedule
this._renderTimer = setTimeout(() => {
render(this);
this._renderTimer = null;
}, timeout);
let fontSize = textDiv.style.fontSize;
let fontFamily = textDiv.style.fontFamily;
// Only build font string and set to context if different from last.
if (fontSize !== this._layoutTextLastFontSize ||
fontFamily !== this._layoutTextLastFontFamily) {
this._layoutTextCtx.font = fontSize + ' ' + fontFamily;
this._lastFontSize = fontSize;
this._lastFontFamily = fontFamily;
}
let width = this._layoutTextCtx.measureText(textDiv.textContent).width;
let transform = '';
if (textDivProperties.canvasWidth !== 0 && width > 0) {
textDivProperties.scale = textDivProperties.canvasWidth / width;
transform = 'scaleX(' + textDivProperties.scale + ')';
}
if (textDivProperties.angle !== 0) {
transform = 'rotate(' + textDivProperties.angle + 'deg) ' + transform;
}
if (transform !== '') {
textDivProperties.originalTransform = transform;
CustomStyle.setProp('transform', textDiv, transform);
}
this._textDivProperties.set(textDiv, textDivProperties);
textLayerFrag.appendChild(textDiv);
},
_render: function TextLayer_render(timeout) {
let capability = createPromiseCapability();
let styleCache = Object.create(null);
// The temporary canvas is used to measure text length in the DOM.
let canvas = document.createElement('canvas');
if (typeof PDFJSDev === 'undefined' ||
PDFJSDev.test('FIREFOX || MOZCENTRAL || GENERIC')) {
canvas.mozOpaque = true;
}
this._layoutTextCtx = canvas.getContext('2d', { alpha: false, });
if (this._textContent) {
let textItems = this._textContent.items;
let textStyles = this._textContent.styles;
this._processItems(textItems, textStyles);
capability.resolve();
} else if (this._textContentStream) {
let pump = () => {
this._reader.read().then(({ value, done, }) => {
if (done) {
capability.resolve();
return;
}
Util.extendObj(styleCache, value.styles);
this._processItems(value.items, styleCache);
pump();
}, capability.reject);
};
this._reader = this._textContentStream.getReader();
pump();
} else {
throw new Error('Neither "textContent" nor "textContentStream"' +
' parameters specified.');
}
capability.promise.then(() => {
styleCache = null;
if (!timeout) { // Render right away
render(this);
} else { // Schedule
this._renderTimer = setTimeout(() => {
render(this);
this._renderTimer = null;
}, timeout);
}
}, this._capability.reject);
},
expandTextDivs: function TextLayer_expandTextDivs(expandDivs) {
@ -610,11 +671,15 @@ var renderTextLayer = (function renderTextLayerClosure() {
* @returns {TextLayerRenderTask}
*/
function renderTextLayer(renderParameters) {
var task = new TextLayerRenderTask(renderParameters.textContent,
renderParameters.container,
renderParameters.viewport,
renderParameters.textDivs,
renderParameters.enhanceTextSelection);
var task = new TextLayerRenderTask({
textContent: renderParameters.textContent,
textContentStream: renderParameters.textContentStream,
container: renderParameters.container,
viewport: renderParameters.viewport,
textDivs: renderParameters.textDivs,
textContentItemsStr: renderParameters.textContentItemsStr,
enhanceTextSelection: renderParameters.enhanceTextSelection,
});
task._render(renderParameters.timeout);
return task;
}

View File

@ -1378,6 +1378,7 @@ MessageHandler.prototype = {
this.streamControllers[streamId] = {
controller,
startCall: startCapability,
isClosed: false,
};
this.postMessage({
sourceName,
@ -1409,6 +1410,7 @@ MessageHandler.prototype = {
cancel: (reason) => {
let cancelCapability = createPromiseCapability();
this.streamControllers[streamId].cancelCall = cancelCapability;
this.streamControllers[streamId].isClosed = true;
this.postMessage({
sourceName,
targetName,
@ -1532,9 +1534,15 @@ MessageHandler.prototype = {
});
break;
case 'enqueue':
this.streamControllers[data.streamId].controller.enqueue(data.chunk);
if (!this.streamControllers[data.streamId].isClosed) {
this.streamControllers[data.streamId].controller.enqueue(data.chunk);
}
break;
case 'close':
if (this.streamControllers[data.streamId].isClosed) {
break;
}
this.streamControllers[data.streamId].isClosed = true;
this.streamControllers[data.streamId].controller.close();
deleteStreamController();
break;
@ -1548,6 +1556,9 @@ MessageHandler.prototype = {
deleteStreamController();
break;
case 'cancel':
if (!this.streamSinks[data.streamId]) {
break;
}
resolveCall(this.streamSinks[data.streamId].onCancel,
[data.reason]).then(() => {
sendStreamResponse({ stream: 'cancel_complete', success: true, });

View File

@ -24,8 +24,6 @@ import {
import { getGlobalEventBus } from './dom_events';
import { RenderingStates } from './pdf_rendering_queue';
const TEXT_LAYER_RENDER_DELAY = 200; // ms
/**
* @typedef {Object} PDFPageViewOptions
* @property {HTMLDivElement} container - The viewer element.
@ -444,12 +442,11 @@ class PDFPageView {
let resultPromise = paintTask.promise.then(function() {
return finishPaintTask(null).then(function () {
if (textLayer) {
pdfPage.getTextContent({
let readableStream = pdfPage.streamTextContent({
normalizeWhitespace: true,
}).then(function textContentResolved(textContent) {
textLayer.setTextContent(textContent);
textLayer.render(TEXT_LAYER_RENDER_DELAY);
});
textLayer.setTextContentStream(readableStream);
textLayer.render();
}
});
}, function(reason) {

View File

@ -41,6 +41,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
this.textLayerDiv = options.textLayerDiv;
this.eventBus = options.eventBus || getGlobalEventBus();
this.textContent = null;
this.textContentItemsStr = [];
this.textContentStream = null;
this.renderingDone = false;
this.pageIdx = options.pageIndex;
this.pageNumber = this.pageIdx + 1;
@ -79,7 +81,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
* for specified amount of ms.
*/
render: function TextLayerBuilder_render(timeout) {
if (!this.textContent || this.renderingDone) {
if (!(this.textContent || this.textContentStream) || this.renderingDone) {
return;
}
this.cancel();
@ -88,9 +90,11 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
var textLayerFrag = document.createDocumentFragment();
this.textLayerRenderTask = renderTextLayer({
textContent: this.textContent,
textContentStream: this.textContentStream,
container: textLayerFrag,
viewport: this.viewport,
textDivs: this.textDivs,
textContentItemsStr: this.textContentItemsStr,
timeout,
enhanceTextSelection: this.enhanceTextSelection,
});
@ -113,6 +117,11 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
}
},
setTextContentStream(readableStream) {
this.cancel();
this.textContentStream = readableStream;
},
setTextContent: function TextLayerBuilder_setTextContent(textContent) {
this.cancel();
this.textContent = textContent;
@ -122,8 +131,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
matchesLength) {
var i = 0;
var iIndex = 0;
var bidiTexts = this.textContent.items;
var end = bidiTexts.length - 1;
let textContentItemsStr = this.textContentItemsStr;
var end = textContentItemsStr.length - 1;
var queryLen = (this.findController === null ?
0 : this.findController.state.query.length);
var ret = [];
@ -135,12 +144,13 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
var matchIdx = matches[m];
// Loop over the divIdxs.
while (i !== end && matchIdx >= (iIndex + bidiTexts[i].str.length)) {
iIndex += bidiTexts[i].str.length;
while (i !== end && matchIdx >=
(iIndex + textContentItemsStr[i].length)) {
iIndex += textContentItemsStr[i].length;
i++;
}
if (i === bidiTexts.length) {
if (i === textContentItemsStr.length) {
console.error('Could not find a matching mapping');
}
@ -160,8 +170,9 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
// Somewhat the same array as above, but use > instead of >= to get
// the end position right.
while (i !== end && matchIdx > (iIndex + bidiTexts[i].str.length)) {
iIndex += bidiTexts[i].str.length;
while (i !== end && matchIdx >
(iIndex + textContentItemsStr[i].length)) {
iIndex += textContentItemsStr[i].length;
i++;
}
@ -181,7 +192,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
return;
}
var bidiTexts = this.textContent.items;
let textContentItemsStr = this.textContentItemsStr;
var textDivs = this.textDivs;
var prevEnd = null;
var pageIdx = this.pageIdx;
@ -204,7 +215,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
function appendTextToDiv(divIdx, fromOffset, toOffset, className) {
var div = textDivs[divIdx];
var content = bidiTexts[divIdx].str.substring(fromOffset, toOffset);
var content =
textContentItemsStr[divIdx].substring(fromOffset, toOffset);
var node = document.createTextNode(content);
if (className) {
var span = document.createElement('span');
@ -277,7 +289,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
// Clear all matches.
var matches = this.matches;
var textDivs = this.textDivs;
var bidiTexts = this.textContent.items;
let textContentItemsStr = this.textContentItemsStr;
var clearedUntilDivIdx = -1;
// Clear all current matches.
@ -286,7 +298,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
var begin = Math.max(clearedUntilDivIdx, match.begin.divIdx);
for (var n = begin, end = match.end.divIdx; n <= end; n++) {
var div = textDivs[n];
div.textContent = bidiTexts[n].str;
div.textContent = textContentItemsStr[n];
div.className = '';
}
clearedUntilDivIdx = match.end.divIdx + 1;