Merge pull request #8488 from mukulmishra18/streams-getTextContent

Streams get text content
This commit is contained in:
Yury Delendik 2017-06-23 12:52:13 -05:00 committed by GitHub
commit e2ca894fec
8 changed files with 275 additions and 114 deletions

View File

@ -270,7 +270,7 @@ var Page = (function PageClosure() {
},
extractTextContent({ handler, task, normalizeWhitespace,
combineTextItems, }) {
sink, combineTextItems, }) {
var contentStreamPromise = this.pdfManager.ensure(this,
'getContentStream');
var resourcesPromise = this.loadResources([
@ -298,6 +298,7 @@ var Page = (function PageClosure() {
resources: this.resources,
normalizeWhitespace,
combineTextItems,
sink,
});
});
},

View File

@ -1176,7 +1176,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
},
getTextContent({ stream, task, resources, stateManager = null,
normalizeWhitespace = false, combineTextItems = false, }) {
normalizeWhitespace = false, combineTextItems = false,
sink, seenStyles = Object.create(null), }) {
// Ensure that `resources`/`stateManager` is correctly initialized,
// even if the provided parameter is e.g. `null`.
resources = resources || Dict.empty;
@ -1214,7 +1215,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
var xobjs = null;
var xobjsCache = Object.create(null);
var skipEmptyXObjs = Object.create(null);
var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
@ -1225,7 +1226,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return textContentItem;
}
var font = textState.font;
if (!(font.loadedName in textContent.styles)) {
if (!(font.loadedName in seenStyles)) {
seenStyles[font.loadedName] = true;
textContent.styles[font.loadedName] = {
fontFamily: font.fallbackName,
ascent: font.ascent,
@ -1416,11 +1418,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textContentItem.str.length = 0;
}
function enqueueChunk() {
let length = textContent.items.length;
if (length > 0) {
sink.enqueue(textContent, length);
textContent.items = [];
textContent.styles = Object.create(null);
}
}
var timeSlotManager = new TimeSlotManager();
return new Promise(function promiseBody(resolve, reject) {
var next = function (promise) {
promise.then(function () {
let next = function (promise) {
enqueueChunk();
Promise.all([promise, sink.ready]).then(function () {
try {
promiseBody(resolve, reject);
} catch (ex) {
@ -1615,11 +1627,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
var name = args[0].name;
if (xobjsCache.key === name) {
if (xobjsCache.texts) {
Util.appendToArray(textContent.items, xobjsCache.texts.items);
Util.extendObj(textContent.styles, xobjsCache.texts.styles);
}
if (name in skipEmptyXObjs) {
break;
}
@ -1633,8 +1641,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
assert(isName(type), 'XObject should have a Name subtype');
if (type.name !== 'Form') {
xobjsCache.key = name;
xobjsCache.texts = null;
skipEmptyXObjs[name] = true;
break;
}
@ -1650,6 +1657,26 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
xObjStateManager.transform(matrix);
}
// Enqueue the `textContent` chunk before parsing the /Form
// XObject.
enqueueChunk();
let sinkWrapper = {
enqueueInvoked: false,
enqueue(chunk, size) {
this.enqueueInvoked = true;
sink.enqueue(chunk, size);
},
get desiredSize() {
return sink.desiredSize;
},
get ready() {
return sink.ready;
},
};
next(self.getTextContent({
stream: xobj,
task,
@ -1657,12 +1684,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
stateManager: xObjStateManager,
normalizeWhitespace,
combineTextItems,
}).then(function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles);
xobjsCache.key = name;
xobjsCache.texts = formTextContent;
sink: sinkWrapper,
seenStyles,
}).then(function() {
if (!sinkWrapper.enqueueInvoked) {
skipEmptyXObjs[name] = true;
}
}));
return;
case OPS.setGState:
@ -1686,20 +1713,27 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
break;
} // switch
if (textContent.items.length >= sink.desiredSize) {
// Wait for ready, if we reach highWaterMark.
stop = true;
break;
}
} // while
if (stop) {
next(deferred);
return;
}
flushTextContentItem();
resolve(textContent);
enqueueChunk();
resolve();
}).catch((reason) => {
if (this.options.ignoreErrors) {
// Error(s) in the TextContent -- allow text-extraction to continue.
warn('getTextContent - ignoring errors during task: ' + task.name);
flushTextContentItem();
return textContent;
enqueueChunk();
return;
}
throw reason;
});

View File

@ -874,30 +874,35 @@ var WorkerMessageHandler = {
});
}, this);
handler.on('GetTextContent', function wphExtractText(data) {
handler.on('GetTextContent', function wphExtractText(data, sink) {
var pageIndex = data.pageIndex;
return pdfManager.getPage(pageIndex).then(function(page) {
sink.onPull = function (desiredSize) { };
sink.onCancel = function (reason) { };
pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
return page.extractTextContent({
page.extractTextContent({
handler,
task,
sink,
normalizeWhitespace: data.normalizeWhitespace,
combineTextItems: data.combineTextItems,
}).then(function(textContent) {
}).then(function() {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +
(Date.now() - start) + 'ms');
return textContent;
sink.close();
}, function (reason) {
finishWorkerTask(task);
if (task.terminated) {
return; // ignoring errors from the terminated thread
}
sink.error(reason);
throw reason;
});
});

View File

@ -954,6 +954,24 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
return intentState.opListReadCapability.promise;
},
/**
* @param {getTextContentParameters} params - getTextContent parameters.
* @return {ReadableStream} ReadableStream to read textContent chunks.
*/
streamTextContent(params = {}) {
const TEXT_CONTENT_CHUNK_SIZE = 100;
return this.transport.messageHandler.sendWithStream('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: (params.normalizeWhitespace === true),
combineTextItems: (params.disableCombineTextItems !== true),
}, {
highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
size(textContent) {
return textContent.items.length;
},
});
},
/**
* @param {getTextContentParameters} params - getTextContent parameters.
* @return {Promise} That is resolved a {@link TextContent}
@ -961,10 +979,28 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
*/
getTextContent: function PDFPageProxy_getTextContent(params) {
params = params || {};
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: (params.normalizeWhitespace === true),
combineTextItems: (params.disableCombineTextItems !== true),
let readableStream = this.streamTextContent(params);
return new Promise(function(resolve, reject) {
function pump() {
reader.read().then(function({ value, done, }) {
if (done) {
resolve(textContent);
return;
}
Util.extendObj(textContent.styles, value.styles);
Util.appendToArray(textContent.items, value.items);
pump();
}, reject);
}
let reader = readableStream.getReader();
let textContent = {
items: [],
styles: Object.create(null),
};
pump();
});
},

View File

@ -20,14 +20,20 @@ import { CustomStyle, getDefaultSetting } from './dom_utils';
* Text layer render parameters.
*
* @typedef {Object} TextLayerRenderParameters
* @property {TextContent} textContent - Text content to render (the object is
* returned by the page's getTextContent() method).
* @property {TextContent} textContent - (optional) Text content to render
* (the object is returned by the page's getTextContent() method).
* @property {ReadableStream} textContentStream - (optional) Text content
* stream to render (the stream is returned by the page's
* streamTextContent() method).
* @property {HTMLElement} container - HTML element that will contain text runs.
* @property {PageViewport} viewport - The target viewport to properly
* layout the text runs.
* @property {Array} textDivs - (optional) HTML elements that are correspond
* the text items of the textContent input. This is output and shall be
* initially be set to empty array.
* @property {Array} textContentItemsStr - (optional) Strings that correspond
* the `str` property of the text items of textContent input. This is output
* and shall be initially be set to empty array.
* @property {number} timeout - (optional) Delay in milliseconds before
* rendering of the text runs occurs.
* @property {boolean} enhanceTextSelection - (optional) Whether to turn on the
@ -122,6 +128,9 @@ var renderTextLayer = (function renderTextLayerClosure() {
}
}
task._textDivProperties.set(textDiv, textDivProperties);
if (task._textContentStream) {
task._layoutText(textDiv);
}
if (task._enhanceTextSelection) {
var angleCos = 1, angleSin = 0;
@ -157,7 +166,6 @@ var renderTextLayer = (function renderTextLayerClosure() {
if (task._canceled) {
return;
}
var textLayerFrag = task._container;
var textDivs = task._textDivs;
var capability = task._capability;
var textDivsLength = textDivs.length;
@ -170,50 +178,12 @@ var renderTextLayer = (function renderTextLayerClosure() {
return;
}
// The temporary canvas is used to measure text length in the DOM.
var canvas = document.createElement('canvas');
if (typeof PDFJSDev === 'undefined' ||
PDFJSDev.test('FIREFOX || MOZCENTRAL || GENERIC')) {
canvas.mozOpaque = true;
if (!task._textContentStream) {
for (var i = 0; i < textDivsLength; i++) {
task._layoutText(textDivs[i]);
}
}
var ctx = canvas.getContext('2d', { alpha: false, });
var lastFontSize;
var lastFontFamily;
for (var i = 0; i < textDivsLength; i++) {
var textDiv = textDivs[i];
var textDivProperties = task._textDivProperties.get(textDiv);
if (textDivProperties.isWhitespace) {
continue;
}
var fontSize = textDiv.style.fontSize;
var fontFamily = textDiv.style.fontFamily;
// Only build font string and set to context if different from last.
if (fontSize !== lastFontSize || fontFamily !== lastFontFamily) {
ctx.font = fontSize + ' ' + fontFamily;
lastFontSize = fontSize;
lastFontFamily = fontFamily;
}
var width = ctx.measureText(textDiv.textContent).width;
textLayerFrag.appendChild(textDiv);
var transform = '';
if (textDivProperties.canvasWidth !== 0 && width > 0) {
textDivProperties.scale = textDivProperties.canvasWidth / width;
transform = 'scaleX(' + textDivProperties.scale + ')';
}
if (textDivProperties.angle !== 0) {
transform = 'rotate(' + textDivProperties.angle + 'deg) ' + transform;
}
if (transform !== '') {
textDivProperties.originalTransform = transform;
CustomStyle.setProp('transform', textDiv, transform);
}
task._textDivProperties.set(textDiv, textDivProperties);
}
task._renderingDone = true;
capability.resolve();
}
@ -499,19 +469,27 @@ var renderTextLayer = (function renderTextLayerClosure() {
* @param {boolean} enhanceTextSelection
* @private
*/
function TextLayerRenderTask(textContent, container, viewport, textDivs,
enhanceTextSelection) {
function TextLayerRenderTask({ textContent, textContentStream, container,
viewport, textDivs, textContentItemsStr,
enhanceTextSelection, }) {
this._textContent = textContent;
this._textContentStream = textContentStream;
this._container = container;
this._viewport = viewport;
this._textDivs = textDivs || [];
this._textContentItemsStr = textContentItemsStr || [];
this._enhanceTextSelection = !!enhanceTextSelection;
this._reader = null;
this._layoutTextLastFontSize = null;
this._layoutTextLastFontFamily = null;
this._layoutTextCtx = null;
this._textDivProperties = new WeakMap();
this._renderingDone = false;
this._canceled = false;
this._capability = createPromiseCapability();
this._renderTimer = null;
this._bounds = [];
this._enhanceTextSelection = !!enhanceTextSelection;
}
TextLayerRenderTask.prototype = {
get promise() {
@ -519,6 +497,10 @@ var renderTextLayer = (function renderTextLayerClosure() {
},
cancel: function TextLayer_cancel() {
if (this._reader) {
this._reader.cancel();
this._reader = null;
}
this._canceled = true;
if (this._renderTimer !== null) {
clearTimeout(this._renderTimer);
@ -527,21 +509,100 @@ var renderTextLayer = (function renderTextLayerClosure() {
this._capability.reject('canceled');
},
_render: function TextLayer_render(timeout) {
var textItems = this._textContent.items;
var textStyles = this._textContent.styles;
for (var i = 0, len = textItems.length; i < len; i++) {
appendText(this, textItems[i], textStyles);
_processItems(items, styleCache) {
for (let i = 0, len = items.length; i < len; i++) {
this._textContentItemsStr.push(items[i].str);
appendText(this, items[i], styleCache);
}
},
_layoutText(textDiv) {
let textLayerFrag = this._container;
let textDivProperties = this._textDivProperties.get(textDiv);
if (textDivProperties.isWhitespace) {
return;
}
if (!timeout) { // Render right away
render(this);
} else { // Schedule
this._renderTimer = setTimeout(() => {
render(this);
this._renderTimer = null;
}, timeout);
let fontSize = textDiv.style.fontSize;
let fontFamily = textDiv.style.fontFamily;
// Only build font string and set to context if different from last.
if (fontSize !== this._layoutTextLastFontSize ||
fontFamily !== this._layoutTextLastFontFamily) {
this._layoutTextCtx.font = fontSize + ' ' + fontFamily;
this._lastFontSize = fontSize;
this._lastFontFamily = fontFamily;
}
let width = this._layoutTextCtx.measureText(textDiv.textContent).width;
let transform = '';
if (textDivProperties.canvasWidth !== 0 && width > 0) {
textDivProperties.scale = textDivProperties.canvasWidth / width;
transform = 'scaleX(' + textDivProperties.scale + ')';
}
if (textDivProperties.angle !== 0) {
transform = 'rotate(' + textDivProperties.angle + 'deg) ' + transform;
}
if (transform !== '') {
textDivProperties.originalTransform = transform;
CustomStyle.setProp('transform', textDiv, transform);
}
this._textDivProperties.set(textDiv, textDivProperties);
textLayerFrag.appendChild(textDiv);
},
_render: function TextLayer_render(timeout) {
let capability = createPromiseCapability();
let styleCache = Object.create(null);
// The temporary canvas is used to measure text length in the DOM.
let canvas = document.createElement('canvas');
if (typeof PDFJSDev === 'undefined' ||
PDFJSDev.test('FIREFOX || MOZCENTRAL || GENERIC')) {
canvas.mozOpaque = true;
}
this._layoutTextCtx = canvas.getContext('2d', { alpha: false, });
if (this._textContent) {
let textItems = this._textContent.items;
let textStyles = this._textContent.styles;
this._processItems(textItems, textStyles);
capability.resolve();
} else if (this._textContentStream) {
let pump = () => {
this._reader.read().then(({ value, done, }) => {
if (done) {
capability.resolve();
return;
}
Util.extendObj(styleCache, value.styles);
this._processItems(value.items, styleCache);
pump();
}, capability.reject);
};
this._reader = this._textContentStream.getReader();
pump();
} else {
throw new Error('Neither "textContent" nor "textContentStream"' +
' parameters specified.');
}
capability.promise.then(() => {
styleCache = null;
if (!timeout) { // Render right away
render(this);
} else { // Schedule
this._renderTimer = setTimeout(() => {
render(this);
this._renderTimer = null;
}, timeout);
}
}, this._capability.reject);
},
expandTextDivs: function TextLayer_expandTextDivs(expandDivs) {
@ -610,11 +671,15 @@ var renderTextLayer = (function renderTextLayerClosure() {
* @returns {TextLayerRenderTask}
*/
function renderTextLayer(renderParameters) {
var task = new TextLayerRenderTask(renderParameters.textContent,
renderParameters.container,
renderParameters.viewport,
renderParameters.textDivs,
renderParameters.enhanceTextSelection);
var task = new TextLayerRenderTask({
textContent: renderParameters.textContent,
textContentStream: renderParameters.textContentStream,
container: renderParameters.container,
viewport: renderParameters.viewport,
textDivs: renderParameters.textDivs,
textContentItemsStr: renderParameters.textContentItemsStr,
enhanceTextSelection: renderParameters.enhanceTextSelection,
});
task._render(renderParameters.timeout);
return task;
}

View File

@ -1378,6 +1378,7 @@ MessageHandler.prototype = {
this.streamControllers[streamId] = {
controller,
startCall: startCapability,
isClosed: false,
};
this.postMessage({
sourceName,
@ -1409,6 +1410,7 @@ MessageHandler.prototype = {
cancel: (reason) => {
let cancelCapability = createPromiseCapability();
this.streamControllers[streamId].cancelCall = cancelCapability;
this.streamControllers[streamId].isClosed = true;
this.postMessage({
sourceName,
targetName,
@ -1532,9 +1534,15 @@ MessageHandler.prototype = {
});
break;
case 'enqueue':
this.streamControllers[data.streamId].controller.enqueue(data.chunk);
if (!this.streamControllers[data.streamId].isClosed) {
this.streamControllers[data.streamId].controller.enqueue(data.chunk);
}
break;
case 'close':
if (this.streamControllers[data.streamId].isClosed) {
break;
}
this.streamControllers[data.streamId].isClosed = true;
this.streamControllers[data.streamId].controller.close();
deleteStreamController();
break;
@ -1548,6 +1556,9 @@ MessageHandler.prototype = {
deleteStreamController();
break;
case 'cancel':
if (!this.streamSinks[data.streamId]) {
break;
}
resolveCall(this.streamSinks[data.streamId].onCancel,
[data.reason]).then(() => {
sendStreamResponse({ stream: 'cancel_complete', success: true, });

View File

@ -24,8 +24,6 @@ import {
import { getGlobalEventBus } from './dom_events';
import { RenderingStates } from './pdf_rendering_queue';
const TEXT_LAYER_RENDER_DELAY = 200; // ms
/**
* @typedef {Object} PDFPageViewOptions
* @property {HTMLDivElement} container - The viewer element.
@ -444,12 +442,11 @@ class PDFPageView {
let resultPromise = paintTask.promise.then(function() {
return finishPaintTask(null).then(function () {
if (textLayer) {
pdfPage.getTextContent({
let readableStream = pdfPage.streamTextContent({
normalizeWhitespace: true,
}).then(function textContentResolved(textContent) {
textLayer.setTextContent(textContent);
textLayer.render(TEXT_LAYER_RENDER_DELAY);
});
textLayer.setTextContentStream(readableStream);
textLayer.render();
}
});
}, function(reason) {

View File

@ -41,6 +41,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
this.textLayerDiv = options.textLayerDiv;
this.eventBus = options.eventBus || getGlobalEventBus();
this.textContent = null;
this.textContentItemsStr = [];
this.textContentStream = null;
this.renderingDone = false;
this.pageIdx = options.pageIndex;
this.pageNumber = this.pageIdx + 1;
@ -79,7 +81,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
* for specified amount of ms.
*/
render: function TextLayerBuilder_render(timeout) {
if (!this.textContent || this.renderingDone) {
if (!(this.textContent || this.textContentStream) || this.renderingDone) {
return;
}
this.cancel();
@ -88,9 +90,11 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
var textLayerFrag = document.createDocumentFragment();
this.textLayerRenderTask = renderTextLayer({
textContent: this.textContent,
textContentStream: this.textContentStream,
container: textLayerFrag,
viewport: this.viewport,
textDivs: this.textDivs,
textContentItemsStr: this.textContentItemsStr,
timeout,
enhanceTextSelection: this.enhanceTextSelection,
});
@ -113,6 +117,11 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
}
},
setTextContentStream(readableStream) {
this.cancel();
this.textContentStream = readableStream;
},
setTextContent: function TextLayerBuilder_setTextContent(textContent) {
this.cancel();
this.textContent = textContent;
@ -122,8 +131,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
matchesLength) {
var i = 0;
var iIndex = 0;
var bidiTexts = this.textContent.items;
var end = bidiTexts.length - 1;
let textContentItemsStr = this.textContentItemsStr;
var end = textContentItemsStr.length - 1;
var queryLen = (this.findController === null ?
0 : this.findController.state.query.length);
var ret = [];
@ -135,12 +144,13 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
var matchIdx = matches[m];
// Loop over the divIdxs.
while (i !== end && matchIdx >= (iIndex + bidiTexts[i].str.length)) {
iIndex += bidiTexts[i].str.length;
while (i !== end && matchIdx >=
(iIndex + textContentItemsStr[i].length)) {
iIndex += textContentItemsStr[i].length;
i++;
}
if (i === bidiTexts.length) {
if (i === textContentItemsStr.length) {
console.error('Could not find a matching mapping');
}
@ -160,8 +170,9 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
// Somewhat the same array as above, but use > instead of >= to get
// the end position right.
while (i !== end && matchIdx > (iIndex + bidiTexts[i].str.length)) {
iIndex += bidiTexts[i].str.length;
while (i !== end && matchIdx >
(iIndex + textContentItemsStr[i].length)) {
iIndex += textContentItemsStr[i].length;
i++;
}
@ -181,7 +192,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
return;
}
var bidiTexts = this.textContent.items;
let textContentItemsStr = this.textContentItemsStr;
var textDivs = this.textDivs;
var prevEnd = null;
var pageIdx = this.pageIdx;
@ -204,7 +215,8 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
function appendTextToDiv(divIdx, fromOffset, toOffset, className) {
var div = textDivs[divIdx];
var content = bidiTexts[divIdx].str.substring(fromOffset, toOffset);
var content =
textContentItemsStr[divIdx].substring(fromOffset, toOffset);
var node = document.createTextNode(content);
if (className) {
var span = document.createElement('span');
@ -277,7 +289,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
// Clear all matches.
var matches = this.matches;
var textDivs = this.textDivs;
var bidiTexts = this.textContent.items;
let textContentItemsStr = this.textContentItemsStr;
var clearedUntilDivIdx = -1;
// Clear all current matches.
@ -286,7 +298,7 @@ var TextLayerBuilder = (function TextLayerBuilderClosure() {
var begin = Math.max(clearedUntilDivIdx, match.begin.divIdx);
for (var n = begin, end = match.end.divIdx; n <= end; n++) {
var div = textDivs[n];
div.textContent = bidiTexts[n].str;
div.textContent = textContentItemsStr[n];
div.className = '';
}
clearedUntilDivIdx = match.end.divIdx + 1;