Merge pull request #7475 from Snuffleupagus/api-getTextContent-combineTextItems
[api-minor] Add a parameter to `PDFPageProxy_getTextContent` that controls whether `PartialEvaluator_getTextContent` will attempt to combine same line text items
This commit is contained in:
commit
a02e2686b9
@ -265,7 +265,8 @@ var Page = (function PageClosure() {
|
|||||||
},
|
},
|
||||||
|
|
||||||
extractTextContent: function Page_extractTextContent(task,
|
extractTextContent: function Page_extractTextContent(task,
|
||||||
normalizeWhitespace) {
|
normalizeWhitespace,
|
||||||
|
combineTextItems) {
|
||||||
var handler = {
|
var handler = {
|
||||||
on: function nullHandlerOn() {},
|
on: function nullHandlerOn() {},
|
||||||
send: function nullHandlerSend() {}
|
send: function nullHandlerSend() {}
|
||||||
@ -298,7 +299,8 @@ var Page = (function PageClosure() {
|
|||||||
task,
|
task,
|
||||||
self.resources,
|
self.resources,
|
||||||
/* stateManager = */ null,
|
/* stateManager = */ null,
|
||||||
normalizeWhitespace);
|
normalizeWhitespace,
|
||||||
|
combineTextItems);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -1132,7 +1132,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
getTextContent:
|
getTextContent:
|
||||||
function PartialEvaluator_getTextContent(stream, task, resources,
|
function PartialEvaluator_getTextContent(stream, task, resources,
|
||||||
stateManager,
|
stateManager,
|
||||||
normalizeWhitespace) {
|
normalizeWhitespace,
|
||||||
|
combineTextItems) {
|
||||||
|
|
||||||
stateManager = (stateManager || new StateManager(new TextState()));
|
stateManager = (stateManager || new StateManager(new TextState()));
|
||||||
|
|
||||||
@ -1443,7 +1444,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
var isSameTextLine = !textState.font ? false :
|
var isSameTextLine = !textState.font ? false :
|
||||||
((textState.font.vertical ? args[0] : args[1]) === 0);
|
((textState.font.vertical ? args[0] : args[1]) === 0);
|
||||||
advance = args[0] - args[1];
|
advance = args[0] - args[1];
|
||||||
if (isSameTextLine && textContentItem.initialized &&
|
if (combineTextItems &&
|
||||||
|
isSameTextLine && textContentItem.initialized &&
|
||||||
advance > 0 &&
|
advance > 0 &&
|
||||||
advance <= textContentItem.fakeMultiSpaceMax) {
|
advance <= textContentItem.fakeMultiSpaceMax) {
|
||||||
textState.translateTextLineMatrix(args[0], args[1]);
|
textState.translateTextLineMatrix(args[0], args[1]);
|
||||||
@ -1475,7 +1477,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
// Optimization to treat same line movement as advance.
|
// Optimization to treat same line movement as advance.
|
||||||
advance = textState.calcTextLineMatrixAdvance(
|
advance = textState.calcTextLineMatrixAdvance(
|
||||||
args[0], args[1], args[2], args[3], args[4], args[5]);
|
args[0], args[1], args[2], args[3], args[4], args[5]);
|
||||||
if (advance !== null && textContentItem.initialized &&
|
if (combineTextItems &&
|
||||||
|
advance !== null && textContentItem.initialized &&
|
||||||
advance.value > 0 &&
|
advance.value > 0 &&
|
||||||
advance.value <= textContentItem.fakeMultiSpaceMax) {
|
advance.value <= textContentItem.fakeMultiSpaceMax) {
|
||||||
textState.translateTextLineMatrix(advance.width,
|
textState.translateTextLineMatrix(advance.width,
|
||||||
@ -1616,7 +1619,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
|
|
||||||
next(self.getTextContent(xobj, task,
|
next(self.getTextContent(xobj, task,
|
||||||
xobj.dict.get('Resources') || resources, stateManager,
|
xobj.dict.get('Resources') || resources, stateManager,
|
||||||
normalizeWhitespace).then(function (formTextContent) {
|
normalizeWhitespace, combineTextItems).then(
|
||||||
|
function (formTextContent) {
|
||||||
Util.appendToArray(textContent.items, formTextContent.items);
|
Util.appendToArray(textContent.items, formTextContent.items);
|
||||||
Util.extendObj(textContent.styles, formTextContent.styles);
|
Util.extendObj(textContent.styles, formTextContent.styles);
|
||||||
stateManager.restore();
|
stateManager.restore();
|
||||||
|
@ -891,12 +891,14 @@ var WorkerMessageHandler = {
|
|||||||
handler.on('GetTextContent', function wphExtractText(data) {
|
handler.on('GetTextContent', function wphExtractText(data) {
|
||||||
var pageIndex = data.pageIndex;
|
var pageIndex = data.pageIndex;
|
||||||
var normalizeWhitespace = data.normalizeWhitespace;
|
var normalizeWhitespace = data.normalizeWhitespace;
|
||||||
|
var combineTextItems = data.combineTextItems;
|
||||||
return pdfManager.getPage(pageIndex).then(function(page) {
|
return pdfManager.getPage(pageIndex).then(function(page) {
|
||||||
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
||||||
startWorkerTask(task);
|
startWorkerTask(task);
|
||||||
var pageNum = pageIndex + 1;
|
var pageNum = pageIndex + 1;
|
||||||
var start = Date.now();
|
var start = Date.now();
|
||||||
return page.extractTextContent(task, normalizeWhitespace).then(
|
return page.extractTextContent(task, normalizeWhitespace,
|
||||||
|
combineTextItems).then(
|
||||||
function(textContent) {
|
function(textContent) {
|
||||||
finishWorkerTask(task);
|
finishWorkerTask(task);
|
||||||
info('text indexing: page=' + pageNum + ' - time=' +
|
info('text indexing: page=' + pageNum + ' - time=' +
|
||||||
|
@ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
|
|||||||
* @typedef {Object} getTextContentParameters
|
* @typedef {Object} getTextContentParameters
|
||||||
* @param {boolean} normalizeWhitespace - replaces all occurrences of
|
* @param {boolean} normalizeWhitespace - replaces all occurrences of
|
||||||
* whitespace with standard spaces (0x20). The default value is `false`.
|
* whitespace with standard spaces (0x20). The default value is `false`.
|
||||||
|
* @param {boolean} disableCombineTextItems - do not attempt to combine
|
||||||
|
* same line {@link TextItem}'s. The default value is `false`.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
|
|||||||
* object that represent the page text content.
|
* object that represent the page text content.
|
||||||
*/
|
*/
|
||||||
getTextContent: function PDFPageProxy_getTextContent(params) {
|
getTextContent: function PDFPageProxy_getTextContent(params) {
|
||||||
var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
|
|
||||||
|
|
||||||
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
|
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
|
||||||
pageIndex: this.pageNumber - 1,
|
pageIndex: this.pageNumber - 1,
|
||||||
normalizeWhitespace: normalizeWhitespace,
|
normalizeWhitespace: (params && params.normalizeWhitespace === true ?
|
||||||
|
true : /* Default */ false),
|
||||||
|
combineTextItems: (params && params.disableCombineTextItems === true ?
|
||||||
|
false : /* Default */ true),
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -469,12 +469,12 @@ var Driver = (function DriverClosure() {
|
|||||||
textLayerContext.clearRect(0, 0,
|
textLayerContext.clearRect(0, 0,
|
||||||
textLayerCanvas.width, textLayerCanvas.height);
|
textLayerCanvas.width, textLayerCanvas.height);
|
||||||
// The text builder will draw its content on the test canvas
|
// The text builder will draw its content on the test canvas
|
||||||
initPromise =
|
initPromise = page.getTextContent({
|
||||||
page.getTextContent({ normalizeWhitespace: true }).then(
|
normalizeWhitespace: true,
|
||||||
function(textContent) {
|
}).then(function(textContent) {
|
||||||
return rasterizeTextLayer(textLayerContext, viewport,
|
return rasterizeTextLayer(textLayerContext, viewport,
|
||||||
textContent);
|
textContent);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
textLayerCanvas = null;
|
textLayerCanvas = null;
|
||||||
|
|
||||||
|
@ -771,12 +771,14 @@ describe('api', function() {
|
|||||||
});
|
});
|
||||||
it('gets text content', function (done) {
|
it('gets text content', function (done) {
|
||||||
var defaultPromise = page.getTextContent();
|
var defaultPromise = page.getTextContent();
|
||||||
var normalizeWhitespacePromise = page.getTextContent({
|
var parametersPromise = page.getTextContent({
|
||||||
normalizeWhitespace: true });
|
normalizeWhitespace: true,
|
||||||
|
disableCombineTextItems: true,
|
||||||
|
});
|
||||||
|
|
||||||
var promises = [
|
var promises = [
|
||||||
defaultPromise,
|
defaultPromise,
|
||||||
normalizeWhitespacePromise
|
parametersPromise,
|
||||||
];
|
];
|
||||||
Promise.all(promises).then(function (data) {
|
Promise.all(promises).then(function (data) {
|
||||||
expect(!!data[0].items).toEqual(true);
|
expect(!!data[0].items).toEqual(true);
|
||||||
|
@ -503,12 +503,12 @@ var PDFPageView = (function PDFPageViewClosure() {
|
|||||||
function pdfPageRenderCallback() {
|
function pdfPageRenderCallback() {
|
||||||
pageViewDrawCallback(null);
|
pageViewDrawCallback(null);
|
||||||
if (textLayer) {
|
if (textLayer) {
|
||||||
self.pdfPage.getTextContent({ normalizeWhitespace: true }).then(
|
self.pdfPage.getTextContent({
|
||||||
function textContentResolved(textContent) {
|
normalizeWhitespace: true,
|
||||||
textLayer.setTextContent(textContent);
|
}).then(function textContentResolved(textContent) {
|
||||||
textLayer.render(TEXT_LAYER_RENDER_DELAY);
|
textLayer.setTextContent(textContent);
|
||||||
}
|
textLayer.render(TEXT_LAYER_RENDER_DELAY);
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
function pdfPageRenderError(error) {
|
function pdfPageRenderError(error) {
|
||||||
|
@ -784,7 +784,9 @@ var PDFViewer = (function pdfViewer() {
|
|||||||
|
|
||||||
getPageTextContent: function (pageIndex) {
|
getPageTextContent: function (pageIndex) {
|
||||||
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
|
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
|
||||||
return page.getTextContent({ normalizeWhitespace: true });
|
return page.getTextContent({
|
||||||
|
normalizeWhitespace: true,
|
||||||
|
});
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user