[api-minor] Add a parameter to PDFPageProxy_getTextContent
that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
This commit is contained in:
parent
c2dfe9e9a9
commit
6dfe53b976
@ -218,7 +218,8 @@ var Page = (function PageClosure() {
|
|||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
extractTextContent: function Page_extractTextContent(task) {
|
extractTextContent: function Page_extractTextContent(task,
|
||||||
|
normalizeWhitespace) {
|
||||||
var handler = {
|
var handler = {
|
||||||
on: function nullHandlerOn() {},
|
on: function nullHandlerOn() {},
|
||||||
send: function nullHandlerSend() {}
|
send: function nullHandlerSend() {}
|
||||||
@ -248,7 +249,9 @@ var Page = (function PageClosure() {
|
|||||||
|
|
||||||
return partialEvaluator.getTextContent(contentStream,
|
return partialEvaluator.getTextContent(contentStream,
|
||||||
task,
|
task,
|
||||||
self.resources);
|
self.resources,
|
||||||
|
/* stateManager = */ null,
|
||||||
|
normalizeWhitespace);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
getTextContent: function PartialEvaluator_getTextContent(stream, task,
|
getTextContent:
|
||||||
resources,
|
function PartialEvaluator_getTextContent(stream, task, resources,
|
||||||
stateManager) {
|
stateManager,
|
||||||
|
normalizeWhitespace) {
|
||||||
|
|
||||||
stateManager = (stateManager || new StateManager(new TextState()));
|
stateManager = (stateManager || new StateManager(new TextState()));
|
||||||
|
|
||||||
|
var WhitespaceRegexp = /\s/g;
|
||||||
|
|
||||||
var textContent = {
|
var textContent = {
|
||||||
items: [],
|
items: [],
|
||||||
styles: Object.create(null)
|
styles: Object.create(null)
|
||||||
@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
return textContentItem;
|
return textContentItem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function replaceWhitespace(str) {
|
||||||
|
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
||||||
|
// alignment issues between the textLayer and the canvas if the text
|
||||||
|
// contains e.g. tabs (fixes issue6612.pdf).
|
||||||
|
var i = 0, ii = str.length, code;
|
||||||
|
while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
|
||||||
|
}
|
||||||
|
|
||||||
function runBidiTransform(textChunk) {
|
function runBidiTransform(textChunk) {
|
||||||
var str = textChunk.str.join('');
|
var str = textChunk.str.join('');
|
||||||
var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
|
var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
|
||||||
return {
|
return {
|
||||||
str: bidiResult.str,
|
str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
|
||||||
|
bidiResult.str),
|
||||||
dir: bidiResult.dir,
|
dir: bidiResult.dir,
|
||||||
width: textChunk.width,
|
width: textChunk.width,
|
||||||
height: textChunk.height,
|
height: textChunk.height,
|
||||||
@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return self.getTextContent(xobj, task,
|
return self.getTextContent(xobj, task,
|
||||||
xobj.dict.get('Resources') || resources, stateManager).
|
xobj.dict.get('Resources') || resources, stateManager,
|
||||||
then(function (formTextContent) {
|
normalizeWhitespace).then(function (formTextContent) {
|
||||||
Util.appendToArray(textContent.items, formTextContent.items);
|
Util.appendToArray(textContent.items, formTextContent.items);
|
||||||
Util.extendObj(textContent.styles, formTextContent.styles);
|
Util.extendObj(textContent.styles, formTextContent.styles);
|
||||||
stateManager.restore();
|
stateManager.restore();
|
||||||
|
@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
|
|||||||
|
|
||||||
handler.on('GetTextContent', function wphExtractText(data) {
|
handler.on('GetTextContent', function wphExtractText(data) {
|
||||||
var pageIndex = data.pageIndex;
|
var pageIndex = data.pageIndex;
|
||||||
|
var normalizeWhitespace = data.normalizeWhitespace;
|
||||||
return pdfManager.getPage(pageIndex).then(function(page) {
|
return pdfManager.getPage(pageIndex).then(function(page) {
|
||||||
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
||||||
startWorkerTask(task);
|
startWorkerTask(task);
|
||||||
var pageNum = pageIndex + 1;
|
var pageNum = pageIndex + 1;
|
||||||
var start = Date.now();
|
var start = Date.now();
|
||||||
return page.extractTextContent(task).then(function(textContent) {
|
return page.extractTextContent(task, normalizeWhitespace).then(
|
||||||
|
function(textContent) {
|
||||||
finishWorkerTask(task);
|
finishWorkerTask(task);
|
||||||
info('text indexing: page=' + pageNum + ' - time=' +
|
info('text indexing: page=' + pageNum + ' - time=' +
|
||||||
(Date.now() - start) + 'ms');
|
(Date.now() - start) + 'ms');
|
||||||
|
@ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
|
|||||||
return PDFDocumentProxy;
|
return PDFDocumentProxy;
|
||||||
})();
|
})();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Page getTextContent parameters.
|
||||||
|
*
|
||||||
|
* @typedef {Object} getTextContentParameters
|
||||||
|
* @param {boolean} normalizeWhitespace - replaces all occurrences of
|
||||||
|
* whitespace with standard spaces (0x20). The default value is `false`.
|
||||||
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Page text content.
|
* Page text content.
|
||||||
*
|
*
|
||||||
@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
|
|||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param {getTextContentParameters} params - getTextContent parameters.
|
||||||
* @return {Promise} That is resolved a {@link TextContent}
|
* @return {Promise} That is resolved a {@link TextContent}
|
||||||
* object that represent the page text content.
|
* object that represent the page text content.
|
||||||
*/
|
*/
|
||||||
getTextContent: function PDFPageProxy_getTextContent() {
|
getTextContent: function PDFPageProxy_getTextContent(params) {
|
||||||
|
var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
|
||||||
|
|
||||||
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
|
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
|
||||||
pageIndex: this.pageNumber - 1
|
pageIndex: this.pageNumber - 1,
|
||||||
|
normalizeWhitespace: normalizeWhitespace,
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -334,10 +334,12 @@ var Driver = (function DriverClosure() {
|
|||||||
textLayerContext.clearRect(0, 0,
|
textLayerContext.clearRect(0, 0,
|
||||||
textLayerCanvas.width, textLayerCanvas.height);
|
textLayerCanvas.width, textLayerCanvas.height);
|
||||||
// The text builder will draw its content on the test canvas
|
// The text builder will draw its content on the test canvas
|
||||||
initPromise = page.getTextContent().then(function(textContent) {
|
initPromise =
|
||||||
return rasterizeTextLayer(textLayerContext, viewport,
|
page.getTextContent({ normalizeWhitespace: true }).then(
|
||||||
textContent);
|
function(textContent) {
|
||||||
});
|
return rasterizeTextLayer(textLayerContext, viewport,
|
||||||
|
textContent);
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
textLayerCanvas = null;
|
textLayerCanvas = null;
|
||||||
initPromise = Promise.resolve();
|
initPromise = Promise.resolve();
|
||||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -49,6 +49,7 @@
|
|||||||
!issue5280.pdf
|
!issue5280.pdf
|
||||||
!issue5677.pdf
|
!issue5677.pdf
|
||||||
!issue5954.pdf
|
!issue5954.pdf
|
||||||
|
!issue6612.pdf
|
||||||
!alphatrans.pdf
|
!alphatrans.pdf
|
||||||
!devicen.pdf
|
!devicen.pdf
|
||||||
!cmykjpeg.pdf
|
!cmykjpeg.pdf
|
||||||
|
BIN
test/pdfs/issue6612.pdf
Normal file
BIN
test/pdfs/issue6612.pdf
Normal file
Binary file not shown.
@ -1271,6 +1271,13 @@
|
|||||||
"link": false,
|
"link": false,
|
||||||
"type": "eq"
|
"type": "eq"
|
||||||
},
|
},
|
||||||
|
{ "id": "issue6612-text",
|
||||||
|
"file": "pdfs/issue6612.pdf",
|
||||||
|
"md5": "657f33236496916597cd70ef1222509a",
|
||||||
|
"rounds": 1,
|
||||||
|
"link": false,
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
{ "id": "zerowidthline",
|
{ "id": "zerowidthline",
|
||||||
"file": "pdfs/zerowidthline.pdf",
|
"file": "pdfs/zerowidthline.pdf",
|
||||||
"md5": "295d26e61a85635433f8e4b768953f60",
|
"md5": "295d26e61a85635433f8e4b768953f60",
|
||||||
|
@ -482,11 +482,21 @@ describe('api', function() {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
it('gets text content', function () {
|
it('gets text content', function () {
|
||||||
var promise = page.getTextContent();
|
var defaultPromise = page.getTextContent();
|
||||||
waitsForPromiseResolved(promise, function (data) {
|
var normalizeWhitespacePromise = page.getTextContent({
|
||||||
expect(!!data.items).toEqual(true);
|
normalizeWhitespace: true });
|
||||||
expect(data.items.length).toEqual(7);
|
|
||||||
expect(!!data.styles).toEqual(true);
|
var promises = [
|
||||||
|
defaultPromise,
|
||||||
|
normalizeWhitespacePromise
|
||||||
|
];
|
||||||
|
waitsForPromiseResolved(Promise.all(promises), function (data) {
|
||||||
|
expect(!!data[0].items).toEqual(true);
|
||||||
|
expect(data[0].items.length).toEqual(7);
|
||||||
|
expect(!!data[0].styles).toEqual(true);
|
||||||
|
|
||||||
|
// A simple check that ensures the two `textContent` object match.
|
||||||
|
expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1]));
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
it('gets operator list', function() {
|
it('gets operator list', function() {
|
||||||
|
@ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() {
|
|||||||
'\u00BC': '1/4', // Vulgar fraction one quarter
|
'\u00BC': '1/4', // Vulgar fraction one quarter
|
||||||
'\u00BD': '1/2', // Vulgar fraction one half
|
'\u00BD': '1/2', // Vulgar fraction one half
|
||||||
'\u00BE': '3/4', // Vulgar fraction three quarters
|
'\u00BE': '3/4', // Vulgar fraction three quarters
|
||||||
'\u00A0': ' ' // No-break space
|
|
||||||
};
|
};
|
||||||
this.findBar = options.findBar || null;
|
this.findBar = options.findBar || null;
|
||||||
|
|
||||||
|
@ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() {
|
|||||||
function pdfPageRenderCallback() {
|
function pdfPageRenderCallback() {
|
||||||
pageViewDrawCallback(null);
|
pageViewDrawCallback(null);
|
||||||
if (textLayer) {
|
if (textLayer) {
|
||||||
self.pdfPage.getTextContent().then(
|
self.pdfPage.getTextContent({ normalizeWhitespace: true }).then(
|
||||||
function textContentResolved(textContent) {
|
function textContentResolved(textContent) {
|
||||||
textLayer.setTextContent(textContent);
|
textLayer.setTextContent(textContent);
|
||||||
textLayer.render(TEXT_LAYER_RENDER_DELAY);
|
textLayer.render(TEXT_LAYER_RENDER_DELAY);
|
||||||
|
@ -471,7 +471,7 @@ var PDFViewer = (function pdfViewer() {
|
|||||||
if (!this.pdfDocument) {
|
if (!this.pdfDocument) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var pageView = this._pages[pageNumber - 1];
|
var pageView = this._pages[pageNumber - 1];
|
||||||
|
|
||||||
if (this.isInPresentationMode) {
|
if (this.isInPresentationMode) {
|
||||||
@ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() {
|
|||||||
|
|
||||||
getPageTextContent: function (pageIndex) {
|
getPageTextContent: function (pageIndex) {
|
||||||
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
|
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
|
||||||
return page.getTextContent();
|
return page.getTextContent({ normalizeWhitespace: true });
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user