Attempt to combine separate beginText/endText sequences in getTextContent (issue 9984)

Please note that while this *improves* issue 9984 slightly (and likely others too), it's not a complete solution.
The remaining issues are related to the, more general, problems with the existing heuristics related to attempting to combine separate text items.
This commit is contained in:
Jonas Jenwald 2018-08-18 13:28:40 +02:00
parent 160ca55163
commit 497b765ede
4 changed files with 47 additions and 7 deletions

View File

@ -1512,6 +1512,17 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textContentItem.str.length = 0;
}
function isIdenticalSetFont(name, size) {
return (textState.font &&
name === textState.fontName && size === textState.fontSize);
}
function handleBeginText() {
flushTextContentItem();
textState.textMatrix = IDENTITY_MATRIX.slice();
textState.textLineMatrix = IDENTITY_MATRIX.slice();
}
function enqueueChunk() {
let length = textContent.items.length;
if (length > 0) {
@ -1537,6 +1548,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
task.ensureNotTerminated();
timeSlotManager.reset();
var stop, operation = {}, args = [];
let pendingBeginText = false;
while (!(stop = timeSlotManager.check())) {
// The arguments parsed by read() are not used beyond this loop, so
// we can reuse the same array on every iteration, thus avoiding
@ -1547,16 +1559,30 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
break;
}
textState = stateManager.state;
var fn = operation.fn;
var fn = operation.fn | 0;
args = operation.args;
var advance, diff;
switch (fn | 0) {
if (pendingBeginText) {
if (fn === OPS.setFont) {
const fontNameArg = args[0].name, fontSizeArg = args[1];
// For multiple identical Tf (setFont) commands, first check if
// the following command is Tm (setTextMatrix) before continuing.
if (isIdenticalSetFont(fontNameArg, fontSizeArg)) {
continue;
}
}
if (fn !== OPS.setTextMatrix) {
handleBeginText();
}
pendingBeginText = false;
}
switch (fn) {
case OPS.setFont:
// Optimization to ignore multiple identical Tf commands.
var fontNameArg = args[0].name, fontSizeArg = args[1];
if (textState.font && fontNameArg === textState.fontName &&
fontSizeArg === textState.fontSize) {
if (isIdenticalSetFont(fontNameArg, fontSizeArg)) {
break;
}
@ -1644,9 +1670,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textState.wordSpacing = args[0];
break;
case OPS.beginText:
flushTextContentItem();
textState.textMatrix = IDENTITY_MATRIX.slice();
textState.textLineMatrix = IDENTITY_MATRIX.slice();
// Optimization to attempt to combine separate BT/ET sequences,
// by checking the next operator(s) before flushing text content
// and resetting the text/textLine matrices (see above).
if (combineTextItems) {
pendingBeginText = true;
break;
}
handleBeginText();
break;
case OPS.showSpacedText:
var items = args[0];

View File

@ -72,6 +72,7 @@
!issue9458.pdf
!issue9915_reduced.pdf
!issue9940.pdf
!issue9984.pdf
!bad-PageLabels.pdf
!decodeACSuccessive.pdf
!filled-background.pdf

BIN
test/pdfs/issue9984.pdf Normal file

Binary file not shown.

View File

@ -1352,6 +1352,13 @@
"link": false,
"type": "eq"
},
{ "id": "issue9984-text",
"file": "pdfs/issue9984.pdf",
"md5": "41be5f1b43f61892978cfc57c74ccf4c",
"rounds": 1,
"link": false,
"type": "text"
},
{ "id": "issue8570",
"file": "pdfs/issue8570.pdf",
"md5": "0355731adb72df233eaa10464dcc8c51",