[text selection] Add the whitespaces present in the pdf in the text chunk

- it aims to fix issue #14627;
- the basic idea of the recent text refactoring was to only consider the rendered visible whitespaces.
  But sometimes, the heuristics aren't correct and although some whitespaces are in the text stream
  they weren't in the text chunks because they were too small. Hence we added some exceptions, for example,
  we always add a whitespace when it is between two non-whitespace chars but only when in the same Tj.
  So basically, this patch removes the constraint to have the chars in the same Tj
  (in using a circular buffer to save the two last chars) but don't add a space when the visible space is really
  too small (hence `NOT_A_SPACE_FACTOR`).
This commit is contained in:
Calixte Denizet 2022-03-21 22:10:46 +01:00
parent db4f3adc5e
commit 18e79e3c0b
6 changed files with 94 additions and 23 deletions

View File

@ -2187,17 +2187,62 @@ class PartialEvaluator {
spaceInFlowMax: 0,
trackingSpaceMin: Infinity,
negativeSpaceMax: -Infinity,
notASpace: -Infinity,
transform: null,
fontName: null,
hasEOL: false,
};
// Use a circular buffer (length === 2) to save the last chars in the
// text stream.
// This implementation of the circular buffer is using a fixed array
// and the position of the next element:
// function addElement(x) {
// buffer[pos] = x;
// pos = (pos + 1) % buffer.length;
// }
// It's a way faster than:
// function addElement(x) {
// buffer.push(x);
// buffer.shift();
// }
//
// It's useful to know when we need to add a whitespace in the
// text chunk.
const twoLastChars = [" ", " "];
let twoLastCharsPos = 0;
/**
* Save the last char.
* @param {string} char
* @returns {boolean} true when the two last chars before adding the new one
* are a non-whitespace followed by a whitespace.
*/
function saveLastChar(char) {
const nextPos = (twoLastCharsPos + 1) % 2;
const ret =
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
twoLastChars[twoLastCharsPos] = char;
twoLastCharsPos = nextPos;
return ret;
}
function resetLastChars() {
twoLastChars[0] = twoLastChars[1] = " ";
twoLastCharsPos = 0;
}
// Used in addFakeSpaces.
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
// so it doesn't count as a space.
const TRACKING_SPACE_FACTOR = 0.1;
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
// even if one is present in the text stream.
const NOT_A_SPACE_FACTOR = 0.03;
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
// a break (a new chunk of text is created).
// It doesn't change anything when the text is copied but
@ -2299,6 +2344,7 @@ class PartialEvaluator {
textContentItem.trackingSpaceMin =
textState.fontSize * TRACKING_SPACE_FACTOR;
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
textContentItem.negativeSpaceMax =
textState.fontSize * NEGATIVE_SPACE_FACTOR;
textContentItem.spaceInFlowMin =
@ -2483,6 +2529,7 @@ class PartialEvaluator {
return true;
}
resetLastChars();
flushTextContentItem();
return true;
}
@ -2491,6 +2538,13 @@ class PartialEvaluator {
appendEOL();
return true;
}
if (advanceY <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.height += advanceY;
} else if (
@ -2501,6 +2555,7 @@ class PartialEvaluator {
)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
@ -2532,6 +2587,10 @@ class PartialEvaluator {
appendEOL();
return true;
}
// We're moving back so in case the last char was a whitespace
// we cancel it: it doesn't make sense to insert it.
resetLastChars();
flushTextContentItem();
return true;
}
@ -2541,12 +2600,19 @@ class PartialEvaluator {
return true;
}
if (advanceX <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.width += advanceX;
} else if (
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
@ -2600,14 +2666,7 @@ class PartialEvaluator {
}
let scaledDim = glyphWidth * scale;
if (
glyph.isWhitespace &&
(i === 0 ||
i + 1 === ii ||
glyphs[i - 1].isWhitespace ||
glyphs[i + 1].isWhitespace ||
extraSpacing)
) {
if (glyph.isWhitespace) {
// Don't push a " " in the textContentItem
// (except when it's between two non-spaces chars),
// it will be done (if required) in next call to
@ -2623,6 +2682,7 @@ class PartialEvaluator {
charSpacing += -scaledDim + textState.wordSpacing;
textState.translateTextMatrix(0, -charSpacing);
}
saveLastChar(" ");
continue;
}
@ -2653,17 +2713,18 @@ class PartialEvaluator {
textChunk.prevTransform = getCurrentTextTransform();
}
if (glyph.isWhitespace) {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
if (saveLastChar(glyphUnicode)) {
// The two last chars are a non-whitespace followed by a whitespace
// and then this non-whitespace, so we insert a whitespace here.
// Replaces all whitespaces with standard spaces (0x20), to avoid
// alignment issues between the textLayer and the canvas if the text
// contains e.g. tabs (fixes issue6612.pdf).
textChunk.str.push(" ");
} else {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
textChunk.str.push(glyphUnicode);
}
textChunk.str.push(glyphUnicode);
if (charSpacing) {
if (!font.vertical) {
@ -2679,6 +2740,7 @@ class PartialEvaluator {
}
function appendEOL() {
resetLastChars();
if (textContentItem.initialized) {
textContentItem.hasEOL = true;
flushTextContentItem();
@ -2701,6 +2763,7 @@ class PartialEvaluator {
width <= textOrientation * textContentItem.spaceInFlowMax
) {
if (textContentItem.initialized) {
resetLastChars();
textContentItem.str.push(" ");
}
return false;
@ -2715,6 +2778,7 @@ class PartialEvaluator {
}
flushTextContentItem();
resetLastChars();
textContent.items.push({
str: " ",
// TODO: check if using the orientation from last chunk is

View File

@ -515,3 +515,4 @@
!issue14497.pdf
!issue14502.pdf
!issue13211.pdf
!issue14627.pdf

BIN
test/pdfs/issue14627.pdf Executable file

Binary file not shown.

View File

@ -6329,5 +6329,11 @@
"md5": "d193853e8a123dc50eeea593a4150b60",
"rounds": 1,
"type": "eq"
},
{ "id": "issue14627",
"file": "pdfs/issue14627.pdf",
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
"rounds": 1,
"type": "text"
}
]

View File

@ -1999,7 +1999,7 @@ describe("api", function () {
const data = await Promise.all([defaultPromise, parametersPromise]);
expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(11);
expect(data[0].items.length).toEqual(15);
expect(!!data[0].styles).toEqual(true);
const page1 = mergeText(data[0].items);

View File

@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
},
pageMatches: [
[
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
4071,
302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157,
1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788,
1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274,
2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901,
2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474,
3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055,
4068,
],
],
pageMatchesLength: [