[text selection] Add the whitespaces present in the pdf in the text chunk
- it aims to fix issue #14627; - the basic idea of the recent text refactoring was to only consider the rendered visible whitespaces. But sometimes, the heuristics aren't correct and although some whitespaces are in the text stream they weren't in the text chunks because they were too small. Hence we added some exceptions, for example, we always add a whitespace when it is between two non-whitespace chars but only when in the same Tj. So basically, this patch removes the constraint to have the chars in the same Tj (in using a circular buffer to save the two last chars) but don't add a space when the visible space is really too small (hence `NOT_A_SPACE_FACTOR`).
This commit is contained in:
parent
db4f3adc5e
commit
18e79e3c0b
@ -2187,17 +2187,62 @@ class PartialEvaluator {
|
||||
spaceInFlowMax: 0,
|
||||
trackingSpaceMin: Infinity,
|
||||
negativeSpaceMax: -Infinity,
|
||||
notASpace: -Infinity,
|
||||
transform: null,
|
||||
fontName: null,
|
||||
hasEOL: false,
|
||||
};
|
||||
|
||||
// Use a circular buffer (length === 2) to save the last chars in the
|
||||
// text stream.
|
||||
// This implementation of the circular buffer is using a fixed array
|
||||
// and the position of the next element:
|
||||
// function addElement(x) {
|
||||
// buffer[pos] = x;
|
||||
// pos = (pos + 1) % buffer.length;
|
||||
// }
|
||||
// It's a way faster than:
|
||||
// function addElement(x) {
|
||||
// buffer.push(x);
|
||||
// buffer.shift();
|
||||
// }
|
||||
//
|
||||
// It's useful to know when we need to add a whitespace in the
|
||||
// text chunk.
|
||||
const twoLastChars = [" ", " "];
|
||||
let twoLastCharsPos = 0;
|
||||
|
||||
/**
|
||||
* Save the last char.
|
||||
* @param {string} char
|
||||
* @returns {boolean} true when the two last chars before adding the new one
|
||||
* are a non-whitespace followed by a whitespace.
|
||||
*/
|
||||
function saveLastChar(char) {
|
||||
const nextPos = (twoLastCharsPos + 1) % 2;
|
||||
const ret =
|
||||
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
|
||||
twoLastChars[twoLastCharsPos] = char;
|
||||
twoLastCharsPos = nextPos;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
function resetLastChars() {
|
||||
twoLastChars[0] = twoLastChars[1] = " ";
|
||||
twoLastCharsPos = 0;
|
||||
}
|
||||
|
||||
// Used in addFakeSpaces.
|
||||
|
||||
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
|
||||
// so it doesn't count as a space.
|
||||
const TRACKING_SPACE_FACTOR = 0.1;
|
||||
|
||||
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
|
||||
// even if one is present in the text stream.
|
||||
const NOT_A_SPACE_FACTOR = 0.03;
|
||||
|
||||
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
|
||||
// a break (a new chunk of text is created).
|
||||
// It doesn't change anything when the text is copied but
|
||||
@ -2299,6 +2344,7 @@ class PartialEvaluator {
|
||||
|
||||
textContentItem.trackingSpaceMin =
|
||||
textState.fontSize * TRACKING_SPACE_FACTOR;
|
||||
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
|
||||
textContentItem.negativeSpaceMax =
|
||||
textState.fontSize * NEGATIVE_SPACE_FACTOR;
|
||||
textContentItem.spaceInFlowMin =
|
||||
@ -2483,6 +2529,7 @@ class PartialEvaluator {
|
||||
return true;
|
||||
}
|
||||
|
||||
resetLastChars();
|
||||
flushTextContentItem();
|
||||
return true;
|
||||
}
|
||||
@ -2491,6 +2538,13 @@ class PartialEvaluator {
|
||||
appendEOL();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (advanceY <= textOrientation * textContentItem.notASpace) {
|
||||
// The real spacing between 2 consecutive chars is thin enough to be
|
||||
// considered a non-space.
|
||||
resetLastChars();
|
||||
}
|
||||
|
||||
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||
textContentItem.height += advanceY;
|
||||
} else if (
|
||||
@ -2501,6 +2555,7 @@ class PartialEvaluator {
|
||||
)
|
||||
) {
|
||||
if (textContentItem.str.length === 0) {
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
dir: "ltr",
|
||||
@ -2532,6 +2587,10 @@ class PartialEvaluator {
|
||||
appendEOL();
|
||||
return true;
|
||||
}
|
||||
|
||||
// We're moving back so in case the last char was a whitespace
|
||||
// we cancel it: it doesn't make sense to insert it.
|
||||
resetLastChars();
|
||||
flushTextContentItem();
|
||||
return true;
|
||||
}
|
||||
@ -2541,12 +2600,19 @@ class PartialEvaluator {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (advanceX <= textOrientation * textContentItem.notASpace) {
|
||||
// The real spacing between 2 consecutive chars is thin enough to be
|
||||
// considered a non-space.
|
||||
resetLastChars();
|
||||
}
|
||||
|
||||
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||
textContentItem.width += advanceX;
|
||||
} else if (
|
||||
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
|
||||
) {
|
||||
if (textContentItem.str.length === 0) {
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
dir: "ltr",
|
||||
@ -2600,14 +2666,7 @@ class PartialEvaluator {
|
||||
}
|
||||
let scaledDim = glyphWidth * scale;
|
||||
|
||||
if (
|
||||
glyph.isWhitespace &&
|
||||
(i === 0 ||
|
||||
i + 1 === ii ||
|
||||
glyphs[i - 1].isWhitespace ||
|
||||
glyphs[i + 1].isWhitespace ||
|
||||
extraSpacing)
|
||||
) {
|
||||
if (glyph.isWhitespace) {
|
||||
// Don't push a " " in the textContentItem
|
||||
// (except when it's between two non-spaces chars),
|
||||
// it will be done (if required) in next call to
|
||||
@ -2623,6 +2682,7 @@ class PartialEvaluator {
|
||||
charSpacing += -scaledDim + textState.wordSpacing;
|
||||
textState.translateTextMatrix(0, -charSpacing);
|
||||
}
|
||||
saveLastChar(" ");
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -2653,17 +2713,18 @@ class PartialEvaluator {
|
||||
textChunk.prevTransform = getCurrentTextTransform();
|
||||
}
|
||||
|
||||
if (glyph.isWhitespace) {
|
||||
let glyphUnicode = glyph.unicode;
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||
if (saveLastChar(glyphUnicode)) {
|
||||
// The two last chars are a non-whitespace followed by a whitespace
|
||||
// and then this non-whitespace, so we insert a whitespace here.
|
||||
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
||||
// alignment issues between the textLayer and the canvas if the text
|
||||
// contains e.g. tabs (fixes issue6612.pdf).
|
||||
textChunk.str.push(" ");
|
||||
} else {
|
||||
let glyphUnicode = glyph.unicode;
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||
textChunk.str.push(glyphUnicode);
|
||||
}
|
||||
textChunk.str.push(glyphUnicode);
|
||||
|
||||
if (charSpacing) {
|
||||
if (!font.vertical) {
|
||||
@ -2679,6 +2740,7 @@ class PartialEvaluator {
|
||||
}
|
||||
|
||||
function appendEOL() {
|
||||
resetLastChars();
|
||||
if (textContentItem.initialized) {
|
||||
textContentItem.hasEOL = true;
|
||||
flushTextContentItem();
|
||||
@ -2701,6 +2763,7 @@ class PartialEvaluator {
|
||||
width <= textOrientation * textContentItem.spaceInFlowMax
|
||||
) {
|
||||
if (textContentItem.initialized) {
|
||||
resetLastChars();
|
||||
textContentItem.str.push(" ");
|
||||
}
|
||||
return false;
|
||||
@ -2715,6 +2778,7 @@ class PartialEvaluator {
|
||||
}
|
||||
|
||||
flushTextContentItem();
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
// TODO: check if using the orientation from last chunk is
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -515,3 +515,4 @@
|
||||
!issue14497.pdf
|
||||
!issue14502.pdf
|
||||
!issue13211.pdf
|
||||
!issue14627.pdf
|
||||
|
BIN
test/pdfs/issue14627.pdf
Executable file
BIN
test/pdfs/issue14627.pdf
Executable file
Binary file not shown.
@ -6329,5 +6329,11 @@
|
||||
"md5": "d193853e8a123dc50eeea593a4150b60",
|
||||
"rounds": 1,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue14627",
|
||||
"file": "pdfs/issue14627.pdf",
|
||||
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
|
||||
"rounds": 1,
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
|
@ -1999,7 +1999,7 @@ describe("api", function () {
|
||||
const data = await Promise.all([defaultPromise, parametersPromise]);
|
||||
|
||||
expect(!!data[0].items).toEqual(true);
|
||||
expect(data[0].items.length).toEqual(11);
|
||||
expect(data[0].items.length).toEqual(15);
|
||||
expect(!!data[0].styles).toEqual(true);
|
||||
|
||||
const page1 = mergeText(data[0].items);
|
||||
|
@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
|
||||
},
|
||||
pageMatches: [
|
||||
[
|
||||
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
|
||||
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
|
||||
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
|
||||
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
|
||||
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
|
||||
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
|
||||
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
|
||||
4071,
|
||||
302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157,
|
||||
1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788,
|
||||
1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274,
|
||||
2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576,
|
||||
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901,
|
||||
2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474,
|
||||
3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055,
|
||||
4068,
|
||||
],
|
||||
],
|
||||
pageMatchesLength: [
|
||||
|
Loading…
Reference in New Issue
Block a user