Merge pull request #14703 from calixteman/14627

[text selection] Add the whitespaces present in the pdf in the text chunk
This commit is contained in:
Jonas Jenwald 2022-03-27 15:20:19 +02:00 committed by GitHub
commit 0dd6bc9a85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 94 additions and 23 deletions

View File

@ -2187,17 +2187,62 @@ class PartialEvaluator {
spaceInFlowMax: 0,
trackingSpaceMin: Infinity,
negativeSpaceMax: -Infinity,
notASpace: -Infinity,
transform: null,
fontName: null,
hasEOL: false,
};
// Use a circular buffer (length === 2) to save the last chars in the
// text stream.
// This implementation of the circular buffer is using a fixed array
// and the position of the next element:
// function addElement(x) {
// buffer[pos] = x;
// pos = (pos + 1) % buffer.length;
// }
// It's a way faster than:
// function addElement(x) {
// buffer.push(x);
// buffer.shift();
// }
//
// It's useful to know when we need to add a whitespace in the
// text chunk.
const twoLastChars = [" ", " "];
let twoLastCharsPos = 0;
/**
* Save the last char.
* @param {string} char
* @returns {boolean} true when the two last chars before adding the new one
* are a non-whitespace followed by a whitespace.
*/
function saveLastChar(char) {
const nextPos = (twoLastCharsPos + 1) % 2;
const ret =
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
twoLastChars[twoLastCharsPos] = char;
twoLastCharsPos = nextPos;
return ret;
}
function resetLastChars() {
twoLastChars[0] = twoLastChars[1] = " ";
twoLastCharsPos = 0;
}
// Used in addFakeSpaces.
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
// so it doesn't count as a space.
const TRACKING_SPACE_FACTOR = 0.1;
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
// even if one is present in the text stream.
const NOT_A_SPACE_FACTOR = 0.03;
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
// a break (a new chunk of text is created).
// It doesn't change anything when the text is copied but
@ -2299,6 +2344,7 @@ class PartialEvaluator {
textContentItem.trackingSpaceMin =
textState.fontSize * TRACKING_SPACE_FACTOR;
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
textContentItem.negativeSpaceMax =
textState.fontSize * NEGATIVE_SPACE_FACTOR;
textContentItem.spaceInFlowMin =
@ -2483,6 +2529,7 @@ class PartialEvaluator {
return true;
}
resetLastChars();
flushTextContentItem();
return true;
}
@ -2491,6 +2538,13 @@ class PartialEvaluator {
appendEOL();
return true;
}
if (advanceY <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.height += advanceY;
} else if (
@ -2501,6 +2555,7 @@ class PartialEvaluator {
)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
@ -2532,6 +2587,10 @@ class PartialEvaluator {
appendEOL();
return true;
}
// We're moving back so in case the last char was a whitespace
// we cancel it: it doesn't make sense to insert it.
resetLastChars();
flushTextContentItem();
return true;
}
@ -2541,12 +2600,19 @@ class PartialEvaluator {
return true;
}
if (advanceX <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.width += advanceX;
} else if (
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
@ -2600,14 +2666,7 @@ class PartialEvaluator {
}
let scaledDim = glyphWidth * scale;
if (
glyph.isWhitespace &&
(i === 0 ||
i + 1 === ii ||
glyphs[i - 1].isWhitespace ||
glyphs[i + 1].isWhitespace ||
extraSpacing)
) {
if (glyph.isWhitespace) {
// Don't push a " " in the textContentItem
// (except when it's between two non-spaces chars),
// it will be done (if required) in next call to
@ -2623,6 +2682,7 @@ class PartialEvaluator {
charSpacing += -scaledDim + textState.wordSpacing;
textState.translateTextMatrix(0, -charSpacing);
}
saveLastChar(" ");
continue;
}
@ -2653,17 +2713,18 @@ class PartialEvaluator {
textChunk.prevTransform = getCurrentTextTransform();
}
if (glyph.isWhitespace) {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
if (saveLastChar(glyphUnicode)) {
// The two last chars are a non-whitespace followed by a whitespace
// and then this non-whitespace, so we insert a whitespace here.
// Replaces all whitespaces with standard spaces (0x20), to avoid
// alignment issues between the textLayer and the canvas if the text
// contains e.g. tabs (fixes issue6612.pdf).
textChunk.str.push(" ");
} else {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
textChunk.str.push(glyphUnicode);
}
textChunk.str.push(glyphUnicode);
if (charSpacing) {
if (!font.vertical) {
@ -2679,6 +2740,7 @@ class PartialEvaluator {
}
function appendEOL() {
resetLastChars();
if (textContentItem.initialized) {
textContentItem.hasEOL = true;
flushTextContentItem();
@ -2701,6 +2763,7 @@ class PartialEvaluator {
width <= textOrientation * textContentItem.spaceInFlowMax
) {
if (textContentItem.initialized) {
resetLastChars();
textContentItem.str.push(" ");
}
return false;
@ -2715,6 +2778,7 @@ class PartialEvaluator {
}
flushTextContentItem();
resetLastChars();
textContent.items.push({
str: " ",
// TODO: check if using the orientation from last chunk is

View File

@ -515,3 +515,4 @@
!issue14497.pdf
!issue14502.pdf
!issue13211.pdf
!issue14627.pdf

BIN
test/pdfs/issue14627.pdf Executable file

Binary file not shown.

View File

@ -6329,5 +6329,11 @@
"md5": "d193853e8a123dc50eeea593a4150b60",
"rounds": 1,
"type": "eq"
},
{ "id": "issue14627",
"file": "pdfs/issue14627.pdf",
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
"rounds": 1,
"type": "text"
}
]

View File

@ -1999,7 +1999,7 @@ describe("api", function () {
const data = await Promise.all([defaultPromise, parametersPromise]);
expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(11);
expect(data[0].items.length).toEqual(15);
expect(!!data[0].styles).toEqual(true);
const page1 = mergeText(data[0].items);

View File

@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
},
pageMatches: [
[
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
4071,
302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157,
1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788,
1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274,
2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901,
2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474,
3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055,
4068,
],
],
pageMatchesLength: [