Merge pull request #14703 from calixteman/14627
[text selection] Add the whitespaces present in the pdf in the text chunk
This commit is contained in:
commit
0dd6bc9a85
@ -2187,17 +2187,62 @@ class PartialEvaluator {
|
||||
spaceInFlowMax: 0,
|
||||
trackingSpaceMin: Infinity,
|
||||
negativeSpaceMax: -Infinity,
|
||||
notASpace: -Infinity,
|
||||
transform: null,
|
||||
fontName: null,
|
||||
hasEOL: false,
|
||||
};
|
||||
|
||||
// Use a circular buffer (length === 2) to save the last chars in the
|
||||
// text stream.
|
||||
// This implementation of the circular buffer is using a fixed array
|
||||
// and the position of the next element:
|
||||
// function addElement(x) {
|
||||
// buffer[pos] = x;
|
||||
// pos = (pos + 1) % buffer.length;
|
||||
// }
|
||||
// It's a way faster than:
|
||||
// function addElement(x) {
|
||||
// buffer.push(x);
|
||||
// buffer.shift();
|
||||
// }
|
||||
//
|
||||
// It's useful to know when we need to add a whitespace in the
|
||||
// text chunk.
|
||||
const twoLastChars = [" ", " "];
|
||||
let twoLastCharsPos = 0;
|
||||
|
||||
/**
|
||||
* Save the last char.
|
||||
* @param {string} char
|
||||
* @returns {boolean} true when the two last chars before adding the new one
|
||||
* are a non-whitespace followed by a whitespace.
|
||||
*/
|
||||
function saveLastChar(char) {
|
||||
const nextPos = (twoLastCharsPos + 1) % 2;
|
||||
const ret =
|
||||
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
|
||||
twoLastChars[twoLastCharsPos] = char;
|
||||
twoLastCharsPos = nextPos;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
function resetLastChars() {
|
||||
twoLastChars[0] = twoLastChars[1] = " ";
|
||||
twoLastCharsPos = 0;
|
||||
}
|
||||
|
||||
// Used in addFakeSpaces.
|
||||
|
||||
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
|
||||
// so it doesn't count as a space.
|
||||
const TRACKING_SPACE_FACTOR = 0.1;
|
||||
|
||||
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
|
||||
// even if one is present in the text stream.
|
||||
const NOT_A_SPACE_FACTOR = 0.03;
|
||||
|
||||
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
|
||||
// a break (a new chunk of text is created).
|
||||
// It doesn't change anything when the text is copied but
|
||||
@ -2299,6 +2344,7 @@ class PartialEvaluator {
|
||||
|
||||
textContentItem.trackingSpaceMin =
|
||||
textState.fontSize * TRACKING_SPACE_FACTOR;
|
||||
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
|
||||
textContentItem.negativeSpaceMax =
|
||||
textState.fontSize * NEGATIVE_SPACE_FACTOR;
|
||||
textContentItem.spaceInFlowMin =
|
||||
@ -2483,6 +2529,7 @@ class PartialEvaluator {
|
||||
return true;
|
||||
}
|
||||
|
||||
resetLastChars();
|
||||
flushTextContentItem();
|
||||
return true;
|
||||
}
|
||||
@ -2491,6 +2538,13 @@ class PartialEvaluator {
|
||||
appendEOL();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (advanceY <= textOrientation * textContentItem.notASpace) {
|
||||
// The real spacing between 2 consecutive chars is thin enough to be
|
||||
// considered a non-space.
|
||||
resetLastChars();
|
||||
}
|
||||
|
||||
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||
textContentItem.height += advanceY;
|
||||
} else if (
|
||||
@ -2501,6 +2555,7 @@ class PartialEvaluator {
|
||||
)
|
||||
) {
|
||||
if (textContentItem.str.length === 0) {
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
dir: "ltr",
|
||||
@ -2532,6 +2587,10 @@ class PartialEvaluator {
|
||||
appendEOL();
|
||||
return true;
|
||||
}
|
||||
|
||||
// We're moving back so in case the last char was a whitespace
|
||||
// we cancel it: it doesn't make sense to insert it.
|
||||
resetLastChars();
|
||||
flushTextContentItem();
|
||||
return true;
|
||||
}
|
||||
@ -2541,12 +2600,19 @@ class PartialEvaluator {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (advanceX <= textOrientation * textContentItem.notASpace) {
|
||||
// The real spacing between 2 consecutive chars is thin enough to be
|
||||
// considered a non-space.
|
||||
resetLastChars();
|
||||
}
|
||||
|
||||
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||
textContentItem.width += advanceX;
|
||||
} else if (
|
||||
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
|
||||
) {
|
||||
if (textContentItem.str.length === 0) {
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
dir: "ltr",
|
||||
@ -2600,14 +2666,7 @@ class PartialEvaluator {
|
||||
}
|
||||
let scaledDim = glyphWidth * scale;
|
||||
|
||||
if (
|
||||
glyph.isWhitespace &&
|
||||
(i === 0 ||
|
||||
i + 1 === ii ||
|
||||
glyphs[i - 1].isWhitespace ||
|
||||
glyphs[i + 1].isWhitespace ||
|
||||
extraSpacing)
|
||||
) {
|
||||
if (glyph.isWhitespace) {
|
||||
// Don't push a " " in the textContentItem
|
||||
// (except when it's between two non-spaces chars),
|
||||
// it will be done (if required) in next call to
|
||||
@ -2623,6 +2682,7 @@ class PartialEvaluator {
|
||||
charSpacing += -scaledDim + textState.wordSpacing;
|
||||
textState.translateTextMatrix(0, -charSpacing);
|
||||
}
|
||||
saveLastChar(" ");
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -2653,17 +2713,18 @@ class PartialEvaluator {
|
||||
textChunk.prevTransform = getCurrentTextTransform();
|
||||
}
|
||||
|
||||
if (glyph.isWhitespace) {
|
||||
let glyphUnicode = glyph.unicode;
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||
if (saveLastChar(glyphUnicode)) {
|
||||
// The two last chars are a non-whitespace followed by a whitespace
|
||||
// and then this non-whitespace, so we insert a whitespace here.
|
||||
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
||||
// alignment issues between the textLayer and the canvas if the text
|
||||
// contains e.g. tabs (fixes issue6612.pdf).
|
||||
textChunk.str.push(" ");
|
||||
} else {
|
||||
let glyphUnicode = glyph.unicode;
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||
textChunk.str.push(glyphUnicode);
|
||||
}
|
||||
textChunk.str.push(glyphUnicode);
|
||||
|
||||
if (charSpacing) {
|
||||
if (!font.vertical) {
|
||||
@ -2679,6 +2740,7 @@ class PartialEvaluator {
|
||||
}
|
||||
|
||||
function appendEOL() {
|
||||
resetLastChars();
|
||||
if (textContentItem.initialized) {
|
||||
textContentItem.hasEOL = true;
|
||||
flushTextContentItem();
|
||||
@ -2701,6 +2763,7 @@ class PartialEvaluator {
|
||||
width <= textOrientation * textContentItem.spaceInFlowMax
|
||||
) {
|
||||
if (textContentItem.initialized) {
|
||||
resetLastChars();
|
||||
textContentItem.str.push(" ");
|
||||
}
|
||||
return false;
|
||||
@ -2715,6 +2778,7 @@ class PartialEvaluator {
|
||||
}
|
||||
|
||||
flushTextContentItem();
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
// TODO: check if using the orientation from last chunk is
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -515,3 +515,4 @@
|
||||
!issue14497.pdf
|
||||
!issue14502.pdf
|
||||
!issue13211.pdf
|
||||
!issue14627.pdf
|
||||
|
BIN
test/pdfs/issue14627.pdf
Executable file
BIN
test/pdfs/issue14627.pdf
Executable file
Binary file not shown.
@ -6329,5 +6329,11 @@
|
||||
"md5": "d193853e8a123dc50eeea593a4150b60",
|
||||
"rounds": 1,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue14627",
|
||||
"file": "pdfs/issue14627.pdf",
|
||||
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
|
||||
"rounds": 1,
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
|
@ -1999,7 +1999,7 @@ describe("api", function () {
|
||||
const data = await Promise.all([defaultPromise, parametersPromise]);
|
||||
|
||||
expect(!!data[0].items).toEqual(true);
|
||||
expect(data[0].items.length).toEqual(11);
|
||||
expect(data[0].items.length).toEqual(15);
|
||||
expect(!!data[0].styles).toEqual(true);
|
||||
|
||||
const page1 = mergeText(data[0].items);
|
||||
|
@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
|
||||
},
|
||||
pageMatches: [
|
||||
[
|
||||
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
|
||||
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
|
||||
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
|
||||
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
|
||||
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
|
||||
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
|
||||
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
|
||||
4071,
|
||||
302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157,
|
||||
1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788,
|
||||
1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274,
|
||||
2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576,
|
||||
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901,
|
||||
2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474,
|
||||
3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055,
|
||||
4068,
|
||||
],
|
||||
],
|
||||
pageMatchesLength: [
|
||||
|
Loading…
Reference in New Issue
Block a user