Merge pull request #14703 from calixteman/14627
[text selection] Add the whitespaces present in the pdf in the text chunk
This commit is contained in:
commit
0dd6bc9a85
@ -2187,17 +2187,62 @@ class PartialEvaluator {
|
|||||||
spaceInFlowMax: 0,
|
spaceInFlowMax: 0,
|
||||||
trackingSpaceMin: Infinity,
|
trackingSpaceMin: Infinity,
|
||||||
negativeSpaceMax: -Infinity,
|
negativeSpaceMax: -Infinity,
|
||||||
|
notASpace: -Infinity,
|
||||||
transform: null,
|
transform: null,
|
||||||
fontName: null,
|
fontName: null,
|
||||||
hasEOL: false,
|
hasEOL: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Use a circular buffer (length === 2) to save the last chars in the
|
||||||
|
// text stream.
|
||||||
|
// This implementation of the circular buffer is using a fixed array
|
||||||
|
// and the position of the next element:
|
||||||
|
// function addElement(x) {
|
||||||
|
// buffer[pos] = x;
|
||||||
|
// pos = (pos + 1) % buffer.length;
|
||||||
|
// }
|
||||||
|
// It's a way faster than:
|
||||||
|
// function addElement(x) {
|
||||||
|
// buffer.push(x);
|
||||||
|
// buffer.shift();
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// It's useful to know when we need to add a whitespace in the
|
||||||
|
// text chunk.
|
||||||
|
const twoLastChars = [" ", " "];
|
||||||
|
let twoLastCharsPos = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save the last char.
|
||||||
|
* @param {string} char
|
||||||
|
* @returns {boolean} true when the two last chars before adding the new one
|
||||||
|
* are a non-whitespace followed by a whitespace.
|
||||||
|
*/
|
||||||
|
function saveLastChar(char) {
|
||||||
|
const nextPos = (twoLastCharsPos + 1) % 2;
|
||||||
|
const ret =
|
||||||
|
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
|
||||||
|
twoLastChars[twoLastCharsPos] = char;
|
||||||
|
twoLastCharsPos = nextPos;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resetLastChars() {
|
||||||
|
twoLastChars[0] = twoLastChars[1] = " ";
|
||||||
|
twoLastCharsPos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Used in addFakeSpaces.
|
// Used in addFakeSpaces.
|
||||||
|
|
||||||
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
|
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
|
||||||
// so it doesn't count as a space.
|
// so it doesn't count as a space.
|
||||||
const TRACKING_SPACE_FACTOR = 0.1;
|
const TRACKING_SPACE_FACTOR = 0.1;
|
||||||
|
|
||||||
|
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
|
||||||
|
// even if one is present in the text stream.
|
||||||
|
const NOT_A_SPACE_FACTOR = 0.03;
|
||||||
|
|
||||||
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
|
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
|
||||||
// a break (a new chunk of text is created).
|
// a break (a new chunk of text is created).
|
||||||
// It doesn't change anything when the text is copied but
|
// It doesn't change anything when the text is copied but
|
||||||
@ -2299,6 +2344,7 @@ class PartialEvaluator {
|
|||||||
|
|
||||||
textContentItem.trackingSpaceMin =
|
textContentItem.trackingSpaceMin =
|
||||||
textState.fontSize * TRACKING_SPACE_FACTOR;
|
textState.fontSize * TRACKING_SPACE_FACTOR;
|
||||||
|
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
|
||||||
textContentItem.negativeSpaceMax =
|
textContentItem.negativeSpaceMax =
|
||||||
textState.fontSize * NEGATIVE_SPACE_FACTOR;
|
textState.fontSize * NEGATIVE_SPACE_FACTOR;
|
||||||
textContentItem.spaceInFlowMin =
|
textContentItem.spaceInFlowMin =
|
||||||
@ -2483,6 +2529,7 @@ class PartialEvaluator {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resetLastChars();
|
||||||
flushTextContentItem();
|
flushTextContentItem();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -2491,6 +2538,13 @@ class PartialEvaluator {
|
|||||||
appendEOL();
|
appendEOL();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (advanceY <= textOrientation * textContentItem.notASpace) {
|
||||||
|
// The real spacing between 2 consecutive chars is thin enough to be
|
||||||
|
// considered a non-space.
|
||||||
|
resetLastChars();
|
||||||
|
}
|
||||||
|
|
||||||
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
|
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||||
textContentItem.height += advanceY;
|
textContentItem.height += advanceY;
|
||||||
} else if (
|
} else if (
|
||||||
@ -2501,6 +2555,7 @@ class PartialEvaluator {
|
|||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
if (textContentItem.str.length === 0) {
|
if (textContentItem.str.length === 0) {
|
||||||
|
resetLastChars();
|
||||||
textContent.items.push({
|
textContent.items.push({
|
||||||
str: " ",
|
str: " ",
|
||||||
dir: "ltr",
|
dir: "ltr",
|
||||||
@ -2532,6 +2587,10 @@ class PartialEvaluator {
|
|||||||
appendEOL();
|
appendEOL();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We're moving back so in case the last char was a whitespace
|
||||||
|
// we cancel it: it doesn't make sense to insert it.
|
||||||
|
resetLastChars();
|
||||||
flushTextContentItem();
|
flushTextContentItem();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -2541,12 +2600,19 @@ class PartialEvaluator {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (advanceX <= textOrientation * textContentItem.notASpace) {
|
||||||
|
// The real spacing between 2 consecutive chars is thin enough to be
|
||||||
|
// considered a non-space.
|
||||||
|
resetLastChars();
|
||||||
|
}
|
||||||
|
|
||||||
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
|
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||||
textContentItem.width += advanceX;
|
textContentItem.width += advanceX;
|
||||||
} else if (
|
} else if (
|
||||||
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
|
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
|
||||||
) {
|
) {
|
||||||
if (textContentItem.str.length === 0) {
|
if (textContentItem.str.length === 0) {
|
||||||
|
resetLastChars();
|
||||||
textContent.items.push({
|
textContent.items.push({
|
||||||
str: " ",
|
str: " ",
|
||||||
dir: "ltr",
|
dir: "ltr",
|
||||||
@ -2600,14 +2666,7 @@ class PartialEvaluator {
|
|||||||
}
|
}
|
||||||
let scaledDim = glyphWidth * scale;
|
let scaledDim = glyphWidth * scale;
|
||||||
|
|
||||||
if (
|
if (glyph.isWhitespace) {
|
||||||
glyph.isWhitespace &&
|
|
||||||
(i === 0 ||
|
|
||||||
i + 1 === ii ||
|
|
||||||
glyphs[i - 1].isWhitespace ||
|
|
||||||
glyphs[i + 1].isWhitespace ||
|
|
||||||
extraSpacing)
|
|
||||||
) {
|
|
||||||
// Don't push a " " in the textContentItem
|
// Don't push a " " in the textContentItem
|
||||||
// (except when it's between two non-spaces chars),
|
// (except when it's between two non-spaces chars),
|
||||||
// it will be done (if required) in next call to
|
// it will be done (if required) in next call to
|
||||||
@ -2623,6 +2682,7 @@ class PartialEvaluator {
|
|||||||
charSpacing += -scaledDim + textState.wordSpacing;
|
charSpacing += -scaledDim + textState.wordSpacing;
|
||||||
textState.translateTextMatrix(0, -charSpacing);
|
textState.translateTextMatrix(0, -charSpacing);
|
||||||
}
|
}
|
||||||
|
saveLastChar(" ");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2653,17 +2713,18 @@ class PartialEvaluator {
|
|||||||
textChunk.prevTransform = getCurrentTextTransform();
|
textChunk.prevTransform = getCurrentTextTransform();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (glyph.isWhitespace) {
|
let glyphUnicode = glyph.unicode;
|
||||||
|
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||||
|
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||||
|
if (saveLastChar(glyphUnicode)) {
|
||||||
|
// The two last chars are a non-whitespace followed by a whitespace
|
||||||
|
// and then this non-whitespace, so we insert a whitespace here.
|
||||||
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
||||||
// alignment issues between the textLayer and the canvas if the text
|
// alignment issues between the textLayer and the canvas if the text
|
||||||
// contains e.g. tabs (fixes issue6612.pdf).
|
// contains e.g. tabs (fixes issue6612.pdf).
|
||||||
textChunk.str.push(" ");
|
textChunk.str.push(" ");
|
||||||
} else {
|
|
||||||
let glyphUnicode = glyph.unicode;
|
|
||||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
|
||||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
|
||||||
textChunk.str.push(glyphUnicode);
|
|
||||||
}
|
}
|
||||||
|
textChunk.str.push(glyphUnicode);
|
||||||
|
|
||||||
if (charSpacing) {
|
if (charSpacing) {
|
||||||
if (!font.vertical) {
|
if (!font.vertical) {
|
||||||
@ -2679,6 +2740,7 @@ class PartialEvaluator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function appendEOL() {
|
function appendEOL() {
|
||||||
|
resetLastChars();
|
||||||
if (textContentItem.initialized) {
|
if (textContentItem.initialized) {
|
||||||
textContentItem.hasEOL = true;
|
textContentItem.hasEOL = true;
|
||||||
flushTextContentItem();
|
flushTextContentItem();
|
||||||
@ -2701,6 +2763,7 @@ class PartialEvaluator {
|
|||||||
width <= textOrientation * textContentItem.spaceInFlowMax
|
width <= textOrientation * textContentItem.spaceInFlowMax
|
||||||
) {
|
) {
|
||||||
if (textContentItem.initialized) {
|
if (textContentItem.initialized) {
|
||||||
|
resetLastChars();
|
||||||
textContentItem.str.push(" ");
|
textContentItem.str.push(" ");
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
@ -2715,6 +2778,7 @@ class PartialEvaluator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
flushTextContentItem();
|
flushTextContentItem();
|
||||||
|
resetLastChars();
|
||||||
textContent.items.push({
|
textContent.items.push({
|
||||||
str: " ",
|
str: " ",
|
||||||
// TODO: check if using the orientation from last chunk is
|
// TODO: check if using the orientation from last chunk is
|
||||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -515,3 +515,4 @@
|
|||||||
!issue14497.pdf
|
!issue14497.pdf
|
||||||
!issue14502.pdf
|
!issue14502.pdf
|
||||||
!issue13211.pdf
|
!issue13211.pdf
|
||||||
|
!issue14627.pdf
|
||||||
|
BIN
test/pdfs/issue14627.pdf
Executable file
BIN
test/pdfs/issue14627.pdf
Executable file
Binary file not shown.
@ -6329,5 +6329,11 @@
|
|||||||
"md5": "d193853e8a123dc50eeea593a4150b60",
|
"md5": "d193853e8a123dc50eeea593a4150b60",
|
||||||
"rounds": 1,
|
"rounds": 1,
|
||||||
"type": "eq"
|
"type": "eq"
|
||||||
|
},
|
||||||
|
{ "id": "issue14627",
|
||||||
|
"file": "pdfs/issue14627.pdf",
|
||||||
|
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
|
||||||
|
"rounds": 1,
|
||||||
|
"type": "text"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -1999,7 +1999,7 @@ describe("api", function () {
|
|||||||
const data = await Promise.all([defaultPromise, parametersPromise]);
|
const data = await Promise.all([defaultPromise, parametersPromise]);
|
||||||
|
|
||||||
expect(!!data[0].items).toEqual(true);
|
expect(!!data[0].items).toEqual(true);
|
||||||
expect(data[0].items.length).toEqual(11);
|
expect(data[0].items.length).toEqual(15);
|
||||||
expect(!!data[0].styles).toEqual(true);
|
expect(!!data[0].styles).toEqual(true);
|
||||||
|
|
||||||
const page1 = mergeText(data[0].items);
|
const page1 = mergeText(data[0].items);
|
||||||
|
@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
|
|||||||
},
|
},
|
||||||
pageMatches: [
|
pageMatches: [
|
||||||
[
|
[
|
||||||
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
|
302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157,
|
||||||
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
|
1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788,
|
||||||
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
|
1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274,
|
||||||
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
|
2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576,
|
||||||
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
|
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901,
|
||||||
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
|
2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474,
|
||||||
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
|
3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055,
|
||||||
4071,
|
4068,
|
||||||
],
|
],
|
||||||
],
|
],
|
||||||
pageMatchesLength: [
|
pageMatchesLength: [
|
||||||
|
Loading…
Reference in New Issue
Block a user