diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 9c8f315f0..4caa568eb 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2187,17 +2187,62 @@ class PartialEvaluator { spaceInFlowMax: 0, trackingSpaceMin: Infinity, negativeSpaceMax: -Infinity, + notASpace: -Infinity, transform: null, fontName: null, hasEOL: false, }; + // Use a circular buffer (length === 2) to save the last chars in the + // text stream. + // This implementation of the circular buffer is using a fixed array + // and the position of the next element: + // function addElement(x) { + // buffer[pos] = x; + // pos = (pos + 1) % buffer.length; + // } + // It's a way faster than: + // function addElement(x) { + // buffer.push(x); + // buffer.shift(); + // } + // + // It's useful to know when we need to add a whitespace in the + // text chunk. + const twoLastChars = [" ", " "]; + let twoLastCharsPos = 0; + + /** + * Save the last char. + * @param {string} char + * @returns {boolean} true when the two last chars before adding the new one + * are a non-whitespace followed by a whitespace. + */ + function saveLastChar(char) { + const nextPos = (twoLastCharsPos + 1) % 2; + const ret = + twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " "; + twoLastChars[twoLastCharsPos] = char; + twoLastCharsPos = nextPos; + + return ret; + } + + function resetLastChars() { + twoLastChars[0] = twoLastChars[1] = " "; + twoLastCharsPos = 0; + } + // Used in addFakeSpaces. // A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space // so it doesn't count as a space. const TRACKING_SPACE_FACTOR = 0.1; + // When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space + // even if one is present in the text stream. + const NOT_A_SPACE_FACTOR = 0.03; + // A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces // a break (a new chunk of text is created). // It doesn't change anything when the text is copied but @@ -2299,6 +2344,7 @@ class PartialEvaluator { textContentItem.trackingSpaceMin = textState.fontSize * TRACKING_SPACE_FACTOR; + textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR; textContentItem.negativeSpaceMax = textState.fontSize * NEGATIVE_SPACE_FACTOR; textContentItem.spaceInFlowMin = @@ -2483,6 +2529,7 @@ class PartialEvaluator { return true; } + resetLastChars(); flushTextContentItem(); return true; } @@ -2491,6 +2538,13 @@ class PartialEvaluator { appendEOL(); return true; } + + if (advanceY <= textOrientation * textContentItem.notASpace) { + // The real spacing between 2 consecutive chars is thin enough to be + // considered a non-space. + resetLastChars(); + } + if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) { textContentItem.height += advanceY; } else if ( @@ -2501,6 +2555,7 @@ class PartialEvaluator { ) ) { if (textContentItem.str.length === 0) { + resetLastChars(); textContent.items.push({ str: " ", dir: "ltr", @@ -2532,6 +2587,10 @@ class PartialEvaluator { appendEOL(); return true; } + + // We're moving back so in case the last char was a whitespace + // we cancel it: it doesn't make sense to insert it. + resetLastChars(); flushTextContentItem(); return true; } @@ -2541,12 +2600,19 @@ class PartialEvaluator { return true; } + if (advanceX <= textOrientation * textContentItem.notASpace) { + // The real spacing between 2 consecutive chars is thin enough to be + // considered a non-space. + resetLastChars(); + } + if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) { textContentItem.width += advanceX; } else if ( !addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation) ) { if (textContentItem.str.length === 0) { + resetLastChars(); textContent.items.push({ str: " ", dir: "ltr", @@ -2600,14 +2666,7 @@ class PartialEvaluator { } let scaledDim = glyphWidth * scale; - if ( - glyph.isWhitespace && - (i === 0 || - i + 1 === ii || - glyphs[i - 1].isWhitespace || - glyphs[i + 1].isWhitespace || - extraSpacing) - ) { + if (glyph.isWhitespace) { // Don't push a " " in the textContentItem // (except when it's between two non-spaces chars), // it will be done (if required) in next call to @@ -2623,6 +2682,7 @@ class PartialEvaluator { charSpacing += -scaledDim + textState.wordSpacing; textState.translateTextMatrix(0, -charSpacing); } + saveLastChar(" "); continue; } @@ -2653,17 +2713,18 @@ class PartialEvaluator { textChunk.prevTransform = getCurrentTextTransform(); } - if (glyph.isWhitespace) { + let glyphUnicode = glyph.unicode; + glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode; + glyphUnicode = reverseIfRtl(glyphUnicode); + if (saveLastChar(glyphUnicode)) { + // The two last chars are a non-whitespace followed by a whitespace + // and then this non-whitespace, so we insert a whitespace here. // Replaces all whitespaces with standard spaces (0x20), to avoid // alignment issues between the textLayer and the canvas if the text // contains e.g. tabs (fixes issue6612.pdf). textChunk.str.push(" "); - } else { - let glyphUnicode = glyph.unicode; - glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode; - glyphUnicode = reverseIfRtl(glyphUnicode); - textChunk.str.push(glyphUnicode); } + textChunk.str.push(glyphUnicode); if (charSpacing) { if (!font.vertical) { @@ -2679,6 +2740,7 @@ class PartialEvaluator { } function appendEOL() { + resetLastChars(); if (textContentItem.initialized) { textContentItem.hasEOL = true; flushTextContentItem(); @@ -2701,6 +2763,7 @@ class PartialEvaluator { width <= textOrientation * textContentItem.spaceInFlowMax ) { if (textContentItem.initialized) { + resetLastChars(); textContentItem.str.push(" "); } return false; @@ -2715,6 +2778,7 @@ class PartialEvaluator { } flushTextContentItem(); + resetLastChars(); textContent.items.push({ str: " ", // TODO: check if using the orientation from last chunk is diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 83f6f8a9d..71d3bbb2b 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -515,3 +515,4 @@ !issue14497.pdf !issue14502.pdf !issue13211.pdf +!issue14627.pdf diff --git a/test/pdfs/issue14627.pdf b/test/pdfs/issue14627.pdf new file mode 100755 index 000000000..ae6ccd269 Binary files /dev/null and b/test/pdfs/issue14627.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index e3cdf1fec..ddd2ab8e9 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -6329,5 +6329,11 @@ "md5": "d193853e8a123dc50eeea593a4150b60", "rounds": 1, "type": "eq" + }, + { "id": "issue14627", + "file": "pdfs/issue14627.pdf", + "md5": "5d1bfcc3b3130bfa7e33e43990e2213a", + "rounds": 1, + "type": "text" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index eca1b0c25..081947830 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -1999,7 +1999,7 @@ describe("api", function () { const data = await Promise.all([defaultPromise, parametersPromise]); expect(!!data[0].items).toEqual(true); - expect(data[0].items.length).toEqual(11); + expect(data[0].items.length).toEqual(15); expect(!!data[0].styles).toEqual(true); const page1 = mergeText(data[0].items); diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index de8f08507..3e87beb4c 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -579,14 +579,14 @@ describe("pdf_find_controller", function () { }, pageMatches: [ [ - 299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152, - 1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786, - 1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272, - 2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576, - 2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902, - 2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477, - 3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058, - 4071, + 302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157, + 1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788, + 1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274, + 2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576, + 2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901, + 2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474, + 3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055, + 4068, ], ], pageMatchesLength: [