diff --git a/src/core/evaluator.js b/src/core/evaluator.js index dbf66f45e..f1d6bf1d2 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2169,6 +2169,8 @@ class PartialEvaluator { stateManager = stateManager || new StateManager(new TextState()); const WhitespaceRegexp = /\s/g; + const DiacriticRegExp = new RegExp("^\\p{Mn}$", "u"); + const NormalizedUnicodes = getNormalizedUnicodes(); const textContent = { items: [], @@ -2182,34 +2184,37 @@ class PartialEvaluator { width: 0, height: 0, vertical: false, - lastCharSize: 0, prevTransform: null, textAdvanceScale: 0, - spaceWidth: 0, spaceInFlowMin: 0, spaceInFlowMax: 0, trackingSpaceMin: Infinity, + negativeSpaceMax: -Infinity, transform: null, fontName: null, hasEOL: false, - isLastCharWhiteSpace: false, }; // Used in addFakeSpaces. - // wsw stands for whitespace width. - // A white <= wsw * TRACKING_SPACE_FACTOR is a tracking space + // A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space // so it doesn't count as a space. - const TRACKING_SPACE_FACTOR = 0.3; + const TRACKING_SPACE_FACTOR = 0.1; - // A white with a width in [wsw * MIN_FACTOR; wsw * MAX_FACTOR] + // A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces + // a break (a new chunk of text is created). + // It doesn't change anything when the text is copied but + // it improves potential mismatch between text layer and canvas. + const NEGATIVE_SPACE_FACTOR = -0.2; + + // A white with a width in [fontSize * MIN_FACTOR; fontSize * MAX_FACTOR] // is a space which will be inserted in the current flow of words. // If the width is outside of this range then the flow is broken // (which means a new span in the text layer). // It's useful to adjust the best as possible the span in the layer // to what is displayed in the canvas. - const SPACE_IN_FLOW_MIN_FACTOR = 0.3; - const SPACE_IN_FLOW_MAX_FACTOR = 1.3; + const SPACE_IN_FLOW_MIN_FACTOR = 0.1; + const SPACE_IN_FLOW_MAX_FACTOR = 0.6; const self = this; const xref = this.xref; @@ -2294,18 +2299,15 @@ class PartialEvaluator { ); const scaleCtmX = Math.hypot(textState.ctm[0], textState.ctm[1]); textContentItem.textAdvanceScale = scaleCtmX * scaleLineX; - textContentItem.lastCharSize = textContentItem.lastCharSize || 0; - const spaceWidth = (font.spaceWidth / 1000) * textState.fontSize; - if (spaceWidth) { - textContentItem.spaceWidth = spaceWidth; - textContentItem.trackingSpaceMin = spaceWidth * TRACKING_SPACE_FACTOR; - textContentItem.spaceInFlowMin = spaceWidth * SPACE_IN_FLOW_MIN_FACTOR; - textContentItem.spaceInFlowMax = spaceWidth * SPACE_IN_FLOW_MAX_FACTOR; - } else { - textContentItem.spaceWidth = 0; - textContentItem.trackingSpaceMin = Infinity; - } + textContentItem.trackingSpaceMin = + textState.fontSize * TRACKING_SPACE_FACTOR; + textContentItem.negativeSpaceMax = + textState.fontSize * NEGATIVE_SPACE_FACTOR; + textContentItem.spaceInFlowMin = + textState.fontSize * SPACE_IN_FLOW_MIN_FACTOR; + textContentItem.spaceInFlowMax = + textState.fontSize * SPACE_IN_FLOW_MAX_FACTOR; textContentItem.hasEOL = false; @@ -2395,7 +2397,7 @@ class PartialEvaluator { }); } - function compareWithLastPosition(fontSize) { + function compareWithLastPosition() { if ( !combineTextItems || !textState.font || @@ -2405,36 +2407,76 @@ class PartialEvaluator { } const currentTransform = getCurrentTextTransform(); - const posX = currentTransform[4]; - const posY = currentTransform[5]; - const lastPosX = textContentItem.prevTransform[4]; - const lastPosY = textContentItem.prevTransform[5]; + let posX = currentTransform[4]; + let posY = currentTransform[5]; + let lastPosX = textContentItem.prevTransform[4]; + let lastPosY = textContentItem.prevTransform[5]; if (lastPosX === posX && lastPosY === posY) { return; } - const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale; - const advanceY = (posY - lastPosY) / textContentItem.textAdvanceScale; - const HALF_LAST_CHAR = -0.5 * textContentItem.lastCharSize; + let rotate = 0; + // Take into account the rotation is the current transform. + // Only rotations with an angle of 0, 90, 180 or 270 are considered. + if ( + currentTransform[0] && + currentTransform[1] === 0 && + currentTransform[2] === 0 + ) { + rotate = currentTransform[0] > 0 ? 0 : 180; + } else if ( + currentTransform[1] && + currentTransform[0] === 0 && + currentTransform[3] === 0 + ) { + rotate += currentTransform[1] > 0 ? 90 : 270; + } + + if (rotate !== 0) { + switch (rotate) { + case 90: + [posX, posY] = [posY, posX]; + [lastPosX, lastPosY] = [lastPosY, lastPosX]; + break; + case 180: + [posX, posY, lastPosX, lastPosY] = [ + -posX, + -posY, + -lastPosX, + -lastPosY, + ]; + break; + case 270: + [posX, posY] = [-posY, -posX]; + [lastPosX, lastPosY] = [-lastPosY, -lastPosX]; + break; + } + } if (textState.font.vertical) { - if ( - Math.abs(advanceX) > - textContentItem.width / - textContentItem.textAdvanceScale /* not the same column */ - ) { + const advanceY = (lastPosY - posY) / textContentItem.textAdvanceScale; + const advanceX = posX - lastPosX; + if (advanceY < textContentItem.negativeSpaceMax) { + if ( + Math.abs(advanceX) > + 0.5 * textContentItem.width /* not the same column */ + ) { + appendEOL(); + return; + } + + flushTextContentItem(); + return; + } + + if (Math.abs(advanceX) > textContentItem.height) { appendEOL(); return; } - - if (HALF_LAST_CHAR > advanceY) { - return; - } - - if (advanceY > textContentItem.trackingSpaceMin) { + if (advanceY <= textContentItem.trackingSpaceMin) { textContentItem.height += advanceY; - } else if (!addFakeSpaces(advanceY, 0, textContentItem.prevTransform)) { + } else if (!addFakeSpaces(advanceY, textContentItem.prevTransform)) { if (textContentItem.str.length === 0) { textContent.items.push({ str: " ", @@ -2445,7 +2487,6 @@ class PartialEvaluator { fontName: textContentItem.fontName, hasEOL: false, }); - textContentItem.isLastCharWhiteSpace = true; } else { textContentItem.height += advanceY; } @@ -2454,22 +2495,28 @@ class PartialEvaluator { return; } - if ( - Math.abs(advanceY) > - textContentItem.height / - textContentItem.textAdvanceScale /* not the same line */ - ) { - appendEOL(); + const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale; + const advanceY = posY - lastPosY; + if (advanceX < textContentItem.negativeSpaceMax) { + if ( + Math.abs(advanceY) > + 0.5 * textContentItem.height /* not the same line */ + ) { + appendEOL(); + return; + } + flushTextContentItem(); return; } - if (HALF_LAST_CHAR > advanceX) { + if (Math.abs(advanceY) > textContentItem.height) { + appendEOL(); return; } if (advanceX <= textContentItem.trackingSpaceMin) { textContentItem.width += advanceX; - } else if (!addFakeSpaces(advanceX, 0, textContentItem.prevTransform)) { + } else if (!addFakeSpaces(advanceX, textContentItem.prevTransform)) { if (textContentItem.str.length === 0) { textContent.items.push({ str: " ", @@ -2480,14 +2527,13 @@ class PartialEvaluator { fontName: textContentItem.fontName, hasEOL: false, }); - textContentItem.isLastCharWhiteSpace = true; } else { textContentItem.width += advanceX; } } } - function buildTextContentItem({ chars, extraSpacing, isFirstChunk }) { + function buildTextContentItem({ chars, extraSpacing }) { const font = textState.font; if (!chars) { // Just move according to the space we have. @@ -2499,87 +2545,91 @@ class PartialEvaluator { 0 ); } else { - textState.translateTextMatrix(0, charSpacing); + textState.translateTextMatrix(0, -charSpacing); } } return; } - const NormalizedUnicodes = getNormalizedUnicodes(); const glyphs = font.charsToGlyphs(chars); const scale = textState.fontMatrix[0] * textState.fontSize; - if (isFirstChunk) { - compareWithLastPosition(scale); - } - - let textChunk = ensureTextContentItem(); - let size = 0; - let lastCharSize = 0; - for (let i = 0, ii = glyphs.length; i < ii; i++) { const glyph = glyphs[i]; let charSpacing = - textState.charSpacing + (i === ii - 1 ? extraSpacing : 0); + textState.charSpacing + (i + 1 === ii ? extraSpacing : 0); + + let glyphWidth = glyph.width; + if (font.vertical) { + glyphWidth = glyph.vmetric ? glyph.vmetric[0] : -glyphWidth; + } + let scaledDim = glyphWidth * scale; let glyphUnicode = glyph.unicode; - if (glyph.isSpace) { - charSpacing += textState.wordSpacing; - textChunk.isLastCharWhiteSpace = true; - } else { - glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode; - glyphUnicode = reverseIfRtl(glyphUnicode); - textChunk.isLastCharWhiteSpace = false; + if ( + glyphUnicode === " " && + (i === 0 || + i + 1 === ii || + glyphs[i - 1].unicode === " " || + glyphs[i + 1].unicode === " ") + ) { + // Don't push a " " in the textContentItem + // (except when it's between two non-spaces chars), + // it will be done (if required) in next call to + // compareWithLastPosition. + // This way we can merge real spaces and spaces due to cursor moves. + if (!font.vertical) { + charSpacing += scaledDim + textState.wordSpacing; + textState.translateTextMatrix( + charSpacing * textState.textHScale, + 0 + ); + } else { + charSpacing += -scaledDim + textState.wordSpacing; + textState.translateTextMatrix(0, -charSpacing); + } + continue; } - textChunk.str.push(glyphUnicode); - const glyphWidth = - font.vertical && glyph.vmetric ? glyph.vmetric[0] : glyph.width; + compareWithLastPosition(); + + // Must be called after compareWithLastPosition because + // the textContentItem could have been flushed. + const textChunk = ensureTextContentItem(); + if (DiacriticRegExp.test(glyph.unicode)) { + scaledDim = 0; + } - let scaledDim = glyphWidth * scale; if (!font.vertical) { scaledDim *= textState.textHScale; textState.translateTextMatrix(scaledDim, 0); + textChunk.width += scaledDim; } else { textState.translateTextMatrix(0, scaledDim); scaledDim = Math.abs(scaledDim); + textChunk.height += scaledDim; } - size += scaledDim; + + if (scaledDim) { + // Save the position of the last visible character. + textChunk.prevTransform = getCurrentTextTransform(); + } + + glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode; + glyphUnicode = reverseIfRtl(glyphUnicode); + textChunk.str.push(glyphUnicode); if (charSpacing) { if (!font.vertical) { - charSpacing *= textState.textHScale; - } - - scaledDim += charSpacing; - const wasSplit = - charSpacing > textContentItem.trackingSpaceMin && - addFakeSpaces(charSpacing, size); - if (!font.vertical) { - textState.translateTextMatrix(charSpacing, 0); + textState.translateTextMatrix( + charSpacing * textState.textHScale, + 0 + ); } else { - textState.translateTextMatrix(0, charSpacing); - } - - if (wasSplit) { - textChunk = ensureTextContentItem(); - size = 0; - } else { - size += charSpacing; + textState.translateTextMatrix(0, -charSpacing); } } - - lastCharSize = scaledDim; } - - textChunk.lastCharSize = lastCharSize; - if (!font.vertical) { - textChunk.width += size; - } else { - textChunk.height += size; - } - - textChunk.prevTransform = getCurrentTextTransform(); } function appendEOL() { @@ -2597,19 +2647,15 @@ class PartialEvaluator { hasEOL: true, }); } - - textContentItem.isLastCharWhiteSpace = false; - textContentItem.lastCharSize = 0; } - function addFakeSpaces(width, size, transf = null) { + function addFakeSpaces(width, transf) { if ( textContentItem.spaceInFlowMin <= width && width <= textContentItem.spaceInFlowMax ) { if (textContentItem.initialized) { textContentItem.str.push(" "); - textContentItem.isLastCharWhiteSpace = true; } return false; } @@ -2617,22 +2663,12 @@ class PartialEvaluator { const fontName = textContentItem.fontName; let height = 0; - width *= textContentItem.textAdvanceScale; - if (!textContentItem.vertical) { - textContentItem.width += size; - } else { - textContentItem.height += size; + if (textContentItem.vertical) { height = width; width = 0; } flushTextContentItem(); - - if (textContentItem.isLastCharWhiteSpace) { - return true; - } - - textContentItem.isLastCharWhiteSpace = true; textContent.items.push({ str: " ", // TODO: check if using the orientation from last chunk is @@ -2640,7 +2676,7 @@ class PartialEvaluator { dir: "ltr", width, height, - transform: transf ? transf : getCurrentTextTransform(), + transform: transf || getCurrentTextTransform(), fontName, hasEOL: false, }); @@ -2731,15 +2767,12 @@ class PartialEvaluator { next(handleSetFont(fontNameArg, null)); return; case OPS.setTextRise: - flushTextContentItem(); textState.textRise = args[0]; break; case OPS.setHScale: - flushTextContentItem(); textState.textHScale = args[0] / 100; break; case OPS.setLeading: - flushTextContentItem(); textState.leading = args[0]; break; case OPS.moveText: @@ -2747,13 +2780,11 @@ class PartialEvaluator { textState.textMatrix = textState.textLineMatrix.slice(); break; case OPS.setLeadingMoveText: - flushTextContentItem(); textState.leading = -args[1]; textState.translateTextLineMatrix(args[0], args[1]); textState.textMatrix = textState.textLineMatrix.slice(); break; case OPS.nextLine: - appendEOL(); textState.carriageReturn(); break; case OPS.setTextMatrix: @@ -2782,7 +2813,6 @@ class PartialEvaluator { textState.wordSpacing = args[0]; break; case OPS.beginText: - flushTextContentItem(); textState.textMatrix = IDENTITY_MATRIX.slice(); textState.textLineMatrix = IDENTITY_MATRIX.slice(); break; @@ -2795,7 +2825,6 @@ class PartialEvaluator { const spaceFactor = ((textState.font.vertical ? 1 : -1) * textState.fontSize) / 1000; const elements = args[0]; - let isFirstChunk = true; for (let i = 0, ii = elements.length; i < ii - 1; i++) { const item = elements[i]; if (typeof item === "string") { @@ -2814,11 +2843,7 @@ class PartialEvaluator { buildTextContentItem({ chars: str, extraSpacing: item * spaceFactor, - isFirstChunk, }); - if (str && isFirstChunk) { - isFirstChunk = false; - } } } @@ -2833,7 +2858,6 @@ class PartialEvaluator { buildTextContentItem({ chars: str, extraSpacing: 0, - isFirstChunk, }); } break; @@ -2842,11 +2866,9 @@ class PartialEvaluator { self.ensureStateFont(stateManager.state); continue; } - buildTextContentItem({ chars: args[0], extraSpacing: 0, - isFirstChunk: true, }); break; case OPS.nextLineShowText: @@ -2854,13 +2876,10 @@ class PartialEvaluator { self.ensureStateFont(stateManager.state); continue; } - textContentItem.hasEOL = true; - flushTextContentItem(); textState.carriageReturn(); buildTextContentItem({ chars: args[0], extraSpacing: 0, - isFirstChunk: true, }); break; case OPS.nextLineSetSpacingShowText: @@ -2868,15 +2887,12 @@ class PartialEvaluator { self.ensureStateFont(stateManager.state); continue; } - textContentItem.hasEOL = true; - flushTextContentItem(); textState.wordSpacing = args[0]; textState.charSpacing = args[1]; textState.carriageReturn(); buildTextContentItem({ chars: args[2], extraSpacing: 0, - isFirstChunk: true, }); break; case OPS.paintXObject: diff --git a/src/display/text_layer.js b/src/display/text_layer.js index 5ba09adbe..778cfe444 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -188,7 +188,7 @@ function appendText(task, geom, styles, ctx) { (task._enhanceTextSelection && AllWhitespaceRegexp.test(geom.str)) ) { shouldScaleText = true; - } else if (geom.transform[0] !== geom.transform[3]) { + } else if (geom.str !== " " && geom.transform[0] !== geom.transform[3]) { const absScaleX = Math.abs(geom.transform[0]), absScaleY = Math.abs(geom.transform[3]); // When the horizontal/vertical scaling differs significantly, also scale diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 76c1c6627..e6ed0ac85 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -13,6 +13,7 @@ !issue1155r.pdf !issue2017r.pdf !bug1727053.pdf +!issue11913.pdf !issue2391-1.pdf !issue2391-2.pdf !issue14046.pdf @@ -182,6 +183,7 @@ !issue11931.pdf !issue1655r.pdf !issue6541.pdf +!issue10640.pdf !issue2948.pdf !issue6231_1.pdf !issue10402.pdf @@ -285,6 +287,7 @@ !issue2840.pdf !issue4061.pdf !issue4668.pdf +!issue13226.pdf !PDFJS-7562-reduced.pdf !issue11768_reduced.pdf !issue5039.pdf @@ -440,6 +443,7 @@ !annotation-fileattachment.pdf !annotation-text-widget.pdf !annotation-choice-widget.pdf +!issue10900.pdf !annotation-button-widget.pdf !annotation-polyline-polygon.pdf !annotation-polyline-polygon-without-appearance.pdf @@ -462,6 +466,7 @@ !issue9972-3.pdf !tiling-pattern-box.pdf !tiling-pattern-large-steps.pdf +!issue13201.pdf !issue11555.pdf !issue12337.pdf !pr12564.pdf diff --git a/test/pdfs/issue10640.pdf b/test/pdfs/issue10640.pdf new file mode 100644 index 000000000..40f632a33 Binary files /dev/null and b/test/pdfs/issue10640.pdf differ diff --git a/test/pdfs/issue10900.pdf b/test/pdfs/issue10900.pdf new file mode 100644 index 000000000..6053501dc Binary files /dev/null and b/test/pdfs/issue10900.pdf differ diff --git a/test/pdfs/issue11913.pdf b/test/pdfs/issue11913.pdf new file mode 100644 index 000000000..e7017b7d3 Binary files /dev/null and b/test/pdfs/issue11913.pdf differ diff --git a/test/pdfs/issue13201.pdf b/test/pdfs/issue13201.pdf new file mode 100644 index 000000000..07b77a8da Binary files /dev/null and b/test/pdfs/issue13201.pdf differ diff --git a/test/pdfs/issue13226.pdf b/test/pdfs/issue13226.pdf new file mode 100644 index 000000000..cebe8fe9c --- /dev/null +++ b/test/pdfs/issue13226.pdf @@ -0,0 +1,86 @@ +%PDF-1.7 +%âãÏÓ +1 0 obj +<< +/Type /Encoding +/BaseEncoding /WinAnsiEncoding +>> +endobj +2 0 obj +<< +/Pages 3 0 R +/Type /Catalog +>> +endobj +3 0 obj +<< +/MediaBox [0 0 400 50] +/Kids [4 0 R] +/Count 1 +/Type /Pages +>> +endobj +4 0 obj +<< +/Parent 3 0 R +/MediaBox [0 0 400 50] +/Resources +<< +/Font +<< +/F1 5 0 R +>> +>> +/Contents 6 0 R +/Type /Page +>> +endobj +5 0 obj +<< +/BaseFont /Times-Italic +/Subtype /Type1 +/Encoding 1 0 R +/Type /Font +>> +endobj +6 0 obj +<< +/Length 278 +>> +stream +BT +/F1 10 Tf +0.005 Tc 1 0 0 1 10 30 Tm +[(M)5 (i)5 (t)]TJ +/Span<>> BDC +14 0 Td +( )Tj +EMC +T* +(arbei)Tj +/Span<>> BDC +( )Tj +EMC +21.2 0 Td +[(terinnen und Mitarbeiter arbeiten in \374ber 100 L\344ndern engagiert im\ + Dienste)5 ( )]TJ +ET +endstream +endobj xref +0 7 +0000000000 65535 f +0000000015 00000 n +0000000085 00000 n +0000000136 00000 n +0000000218 00000 n +0000000347 00000 n +0000000438 00000 n +trailer + +<< +/Root 2 0 R +/Size 7 +>> +startxref +768 +%%EOF diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index f5a81f243..dabbd96f0 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -73,6 +73,10 @@ describe("api", function () { }, WAIT_TIMEOUT); } + function mergeText(items) { + return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join(""); + } + describe("getDocument", function () { it("creates pdf doc from URL-string", async function () { const urlStr = TEST_PDFS_PATH + basicApiFileName; @@ -1604,11 +1608,17 @@ describe("api", function () { const data = await Promise.all([defaultPromise, parametersPromise]); expect(!!data[0].items).toEqual(true); - expect(data[0].items.length).toEqual(12); + expect(data[0].items.length).toEqual(11); expect(!!data[0].styles).toEqual(true); + const page1 = mergeText(data[0].items); + expect(page1).toEqual(`Table Of Content +Chapter 1 .......................................................... 2 +Paragraph 1.1 ...................................................... 3 +page 1 / 3`); + expect(!!data[1].items).toEqual(true); - expect(data[1].items.length).toEqual(7); + expect(data[1].items.length).toEqual(6); expect(!!data[1].styles).toEqual(true); }); @@ -1643,6 +1653,107 @@ describe("api", function () { await loadingTask.destroy(); }); + it("gets text content, with no extra spaces (issue 13226)", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect(text).toEqual( + "Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste" + ); + + await loadingTask.destroy(); + }); + + it("gets text content, with merged spaces (issue 13201)", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect( + text.includes( + "Abstract. A purely peer-to-peer version of electronic cash would allow online" + ) + ).toEqual(true); + expect( + text.includes( + "avoid mediating disputes. The cost of mediation increases transaction costs, limiting the" + ) + ).toEqual(true); + expect( + text.includes( + "system is secure as long as honest nodes collectively control more CPU power than any" + ) + ).toEqual(true); + + await loadingTask.destroy(); + }); + + it("gets text content, with no spaces between letters of words (issue 11913)", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect( + text.includes( + "1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the" + ) + ).toEqual(true); + expect( + text.includes( + "argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-" + ) + ).toEqual(true); + await loadingTask.destroy(); + }); + + it("gets text content, with merged spaces (issue 10900)", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect( + text.includes(`3 3 3 3 +851.5 854.9 839.3 837.5 +633.6 727.8 789.9 796.2 +1,485.1 1,582.7 1,629.2 1,633.7 +114.2 121.7 125.3 130.7 +13.0x 13.0x 13.0x 12.5x`) + ).toEqual(true); + + await loadingTask.destroy(); + }); + + it("gets text content, with spaces (issue 10640)", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect( + text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson. +Open Sans was designed with an upright stress, open forms and a neu- +tral, yet friendly appearance. It was optimized for print, web, and mobile +interfaces, and has excellent legibility characteristics in its letterforms (see +figure \x81 on the following page). This font is available from the Google Font +Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80. +This package provides support for this font in LATEX. It includes Type \x81 +versions of the fonts, converted for this package using FontForge from its +sources, for full support with Dvips.`) + ).toEqual(true); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree(); diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 1b97f47e4..f95daaacc 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -268,7 +268,7 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - pageMatches: [[19, 48, 66]], + pageMatches: [[19, 46, 62]], pageMatchesLength: [[8, 8, 8]], }); });