Merge pull request #16200 from calixteman/dont_normalize
[api-minor] Don't normalize the text used in the text layer.
This commit is contained in:
		
						commit
						dbe0c4e60c
					
				@ -147,7 +147,11 @@ function bidi(str, startLevel = -1, vertical = false) {
 | 
			
		||||
      if (!charType) {
 | 
			
		||||
        warn("Bidi: invalid Unicode character " + charCode.toString(16));
 | 
			
		||||
      }
 | 
			
		||||
    } else if (0x0700 <= charCode && charCode <= 0x08ac) {
 | 
			
		||||
    } else if (
 | 
			
		||||
      (0x0700 <= charCode && charCode <= 0x08ac) ||
 | 
			
		||||
      (0xfb50 <= charCode && charCode <= 0xfdff) ||
 | 
			
		||||
      (0xfe70 <= charCode && charCode <= 0xfeff)
 | 
			
		||||
    ) {
 | 
			
		||||
      charType = "AL";
 | 
			
		||||
    }
 | 
			
		||||
    if (charType === "R" || charType === "AL" || charType === "AN") {
 | 
			
		||||
 | 
			
		||||
@ -511,7 +511,13 @@ class Page {
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  extractTextContent({ handler, task, includeMarkedContent, sink }) {
 | 
			
		||||
  extractTextContent({
 | 
			
		||||
    handler,
 | 
			
		||||
    task,
 | 
			
		||||
    includeMarkedContent,
 | 
			
		||||
    disableNormalization,
 | 
			
		||||
    sink,
 | 
			
		||||
  }) {
 | 
			
		||||
    const contentStreamPromise = this.getContentStream();
 | 
			
		||||
    const resourcesPromise = this.loadResources([
 | 
			
		||||
      "ExtGState",
 | 
			
		||||
@ -539,6 +545,7 @@ class Page {
 | 
			
		||||
        task,
 | 
			
		||||
        resources: this.resources,
 | 
			
		||||
        includeMarkedContent,
 | 
			
		||||
        disableNormalization,
 | 
			
		||||
        sink,
 | 
			
		||||
        viewBox: this.view,
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
@ -24,6 +24,7 @@ import {
 | 
			
		||||
  IDENTITY_MATRIX,
 | 
			
		||||
  info,
 | 
			
		||||
  isArrayEqual,
 | 
			
		||||
  normalizeUnicode,
 | 
			
		||||
  OPS,
 | 
			
		||||
  shadow,
 | 
			
		||||
  stringToPDFString,
 | 
			
		||||
@ -2271,6 +2272,7 @@ class PartialEvaluator {
 | 
			
		||||
    seenStyles = new Set(),
 | 
			
		||||
    viewBox,
 | 
			
		||||
    markedContentData = null,
 | 
			
		||||
    disableNormalization = false,
 | 
			
		||||
  }) {
 | 
			
		||||
    // Ensure that `resources`/`stateManager` is correctly initialized,
 | 
			
		||||
    // even if the provided parameter is e.g. `null`.
 | 
			
		||||
@ -2524,7 +2526,10 @@ class PartialEvaluator {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    function runBidiTransform(textChunk) {
 | 
			
		||||
      const text = textChunk.str.join("");
 | 
			
		||||
      let text = textChunk.str.join("");
 | 
			
		||||
      if (!disableNormalization) {
 | 
			
		||||
        text = normalizeUnicode(text);
 | 
			
		||||
      }
 | 
			
		||||
      const bidiResult = bidi(text, -1, textChunk.vertical);
 | 
			
		||||
      return {
 | 
			
		||||
        str: bidiResult.str,
 | 
			
		||||
@ -2859,7 +2864,7 @@ class PartialEvaluator {
 | 
			
		||||
          textChunk.prevTransform = getCurrentTextTransform();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        const glyphUnicode = glyph.normalizedUnicode;
 | 
			
		||||
        const glyphUnicode = glyph.unicode;
 | 
			
		||||
        if (saveLastChar(glyphUnicode)) {
 | 
			
		||||
          // The two last chars are a non-whitespace followed by a whitespace
 | 
			
		||||
          // and then this non-whitespace, so we insert a whitespace here.
 | 
			
		||||
@ -3242,6 +3247,7 @@ class PartialEvaluator {
 | 
			
		||||
                    seenStyles,
 | 
			
		||||
                    viewBox,
 | 
			
		||||
                    markedContentData,
 | 
			
		||||
                    disableNormalization,
 | 
			
		||||
                  })
 | 
			
		||||
                  .then(function () {
 | 
			
		||||
                    if (!sinkWrapper.enqueueInvoked) {
 | 
			
		||||
 | 
			
		||||
@ -33,11 +33,9 @@ import {
 | 
			
		||||
} from "./fonts_utils.js";
 | 
			
		||||
import {
 | 
			
		||||
  getCharUnicodeCategory,
 | 
			
		||||
  getNormalizedUnicodes,
 | 
			
		||||
  getUnicodeForGlyph,
 | 
			
		||||
  getUnicodeRangeFor,
 | 
			
		||||
  mapSpecialUnicodeValues,
 | 
			
		||||
  reverseIfRtl,
 | 
			
		||||
} from "./unicode.js";
 | 
			
		||||
import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
 | 
			
		||||
import {
 | 
			
		||||
@ -277,24 +275,6 @@ class Glyph {
 | 
			
		||||
      /* nonSerializable = */ true
 | 
			
		||||
    );
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /**
 | 
			
		||||
   * This property, which is only used by `PartialEvaluator.getTextContent`,
 | 
			
		||||
   * is purposely made non-serializable.
 | 
			
		||||
   * @type {string}
 | 
			
		||||
   */
 | 
			
		||||
  get normalizedUnicode() {
 | 
			
		||||
    return shadow(
 | 
			
		||||
      this,
 | 
			
		||||
      "normalizedUnicode",
 | 
			
		||||
      reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
 | 
			
		||||
      /* nonSerializable = */ true
 | 
			
		||||
    );
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static get _NormalizedUnicodes() {
 | 
			
		||||
    return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
function int16(b0, b1) {
 | 
			
		||||
@ -507,6 +487,9 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
 | 
			
		||||
  const privateUseOffetStart = PRIVATE_USE_AREAS[privateUseAreaIndex][0];
 | 
			
		||||
  let nextAvailableFontCharCode = privateUseOffetStart;
 | 
			
		||||
  let privateUseOffetEnd = PRIVATE_USE_AREAS[privateUseAreaIndex][1];
 | 
			
		||||
  const isInPrivateArea = code =>
 | 
			
		||||
    (PRIVATE_USE_AREAS[0][0] <= code && code <= PRIVATE_USE_AREAS[0][1]) ||
 | 
			
		||||
    (PRIVATE_USE_AREAS[1][0] <= code && code <= PRIVATE_USE_AREAS[1][1]);
 | 
			
		||||
  for (let originalCharCode in charCodeToGlyphId) {
 | 
			
		||||
    originalCharCode |= 0;
 | 
			
		||||
    let glyphId = charCodeToGlyphId[originalCharCode];
 | 
			
		||||
@ -539,11 +522,7 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
 | 
			
		||||
    if (typeof unicode === "string") {
 | 
			
		||||
      unicode = unicode.codePointAt(0);
 | 
			
		||||
    }
 | 
			
		||||
    if (
 | 
			
		||||
      unicode &&
 | 
			
		||||
      unicode < privateUseOffetStart &&
 | 
			
		||||
      !usedGlyphIds.has(glyphId)
 | 
			
		||||
    ) {
 | 
			
		||||
    if (unicode && !isInPrivateArea(unicode) && !usedGlyphIds.has(glyphId)) {
 | 
			
		||||
      toUnicodeExtraMap.set(unicode, glyphId);
 | 
			
		||||
      usedGlyphIds.add(glyphId);
 | 
			
		||||
    }
 | 
			
		||||
@ -785,6 +764,7 @@ function createOS2Table(properties, charstrings, override) {
 | 
			
		||||
 | 
			
		||||
  let firstCharIndex = null;
 | 
			
		||||
  let lastCharIndex = 0;
 | 
			
		||||
  let position = -1;
 | 
			
		||||
 | 
			
		||||
  if (charstrings) {
 | 
			
		||||
    for (let code in charstrings) {
 | 
			
		||||
@ -796,7 +776,7 @@ function createOS2Table(properties, charstrings, override) {
 | 
			
		||||
        lastCharIndex = code;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      const position = getUnicodeRangeFor(code);
 | 
			
		||||
      position = getUnicodeRangeFor(code, position);
 | 
			
		||||
      if (position < 32) {
 | 
			
		||||
        ulUnicodeRange1 |= 1 << position;
 | 
			
		||||
      } else if (position < 64) {
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1685
									
								
								src/core/unicode.js
									
									
									
									
									
								
							
							
						
						
									
										1685
									
								
								src/core/unicode.js
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -745,7 +745,7 @@ class WorkerMessageHandler {
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    handler.on("GetTextContent", function (data, sink) {
 | 
			
		||||
      const { pageIndex, includeMarkedContent } = data;
 | 
			
		||||
      const { pageIndex, includeMarkedContent, disableNormalization } = data;
 | 
			
		||||
 | 
			
		||||
      pdfManager.getPage(pageIndex).then(function (page) {
 | 
			
		||||
        const task = new WorkerTask("GetTextContent: page " + pageIndex);
 | 
			
		||||
@ -760,6 +760,7 @@ class WorkerMessageHandler {
 | 
			
		||||
            task,
 | 
			
		||||
            sink,
 | 
			
		||||
            includeMarkedContent,
 | 
			
		||||
            disableNormalization,
 | 
			
		||||
          })
 | 
			
		||||
          .then(
 | 
			
		||||
            function () {
 | 
			
		||||
 | 
			
		||||
@ -1122,6 +1122,8 @@ class PDFDocumentProxy {
 | 
			
		||||
 * @typedef {Object} getTextContentParameters
 | 
			
		||||
 * @property {boolean} [includeMarkedContent] - When true include marked
 | 
			
		||||
 *   content items in the items array of TextContent. The default is `false`.
 | 
			
		||||
 * @property {boolean} [disableNormalization] - When true the text is *not*
 | 
			
		||||
 *   normalized in the worker-thread. The default is `false`.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
@ -1598,7 +1600,10 @@ class PDFPageProxy {
 | 
			
		||||
   * @param {getTextContentParameters} params - getTextContent parameters.
 | 
			
		||||
   * @returns {ReadableStream} Stream for reading text content chunks.
 | 
			
		||||
   */
 | 
			
		||||
  streamTextContent({ includeMarkedContent = false } = {}) {
 | 
			
		||||
  streamTextContent({
 | 
			
		||||
    includeMarkedContent = false,
 | 
			
		||||
    disableNormalization = false,
 | 
			
		||||
  } = {}) {
 | 
			
		||||
    const TEXT_CONTENT_CHUNK_SIZE = 100;
 | 
			
		||||
 | 
			
		||||
    return this._transport.messageHandler.sendWithStream(
 | 
			
		||||
@ -1606,6 +1611,7 @@ class PDFPageProxy {
 | 
			
		||||
      {
 | 
			
		||||
        pageIndex: this._pageIndex,
 | 
			
		||||
        includeMarkedContent: includeMarkedContent === true,
 | 
			
		||||
        disableNormalization: disableNormalization === true,
 | 
			
		||||
      },
 | 
			
		||||
      {
 | 
			
		||||
        highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
 | 
			
		||||
 | 
			
		||||
@ -35,6 +35,7 @@ import {
 | 
			
		||||
  FeatureTest,
 | 
			
		||||
  InvalidPDFException,
 | 
			
		||||
  MissingPDFException,
 | 
			
		||||
  normalizeUnicode,
 | 
			
		||||
  OPS,
 | 
			
		||||
  PasswordResponses,
 | 
			
		||||
  PermissionFlag,
 | 
			
		||||
@ -100,6 +101,7 @@ export {
 | 
			
		||||
  isPdfFile,
 | 
			
		||||
  loadScript,
 | 
			
		||||
  MissingPDFException,
 | 
			
		||||
  normalizeUnicode,
 | 
			
		||||
  OPS,
 | 
			
		||||
  PasswordResponses,
 | 
			
		||||
  PDFDataRangeTransport,
 | 
			
		||||
 | 
			
		||||
@ -1026,6 +1026,25 @@ function createPromiseCapability() {
 | 
			
		||||
  return capability;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
let NormalizeRegex = null;
 | 
			
		||||
let NormalizationMap = null;
 | 
			
		||||
function normalizeUnicode(str) {
 | 
			
		||||
  if (!NormalizeRegex) {
 | 
			
		||||
    // In order to generate the following regex:
 | 
			
		||||
    //  - create a PDF containing all the chars in the range 0000-FFFF with
 | 
			
		||||
    //    a NFKC which is different of the char.
 | 
			
		||||
    //  - copy and paste all those chars and get the ones where NFKC is
 | 
			
		||||
    //    required.
 | 
			
		||||
    // It appears that most the chars here contain some ligatures.
 | 
			
		||||
    NormalizeRegex =
 | 
			
		||||
      /([\u00a0\u00b5\u037e\u0eb3\u2000-\u200a\u202f\u2126\ufb00-\ufb04\ufb06\ufb20-\ufb36\ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufba1\ufba4-\ufba9\ufbae-\ufbb1\ufbd3-\ufbdc\ufbde-\ufbe7\ufbea-\ufbf8\ufbfc-\ufbfd\ufc00-\ufc5d\ufc64-\ufcf1\ufcf5-\ufd3d\ufd88\ufdf4\ufdfa-\ufdfb\ufe71\ufe77\ufe79\ufe7b\ufe7d]+)|(\ufb05+)/gu;
 | 
			
		||||
    NormalizationMap = new Map([["ſt", "ſt"]]);
 | 
			
		||||
  }
 | 
			
		||||
  return str.replaceAll(NormalizeRegex, (_, p1, p2) => {
 | 
			
		||||
    return p1 ? p1.normalize("NFKC") : NormalizationMap.get(p2);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export {
 | 
			
		||||
  AbortException,
 | 
			
		||||
  AnnotationActionEventType,
 | 
			
		||||
@ -1064,6 +1083,7 @@ export {
 | 
			
		||||
  LINE_FACTOR,
 | 
			
		||||
  MAX_IMAGE_SIZE_TO_CACHE,
 | 
			
		||||
  MissingPDFException,
 | 
			
		||||
  normalizeUnicode,
 | 
			
		||||
  objectFromMap,
 | 
			
		||||
  objectSize,
 | 
			
		||||
  OPS,
 | 
			
		||||
 | 
			
		||||
@ -693,6 +693,7 @@ class Driver {
 | 
			
		||||
              initPromise = page
 | 
			
		||||
                .getTextContent({
 | 
			
		||||
                  includeMarkedContent: true,
 | 
			
		||||
                  disableNormalization: true,
 | 
			
		||||
                })
 | 
			
		||||
                .then(function (textContent) {
 | 
			
		||||
                  return Rasterize.textLayer(
 | 
			
		||||
 | 
			
		||||
@ -28,7 +28,7 @@ describe("Copy and paste", () => {
 | 
			
		||||
      await closePages(pages);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("must check that we've all the contents", async () => {
 | 
			
		||||
    it("must check that we've all the contents on copy/paste", async () => {
 | 
			
		||||
      await Promise.all(
 | 
			
		||||
        pages.map(async ([browserName, page]) => {
 | 
			
		||||
          await page.keyboard.down("Control");
 | 
			
		||||
@ -117,4 +117,47 @@ describe("Copy and paste", () => {
 | 
			
		||||
      );
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
  describe("all text", () => {
 | 
			
		||||
    let pages;
 | 
			
		||||
 | 
			
		||||
    beforeAll(async () => {
 | 
			
		||||
      pages = await loadAndWait("copy_paste_ligatures.pdf", ".textLayer");
 | 
			
		||||
      await mockClipboard(pages);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    afterAll(async () => {
 | 
			
		||||
      await closePages(pages);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("must check that we've all the contents on copy/paste", async () => {
 | 
			
		||||
      await Promise.all(
 | 
			
		||||
        pages.map(async ([browserName, page]) => {
 | 
			
		||||
          await page.keyboard.down("Control");
 | 
			
		||||
          await page.keyboard.press("a");
 | 
			
		||||
          await page.keyboard.up("Control");
 | 
			
		||||
 | 
			
		||||
          await page.waitForTimeout(100);
 | 
			
		||||
 | 
			
		||||
          await page.keyboard.down("Control");
 | 
			
		||||
          await page.keyboard.press("c");
 | 
			
		||||
          await page.keyboard.up("Control");
 | 
			
		||||
 | 
			
		||||
          await page.waitForTimeout(100);
 | 
			
		||||
 | 
			
		||||
          await page.waitForFunction(
 | 
			
		||||
            `document.querySelector('#viewerContainer').style.cursor !== "wait"`
 | 
			
		||||
          );
 | 
			
		||||
 | 
			
		||||
          const text = await page.evaluate(() =>
 | 
			
		||||
            navigator.clipboard.readText()
 | 
			
		||||
          );
 | 
			
		||||
 | 
			
		||||
          expect(!!text).withContext(`In ${browserName}`).toEqual(true);
 | 
			
		||||
          expect(text)
 | 
			
		||||
            .withContext(`In ${browserName}`)
 | 
			
		||||
            .toEqual("abcdeffffiflffifflſtstghijklmno");
 | 
			
		||||
        })
 | 
			
		||||
      );
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
});
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							@ -585,3 +585,4 @@
 | 
			
		||||
!issue16221.pdf
 | 
			
		||||
!issue16224.pdf
 | 
			
		||||
!issue16278.pdf
 | 
			
		||||
!copy_paste_ligatures.pdf
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/copy_paste_ligatures.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/copy_paste_ligatures.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							@ -2340,7 +2340,9 @@ page 1 / 3`);
 | 
			
		||||
      );
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items, styles } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items, styles } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      expect(items.length).toEqual(1);
 | 
			
		||||
      // Font name will be a random object id.
 | 
			
		||||
      const fontName = items[0].fontName;
 | 
			
		||||
@ -2376,7 +2378,9 @@ page 1 / 3`);
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(text).toEqual(
 | 
			
		||||
@ -2394,7 +2398,9 @@ page 1 / 3`);
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue16119.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
@ -2410,7 +2416,9 @@ page 1 / 3`);
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
@ -2436,7 +2444,9 @@ page 1 / 3`);
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
@ -2456,7 +2466,9 @@ page 1 / 3`);
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
@ -2475,11 +2487,27 @@ page 1 / 3`);
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
      let { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      let text = mergeText(items);
 | 
			
		||||
      let expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson.
 | 
			
		||||
Open Sans was designed with an upright stress, open forms and a neu-
 | 
			
		||||
tral, yet friendly appearance. It was optimized for print, web, and mobile
 | 
			
		||||
interfaces, and has excellent legibility characteristics in its letterforms (see
 | 
			
		||||
figure \x81 on the following page). This font is available from the Google Font
 | 
			
		||||
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
 | 
			
		||||
This package provides support for this font in LATEX. It includes Type \x81
 | 
			
		||||
versions of the fonts, converted for this package using FontForge from its
 | 
			
		||||
sources, for full support with Dvips.`;
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
        text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
 | 
			
		||||
      expect(text.includes(expected)).toEqual(true);
 | 
			
		||||
 | 
			
		||||
      ({ items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: false,
 | 
			
		||||
      }));
 | 
			
		||||
      text = mergeText(items);
 | 
			
		||||
      expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson.
 | 
			
		||||
Open Sans was designed with an upright stress, open forms and a neu-
 | 
			
		||||
tral, yet friendly appearance. It was optimized for print, web, and mobile
 | 
			
		||||
interfaces, and has excellent legibility characteristics in its letterforms (see
 | 
			
		||||
@ -2487,8 +2515,8 @@ figure \x81 on the following page). This font is available from the Google Font
 | 
			
		||||
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
 | 
			
		||||
This package provides support for this font in LATEX. It includes Type \x81
 | 
			
		||||
versions of the fonts, converted for this package using FontForge from its
 | 
			
		||||
sources, for full support with Dvips.`)
 | 
			
		||||
      ).toEqual(true);
 | 
			
		||||
sources, for full support with Dvips.`;
 | 
			
		||||
      expect(text.includes(expected)).toEqual(true);
 | 
			
		||||
 | 
			
		||||
      await loadingTask.destroy();
 | 
			
		||||
    });
 | 
			
		||||
@ -2501,7 +2529,9 @@ sources, for full support with Dvips.`)
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("bug931481.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
@ -2529,7 +2559,9 @@ sozialökonomische Gerechtigkeit.`)
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(
 | 
			
		||||
@ -2550,7 +2582,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
 | 
			
		||||
      );
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(text).toEqual(
 | 
			
		||||
@ -2568,7 +2602,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(6);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(/win aisle/.test(text)).toEqual(false);
 | 
			
		||||
@ -2586,10 +2622,12 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(568);
 | 
			
		||||
      let { items } = await pdfPage.getTextContent({
 | 
			
		||||
        includeMarkedContent: false,
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const textWithoutMC = mergeText(items);
 | 
			
		||||
      ({ items } = await pdfPage.getTextContent({
 | 
			
		||||
        includeMarkedContent: true,
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      }));
 | 
			
		||||
      const textWithMC = mergeText(items);
 | 
			
		||||
 | 
			
		||||
@ -2607,7 +2645,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
 | 
			
		||||
      );
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
      const text = mergeText(items);
 | 
			
		||||
 | 
			
		||||
      expect(text).toEqual("𠮷");
 | 
			
		||||
@ -2619,7 +2659,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
 | 
			
		||||
      const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf"));
 | 
			
		||||
      const pdfDoc = await loadingTask.promise;
 | 
			
		||||
      const pdfPage = await pdfDoc.getPage(1);
 | 
			
		||||
      const { items } = await pdfPage.getTextContent();
 | 
			
		||||
      const { items } = await pdfPage.getTextContent({
 | 
			
		||||
        disableNormalization: true,
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
      expect(items.map(i => i.str)).toEqual(["Hello ", "World"]);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -542,7 +542,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[2743]],
 | 
			
		||||
      pageMatches: [[2734]],
 | 
			
		||||
      pageMatchesLength: [[14]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
@ -561,7 +561,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        pageIndex: 1,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[], [1493]],
 | 
			
		||||
      pageMatches: [[], [1486]],
 | 
			
		||||
      pageMatchesLength: [[], [11]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
@ -594,7 +594,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        [],
 | 
			
		||||
        [],
 | 
			
		||||
        [],
 | 
			
		||||
        [2087],
 | 
			
		||||
        [2081],
 | 
			
		||||
      ],
 | 
			
		||||
      pageMatchesLength: [
 | 
			
		||||
        [24],
 | 
			
		||||
@ -629,7 +629,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[1501]],
 | 
			
		||||
      pageMatches: [[1497]],
 | 
			
		||||
      pageMatchesLength: [[25]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
@ -670,7 +670,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[1946]],
 | 
			
		||||
      pageMatches: [[1941]],
 | 
			
		||||
      pageMatchesLength: [[21]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
@ -692,7 +692,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[1946]],
 | 
			
		||||
      pageMatches: [[1941]],
 | 
			
		||||
      pageMatchesLength: [[23]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
@ -712,7 +712,7 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[1946]],
 | 
			
		||||
      pageMatches: [[1941]],
 | 
			
		||||
      pageMatchesLength: [[23]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
@ -976,4 +976,61 @@ describe("pdf_find_controller", function () {
 | 
			
		||||
      pageMatchesLength: [[5, 5]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  it("performs a search in a text with some arabic chars in different unicode ranges but with same normalized form", async function () {
 | 
			
		||||
    const { eventBus, pdfFindController } = await initPdfFindController(
 | 
			
		||||
      "ArabicCIDTrueType.pdf"
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    await testSearch({
 | 
			
		||||
      eventBus,
 | 
			
		||||
      pdfFindController,
 | 
			
		||||
      state: {
 | 
			
		||||
        query: "\u0629",
 | 
			
		||||
      },
 | 
			
		||||
      matchesPerPage: [4],
 | 
			
		||||
      selectedMatch: {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[6, 25, 44, 63]],
 | 
			
		||||
      pageMatchesLength: [[1, 1, 1, 1]],
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    await testSearch({
 | 
			
		||||
      eventBus,
 | 
			
		||||
      pdfFindController,
 | 
			
		||||
      state: {
 | 
			
		||||
        query: "\ufe94",
 | 
			
		||||
      },
 | 
			
		||||
      matchesPerPage: [4],
 | 
			
		||||
      selectedMatch: {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[6, 25, 44, 63]],
 | 
			
		||||
      pageMatchesLength: [[1, 1, 1, 1]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  it("performs a search in a text with some f ligatures", async function () {
 | 
			
		||||
    const { eventBus, pdfFindController } = await initPdfFindController(
 | 
			
		||||
      "copy_paste_ligatures.pdf"
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    await testSearch({
 | 
			
		||||
      eventBus,
 | 
			
		||||
      pdfFindController,
 | 
			
		||||
      state: {
 | 
			
		||||
        query: "f",
 | 
			
		||||
      },
 | 
			
		||||
      matchesPerPage: [9],
 | 
			
		||||
      selectedMatch: {
 | 
			
		||||
        pageIndex: 0,
 | 
			
		||||
        matchIndex: 0,
 | 
			
		||||
      },
 | 
			
		||||
      pageMatches: [[5, 6, 6, 7, 8, 9, 9, 10, 10]],
 | 
			
		||||
      pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1]],
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
});
 | 
			
		||||
 | 
			
		||||
@ -15,11 +15,9 @@
 | 
			
		||||
 | 
			
		||||
import {
 | 
			
		||||
  getCharUnicodeCategory,
 | 
			
		||||
  getNormalizedUnicodes,
 | 
			
		||||
  getUnicodeForGlyph,
 | 
			
		||||
  getUnicodeRangeFor,
 | 
			
		||||
  mapSpecialUnicodeValues,
 | 
			
		||||
  reverseIfRtl,
 | 
			
		||||
} from "../../src/core/unicode.js";
 | 
			
		||||
import {
 | 
			
		||||
  getDingbatsGlyphsUnicode,
 | 
			
		||||
@ -152,69 +150,12 @@ describe("unicode", function () {
 | 
			
		||||
      expect(getUnicodeRangeFor(0x0041)).toEqual(0);
 | 
			
		||||
      // fi (Alphabetic Presentation Forms)
 | 
			
		||||
      expect(getUnicodeRangeFor(0xfb01)).toEqual(62);
 | 
			
		||||
      // Combining diacritic (Cyrillic Extended-A)
 | 
			
		||||
      expect(getUnicodeRangeFor(0x2dff)).toEqual(9);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("should not get a Unicode range", function () {
 | 
			
		||||
      expect(getUnicodeRangeFor(0x05ff)).toEqual(-1);
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  describe("getNormalizedUnicodes", function () {
 | 
			
		||||
    let NormalizedUnicodes;
 | 
			
		||||
 | 
			
		||||
    beforeAll(function () {
 | 
			
		||||
      NormalizedUnicodes = getNormalizedUnicodes();
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    afterAll(function () {
 | 
			
		||||
      NormalizedUnicodes = null;
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("should get normalized Unicode values for ligatures", function () {
 | 
			
		||||
      // fi => f + i
 | 
			
		||||
      expect(NormalizedUnicodes["\uFB01"]).toEqual("fi");
 | 
			
		||||
      // Arabic
 | 
			
		||||
      expect(NormalizedUnicodes["\u0675"]).toEqual("\u0627\u0674");
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("should not normalize standard characters", function () {
 | 
			
		||||
      expect(NormalizedUnicodes.A).toEqual(undefined);
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  describe("reverseIfRtl", function () {
 | 
			
		||||
    let NormalizedUnicodes;
 | 
			
		||||
 | 
			
		||||
    function getGlyphUnicode(char) {
 | 
			
		||||
      if (NormalizedUnicodes[char] !== undefined) {
 | 
			
		||||
        return NormalizedUnicodes[char];
 | 
			
		||||
      }
 | 
			
		||||
      return char;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    beforeAll(function () {
 | 
			
		||||
      NormalizedUnicodes = getNormalizedUnicodes();
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    afterAll(function () {
 | 
			
		||||
      NormalizedUnicodes = null;
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("should not reverse LTR characters", function () {
 | 
			
		||||
      const A = getGlyphUnicode("A");
 | 
			
		||||
      expect(reverseIfRtl(A)).toEqual("A");
 | 
			
		||||
 | 
			
		||||
      const fi = getGlyphUnicode("\uFB01");
 | 
			
		||||
      expect(reverseIfRtl(fi)).toEqual("fi");
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    it("should reverse RTL characters", function () {
 | 
			
		||||
      // Hebrew (no-op, since it's not a combined character)
 | 
			
		||||
      const heAlef = getGlyphUnicode("\u05D0");
 | 
			
		||||
      expect(reverseIfRtl(heAlef)).toEqual("\u05D0");
 | 
			
		||||
      // Arabic
 | 
			
		||||
      const arAlef = getGlyphUnicode("\u0675");
 | 
			
		||||
      expect(reverseIfRtl(arAlef)).toEqual("\u0674\u0627");
 | 
			
		||||
      expect(getUnicodeRangeFor(0xaa60)).toEqual(-1);
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
});
 | 
			
		||||
 | 
			
		||||
@ -18,8 +18,8 @@
 | 
			
		||||
/** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
 | 
			
		||||
 | 
			
		||||
import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
 | 
			
		||||
import { getCharacterType, getNormalizeWithNFKC } from "./pdf_find_utils.js";
 | 
			
		||||
import { createPromiseCapability } from "pdfjs-lib";
 | 
			
		||||
import { getCharacterType } from "./pdf_find_utils.js";
 | 
			
		||||
 | 
			
		||||
const FindState = {
 | 
			
		||||
  FOUND: 0,
 | 
			
		||||
@ -126,12 +126,7 @@ function normalize(text) {
 | 
			
		||||
  } else {
 | 
			
		||||
    // Compile the regular expression for text normalization once.
 | 
			
		||||
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
 | 
			
		||||
    const toNormalizeWithNFKC =
 | 
			
		||||
      "\u2460-\u2473" + // Circled numbers.
 | 
			
		||||
      "\u24b6-\u24ff" + // Circled letters/numbers.
 | 
			
		||||
      "\u3244-\u32bf" + // Circled ideograms/numbers.
 | 
			
		||||
      "\u32d0-\u32fe" + // Circled ideograms.
 | 
			
		||||
      "\uff00-\uffef"; // Halfwidth, fullwidth forms.
 | 
			
		||||
    const toNormalizeWithNFKC = getNormalizeWithNFKC();
 | 
			
		||||
 | 
			
		||||
    // 3040-309F: Hiragana
 | 
			
		||||
    // 30A0-30FF: Katakana
 | 
			
		||||
@ -840,6 +835,7 @@ class PDFFindController {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let promise = Promise.resolve();
 | 
			
		||||
    const textOptions = { disableNormalization: true };
 | 
			
		||||
    for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
 | 
			
		||||
      const extractTextCapability = createPromiseCapability();
 | 
			
		||||
      this._extractTextPromises[i] = extractTextCapability.promise;
 | 
			
		||||
@ -848,7 +844,7 @@ class PDFFindController {
 | 
			
		||||
        return this._pdfDocument
 | 
			
		||||
          .getPage(i + 1)
 | 
			
		||||
          .then(pdfPage => {
 | 
			
		||||
            return pdfPage.getTextContent();
 | 
			
		||||
            return pdfPage.getTextContent(textOptions);
 | 
			
		||||
          })
 | 
			
		||||
          .then(
 | 
			
		||||
            textContent => {
 | 
			
		||||
 | 
			
		||||
@ -112,4 +112,46 @@ function getCharacterType(charCode) {
 | 
			
		||||
  return CharacterType.ALPHA_LETTER;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export { CharacterType, getCharacterType };
 | 
			
		||||
let NormalizeWithNFKC;
 | 
			
		||||
function getNormalizeWithNFKC() {
 | 
			
		||||
  /* eslint-disable no-irregular-whitespace */
 | 
			
		||||
  NormalizeWithNFKC ||= ` ¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´-῾ - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉-ℓℕ-№ℙ-ℝ℠-™ℤΩℨK-ℭℯ-ℱℳ-ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰꟲ-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`;
 | 
			
		||||
 | 
			
		||||
  if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
 | 
			
		||||
    const ranges = [];
 | 
			
		||||
    const range = [];
 | 
			
		||||
    const diacriticsRegex = /^\p{M}$/u;
 | 
			
		||||
    // Some chars must be replaced by their NFKC counterpart during a search.
 | 
			
		||||
    for (let i = 0; i < 65536; i++) {
 | 
			
		||||
      const c = String.fromCharCode(i);
 | 
			
		||||
      if (c.normalize("NFKC") !== c && !diacriticsRegex.test(c)) {
 | 
			
		||||
        if (range.length !== 2) {
 | 
			
		||||
          range[0] = range[1] = i;
 | 
			
		||||
          continue;
 | 
			
		||||
        }
 | 
			
		||||
        if (range[1] + 1 !== i) {
 | 
			
		||||
          if (range[0] === range[1]) {
 | 
			
		||||
            ranges.push(String.fromCharCode(range[0]));
 | 
			
		||||
          } else {
 | 
			
		||||
            ranges.push(
 | 
			
		||||
              `${String.fromCharCode(range[0])}-${String.fromCharCode(
 | 
			
		||||
                range[1]
 | 
			
		||||
              )}`
 | 
			
		||||
            );
 | 
			
		||||
          }
 | 
			
		||||
          range[0] = range[1] = i;
 | 
			
		||||
        } else {
 | 
			
		||||
          range[1] = i;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    if (ranges.join("") !== NormalizeWithNFKC) {
 | 
			
		||||
      throw new Error(
 | 
			
		||||
        "getNormalizeWithNFKC - update the `NormalizeWithNFKC` string."
 | 
			
		||||
      );
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return NormalizeWithNFKC;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export { CharacterType, getCharacterType, getNormalizeWithNFKC };
 | 
			
		||||
 | 
			
		||||
@ -368,6 +368,7 @@ class PDFPageView {
 | 
			
		||||
      if (!textLayer.renderingDone) {
 | 
			
		||||
        const readableStream = pdfPage.streamTextContent({
 | 
			
		||||
          includeMarkedContent: true,
 | 
			
		||||
          disableNormalization: true,
 | 
			
		||||
        });
 | 
			
		||||
        textLayer.setTextContentSource(readableStream);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
@ -665,6 +665,8 @@ class PDFViewer {
 | 
			
		||||
      }
 | 
			
		||||
      buffer.length = 0;
 | 
			
		||||
      const page = await this.pdfDocument.getPage(pageNum);
 | 
			
		||||
      // By default getTextContent pass disableNormalization equals to false
 | 
			
		||||
      // which is fine because we want a normalized string.
 | 
			
		||||
      const { items } = await page.getTextContent();
 | 
			
		||||
      for (const item of items) {
 | 
			
		||||
        if (item.str) {
 | 
			
		||||
 | 
			
		||||
@ -208,9 +208,20 @@ class TextHighlighter {
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let lastDivIdx = -1;
 | 
			
		||||
    let lastOffset = -1;
 | 
			
		||||
    for (let i = i0; i < i1; i++) {
 | 
			
		||||
      const match = matches[i];
 | 
			
		||||
      const begin = match.begin;
 | 
			
		||||
      if (begin.divIdx === lastDivIdx && begin.offset === lastOffset) {
 | 
			
		||||
        // It's possible to be in this situation if we searched for a 'f' and we
 | 
			
		||||
        // have a ligature 'ff' in the text. The 'ff' has to be highlighted two
 | 
			
		||||
        // times.
 | 
			
		||||
        continue;
 | 
			
		||||
      }
 | 
			
		||||
      lastDivIdx = begin.divIdx;
 | 
			
		||||
      lastOffset = begin.offset;
 | 
			
		||||
 | 
			
		||||
      const end = match.end;
 | 
			
		||||
      const isSelected = isSelectedPage && i === selectedMatchIdx;
 | 
			
		||||
      const highlightSuffix = isSelected ? " selected" : "";
 | 
			
		||||
 | 
			
		||||
@ -20,7 +20,8 @@
 | 
			
		||||
// eslint-disable-next-line max-len
 | 
			
		||||
/** @typedef {import("./text_accessibility.js").TextAccessibilityManager} TextAccessibilityManager */
 | 
			
		||||
 | 
			
		||||
import { renderTextLayer, updateTextLayer } from "pdfjs-lib";
 | 
			
		||||
import { normalizeUnicode, renderTextLayer, updateTextLayer } from "pdfjs-lib";
 | 
			
		||||
import { removeNullCharacters } from "./ui_utils.js";
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * @typedef {Object} TextLayerBuilderOptions
 | 
			
		||||
@ -212,6 +213,16 @@ class TextLayerBuilder {
 | 
			
		||||
      }
 | 
			
		||||
      end.classList.remove("active");
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    div.addEventListener("copy", event => {
 | 
			
		||||
      const selection = document.getSelection();
 | 
			
		||||
      event.clipboardData.setData(
 | 
			
		||||
        "text/plain",
 | 
			
		||||
        removeNullCharacters(normalizeUnicode(selection.toString()))
 | 
			
		||||
      );
 | 
			
		||||
      event.preventDefault();
 | 
			
		||||
      event.stopPropagation();
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user