From c33b8d76924e6a033a42c7db17b8853a750b9026 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Thu, 3 Nov 2022 10:20:18 +0100
Subject: [PATCH] Cache the normalized unicode-value on the `Glyph`-instance

Currently, during text-extraction, we're repeatedly normalizing and (when necessary) reversing the unicode-values every time. This seems a little unnecessary, since the result won't change, hence this patch moves that into the `Glyph`-instance and makes it *lazily* initialized.

Taking the `tracemonkey.pdf` document as an example: When extracting the text-content there's a total of 69236 characters but only 595 unique `Glyph`-instances, which mean a 99.1 percent cache hit-rate. Generally speaking, the longer a PDF document is the more beneficial this should be.

*Please note:* The old code is fast enough that it unfortunately seems difficult to measure a (clear) performance improvement with this patch, so I completely understand if it's deemed an unnecessary change.
---
 src/core/evaluator.js | 11 ++---------
 src/core/fonts.js     | 20 ++++++++++++++++++++
 src/shared/util.js    |  4 ++--
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 2f982b575..84d8950d3 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -51,11 +51,6 @@ import {
   getStdFontMap,
   getSymbolsFonts,
 } from "./standard_fonts.js";
-import {
-  getNormalizedUnicodes,
-  getUnicodeForGlyph,
-  reverseIfRtl,
-} from "./unicode.js";
 import { getTilingPatternIR, Pattern } from "./pattern.js";
 import { getXfaFontDict, getXfaFontName } from "./xfa_fonts.js";
 import { IdentityToUnicodeMap, ToUnicodeMap } from "./to_unicode_map.js";
@@ -75,6 +70,7 @@ import { DecodeStream } from "./decode_stream.js";
 import { getGlyphsUnicode } from "./glyphlist.js";
 import { getLookupTableFactory } from "./core_utils.js";
 import { getMetrics } from "./metrics.js";
+import { getUnicodeForGlyph } from "./unicode.js";
 import { MurmurHash3_64 } from "../shared/murmurhash3.js";
 import { OperatorList } from "./operator_list.js";
 import { PDFImage } from "./image.js";
@@ -2293,7 +2289,6 @@ class PartialEvaluator {
     if (includeMarkedContent) {
       markedContentData = markedContentData || { level: 0 };
     }
-    const NormalizedUnicodes = getNormalizedUnicodes();
 
     const textContent = {
       items: [],
@@ -2839,9 +2834,7 @@ class PartialEvaluator {
           textChunk.prevTransform = getCurrentTextTransform();
         }
 
-        let glyphUnicode = glyph.unicode;
-        glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
-        glyphUnicode = reverseIfRtl(glyphUnicode);
+        const glyphUnicode = glyph.normalizedUnicode;
         if (saveLastChar(glyphUnicode)) {
           // The two last chars are a non-whitespace followed by a whitespace
           // and then this non-whitespace, so we insert a whitespace here.
diff --git a/src/core/fonts.js b/src/core/fonts.js
index b42486506..e44cfe863 100644
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@@ -35,9 +35,11 @@ import {
 } from "./fonts_utils.js";
 import {
   getCharUnicodeCategory,
+  getNormalizedUnicodes,
   getUnicodeForGlyph,
   getUnicodeRangeFor,
   mapSpecialUnicodeValues,
+  reverseIfRtl,
 } from "./unicode.js";
 import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
 import {
@@ -218,6 +220,24 @@ class Glyph {
     this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
     this.isInvisibleFormatMark = category.isInvisibleFormatMark;
   }
+
+  /**
+   * This property, which is only used by `PartialEvaluator.getTextContent`,
+   * is purposely made non-serializable.
+   * @type {string}
+   */
+  get normalizedUnicode() {
+    return shadow(
+      this,
+      "normalizedUnicode",
+      reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
+      /* nonSerializable = */ true
+    );
+  }
+
+  static get _NormalizedUnicodes() {
+    return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
+  }
 }
 
 function int16(b0, b1) {
diff --git a/src/shared/util.js b/src/shared/util.js
index bfa0b6bec..723d43129 100644
--- a/src/shared/util.js
+++ b/src/shared/util.js
@@ -498,7 +498,7 @@ function createValidAbsoluteUrl(url, baseUrl = null, options = null) {
   return null;
 }
 
-function shadow(obj, prop, value) {
+function shadow(obj, prop, value, nonSerializable = false) {
   if (
     typeof PDFJSDev === "undefined" ||
     PDFJSDev.test("!PRODUCTION || TESTING")
@@ -510,7 +510,7 @@ function shadow(obj, prop, value) {
   }
   Object.defineProperty(obj, prop, {
     value,
-    enumerable: true,
+    enumerable: !nonSerializable,
     configurable: true,
     writable: false,
   });