Add a (global) cache to the getCharUnicodeCategory function

Given that the regular expression has already become more complex (after the initial patch adding it), it seems to me that it probably cannot hurt to add a global cache to reduce unnecessary re-parsing.
Obviously the `Glyph`-instances are being cached *per* font, however in most documents multiple fonts are being used and in practice there's very often a fair amount of overlap between the /ToUnicode-data in different fonts[1].

Consider for example loading and rendering the entire `tracemonkey.pdf` document (from the test-suite), which isn't a particularily large document. In that case the `getCharUnicodeCategory` function is being called a total of `601` times, however there's only `106` *unique* unicode-chars being checked.

*Please note:* In practice I suppose that this won't have a *huge* effect on overall performance, however given the relative simplicity of this patch I figured that it'd not hurt to submit it for review.

---
[1] Consider e.g. how there's usually different fonts used for regular, bold, respectively italic text.
This commit is contained in:
Jonas Jenwald 2022-01-24 16:16:54 +01:00
parent 9367d54009
commit 8836593b9e
5 changed files with 66 additions and 28 deletions

View File

@ -13,19 +13,6 @@
* limitations under the License.
*/
import {
clearPrimitiveCaches,
Dict,
isDict,
isName,
isRef,
isRefsEqual,
isStream,
Name,
Ref,
RefSet,
RefSetCache,
} from "./primitives.js";
import {
collectActions,
MissingDataException,
@ -48,8 +35,21 @@ import {
stringToUTF8String,
warn,
} from "../shared/util.js";
import {
Dict,
isDict,
isName,
isRef,
isRefsEqual,
isStream,
Name,
Ref,
RefSet,
RefSetCache,
} from "./primitives.js";
import { NameTree, NumberTree } from "./name_number_tree.js";
import { BaseStream } from "./base_stream.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { ColorSpace } from "./colorspace.js";
import { FileSpec } from "./file_spec.js";
import { GlobalImageCache } from "./image_utils.js";
@ -1069,7 +1069,7 @@ class Catalog {
}
cleanup(manuallyTriggered = false) {
clearPrimitiveCaches();
clearGlobalCaches();
this.globalImageCache.clear(/* onlyData = */ manuallyTriggered);
this.pageKidsCountCache.clear();
this.pageIndexCache.clear();

View File

@ -0,0 +1,24 @@
/* Copyright 2022 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { clearPrimitiveCaches } from "./primitives.js";
import { clearUnicodeCaches } from "./unicode.js";
function clearGlobalCaches() {
clearPrimitiveCaches();
clearUnicodeCaches();
}
export { clearGlobalCaches };

View File

@ -35,16 +35,6 @@ import {
Util,
warn,
} from "../shared/util.js";
import {
clearPrimitiveCaches,
Dict,
isDict,
isName,
isRef,
isStream,
Name,
Ref,
} from "./primitives.js";
import {
collectActions,
getInheritableProperty,
@ -54,12 +44,22 @@ import {
XRefEntryException,
XRefParseException,
} from "./core_utils.js";
import {
Dict,
isDict,
isName,
isRef,
isStream,
Name,
Ref,
} from "./primitives.js";
import { getXfaFontDict, getXfaFontName } from "./xfa_fonts.js";
import { NullStream, Stream } from "./stream.js";
import { AnnotationFactory } from "./annotation.js";
import { BaseStream } from "./base_stream.js";
import { calculateMD5 } from "./crypto.js";
import { Catalog } from "./catalog.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { Linearization } from "./parser.js";
import { ObjectLoader } from "./object_loader.js";
import { OperatorList } from "./operator_list.js";
@ -1449,7 +1449,7 @@ class PDFDocument {
async cleanup(manuallyTriggered = false) {
return this.catalog
? this.catalog.cleanup(manuallyTriggered)
: clearPrimitiveCaches();
: clearGlobalCaches();
}
/**

View File

@ -1641,16 +1641,29 @@ function reverseIfRtl(chars) {
}
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u");
const CategoryCache = new Map();
function getCharUnicodeCategory(char) {
const cachedCategory = CategoryCache.get(char);
if (cachedCategory) {
return cachedCategory;
}
const groups = char.match(SpecialCharRegExp);
return {
const category = {
isWhitespace: !!(groups && groups[1]),
isZeroWidthDiacritic: !!(groups && groups[2]),
isInvisibleFormatMark: !!(groups && groups[3]),
};
CategoryCache.set(char, category);
return category;
}
function clearUnicodeCaches() {
CategoryCache.clear();
}
export {
clearUnicodeCaches,
getCharUnicodeCategory,
getNormalizedUnicodes,
getUnicodeForGlyph,

View File

@ -32,8 +32,9 @@ import {
VerbosityLevel,
warn,
} from "../shared/util.js";
import { clearPrimitiveCaches, Dict, Ref } from "./primitives.js";
import { Dict, Ref } from "./primitives.js";
import { LocalPdfManager, NetworkPdfManager } from "./pdf_manager.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { incrementalUpdate } from "./writer.js";
import { isNodeJS } from "../shared/is_node.js";
import { MessageHandler } from "../shared/message_handler.js";
@ -795,7 +796,7 @@ class WorkerMessageHandler {
pdfManager = null;
} else {
clearPrimitiveCaches();
clearGlobalCaches();
}
if (cancelXHRs) {
cancelXHRs(new AbortException("Worker was terminated."));