From 61ee0de29fc3cd6f142be4b554ca6ab30a77bea7 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 28 Mar 2017 17:54:41 +0200 Subject: [PATCH] Use a simple `RefSetCache` to significantly improve the performance of `Catalog.getPageDict` for certain long documents (PR 8105 follow-up) I found that PR 8105 unfortunately causes a *very serious* performance regression in long PDF documents where the `Pages` tree only has one level; my apologies for this! Obviously we cannot revert that PR, since that would cause more issues than it solves. Hence it seems to me that the only viable solution here, is to add a simple `RefSetCache` to reduce the amount of redundant lookups. Previously in PR 8105 caching was thought to be unnecessary, but as it turns out I don't think that we really have a choice in the matter any more. --- src/core/obj.js | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/core/obj.js b/src/core/obj.js index 577631801..514f00e70 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -50,6 +50,7 @@ var stringToUTF8String = sharedUtil.stringToUTF8String; var warn = sharedUtil.warn; var createValidAbsoluteUrl = sharedUtil.createValidAbsoluteUrl; var Util = sharedUtil.Util; +var Dict = corePrimitives.Dict; var Ref = corePrimitives.Ref; var RefSet = corePrimitives.RefSet; var RefSetCache = corePrimitives.RefSetCache; @@ -70,10 +71,11 @@ var Catalog = (function CatalogClosure() { this.pdfManager = pdfManager; this.xref = xref; this.catDict = xref.getCatalogObj(); - this.fontCache = new RefSetCache(); - this.builtInCMapCache = Object.create(null); assert(isDict(this.catDict), 'catalog object is not a dictionary'); + this.fontCache = new RefSetCache(); + this.builtInCMapCache = Object.create(null); + this.pageKidsCountCache = new RefSetCache(); // TODO refactor to move getPage() to the PDFDocument. this.pageFactory = pageFactory; this.pagePromises = []; @@ -421,6 +423,8 @@ var Catalog = (function CatalogClosure() { }, cleanup: function Catalog_cleanup() { + this.pageKidsCountCache.clear(); + var promises = []; this.fontCache.forEach(function (promise) { promises.push(promise); @@ -453,17 +457,30 @@ var Catalog = (function CatalogClosure() { getPageDict: function Catalog_getPageDict(pageIndex) { var capability = createPromiseCapability(); var nodesToVisit = [this.catDict.getRaw('Pages')]; - var currentPageIndex = 0; - var xref = this.xref; + var count, currentPageIndex = 0; + var xref = this.xref, pageKidsCountCache = this.pageKidsCountCache; function next() { while (nodesToVisit.length) { var currentNode = nodesToVisit.pop(); if (isRef(currentNode)) { + count = pageKidsCountCache.get(currentNode); + // Skip nodes where the page can't be. + if (count > 0 && currentPageIndex + count < pageIndex) { + currentPageIndex += count; + continue; + } + xref.fetchAsync(currentNode).then(function (obj) { if (isDict(obj, 'Page') || (isDict(obj) && !obj.has('Kids'))) { if (pageIndex === currentPageIndex) { + // Cache the Page reference, since it can *greatly* improve + // performance by reducing redundant lookups in long documents + // where all nodes are found at *one* level of the tree. + if (currentNode && !pageKidsCountCache.has(currentNode)) { + pageKidsCountCache.put(currentNode, 1); + } capability.resolve([obj, currentNode]); } else { currentPageIndex++; @@ -481,7 +498,13 @@ var Catalog = (function CatalogClosure() { assert(isDict(currentNode), 'page dictionary kid reference points to wrong type of object'); - var count = currentNode.get('Count'); + count = currentNode.get('Count'); + // Cache the Kids count, since it can reduce redundant lookups in long + // documents where all nodes are found at *one* level of the tree. + var objId = currentNode.objId; + if (objId && !pageKidsCountCache.has(objId)) { + pageKidsCountCache.put(objId, count); + } // Skip nodes where the page can't be. if (currentPageIndex + count <= pageIndex) { currentPageIndex += count; @@ -1251,7 +1274,7 @@ var XRef = (function XRefClosure() { var cacheEntry = this.cache[num]; // In documents with Object Streams, it's possible that cached `Dict`s // have not been assigned an `objId` yet (see e.g. issue3115r.pdf). - if (isDict(cacheEntry) && !cacheEntry.objId) { + if (cacheEntry instanceof Dict && !cacheEntry.objId) { cacheEntry.objId = ref.toString(); } return cacheEntry;