From 79f34b183c8f9872133d1decf0a74c12cbe56a9e Mon Sep 17 00:00:00 2001 From: Christian Krebs Date: Mon, 3 Mar 2014 18:44:45 +0100 Subject: [PATCH] Treat fonts with the same font descriptor, encoding and unicode map as aliases Different fonts can point to the same font descriptor (see https://github.com/mozilla/pdf.js/issues/4339 for details). With this commit such fonts are treated as aliases if they have also the same encoding and the same toUnicode map. The according info is stored on the font descriptor. This change must also ensure that aliases use always the same font name because translated fonts can get cleared depending on the CLEANUP_TIMEOUT setting. --- make.js | 3 +- src/core/evaluator.js | 94 +++++++++++++++++++++++--- src/core/fonts.js | 7 ++ src/core/murmurhash3.js | 146 ++++++++++++++++++++++++++++++++++++++++ src/core/obj.js | 4 ++ src/worker_loader.js | 1 + 6 files changed, 245 insertions(+), 10 deletions(-) create mode 100644 src/core/murmurhash3.js diff --git a/make.js b/make.js index 62b71ca26..6ba805b02 100644 --- a/make.js +++ b/make.js @@ -349,7 +349,8 @@ target.bundle = function(args) { 'core/jpx.js', 'core/jbig2.js', 'core/bidi.js', - 'core/cmap.js' + 'core/cmap.js', + 'core/murmurhash3.js' ]; if (!defines.SINGLE_FILE) { diff --git a/src/core/evaluator.js b/src/core/evaluator.js index d8ce942d7..d4ee4543e 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -17,11 +17,11 @@ /* globals assert, assertWellFormed, ColorSpace, DecodeStream, Dict, Encodings, error, ErrorFont, Font, FONT_IDENTITY_MATRIX, fontCharsToUnicode, FontFlags, ImageKind, info, isArray, isCmd, isDict, isEOF, isName, - isNum, isStream, isString, JpegStream, Lexer, Metrics, Name, Parser, - Pattern, PDFImage, PDFJS, serifFonts, stdFontMap, symbolsFonts, - getTilingPatternIR, warn, Util, Promise, LegacyPromise, - RefSetCache, isRef, TextRenderingMode, CMapFactory, OPS, - UNSUPPORTED_FEATURES, UnsupportedManager */ + isNum, isStream, isString, JpegStream, Lexer, Metrics, + MurmurHash3_64, Name, Parser, Pattern, PDFImage, PDFJS, serifFonts, + stdFontMap, symbolsFonts, getTilingPatternIR, warn, Util, Promise, + LegacyPromise, RefSetCache, isRef, TextRenderingMode, CMapFactory, + OPS, UNSUPPORTED_FEATURES, UnsupportedManager */ 'use strict'; @@ -413,6 +413,36 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { if (!isDict(font)) { return errorFont(); } + + var preEvaluatedFont = this.preEvaluateFont(font, xref); + var descriptor = preEvaluatedFont.descriptor; + var fontID = fontRef.num + '_' + fontRef.gen; + if (isDict(descriptor)) { + if (!descriptor.fontAliases) { + descriptor.fontAliases = Object.create(null); + } + + var fontAliases = descriptor.fontAliases; + var hash = preEvaluatedFont.hash; + if (fontAliases[hash]) { + var aliasFontRef = fontAliases[hash].aliasRef; + if (aliasFontRef && this.fontCache.has(aliasFontRef)) { + this.fontCache.putAlias(fontRef, aliasFontRef); + var cachedFont = this.fontCache.get(fontRef); + return cachedFont; + } + } + + if (!fontAliases[hash]) { + fontAliases[hash] = { + fontID: Font.getFontID() + }; + } + + fontAliases[hash].aliasRef = fontRef; + fontID = fontAliases[hash].fontID; + } + // Workaround for bad PDF generators that don't reference fonts // properly, i.e. by not using an object identifier. // Check if the fontRef is a Dict (as opposed to a standard object), @@ -426,12 +456,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // Keep track of each font we translated so the caller can // load them asynchronously before calling display on a page. font.loadedName = 'g_font_' + (fontRefIsDict ? - fontName.replace(/\W/g, '') : (fontRef.num + '_' + fontRef.gen)); + fontName.replace(/\W/g, '') : fontID); if (!font.translated) { var translated; try { - translated = this.translateFont(font, xref); + translated = this.translateFont(preEvaluatedFont, xref); } catch (e) { UnsupportedManager.notify(UNSUPPORTED_FEATURES.font); translated = new ErrorFont(e instanceof Error ? e.message : e); @@ -1127,7 +1157,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return widths; }, - translateFont: function PartialEvaluator_translateFont(dict, xref) { + preEvaluateFont: function PartialEvaluator_preEvaluateFont(dict, xref) { var baseDict = dict; var type = dict.get('Subtype'); assertWellFormed(isName(type), 'invalid font Subtype'); @@ -1148,9 +1178,55 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { assertWellFormed(isName(type), 'invalid font Subtype'); composite = true; } - var maxCharIndex = (composite ? 0xFFFF : 0xFF); var descriptor = dict.get('FontDescriptor'); + if (descriptor) { + var hash = new MurmurHash3_64(); + var encoding = baseDict.getRaw('Encoding'); + if (isName(encoding)) { + hash.update(encoding.name); + } else if (isRef(encoding)) { + hash.update(encoding.num + '_' + encoding.gen); + } + + var toUnicode = dict.get('ToUnicode') || baseDict.get('ToUnicode'); + if (isStream(toUnicode)) { + var stream = toUnicode.str || toUnicode; + var uint8array = stream.buffer ? + new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength) : + new Uint8Array(stream.bytes.buffer, + stream.start, stream.end - stream.start); + hash.update(uint8array); + + } else if (isName(toUnicode)) { + hash.update(toUnicode.name); + } + + var widths = dict.get('Widths') || baseDict.get('Widths'); + if (widths) { + var uint8array = new Uint8Array(new Uint32Array(widths).buffer); + hash.update(uint8array); + } + } + + return { + descriptor: descriptor, + dict: dict, + baseDict: baseDict, + composite: composite, + hash: hash ? hash.hexdigest() : '' + }; + }, + + translateFont: function PartialEvaluator_translateFont(preEvaluatedFont, + xref) { + var baseDict = preEvaluatedFont.baseDict; + var dict = preEvaluatedFont.dict; + var composite = preEvaluatedFont.composite; + var descriptor = preEvaluatedFont.descriptor; + var type = dict.get('Subtype'); + var maxCharIndex = (composite ? 0xFFFF : 0xFF); + if (!descriptor) { if (type.name == 'Type3') { // FontDescriptor is only required for Type3 fonts when the document diff --git a/src/core/fonts.js b/src/core/fonts.js index e8280739e..9a76e8830 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -2328,6 +2328,13 @@ var Font = (function FontClosure() { this.loading = true; } + Font.getFontID = (function () { + var ID = 1; + return function Font_getFontID() { + return String(ID++); + }; + })(); + function int16(b0, b1) { return (b0 << 8) + b1; } diff --git a/src/core/murmurhash3.js b/src/core/murmurhash3.js new file mode 100644 index 000000000..c2e716330 --- /dev/null +++ b/src/core/murmurhash3.js @@ -0,0 +1,146 @@ +/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ + +/* Copyright 2014 Opera Software ASA + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + * Based on https://code.google.com/p/smhasher/wiki/MurmurHash3. + * Hashes roughly 100 KB per millisecond on i7 3.4 GHz. + */ + +'use strict'; + +var MurmurHash3_64 = (function MurmurHash3_64Closure (seed) { + // Workaround for missing math precison in JS. + var MASK_HIGH = 0xffff0000; + var MASK_LOW = 0xffff; + + function MurmurHash3_64 (seed) { + var SEED = 0xc3d2e1f0; + this.h1 = seed ? seed & 0xffffffff : SEED; + this.h2 = seed ? seed & 0xffffffff : SEED; + } + + MurmurHash3_64.prototype = { + update: function MurmurHash3_64_update(input) { + if (typeof input == 'string') { + var data = new Uint8Array(input.length * 2); + var length = 0; + for (var i = 0; i < input.length; i++) { + var code = input.charCodeAt(i); + if (code <= 0xff) { + data[length++] = code; + } + else { + data[length++] = code >>> 8; + data[length++] = code & 0xff; + } + } + } else { + if (!(input instanceof Uint8Array)) { + throw new Error('Wrong data format in MurmurHash3_64_update. ' + + 'Input must be a string or Uint8Array'); + } + data = input; + length = data.length; + } + + var blockCounts = length >> 2; + var tailLength = length - blockCounts * 4; + var dataUint32 = new Uint32Array(data.buffer, 0, blockCounts); + var k1 = 0; + var k2 = 0; + var h1 = this.h1; + var h2 = this.h2; + var C1 = 0xcc9e2d51; + var C2 = 0x1b873593; + var C1_LOW = C1 & MASK_LOW; + var C2_LOW = C2 & MASK_LOW; + + for (var i = 0; i < blockCounts; i++) { + if (i & 1) { + k1 = dataUint32[i]; + k1 = (k1 * C1 & MASK_HIGH) | (k1 * C1_LOW & MASK_LOW); + k1 = k1 << 15 | k1 >>> 17; + k1 = (k1 * C2 & MASK_HIGH) | (k1 * C2_LOW & MASK_LOW); + h1 ^= k1; + h1 = h1 << 13 | h1 >>> 19; + h1 = h1 * 5 + 0xe6546b64; + } else { + k2 = dataUint32[i]; + k2 = (k2 * C1 & MASK_HIGH) | (k2 * C1_LOW & MASK_LOW); + k2 = k2 << 15 | k2 >>> 17; + k2 = (k2 * C2 & MASK_HIGH) | (k2 * C2_LOW & MASK_LOW); + h2 ^= k2; + h2 = h2 << 13 | h2 >>> 19; + h2 = h2 * 5 + 0xe6546b64; + } + } + + k1 = 0; + + switch (tailLength) { + case 3: + k1 ^= data[blockCounts * 4 + 2] << 16; + /* falls through */ + case 2: + k1 ^= data[blockCounts * 4 + 1] << 8; + /* falls through */ + case 1: + k1 ^= data[blockCounts * 4]; + /* falls through */ + k1 = (k1 * C1 & MASK_HIGH) | (k1 * C1_LOW & MASK_LOW); + k1 = k1 << 15 | k1 >>> 17; + k1 = (k1 * C2 & MASK_HIGH) | (k1 * C2_LOW & MASK_LOW); + if (blockCounts & 1) { + h1 ^= k1; + } else { + h2 ^= k1; + } + } + + this.h1 = h1; + this.h2 = h2; + return this; + }, + + hexdigest: function MurmurHash3_64_hexdigest () { + var h1 = this.h1; + var h2 = this.h2; + + h1 ^= h2 >>> 1; + h1 = (h1 * 0xed558ccd & MASK_HIGH) | (h1 * 0x8ccd & MASK_LOW); + h2 = (h2 * 0xff51afd7 & MASK_HIGH) | + (((h2 << 16 | h1 >>> 16) * 0xafd7ed55 & MASK_HIGH) >>> 16); + h1 ^= h2 >>> 1; + h1 = (h1 * 0x1a85ec53 & MASK_HIGH) | (h1 * 0xec53 & MASK_LOW); + h2 = (h2 * 0xc4ceb9fe & MASK_HIGH) | + (((h2 << 16 | h1 >>> 16) * 0xb9fe1a85 & MASK_HIGH) >>> 16); + h1 ^= h2 >>> 1; + + for (var i = 0, arr = [h1, h2], str = ''; i < arr.length; i++) { + var hex = (arr[i] >>> 0).toString(16); + while (hex.length < 8) { + hex = '0' + hex; + } + str += hex; + } + + return str; + } + }; + + return MurmurHash3_64; +})(); diff --git a/src/core/obj.js b/src/core/obj.js index ab4bb04c7..59c8dbf46 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -271,6 +271,10 @@ var RefSetCache = (function RefSetCacheClosure() { this.dict['R' + ref.num + '.' + ref.gen] = obj; }, + putAlias: function RefSetCache_putAlias(ref, aliasRef) { + this.dict['R' + ref.num + '.' + ref.gen] = this.get(aliasRef); + }, + forEach: function RefSetCache_forEach(fn, thisArg) { for (var i in this.dict) { fn.call(thisArg, this.dict[i]); diff --git a/src/worker_loader.js b/src/worker_loader.js index 3333c3009..7ee5e9d77 100644 --- a/src/worker_loader.js +++ b/src/worker_loader.js @@ -52,6 +52,7 @@ var otherFiles = [ 'core/jpx.js', 'core/jbig2.js', 'core/bidi.js', + 'core/murmurhash3.js', '../external/jpgjs/jpg.js' ];