From aa53319c875205ffe7cd8b41a0280e19445ca5fb Mon Sep 17 00:00:00 2001 From: Jordan Thoms Date: Sun, 3 Aug 2014 01:19:55 +1200 Subject: [PATCH] Improve fingerprinting of documents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes two issues: - #4456 : The first 100 bytes are often not unique as they can be filled with standard PDF headers - so we use the first 200 KB instead. (This may be overkill) - Some documents we encountered have invalid xref ids, which were always coming out as ‘0000000000000000’ - so we detect that and use the MD5 instead. --- src/core/core.js | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/core/core.js b/src/core/core.js index d61aedca6..14cedddc9 100644 --- a/src/core/core.js +++ b/src/core/core.js @@ -280,6 +280,10 @@ var Page = (function PageClosure() { * `PDFDocument` objects on the main thread created. */ var PDFDocument = (function PDFDocumentClosure() { + var FINGERPRINT_FIRST_BYTES = 1024; + + var EMPTY_FINGERPRINT = '\x00\x00\x00\x00\x00\x00\x00' + + '\x00\x00\x00\x00\x00\x00\x00\x00\x00'; function PDFDocument(pdfManager, arg, password) { if (isStream(arg)) { init.call(this, pdfManager, arg, password); @@ -493,14 +497,21 @@ var PDFDocument = (function PDFDocumentClosure() { get fingerprint() { var xref = this.xref, hash, fileID = ''; - if (xref.trailer.has('ID')) { + if (xref.trailer.has('ID') && + xref.trailer.get('ID')[0] !== EMPTY_FINGERPRINT) { hash = stringToBytes(xref.trailer.get('ID')[0]); } else { - hash = calculateMD5(this.stream.bytes.subarray(0, 100), 0, 100); + if (this.stream.ensureRange) { + this.stream.ensureRange(0, + Math.min(FINGERPRINT_FIRST_BYTES, this.stream.end)); + } + hash = calculateMD5(this.stream.bytes.subarray(0, + FINGERPRINT_FIRST_BYTES), 0, FINGERPRINT_FIRST_BYTES); } for (var i = 0, n = hash.length; i < n; i++) { - fileID += hash[i].toString(16); + var hex = hash[i].toString(16); + fileID += hex.length === 1 ? '0' + hex : hex; } return shadow(this, 'fingerprint', fileID);