diff --git a/make.js b/make.js index 45d14519b..02142b2b7 100644 --- a/make.js +++ b/make.js @@ -306,7 +306,8 @@ target.bundle = function(args) { 'core/worker.js', 'core/jpx.js', 'core/jbig2.js', - 'core/bidi.js' + 'core/bidi.js', + 'core/cmap.js' ]; var EXT_SRC_FILES = [ diff --git a/src/core/cmap.js b/src/core/cmap.js new file mode 100644 index 000000000..aa76128f7 --- /dev/null +++ b/src/core/cmap.js @@ -0,0 +1,460 @@ +/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ +/* Copyright 2012 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* globals Util, isString, isInt, warn, error, isCmd, isEOF, isName, Lexer, + isStream */ + +'use strict'; + +var CMAP_CODESPACES = { + 'Adobe-CNS1-0': [[], [0, 14335]], + 'Adobe-CNS1-1': [[], [0, 17407]], + 'Adobe-CNS1-2': [[], [0, 17663]], + 'Adobe-CNS1-3': [[], [0, 18943]], + 'Adobe-CNS1-4': [[], [0, 19199]], + 'Adobe-CNS1-5': [[], [0, 19199]], + 'Adobe-CNS1-6': [[], [0, 19199]], + 'Adobe-CNS1-UCS2': [[], [0, 65535]], + 'B5-H': [[0, 128], [41280, 65278]], + 'B5-V': [[0, 128], [41280, 65278]], + 'B5pc-H': [[0, 128, 253, 255], [41280, 64766]], + 'B5pc-V': [[0, 128, 253, 255], [41280, 64766]], + 'CNS-EUC-H': [[0, 128], [41377, 65278], [], + [2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]], + 'CNS-EUC-V': [[0, 128], [41377, 65278], [], + [2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]], + 'CNS1-H': [[], [8481, 32382]], + 'CNS1-V': [[], [8481, 32382]], + 'CNS2-H': [[], [8481, 32382]], + 'CNS2-V': [[], [8481, 32382]], + 'ETen-B5-H': [[0, 128], [41280, 65278]], + 'ETen-B5-V': [[0, 128], [41280, 65278]], + 'ETenms-B5-H': [[0, 128], [41280, 65278]], + 'ETenms-B5-V': [[0, 128], [41280, 65278]], + 'ETHK-B5-H': [[0, 128], [34624, 65278]], + 'ETHK-B5-V': [[0, 128], [34624, 65278]], + 'HKdla-B5-H': [[0, 128], [41280, 65278]], + 'HKdla-B5-V': [[0, 128], [41280, 65278]], + 'HKdlb-B5-H': [[0, 128], [36416, 65278]], + 'HKdlb-B5-V': [[0, 128], [36416, 65278]], + 'HKgccs-B5-H': [[0, 128], [35392, 65278]], + 'HKgccs-B5-V': [[0, 128], [35392, 65278]], + 'HKm314-B5-H': [[0, 128], [41280, 65278]], + 'HKm314-B5-V': [[0, 128], [41280, 65278]], + 'HKm471-B5-H': [[0, 128], [41280, 65278]], + 'HKm471-B5-V': [[0, 128], [41280, 65278]], + 'HKscs-B5-H': [[0, 128], [34624, 65278]], + 'HKscs-B5-V': [[0, 128], [34624, 65278]], + 'UniCNS-UCS2-H': [[], [0, 55295, 57344, 65535]], + 'UniCNS-UCS2-V': [[], [0, 55295, 57344, 65535]], + 'UniCNS-UTF16-H': [[], [0, 55295, 57344, 65535], [], + [3623934976, 3690979327]], + 'UniCNS-UTF16-V': [[], [0, 55295, 57344, 65535], [], + [3623934976, 3690979327]], + 'Adobe-GB1-0': [[], [0, 7935]], + 'Adobe-GB1-1': [[], [0, 9983]], + 'Adobe-GB1-2': [[], [0, 22271]], + 'Adobe-GB1-3': [[], [0, 22527]], + 'Adobe-GB1-4': [[], [0, 29183]], + 'Adobe-GB1-5': [[], [0, 30463]], + 'Adobe-GB1-UCS2': [[], [0, 65535]], + 'GB-EUC-H': [[0, 128], [41377, 65278]], + 'GB-EUC-V': [[0, 128], [41377, 65278]], + 'GB-H': [[], [8481, 32382]], + 'GB-V': [[], [8481, 32382]], + 'GBK-EUC-H': [[0, 128], [33088, 65278]], + 'GBK-EUC-V': [[0, 128], [33088, 65278]], + 'GBK2K-H': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]], + 'GBK2K-V': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]], + 'GBKp-EUC-H': [[0, 128], [33088, 65278]], + 'GBKp-EUC-V': [[0, 128], [33088, 65278]], + 'GBpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]], + 'GBpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]], + 'GBT-EUC-H': [[0, 128], [41377, 65278]], + 'GBT-EUC-V': [[0, 128], [41377, 65278]], + 'GBT-H': [[], [8481, 32382]], + 'GBT-V': [[], [8481, 32382]], + 'GBTpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]], + 'GBTpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]], + 'UniGB-UCS2-H': [[], [0, 55295, 57344, 65535]], + 'UniGB-UCS2-V': [[], [0, 55295, 57344, 65535]], + 'UniGB-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]], + 'UniGB-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]], + '78-EUC-H': [[0, 128], [36512, 36575, 41377, 65278]], + '78-EUC-V': [[0, 128], [36512, 36575, 41377, 65278]], + '78-H': [[], [8481, 32382]], + '78-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '78-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '78-V': [[], [8481, 32382]], + '78ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '78ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '83pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]], + '90ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '90ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '90msp-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '90msp-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + '90pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]], + '90pv-RKSJ-V': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]], + 'Add-H': [[], [8481, 32382]], + 'Add-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + 'Add-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + 'Add-V': [[], [8481, 32382]], + 'Adobe-Japan1-0': [[], [0, 8447]], + 'Adobe-Japan1-1': [[], [0, 8447]], + 'Adobe-Japan1-2': [[], [0, 8959]], + 'Adobe-Japan1-3': [[], [0, 9471]], + 'Adobe-Japan1-4': [[], [0, 15615]], + 'Adobe-Japan1-5': [[], [0, 20479]], + 'Adobe-Japan1-6': [[], [0, 23295]], + 'Adobe-Japan1-UCS2': [[], [0, 65535]], + 'Adobe-Japan2-0': [[], [0, 6143]], + 'EUC-H': [[0, 128], [36512, 36575, 41377, 65278]], + 'EUC-V': [[0, 128], [36512, 36575, 41377, 65278]], + 'Ext-H': [[], [8481, 32382]], + 'Ext-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + 'Ext-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + 'Ext-V': [[], [8481, 32382]], + 'H': [[], [8481, 32382]], + 'Hankaku': [[0, 255], []], + 'Hiragana': [[0, 255], []], + 'Hojo-EUC-H': [[], [], [9413025, 9436926], []], + 'Hojo-EUC-V': [[], [], [9413025, 9436926], []], + 'Hojo-H': [[], [8481, 32382]], + 'Hojo-V': [[], [8481, 32382]], + 'Katakana': [[0, 255], []], + 'NWP-H': [[], [8481, 32382]], + 'NWP-V': [[], [8481, 32382]], + 'RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + 'RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]], + 'Roman': [[0, 255], []], + 'UniHojo-UCS2-H': [[], [0, 55295, 57344, 65535]], + 'UniHojo-UCS2-V': [[], [0, 55295, 57344, 65535]], + 'UniHojo-UTF16-H': [[], [0, 55295, 57344, 65535], [], + [3623934976, 3690979327]], + 'UniHojo-UTF16-V': [[], [0, 55295, 57344, 65535], [], + [3623934976, 3690979327]], + 'UniJIS-UCS2-H': [[], [0, 55295, 57344, 65535]], + 'UniJIS-UCS2-HW-H': [[], [0, 55295, 57344, 65535]], + 'UniJIS-UCS2-HW-V': [[], [0, 55295, 57344, 65535]], + 'UniJIS-UCS2-V': [[], [0, 55295, 57344, 65535]], + 'UniJIS-UTF16-H': [[], [0, 55295, 57344, 65535], [], + [3623934976, 3690979327]], + 'UniJIS-UTF16-V': [[], [0, 55295, 57344, 65535], [], + [3623934976, 3690979327]], + 'UniJISPro-UCS2-HW-V': [[], [0, 55295, 57344, 65535]], + 'UniJISPro-UCS2-V': [[], [0, 55295, 57344, 65535]], + 'V': [[], [8481, 32382]], + 'WP-Symbol': [[0, 255], []], + 'Adobe-Korea1-0': [[], [0, 9471]], + 'Adobe-Korea1-1': [[], [0, 18175]], + 'Adobe-Korea1-2': [[], [0, 18431]], + 'Adobe-Korea1-UCS2': [[], [0, 65535]], + 'KSC-EUC-H': [[0, 128], [41377, 65278]], + 'KSC-EUC-V': [[0, 128], [41377, 65278]], + 'KSC-H': [[], [8481, 32382]], + 'KSC-Johab-H': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]], + 'KSC-Johab-V': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]], + 'KSC-V': [[], [8481, 32382]], + 'KSCms-UHC-H': [[0, 128], [33089, 65278]], + 'KSCms-UHC-HW-H': [[0, 128], [33089, 65278]], + 'KSCms-UHC-HW-V': [[0, 128], [33089, 65278]], + 'KSCms-UHC-V': [[0, 128], [33089, 65278]], + 'KSCpc-EUC-H': [[0, 132, 254, 255], [41281, 65022]], + 'KSCpc-EUC-V': [[0, 132, 254, 255], [41281, 65022]], + 'UniKS-UCS2-H': [[], [0, 55295, 57344, 65535]], + 'UniKS-UCS2-V': [[], [0, 55295, 57344, 65535]], + 'UniKS-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]], + 'UniKS-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]] +}; + +// CMap, not to be confused with TrueType's cmap. +var CMap = (function CMapClosure() { + function CMap() { + // Codespace ranges are stored as follows: + // [[1BytePairs], [2BytePairs], [3BytePairs], [4BytePairs]] + // where nBytePairs are ranges e.g. [low1, high1, low2, high2, ...] + this.codespaceRanges = [[], [], [], []]; + this.map = []; + this.vertical = false; + } + CMap.prototype = { + addCodespaceRange: function(n, low, high) { + this.codespaceRanges[n - 1].push(low, high); + }, + + mapRange: function(low, high, dstLow) { + var lastByte = dstLow.length - 1; + while (low <= high) { + this.map[low] = dstLow; + // Only the last byte has to be incremented. + dstLow = dstLow.substr(0, lastByte) + + String.fromCharCode(dstLow.charCodeAt(lastByte) + 1); + ++low; + } + }, + + mapRangeToArray: function(low, high, array) { + var i = 0; + while (low <= high) { + this.map[low] = array[i++]; + ++low; + } + }, + + mapOne: function(src, dst) { + this.map[src] = dst; + }, + + lookup: function(code) { + return this.map[code]; + }, + + readCharCode: function(str, offset) { + var c = 0; + var codespaceRanges = this.codespaceRanges; + var codespaceRangesLen = this.codespaceRanges.length; + // 9.7.6.2 CMap Mapping + // The code length is at most 4. + for (var n = 0; n < codespaceRangesLen; n++) { + c = ((c << 8) | str.charCodeAt(offset + n)) >>> 0; + // Check each codespace range to see if it falls within. + var codespaceRange = codespaceRanges[n]; + for (var k = 0, kk = codespaceRange.length; k < kk;) { + var low = codespaceRange[k++]; + var high = codespaceRange[k++]; + if (c >= low && c <= high) { + return [c, n + 1]; + } + } + } + + return [0, 1]; + } + + }; + return CMap; +})(); + +var IdentityCMap = (function IdentityCMapClosure() { + function IdentityCMap(vertical, n) { + CMap.call(this); + this.vertical = vertical; + this.addCodespaceRange(n, 0, 0xffff); + this.mapRange(0, 0xffff, '\u0000'); + } + Util.inherit(IdentityCMap, CMap, {}); + + return IdentityCMap; +})(); + +var CMapFactory = (function CMapFactoryClosure() { + function strToInt(str) { + var a = 0; + for (var i = 0; i < str.length; i++) { + a = (a << 8) | str.charCodeAt(i); + } + return a >>> 0; + } + + function expectString(obj) { + if (!isString(obj)) { + error('Malformed CMap: expected string.'); + } + } + + function expectInt(obj) { + if (!isInt(obj)) { + error('Malformed CMap: expected int.'); + } + } + + function parseBfChar(cMap, lexer) { + while (true) { + var obj = lexer.getObj(); + if (isEOF(obj)) { + break; + } + if (isCmd(obj, 'endbfchar')) { + return; + } + expectString(obj); + var src = strToInt(obj); + obj = lexer.getObj(); + // TODO are /dstName used? + expectString(obj); + var dst = obj; + cMap.mapOne(src, dst); + } + } + + function parseBfRange(cMap, lexer) { + while (true) { + var obj = lexer.getObj(); + if (isEOF(obj)) { + break; + } + if (isCmd(obj, 'endbfrange')) { + return; + } + expectString(obj); + var low = strToInt(obj); + obj = lexer.getObj(); + expectString(obj); + var high = strToInt(obj); + obj = lexer.getObj(); + if (isInt(obj) || isString(obj)) { + var dstLow = isInt(obj) ? String.fromCharCode(obj) : obj; + cMap.mapRange(low, high, dstLow); + } else if (isCmd(obj, '[')) { + obj = lexer.getObj(); + var array = []; + while (!isCmd(obj, ']') && !isEOF(obj)) { + array.push(obj); + obj = lexer.getObj(); + } + cMap.mapRangeToArray(low, high, array); + } else { + break; + } + } + error('Invalid bf range.'); + } + + function parseCidChar(cMap, lexer) { + while (true) { + var obj = lexer.getObj(); + if (isEOF(obj)) { + break; + } + if (isCmd(obj, 'endcidchar')) { + return; + } + expectString(obj); + var src = strToInt(obj); + obj = lexer.getObj(); + expectInt(obj); + var dst = String.fromCharCode(obj); + cMap.mapOne(src, dst); + } + } + + function parseCidRange(cMap, lexer) { + while (true) { + var obj = lexer.getObj(); + if (isEOF(obj)) { + break; + } + if (isCmd(obj, 'endcidrange')) { + return; + } + expectString(obj); + var low = strToInt(obj); + obj = lexer.getObj(); + expectString(obj); + var high = strToInt(obj); + obj = lexer.getObj(); + expectInt(obj); + var dstLow = String.fromCharCode(obj); + cMap.mapRange(low, high, dstLow); + } + } + + function parseCodespaceRange(cMap, lexer) { + while (true) { + var obj = lexer.getObj(); + if (isEOF(obj)) { + break; + } + if (isCmd(obj, 'endcodespacerange')) { + return; + } + if (!isString(obj)) { + break; + } + var low = strToInt(obj); + obj = lexer.getObj(); + if (!isString(obj)) { + break; + } + var high = strToInt(obj); + cMap.addCodespaceRange(obj.length, low, high); + } + error('Invalid codespace range.'); + } + + function parseCmap(cMap, lexer) { + objLoop: while (true) { + var obj = lexer.getObj(); + if (isEOF(obj)) { + break; + } else if (isCmd(obj)) { + switch (obj.cmd) { + case 'endcMap': + break objLoop; + case 'usecMap': + // TODO + break; + case 'begincodespacerange': + parseCodespaceRange(cMap, lexer); + break; + case 'beginbfchar': + parseBfChar(cMap, lexer); + break; + case 'begincidchar': + parseCidChar(cMap, lexer); + break; + case 'beginbfrange': + parseBfRange(cMap, lexer); + break; + case 'begincidrange': + parseCidRange(cMap, lexer); + break; + } + } + } + } + return { + create: function (encoding) { + if (isName(encoding)) { + switch (encoding.name) { + case 'Identity-H': + return new IdentityCMap(false, 2); + case 'Identity-V': + return new IdentityCMap(true, 2); + default: + if (encoding.name in CMAP_CODESPACES) { + // XXX: Temporary hack so the correct amount of bytes are read in + // CMap.readCharCode. + var cMap = new CMap(); + cMap.codespaceRanges = CMAP_CODESPACES[encoding.name]; + return cMap; + } + return null; + } + } else if (isStream(encoding)) { + var cMap = new CMap(); + var lexer = new Lexer(encoding); + try { + parseCmap(cMap, lexer); + } catch (e) { + warn('Invalid CMap data. ' + e); + } + return cMap; + } + error('Encoding required.'); + } + }; +})(); diff --git a/src/core/evaluator.js b/src/core/evaluator.js index dafdddb50..c11713323 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -20,7 +20,7 @@ isStream, isString, JpegStream, Lexer, Metrics, Name, Parser, Pattern, PDFImage, PDFJS, serifFonts, stdFontMap, symbolsFonts, TilingPattern, TODO, warn, Util, Promise, - RefSetCache, isRef, TextRenderingMode */ + RefSetCache, isRef, TextRenderingMode, CMapFactory */ 'use strict'; @@ -1010,119 +1010,24 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { if (!isIdentityMap) error('ToUnicode file cmap translation not implemented'); } else if (isStream(cmapObj)) { - var tokens = []; - var token = ''; - var beginArrayToken = {}; - - var cmap = cmapObj.getBytes(cmapObj.length); - for (var i = 0, ii = cmap.length; i < ii; i++) { - var octet = cmap[i]; - if (octet == 0x20 || octet == 0x0D || octet == 0x0A || - octet == 0x3C || octet == 0x5B || octet == 0x5D) { - switch (token) { - case 'usecmap': - error('usecmap is not implemented'); - break; - - case 'beginbfchar': - case 'beginbfrange': - case 'begincidchar': - case 'begincidrange': - token = ''; - tokens = []; - break; - - case 'endcidrange': - case 'endbfrange': - for (var j = 0, jj = tokens.length; j < jj; j += 3) { - var startRange = tokens[j]; - var endRange = tokens[j + 1]; - var code = tokens[j + 2]; - if (code == 0xFFFF) { - // CMap is broken, assuming code == startRange - code = startRange; - } - if (isArray(code)) { - var codeindex = 0; - while (startRange <= endRange) { - charToUnicode[startRange] = code[codeindex++]; - ++startRange; - } - } else { - while (startRange <= endRange) { - charToUnicode[startRange] = code++; - ++startRange; - } - } - } - break; - - case 'endcidchar': - case 'endbfchar': - for (var j = 0, jj = tokens.length; j < jj; j += 2) { - var index = tokens[j]; - var code = tokens[j + 1]; - charToUnicode[index] = code; - } - break; - - case '': - break; - - default: - if (token[0] >= '0' && token[0] <= '9') - token = parseInt(token, 10); // a number - tokens.push(token); - token = ''; + var cmap = CMapFactory.create(cmapObj).map; + // Convert UTF-16BE + for (var i in cmap) { + var token = cmap[i]; + var str = []; + for (var k = 0; k < token.length; k += 2) { + var w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1); + if ((w1 & 0xF800) !== 0xD800) { // w1 < 0xD800 || w1 > 0xDFFF + str.push(w1); + continue; } - switch (octet) { - case 0x5B: - // begin list parsing - tokens.push(beginArrayToken); - break; - case 0x5D: - // collect array items - var items = [], item; - while (tokens.length && - (item = tokens.pop()) != beginArrayToken) - items.unshift(item); - tokens.push(items); - break; - } - } else if (octet == 0x3E) { - if (token.length) { - // Heuristic: guessing chars size by checking numbers sizes - // in the CMap entries. - if (token.length == 2 && properties.composite) - properties.wideChars = false; - - if (token.length <= 4) { - // parsing hex number - tokens.push(parseInt(token, 16)); - token = ''; - } else { - // parsing hex UTF-16BE numbers - var str = []; - for (var k = 0, kk = token.length; k < kk; k += 4) { - var b = parseInt(token.substr(k, 4), 16); - if (b <= 0x10) { - k += 4; - b = (b << 16) | parseInt(token.substr(k, 4), 16); - b -= 0x10000; - str.push(0xD800 | (b >> 10)); - str.push(0xDC00 | (b & 0x3FF)); - break; - } - str.push(b); - } - tokens.push(String.fromCharCode.apply(String, str)); - token = ''; - } - } - } else { - token += String.fromCharCode(octet); + k += 2; + var w2 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1); + str.push(((w1 & 0x3ff) << 10) + (w2 & 0x3ff) + 0x10000); } + cmap[i] = String.fromCharCode.apply(String, str); } + return cmap; } return charToUnicode; }, @@ -1409,6 +1314,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { properties.cidEncoding = cidEncoding.name; properties.vertical = /-V$/.test(cidEncoding.name); } + properties.cmap = CMapFactory.create(cidEncoding); } this.extractWidths(dict, xref, descriptor, properties); this.extractDataStructures(dict, baseDict, xref, properties); diff --git a/src/core/fonts.js b/src/core/fonts.js index 6cc4badd6..154c3f087 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -18,7 +18,7 @@ ExpertSubsetCharset, FileReaderSync, GlyphsUnicode, info, isArray, isNum, ISOAdobeCharset, Stream, stringToBytes, TextDecoder, TODO, warn, Lexer, Util, - FONT_IDENTITY_MATRIX, FontRendererFactory, shadow */ + FONT_IDENTITY_MATRIX, FontRendererFactory, shadow, isString */ 'use strict'; @@ -2182,6 +2182,7 @@ var Font = (function FontClosure() { this.composite = properties.composite; this.wideChars = properties.wideChars; this.hasEncoding = properties.hasEncoding; + this.cmap = properties.cmap; this.fontMatrix = properties.fontMatrix; if (properties.type == 'Type3') { @@ -3701,7 +3702,7 @@ var Font = (function FontClosure() { var dupFirstEntry = false; if (properties.type == 'CIDFontType2' && properties.toUnicode && - properties.toUnicode[0] > 0) { + properties.toUnicode[0] > '\u0000') { // oracle's defect (see 3427), duplicating first entry dupFirstEntry = true; numGlyphs++; @@ -4250,8 +4251,12 @@ var Font = (function FontClosure() { var unicode = toUnicode[i]; var fontCharCode = typeof unicode === 'object' ? unusedUnicode++ : unicode; - if (typeof unicode !== 'undefined') + if (typeof unicode !== 'undefined') { + if (isString(fontCharCode) && fontCharCode.length === 1) { + fontCharCode = fontCharCode.charCodeAt(0); + } result[i] = fontCharCode; + } } return result; }, @@ -4264,7 +4269,7 @@ var Font = (function FontClosure() { var isIdentityMap = toUnicode.length === 0; for (var i = firstChar, ii = lastChar; i <= ii; i++) { // TODO missing map the character according font's CMap - map[i] = isIdentityMap ? i : toUnicode[i]; + map[i] = isIdentityMap ? String.fromCharCode(i) : toUnicode[i]; } } else { for (var i = firstChar, ii = lastChar; i <= ii; i++) { @@ -4272,7 +4277,7 @@ var Font = (function FontClosure() { if (!glyph) glyph = properties.baseEncoding[i]; if (!!glyph && (glyph in GlyphsUnicode)) - map[i] = GlyphsUnicode[glyph]; + map[i] = String.fromCharCode(GlyphsUnicode[glyph]); } } this.toUnicode = map; @@ -4535,15 +4540,15 @@ var Font = (function FontClosure() { warn('Unsupported CMap: ' + cidEncoding); } } - if (!converter && this.wideChars) { + if (!converter && this.cmap) { + var i = 0; // composite fonts have multi-byte strings convert the string from // single-byte to multi-byte - // XXX assuming CIDFonts are two-byte - later need to extract the - // correct byte encoding according to the PDF spec - var length = chars.length - 1; // looping over two bytes at a time so - // loop should never end on the last byte - for (var i = 0; i < length; i++) { - var charcode = int16([chars.charCodeAt(i++), chars.charCodeAt(i)]); + while (i < chars.length) { + var c = this.cmap.readCharCode(chars, i); + var charcode = c[0]; + var length = c[1]; + i += length; var glyph = this.charToGlyph(charcode); glyphs.push(glyph); // placing null after each word break charcode (ASCII SPACE) diff --git a/src/worker_loader.js b/src/worker_loader.js index e59299e2b..736c6bf96 100644 --- a/src/worker_loader.js +++ b/src/worker_loader.js @@ -34,6 +34,7 @@ var files = [ 'core/cidmaps.js', 'core/crypto.js', 'core/evaluator.js', + 'core/cmap.js', 'core/fonts.js', 'core/font_renderer.js', 'core/glyphlist.js', diff --git a/test/pdfs/bug898853.pdf b/test/pdfs/bug898853.pdf new file mode 100644 index 000000000..ab3bbc8c5 Binary files /dev/null and b/test/pdfs/bug898853.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index 9b5146877..6b245d729 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -869,6 +869,13 @@ "link": true, "type": "eq" }, + { "id": "bug898853.pdf", + "file": "pdfs/bug898853.pdf", + "md5": "37c37702bf98d33f9f74e2380c4d1a3f", + "rounds": 1, + "type": "eq", + "about": "Has a multi-byte char codes." + }, { "id": "issue1912", "file": "pdfs/issue1912.pdf", "md5": "15305b7c2cba971e7423de3f6ad38fef", diff --git a/test/unit/cmap_spec.js b/test/unit/cmap_spec.js new file mode 100644 index 000000000..be0e41268 --- /dev/null +++ b/test/unit/cmap_spec.js @@ -0,0 +1,86 @@ +/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ +/* globals expect, it, describe, StringStream, Lexer, CMapFactory */ + +'use strict'; + +describe('cmap', function() { + it('parses beginbfchar', function() { + var str = '2 beginbfchar\n' + + '<03> <00>\n' + + '<04> <01>\n' + + 'endbfchar\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + expect(cmap.lookup(0x03)).toEqual(String.fromCharCode(0x00)); + expect(cmap.lookup(0x04)).toEqual(String.fromCharCode(0x01)); + expect(cmap.lookup(0x05)).toBeUndefined(); + }); + it('parses beginbfrange with range', function() { + var str = '1 beginbfrange\n' + + '<06> <0B> 0\n' + + 'endbfrange\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + expect(cmap.lookup(0x05)).toBeUndefined(); + expect(cmap.lookup(0x06)).toEqual(String.fromCharCode(0x00)); + expect(cmap.lookup(0x0B)).toEqual(String.fromCharCode(0x05)); + expect(cmap.lookup(0x0C)).toBeUndefined(); + }); + it('parses beginbfrange with array', function() { + var str = '1 beginbfrange\n' + + '<0D> <12> [ 0 1 2 3 4 5 ]\n' + + 'endbfrange\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + expect(cmap.lookup(0x0C)).toBeUndefined(); + expect(cmap.lookup(0x0D)).toEqual(0x00); + expect(cmap.lookup(0x12)).toEqual(0x05); + expect(cmap.lookup(0x13)).toBeUndefined(); + }); + it('parses begincidchar', function() { + var str = '1 begincidchar\n' + + '<14> 0\n' + + 'endcidchar\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + expect(cmap.lookup(0x14)).toEqual(String.fromCharCode(0x00)); + expect(cmap.lookup(0x15)).toBeUndefined(); + }); + it('parses begincidrange', function() { + var str = '1 begincidrange\n' + + '<0016> <001B> 0\n' + + 'endcidrange\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + expect(cmap.lookup(0x15)).toBeUndefined(); + expect(cmap.lookup(0x16)).toEqual(String.fromCharCode(0x00)); + expect(cmap.lookup(0x1B)).toEqual(String.fromCharCode(0x05)); + expect(cmap.lookup(0x1C)).toBeUndefined(); + }); + it('decodes codespace ranges', function() { + var str = '1 begincodespacerange\n' + + '<01> <02>\n' + + '<00000003> <00000004>\n' + + 'endcodespacerange\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + var c = cmap.readCharCode(String.fromCharCode(1), 0); + expect(c[0]).toEqual(1); + expect(c[1]).toEqual(1); + c = cmap.readCharCode(String.fromCharCode(0, 0, 0, 3), 0); + expect(c[0]).toEqual(3); + expect(c[1]).toEqual(4); + }); + it('decodes 4 byte codespace ranges', function() { + var str = '1 begincodespacerange\n' + + '<8EA1A1A1> <8EA1FEFE>\n' + + 'endcodespacerange\n'; + var stream = new StringStream(str); + var cmap = CMapFactory.create(stream); + var c = cmap.readCharCode(String.fromCharCode(0x8E, 0xA1, 0xA1, 0xA1), 0); + expect(c[0]).toEqual(0x8EA1A1A1); + expect(c[1]).toEqual(4); + }); +}); + diff --git a/test/unit/unit_test.html b/test/unit/unit_test.html index 225e873fa..df640b66c 100644 --- a/test/unit/unit_test.html +++ b/test/unit/unit_test.html @@ -26,6 +26,7 @@ + @@ -49,6 +50,7 @@ +