diff --git a/make.js b/make.js
index 45d14519b..02142b2b7 100644
--- a/make.js
+++ b/make.js
@@ -306,7 +306,8 @@ target.bundle = function(args) {
'core/worker.js',
'core/jpx.js',
'core/jbig2.js',
- 'core/bidi.js'
+ 'core/bidi.js',
+ 'core/cmap.js'
];
var EXT_SRC_FILES = [
diff --git a/src/core/cmap.js b/src/core/cmap.js
new file mode 100644
index 000000000..aa76128f7
--- /dev/null
+++ b/src/core/cmap.js
@@ -0,0 +1,460 @@
+/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
+/* Copyright 2012 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* globals Util, isString, isInt, warn, error, isCmd, isEOF, isName, Lexer,
+ isStream */
+
+'use strict';
+
+var CMAP_CODESPACES = {
+ 'Adobe-CNS1-0': [[], [0, 14335]],
+ 'Adobe-CNS1-1': [[], [0, 17407]],
+ 'Adobe-CNS1-2': [[], [0, 17663]],
+ 'Adobe-CNS1-3': [[], [0, 18943]],
+ 'Adobe-CNS1-4': [[], [0, 19199]],
+ 'Adobe-CNS1-5': [[], [0, 19199]],
+ 'Adobe-CNS1-6': [[], [0, 19199]],
+ 'Adobe-CNS1-UCS2': [[], [0, 65535]],
+ 'B5-H': [[0, 128], [41280, 65278]],
+ 'B5-V': [[0, 128], [41280, 65278]],
+ 'B5pc-H': [[0, 128, 253, 255], [41280, 64766]],
+ 'B5pc-V': [[0, 128, 253, 255], [41280, 64766]],
+ 'CNS-EUC-H': [[0, 128], [41377, 65278], [],
+ [2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]],
+ 'CNS-EUC-V': [[0, 128], [41377, 65278], [],
+ [2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]],
+ 'CNS1-H': [[], [8481, 32382]],
+ 'CNS1-V': [[], [8481, 32382]],
+ 'CNS2-H': [[], [8481, 32382]],
+ 'CNS2-V': [[], [8481, 32382]],
+ 'ETen-B5-H': [[0, 128], [41280, 65278]],
+ 'ETen-B5-V': [[0, 128], [41280, 65278]],
+ 'ETenms-B5-H': [[0, 128], [41280, 65278]],
+ 'ETenms-B5-V': [[0, 128], [41280, 65278]],
+ 'ETHK-B5-H': [[0, 128], [34624, 65278]],
+ 'ETHK-B5-V': [[0, 128], [34624, 65278]],
+ 'HKdla-B5-H': [[0, 128], [41280, 65278]],
+ 'HKdla-B5-V': [[0, 128], [41280, 65278]],
+ 'HKdlb-B5-H': [[0, 128], [36416, 65278]],
+ 'HKdlb-B5-V': [[0, 128], [36416, 65278]],
+ 'HKgccs-B5-H': [[0, 128], [35392, 65278]],
+ 'HKgccs-B5-V': [[0, 128], [35392, 65278]],
+ 'HKm314-B5-H': [[0, 128], [41280, 65278]],
+ 'HKm314-B5-V': [[0, 128], [41280, 65278]],
+ 'HKm471-B5-H': [[0, 128], [41280, 65278]],
+ 'HKm471-B5-V': [[0, 128], [41280, 65278]],
+ 'HKscs-B5-H': [[0, 128], [34624, 65278]],
+ 'HKscs-B5-V': [[0, 128], [34624, 65278]],
+ 'UniCNS-UCS2-H': [[], [0, 55295, 57344, 65535]],
+ 'UniCNS-UCS2-V': [[], [0, 55295, 57344, 65535]],
+ 'UniCNS-UTF16-H': [[], [0, 55295, 57344, 65535], [],
+ [3623934976, 3690979327]],
+ 'UniCNS-UTF16-V': [[], [0, 55295, 57344, 65535], [],
+ [3623934976, 3690979327]],
+ 'Adobe-GB1-0': [[], [0, 7935]],
+ 'Adobe-GB1-1': [[], [0, 9983]],
+ 'Adobe-GB1-2': [[], [0, 22271]],
+ 'Adobe-GB1-3': [[], [0, 22527]],
+ 'Adobe-GB1-4': [[], [0, 29183]],
+ 'Adobe-GB1-5': [[], [0, 30463]],
+ 'Adobe-GB1-UCS2': [[], [0, 65535]],
+ 'GB-EUC-H': [[0, 128], [41377, 65278]],
+ 'GB-EUC-V': [[0, 128], [41377, 65278]],
+ 'GB-H': [[], [8481, 32382]],
+ 'GB-V': [[], [8481, 32382]],
+ 'GBK-EUC-H': [[0, 128], [33088, 65278]],
+ 'GBK-EUC-V': [[0, 128], [33088, 65278]],
+ 'GBK2K-H': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]],
+ 'GBK2K-V': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]],
+ 'GBKp-EUC-H': [[0, 128], [33088, 65278]],
+ 'GBKp-EUC-V': [[0, 128], [33088, 65278]],
+ 'GBpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]],
+ 'GBpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]],
+ 'GBT-EUC-H': [[0, 128], [41377, 65278]],
+ 'GBT-EUC-V': [[0, 128], [41377, 65278]],
+ 'GBT-H': [[], [8481, 32382]],
+ 'GBT-V': [[], [8481, 32382]],
+ 'GBTpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]],
+ 'GBTpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]],
+ 'UniGB-UCS2-H': [[], [0, 55295, 57344, 65535]],
+ 'UniGB-UCS2-V': [[], [0, 55295, 57344, 65535]],
+ 'UniGB-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
+ 'UniGB-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
+ '78-EUC-H': [[0, 128], [36512, 36575, 41377, 65278]],
+ '78-EUC-V': [[0, 128], [36512, 36575, 41377, 65278]],
+ '78-H': [[], [8481, 32382]],
+ '78-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '78-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '78-V': [[], [8481, 32382]],
+ '78ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '78ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '83pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
+ '90ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '90ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '90msp-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '90msp-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ '90pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
+ '90pv-RKSJ-V': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
+ 'Add-H': [[], [8481, 32382]],
+ 'Add-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ 'Add-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ 'Add-V': [[], [8481, 32382]],
+ 'Adobe-Japan1-0': [[], [0, 8447]],
+ 'Adobe-Japan1-1': [[], [0, 8447]],
+ 'Adobe-Japan1-2': [[], [0, 8959]],
+ 'Adobe-Japan1-3': [[], [0, 9471]],
+ 'Adobe-Japan1-4': [[], [0, 15615]],
+ 'Adobe-Japan1-5': [[], [0, 20479]],
+ 'Adobe-Japan1-6': [[], [0, 23295]],
+ 'Adobe-Japan1-UCS2': [[], [0, 65535]],
+ 'Adobe-Japan2-0': [[], [0, 6143]],
+ 'EUC-H': [[0, 128], [36512, 36575, 41377, 65278]],
+ 'EUC-V': [[0, 128], [36512, 36575, 41377, 65278]],
+ 'Ext-H': [[], [8481, 32382]],
+ 'Ext-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ 'Ext-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ 'Ext-V': [[], [8481, 32382]],
+ 'H': [[], [8481, 32382]],
+ 'Hankaku': [[0, 255], []],
+ 'Hiragana': [[0, 255], []],
+ 'Hojo-EUC-H': [[], [], [9413025, 9436926], []],
+ 'Hojo-EUC-V': [[], [], [9413025, 9436926], []],
+ 'Hojo-H': [[], [8481, 32382]],
+ 'Hojo-V': [[], [8481, 32382]],
+ 'Katakana': [[0, 255], []],
+ 'NWP-H': [[], [8481, 32382]],
+ 'NWP-V': [[], [8481, 32382]],
+ 'RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ 'RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+ 'Roman': [[0, 255], []],
+ 'UniHojo-UCS2-H': [[], [0, 55295, 57344, 65535]],
+ 'UniHojo-UCS2-V': [[], [0, 55295, 57344, 65535]],
+ 'UniHojo-UTF16-H': [[], [0, 55295, 57344, 65535], [],
+ [3623934976, 3690979327]],
+ 'UniHojo-UTF16-V': [[], [0, 55295, 57344, 65535], [],
+ [3623934976, 3690979327]],
+ 'UniJIS-UCS2-H': [[], [0, 55295, 57344, 65535]],
+ 'UniJIS-UCS2-HW-H': [[], [0, 55295, 57344, 65535]],
+ 'UniJIS-UCS2-HW-V': [[], [0, 55295, 57344, 65535]],
+ 'UniJIS-UCS2-V': [[], [0, 55295, 57344, 65535]],
+ 'UniJIS-UTF16-H': [[], [0, 55295, 57344, 65535], [],
+ [3623934976, 3690979327]],
+ 'UniJIS-UTF16-V': [[], [0, 55295, 57344, 65535], [],
+ [3623934976, 3690979327]],
+ 'UniJISPro-UCS2-HW-V': [[], [0, 55295, 57344, 65535]],
+ 'UniJISPro-UCS2-V': [[], [0, 55295, 57344, 65535]],
+ 'V': [[], [8481, 32382]],
+ 'WP-Symbol': [[0, 255], []],
+ 'Adobe-Korea1-0': [[], [0, 9471]],
+ 'Adobe-Korea1-1': [[], [0, 18175]],
+ 'Adobe-Korea1-2': [[], [0, 18431]],
+ 'Adobe-Korea1-UCS2': [[], [0, 65535]],
+ 'KSC-EUC-H': [[0, 128], [41377, 65278]],
+ 'KSC-EUC-V': [[0, 128], [41377, 65278]],
+ 'KSC-H': [[], [8481, 32382]],
+ 'KSC-Johab-H': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]],
+ 'KSC-Johab-V': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]],
+ 'KSC-V': [[], [8481, 32382]],
+ 'KSCms-UHC-H': [[0, 128], [33089, 65278]],
+ 'KSCms-UHC-HW-H': [[0, 128], [33089, 65278]],
+ 'KSCms-UHC-HW-V': [[0, 128], [33089, 65278]],
+ 'KSCms-UHC-V': [[0, 128], [33089, 65278]],
+ 'KSCpc-EUC-H': [[0, 132, 254, 255], [41281, 65022]],
+ 'KSCpc-EUC-V': [[0, 132, 254, 255], [41281, 65022]],
+ 'UniKS-UCS2-H': [[], [0, 55295, 57344, 65535]],
+ 'UniKS-UCS2-V': [[], [0, 55295, 57344, 65535]],
+ 'UniKS-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
+ 'UniKS-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]]
+};
+
+// CMap, not to be confused with TrueType's cmap.
+var CMap = (function CMapClosure() {
+ function CMap() {
+ // Codespace ranges are stored as follows:
+ // [[1BytePairs], [2BytePairs], [3BytePairs], [4BytePairs]]
+ // where nBytePairs are ranges e.g. [low1, high1, low2, high2, ...]
+ this.codespaceRanges = [[], [], [], []];
+ this.map = [];
+ this.vertical = false;
+ }
+ CMap.prototype = {
+ addCodespaceRange: function(n, low, high) {
+ this.codespaceRanges[n - 1].push(low, high);
+ },
+
+ mapRange: function(low, high, dstLow) {
+ var lastByte = dstLow.length - 1;
+ while (low <= high) {
+ this.map[low] = dstLow;
+ // Only the last byte has to be incremented.
+ dstLow = dstLow.substr(0, lastByte) +
+ String.fromCharCode(dstLow.charCodeAt(lastByte) + 1);
+ ++low;
+ }
+ },
+
+ mapRangeToArray: function(low, high, array) {
+ var i = 0;
+ while (low <= high) {
+ this.map[low] = array[i++];
+ ++low;
+ }
+ },
+
+ mapOne: function(src, dst) {
+ this.map[src] = dst;
+ },
+
+ lookup: function(code) {
+ return this.map[code];
+ },
+
+ readCharCode: function(str, offset) {
+ var c = 0;
+ var codespaceRanges = this.codespaceRanges;
+ var codespaceRangesLen = this.codespaceRanges.length;
+ // 9.7.6.2 CMap Mapping
+ // The code length is at most 4.
+ for (var n = 0; n < codespaceRangesLen; n++) {
+ c = ((c << 8) | str.charCodeAt(offset + n)) >>> 0;
+ // Check each codespace range to see if it falls within.
+ var codespaceRange = codespaceRanges[n];
+ for (var k = 0, kk = codespaceRange.length; k < kk;) {
+ var low = codespaceRange[k++];
+ var high = codespaceRange[k++];
+ if (c >= low && c <= high) {
+ return [c, n + 1];
+ }
+ }
+ }
+
+ return [0, 1];
+ }
+
+ };
+ return CMap;
+})();
+
+var IdentityCMap = (function IdentityCMapClosure() {
+ function IdentityCMap(vertical, n) {
+ CMap.call(this);
+ this.vertical = vertical;
+ this.addCodespaceRange(n, 0, 0xffff);
+ this.mapRange(0, 0xffff, '\u0000');
+ }
+ Util.inherit(IdentityCMap, CMap, {});
+
+ return IdentityCMap;
+})();
+
+var CMapFactory = (function CMapFactoryClosure() {
+ function strToInt(str) {
+ var a = 0;
+ for (var i = 0; i < str.length; i++) {
+ a = (a << 8) | str.charCodeAt(i);
+ }
+ return a >>> 0;
+ }
+
+ function expectString(obj) {
+ if (!isString(obj)) {
+ error('Malformed CMap: expected string.');
+ }
+ }
+
+ function expectInt(obj) {
+ if (!isInt(obj)) {
+ error('Malformed CMap: expected int.');
+ }
+ }
+
+ function parseBfChar(cMap, lexer) {
+ while (true) {
+ var obj = lexer.getObj();
+ if (isEOF(obj)) {
+ break;
+ }
+ if (isCmd(obj, 'endbfchar')) {
+ return;
+ }
+ expectString(obj);
+ var src = strToInt(obj);
+ obj = lexer.getObj();
+ // TODO are /dstName used?
+ expectString(obj);
+ var dst = obj;
+ cMap.mapOne(src, dst);
+ }
+ }
+
+ function parseBfRange(cMap, lexer) {
+ while (true) {
+ var obj = lexer.getObj();
+ if (isEOF(obj)) {
+ break;
+ }
+ if (isCmd(obj, 'endbfrange')) {
+ return;
+ }
+ expectString(obj);
+ var low = strToInt(obj);
+ obj = lexer.getObj();
+ expectString(obj);
+ var high = strToInt(obj);
+ obj = lexer.getObj();
+ if (isInt(obj) || isString(obj)) {
+ var dstLow = isInt(obj) ? String.fromCharCode(obj) : obj;
+ cMap.mapRange(low, high, dstLow);
+ } else if (isCmd(obj, '[')) {
+ obj = lexer.getObj();
+ var array = [];
+ while (!isCmd(obj, ']') && !isEOF(obj)) {
+ array.push(obj);
+ obj = lexer.getObj();
+ }
+ cMap.mapRangeToArray(low, high, array);
+ } else {
+ break;
+ }
+ }
+ error('Invalid bf range.');
+ }
+
+ function parseCidChar(cMap, lexer) {
+ while (true) {
+ var obj = lexer.getObj();
+ if (isEOF(obj)) {
+ break;
+ }
+ if (isCmd(obj, 'endcidchar')) {
+ return;
+ }
+ expectString(obj);
+ var src = strToInt(obj);
+ obj = lexer.getObj();
+ expectInt(obj);
+ var dst = String.fromCharCode(obj);
+ cMap.mapOne(src, dst);
+ }
+ }
+
+ function parseCidRange(cMap, lexer) {
+ while (true) {
+ var obj = lexer.getObj();
+ if (isEOF(obj)) {
+ break;
+ }
+ if (isCmd(obj, 'endcidrange')) {
+ return;
+ }
+ expectString(obj);
+ var low = strToInt(obj);
+ obj = lexer.getObj();
+ expectString(obj);
+ var high = strToInt(obj);
+ obj = lexer.getObj();
+ expectInt(obj);
+ var dstLow = String.fromCharCode(obj);
+ cMap.mapRange(low, high, dstLow);
+ }
+ }
+
+ function parseCodespaceRange(cMap, lexer) {
+ while (true) {
+ var obj = lexer.getObj();
+ if (isEOF(obj)) {
+ break;
+ }
+ if (isCmd(obj, 'endcodespacerange')) {
+ return;
+ }
+ if (!isString(obj)) {
+ break;
+ }
+ var low = strToInt(obj);
+ obj = lexer.getObj();
+ if (!isString(obj)) {
+ break;
+ }
+ var high = strToInt(obj);
+ cMap.addCodespaceRange(obj.length, low, high);
+ }
+ error('Invalid codespace range.');
+ }
+
+ function parseCmap(cMap, lexer) {
+ objLoop: while (true) {
+ var obj = lexer.getObj();
+ if (isEOF(obj)) {
+ break;
+ } else if (isCmd(obj)) {
+ switch (obj.cmd) {
+ case 'endcMap':
+ break objLoop;
+ case 'usecMap':
+ // TODO
+ break;
+ case 'begincodespacerange':
+ parseCodespaceRange(cMap, lexer);
+ break;
+ case 'beginbfchar':
+ parseBfChar(cMap, lexer);
+ break;
+ case 'begincidchar':
+ parseCidChar(cMap, lexer);
+ break;
+ case 'beginbfrange':
+ parseBfRange(cMap, lexer);
+ break;
+ case 'begincidrange':
+ parseCidRange(cMap, lexer);
+ break;
+ }
+ }
+ }
+ }
+ return {
+ create: function (encoding) {
+ if (isName(encoding)) {
+ switch (encoding.name) {
+ case 'Identity-H':
+ return new IdentityCMap(false, 2);
+ case 'Identity-V':
+ return new IdentityCMap(true, 2);
+ default:
+ if (encoding.name in CMAP_CODESPACES) {
+ // XXX: Temporary hack so the correct amount of bytes are read in
+ // CMap.readCharCode.
+ var cMap = new CMap();
+ cMap.codespaceRanges = CMAP_CODESPACES[encoding.name];
+ return cMap;
+ }
+ return null;
+ }
+ } else if (isStream(encoding)) {
+ var cMap = new CMap();
+ var lexer = new Lexer(encoding);
+ try {
+ parseCmap(cMap, lexer);
+ } catch (e) {
+ warn('Invalid CMap data. ' + e);
+ }
+ return cMap;
+ }
+ error('Encoding required.');
+ }
+ };
+})();
diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index dafdddb50..c11713323 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -20,7 +20,7 @@
isStream, isString, JpegStream, Lexer, Metrics, Name, Parser,
Pattern, PDFImage, PDFJS, serifFonts, stdFontMap, symbolsFonts,
TilingPattern, TODO, warn, Util, Promise,
- RefSetCache, isRef, TextRenderingMode */
+ RefSetCache, isRef, TextRenderingMode, CMapFactory */
'use strict';
@@ -1010,119 +1010,24 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
if (!isIdentityMap)
error('ToUnicode file cmap translation not implemented');
} else if (isStream(cmapObj)) {
- var tokens = [];
- var token = '';
- var beginArrayToken = {};
-
- var cmap = cmapObj.getBytes(cmapObj.length);
- for (var i = 0, ii = cmap.length; i < ii; i++) {
- var octet = cmap[i];
- if (octet == 0x20 || octet == 0x0D || octet == 0x0A ||
- octet == 0x3C || octet == 0x5B || octet == 0x5D) {
- switch (token) {
- case 'usecmap':
- error('usecmap is not implemented');
- break;
-
- case 'beginbfchar':
- case 'beginbfrange':
- case 'begincidchar':
- case 'begincidrange':
- token = '';
- tokens = [];
- break;
-
- case 'endcidrange':
- case 'endbfrange':
- for (var j = 0, jj = tokens.length; j < jj; j += 3) {
- var startRange = tokens[j];
- var endRange = tokens[j + 1];
- var code = tokens[j + 2];
- if (code == 0xFFFF) {
- // CMap is broken, assuming code == startRange
- code = startRange;
- }
- if (isArray(code)) {
- var codeindex = 0;
- while (startRange <= endRange) {
- charToUnicode[startRange] = code[codeindex++];
- ++startRange;
- }
- } else {
- while (startRange <= endRange) {
- charToUnicode[startRange] = code++;
- ++startRange;
- }
- }
- }
- break;
-
- case 'endcidchar':
- case 'endbfchar':
- for (var j = 0, jj = tokens.length; j < jj; j += 2) {
- var index = tokens[j];
- var code = tokens[j + 1];
- charToUnicode[index] = code;
- }
- break;
-
- case '':
- break;
-
- default:
- if (token[0] >= '0' && token[0] <= '9')
- token = parseInt(token, 10); // a number
- tokens.push(token);
- token = '';
+ var cmap = CMapFactory.create(cmapObj).map;
+ // Convert UTF-16BE
+ for (var i in cmap) {
+ var token = cmap[i];
+ var str = [];
+ for (var k = 0; k < token.length; k += 2) {
+ var w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
+ if ((w1 & 0xF800) !== 0xD800) { // w1 < 0xD800 || w1 > 0xDFFF
+ str.push(w1);
+ continue;
}
- switch (octet) {
- case 0x5B:
- // begin list parsing
- tokens.push(beginArrayToken);
- break;
- case 0x5D:
- // collect array items
- var items = [], item;
- while (tokens.length &&
- (item = tokens.pop()) != beginArrayToken)
- items.unshift(item);
- tokens.push(items);
- break;
- }
- } else if (octet == 0x3E) {
- if (token.length) {
- // Heuristic: guessing chars size by checking numbers sizes
- // in the CMap entries.
- if (token.length == 2 && properties.composite)
- properties.wideChars = false;
-
- if (token.length <= 4) {
- // parsing hex number
- tokens.push(parseInt(token, 16));
- token = '';
- } else {
- // parsing hex UTF-16BE numbers
- var str = [];
- for (var k = 0, kk = token.length; k < kk; k += 4) {
- var b = parseInt(token.substr(k, 4), 16);
- if (b <= 0x10) {
- k += 4;
- b = (b << 16) | parseInt(token.substr(k, 4), 16);
- b -= 0x10000;
- str.push(0xD800 | (b >> 10));
- str.push(0xDC00 | (b & 0x3FF));
- break;
- }
- str.push(b);
- }
- tokens.push(String.fromCharCode.apply(String, str));
- token = '';
- }
- }
- } else {
- token += String.fromCharCode(octet);
+ k += 2;
+ var w2 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
+ str.push(((w1 & 0x3ff) << 10) + (w2 & 0x3ff) + 0x10000);
}
+ cmap[i] = String.fromCharCode.apply(String, str);
}
+ return cmap;
}
return charToUnicode;
},
@@ -1409,6 +1314,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
properties.cidEncoding = cidEncoding.name;
properties.vertical = /-V$/.test(cidEncoding.name);
}
+ properties.cmap = CMapFactory.create(cidEncoding);
}
this.extractWidths(dict, xref, descriptor, properties);
this.extractDataStructures(dict, baseDict, xref, properties);
diff --git a/src/core/fonts.js b/src/core/fonts.js
index 6cc4badd6..154c3f087 100644
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@@ -18,7 +18,7 @@
ExpertSubsetCharset, FileReaderSync, GlyphsUnicode,
info, isArray, isNum, ISOAdobeCharset, Stream,
stringToBytes, TextDecoder, TODO, warn, Lexer, Util,
- FONT_IDENTITY_MATRIX, FontRendererFactory, shadow */
+ FONT_IDENTITY_MATRIX, FontRendererFactory, shadow, isString */
'use strict';
@@ -2182,6 +2182,7 @@ var Font = (function FontClosure() {
this.composite = properties.composite;
this.wideChars = properties.wideChars;
this.hasEncoding = properties.hasEncoding;
+ this.cmap = properties.cmap;
this.fontMatrix = properties.fontMatrix;
if (properties.type == 'Type3') {
@@ -3701,7 +3702,7 @@ var Font = (function FontClosure() {
var dupFirstEntry = false;
if (properties.type == 'CIDFontType2' && properties.toUnicode &&
- properties.toUnicode[0] > 0) {
+ properties.toUnicode[0] > '\u0000') {
// oracle's defect (see 3427), duplicating first entry
dupFirstEntry = true;
numGlyphs++;
@@ -4250,8 +4251,12 @@ var Font = (function FontClosure() {
var unicode = toUnicode[i];
var fontCharCode = typeof unicode === 'object' ? unusedUnicode++ :
unicode;
- if (typeof unicode !== 'undefined')
+ if (typeof unicode !== 'undefined') {
+ if (isString(fontCharCode) && fontCharCode.length === 1) {
+ fontCharCode = fontCharCode.charCodeAt(0);
+ }
result[i] = fontCharCode;
+ }
}
return result;
},
@@ -4264,7 +4269,7 @@ var Font = (function FontClosure() {
var isIdentityMap = toUnicode.length === 0;
for (var i = firstChar, ii = lastChar; i <= ii; i++) {
// TODO missing map the character according font's CMap
- map[i] = isIdentityMap ? i : toUnicode[i];
+ map[i] = isIdentityMap ? String.fromCharCode(i) : toUnicode[i];
}
} else {
for (var i = firstChar, ii = lastChar; i <= ii; i++) {
@@ -4272,7 +4277,7 @@ var Font = (function FontClosure() {
if (!glyph)
glyph = properties.baseEncoding[i];
if (!!glyph && (glyph in GlyphsUnicode))
- map[i] = GlyphsUnicode[glyph];
+ map[i] = String.fromCharCode(GlyphsUnicode[glyph]);
}
}
this.toUnicode = map;
@@ -4535,15 +4540,15 @@ var Font = (function FontClosure() {
warn('Unsupported CMap: ' + cidEncoding);
}
}
- if (!converter && this.wideChars) {
+ if (!converter && this.cmap) {
+ var i = 0;
// composite fonts have multi-byte strings convert the string from
// single-byte to multi-byte
- // XXX assuming CIDFonts are two-byte - later need to extract the
- // correct byte encoding according to the PDF spec
- var length = chars.length - 1; // looping over two bytes at a time so
- // loop should never end on the last byte
- for (var i = 0; i < length; i++) {
- var charcode = int16([chars.charCodeAt(i++), chars.charCodeAt(i)]);
+ while (i < chars.length) {
+ var c = this.cmap.readCharCode(chars, i);
+ var charcode = c[0];
+ var length = c[1];
+ i += length;
var glyph = this.charToGlyph(charcode);
glyphs.push(glyph);
// placing null after each word break charcode (ASCII SPACE)
diff --git a/src/worker_loader.js b/src/worker_loader.js
index e59299e2b..736c6bf96 100644
--- a/src/worker_loader.js
+++ b/src/worker_loader.js
@@ -34,6 +34,7 @@ var files = [
'core/cidmaps.js',
'core/crypto.js',
'core/evaluator.js',
+ 'core/cmap.js',
'core/fonts.js',
'core/font_renderer.js',
'core/glyphlist.js',
diff --git a/test/pdfs/bug898853.pdf b/test/pdfs/bug898853.pdf
new file mode 100644
index 000000000..ab3bbc8c5
Binary files /dev/null and b/test/pdfs/bug898853.pdf differ
diff --git a/test/test_manifest.json b/test/test_manifest.json
index 9b5146877..6b245d729 100644
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@@ -869,6 +869,13 @@
"link": true,
"type": "eq"
},
+ { "id": "bug898853.pdf",
+ "file": "pdfs/bug898853.pdf",
+ "md5": "37c37702bf98d33f9f74e2380c4d1a3f",
+ "rounds": 1,
+ "type": "eq",
+ "about": "Has a multi-byte char codes."
+ },
{ "id": "issue1912",
"file": "pdfs/issue1912.pdf",
"md5": "15305b7c2cba971e7423de3f6ad38fef",
diff --git a/test/unit/cmap_spec.js b/test/unit/cmap_spec.js
new file mode 100644
index 000000000..be0e41268
--- /dev/null
+++ b/test/unit/cmap_spec.js
@@ -0,0 +1,86 @@
+/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
+/* globals expect, it, describe, StringStream, Lexer, CMapFactory */
+
+'use strict';
+
+describe('cmap', function() {
+ it('parses beginbfchar', function() {
+ var str = '2 beginbfchar\n' +
+ '<03> <00>\n' +
+ '<04> <01>\n' +
+ 'endbfchar\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ expect(cmap.lookup(0x03)).toEqual(String.fromCharCode(0x00));
+ expect(cmap.lookup(0x04)).toEqual(String.fromCharCode(0x01));
+ expect(cmap.lookup(0x05)).toBeUndefined();
+ });
+ it('parses beginbfrange with range', function() {
+ var str = '1 beginbfrange\n' +
+ '<06> <0B> 0\n' +
+ 'endbfrange\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ expect(cmap.lookup(0x05)).toBeUndefined();
+ expect(cmap.lookup(0x06)).toEqual(String.fromCharCode(0x00));
+ expect(cmap.lookup(0x0B)).toEqual(String.fromCharCode(0x05));
+ expect(cmap.lookup(0x0C)).toBeUndefined();
+ });
+ it('parses beginbfrange with array', function() {
+ var str = '1 beginbfrange\n' +
+ '<0D> <12> [ 0 1 2 3 4 5 ]\n' +
+ 'endbfrange\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ expect(cmap.lookup(0x0C)).toBeUndefined();
+ expect(cmap.lookup(0x0D)).toEqual(0x00);
+ expect(cmap.lookup(0x12)).toEqual(0x05);
+ expect(cmap.lookup(0x13)).toBeUndefined();
+ });
+ it('parses begincidchar', function() {
+ var str = '1 begincidchar\n' +
+ '<14> 0\n' +
+ 'endcidchar\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ expect(cmap.lookup(0x14)).toEqual(String.fromCharCode(0x00));
+ expect(cmap.lookup(0x15)).toBeUndefined();
+ });
+ it('parses begincidrange', function() {
+ var str = '1 begincidrange\n' +
+ '<0016> <001B> 0\n' +
+ 'endcidrange\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ expect(cmap.lookup(0x15)).toBeUndefined();
+ expect(cmap.lookup(0x16)).toEqual(String.fromCharCode(0x00));
+ expect(cmap.lookup(0x1B)).toEqual(String.fromCharCode(0x05));
+ expect(cmap.lookup(0x1C)).toBeUndefined();
+ });
+ it('decodes codespace ranges', function() {
+ var str = '1 begincodespacerange\n' +
+ '<01> <02>\n' +
+ '<00000003> <00000004>\n' +
+ 'endcodespacerange\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ var c = cmap.readCharCode(String.fromCharCode(1), 0);
+ expect(c[0]).toEqual(1);
+ expect(c[1]).toEqual(1);
+ c = cmap.readCharCode(String.fromCharCode(0, 0, 0, 3), 0);
+ expect(c[0]).toEqual(3);
+ expect(c[1]).toEqual(4);
+ });
+ it('decodes 4 byte codespace ranges', function() {
+ var str = '1 begincodespacerange\n' +
+ '<8EA1A1A1> <8EA1FEFE>\n' +
+ 'endcodespacerange\n';
+ var stream = new StringStream(str);
+ var cmap = CMapFactory.create(stream);
+ var c = cmap.readCharCode(String.fromCharCode(0x8E, 0xA1, 0xA1, 0xA1), 0);
+ expect(c[0]).toEqual(0x8EA1A1A1);
+ expect(c[1]).toEqual(4);
+ });
+});
+
diff --git a/test/unit/unit_test.html b/test/unit/unit_test.html
index 225e873fa..df640b66c 100644
--- a/test/unit/unit_test.html
+++ b/test/unit/unit_test.html
@@ -26,6 +26,7 @@
+
@@ -49,6 +50,7 @@
+