Merge pull request #3674 from brendandahl/cmap-squash

Read multi-byte character codes based on codespace ranges.
2013-09-25 18:04:49 -07:00 · 2013-09-25 18:04:49 -07:00 · 1c7f1cee00
commit 1c7f1cee00
parent 7f49dba87c f32e65b19f
9 changed files with 592 additions and 124 deletions
--- a/make.js
+++ b/make.js
@ -306,7 +306,8 @@ target.bundle = function(args) {
    'core/worker.js',
    'core/jpx.js',
    'core/jbig2.js',
-    'core/bidi.js'
+    'core/bidi.js',
+    'core/cmap.js'
  ];

  var EXT_SRC_FILES = [
--- a/src/core/cmap.js
+++ b/src/core/cmap.js
@ -0,0 +1,460 @@
+/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
+/* Copyright 2012 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* globals Util, isString, isInt, warn, error, isCmd, isEOF, isName, Lexer,
+           isStream */
+
+'use strict';
+
+var CMAP_CODESPACES = {
+  'Adobe-CNS1-0': [[], [0, 14335]],
+  'Adobe-CNS1-1': [[], [0, 17407]],
+  'Adobe-CNS1-2': [[], [0, 17663]],
+  'Adobe-CNS1-3': [[], [0, 18943]],
+  'Adobe-CNS1-4': [[], [0, 19199]],
+  'Adobe-CNS1-5': [[], [0, 19199]],
+  'Adobe-CNS1-6': [[], [0, 19199]],
+  'Adobe-CNS1-UCS2': [[], [0, 65535]],
+  'B5-H': [[0, 128], [41280, 65278]],
+  'B5-V': [[0, 128], [41280, 65278]],
+  'B5pc-H': [[0, 128, 253, 255], [41280, 64766]],
+  'B5pc-V': [[0, 128, 253, 255], [41280, 64766]],
+  'CNS-EUC-H': [[0, 128], [41377, 65278], [],
+    [2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]],
+  'CNS-EUC-V': [[0, 128], [41377, 65278], [],
+    [2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]],
+  'CNS1-H': [[], [8481, 32382]],
+  'CNS1-V': [[], [8481, 32382]],
+  'CNS2-H': [[], [8481, 32382]],
+  'CNS2-V': [[], [8481, 32382]],
+  'ETen-B5-H': [[0, 128], [41280, 65278]],
+  'ETen-B5-V': [[0, 128], [41280, 65278]],
+  'ETenms-B5-H': [[0, 128], [41280, 65278]],
+  'ETenms-B5-V': [[0, 128], [41280, 65278]],
+  'ETHK-B5-H': [[0, 128], [34624, 65278]],
+  'ETHK-B5-V': [[0, 128], [34624, 65278]],
+  'HKdla-B5-H': [[0, 128], [41280, 65278]],
+  'HKdla-B5-V': [[0, 128], [41280, 65278]],
+  'HKdlb-B5-H': [[0, 128], [36416, 65278]],
+  'HKdlb-B5-V': [[0, 128], [36416, 65278]],
+  'HKgccs-B5-H': [[0, 128], [35392, 65278]],
+  'HKgccs-B5-V': [[0, 128], [35392, 65278]],
+  'HKm314-B5-H': [[0, 128], [41280, 65278]],
+  'HKm314-B5-V': [[0, 128], [41280, 65278]],
+  'HKm471-B5-H': [[0, 128], [41280, 65278]],
+  'HKm471-B5-V': [[0, 128], [41280, 65278]],
+  'HKscs-B5-H': [[0, 128], [34624, 65278]],
+  'HKscs-B5-V': [[0, 128], [34624, 65278]],
+  'UniCNS-UCS2-H': [[], [0, 55295, 57344, 65535]],
+  'UniCNS-UCS2-V': [[], [0, 55295, 57344, 65535]],
+  'UniCNS-UTF16-H': [[], [0, 55295, 57344, 65535], [],
+    [3623934976, 3690979327]],
+  'UniCNS-UTF16-V': [[], [0, 55295, 57344, 65535], [],
+    [3623934976, 3690979327]],
+  'Adobe-GB1-0': [[], [0, 7935]],
+  'Adobe-GB1-1': [[], [0, 9983]],
+  'Adobe-GB1-2': [[], [0, 22271]],
+  'Adobe-GB1-3': [[], [0, 22527]],
+  'Adobe-GB1-4': [[], [0, 29183]],
+  'Adobe-GB1-5': [[], [0, 30463]],
+  'Adobe-GB1-UCS2': [[], [0, 65535]],
+  'GB-EUC-H': [[0, 128], [41377, 65278]],
+  'GB-EUC-V': [[0, 128], [41377, 65278]],
+  'GB-H': [[], [8481, 32382]],
+  'GB-V': [[], [8481, 32382]],
+  'GBK-EUC-H': [[0, 128], [33088, 65278]],
+  'GBK-EUC-V': [[0, 128], [33088, 65278]],
+  'GBK2K-H': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]],
+  'GBK2K-V': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]],
+  'GBKp-EUC-H': [[0, 128], [33088, 65278]],
+  'GBKp-EUC-V': [[0, 128], [33088, 65278]],
+  'GBpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]],
+  'GBpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]],
+  'GBT-EUC-H': [[0, 128], [41377, 65278]],
+  'GBT-EUC-V': [[0, 128], [41377, 65278]],
+  'GBT-H': [[], [8481, 32382]],
+  'GBT-V': [[], [8481, 32382]],
+  'GBTpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]],
+  'GBTpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]],
+  'UniGB-UCS2-H': [[], [0, 55295, 57344, 65535]],
+  'UniGB-UCS2-V': [[], [0, 55295, 57344, 65535]],
+  'UniGB-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
+  'UniGB-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
+  '78-EUC-H': [[0, 128], [36512, 36575, 41377, 65278]],
+  '78-EUC-V': [[0, 128], [36512, 36575, 41377, 65278]],
+  '78-H': [[], [8481, 32382]],
+  '78-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '78-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '78-V': [[], [8481, 32382]],
+  '78ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '78ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '83pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
+  '90ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '90ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '90msp-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '90msp-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  '90pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
+  '90pv-RKSJ-V': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
+  'Add-H': [[], [8481, 32382]],
+  'Add-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  'Add-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  'Add-V': [[], [8481, 32382]],
+  'Adobe-Japan1-0': [[], [0, 8447]],
+  'Adobe-Japan1-1': [[], [0, 8447]],
+  'Adobe-Japan1-2': [[], [0, 8959]],
+  'Adobe-Japan1-3': [[], [0, 9471]],
+  'Adobe-Japan1-4': [[], [0, 15615]],
+  'Adobe-Japan1-5': [[], [0, 20479]],
+  'Adobe-Japan1-6': [[], [0, 23295]],
+  'Adobe-Japan1-UCS2': [[], [0, 65535]],
+  'Adobe-Japan2-0': [[], [0, 6143]],
+  'EUC-H': [[0, 128], [36512, 36575, 41377, 65278]],
+  'EUC-V': [[0, 128], [36512, 36575, 41377, 65278]],
+  'Ext-H': [[], [8481, 32382]],
+  'Ext-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  'Ext-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  'Ext-V': [[], [8481, 32382]],
+  'H': [[], [8481, 32382]],
+  'Hankaku': [[0, 255], []],
+  'Hiragana': [[0, 255], []],
+  'Hojo-EUC-H': [[], [], [9413025, 9436926], []],
+  'Hojo-EUC-V': [[], [], [9413025, 9436926], []],
+  'Hojo-H': [[], [8481, 32382]],
+  'Hojo-V': [[], [8481, 32382]],
+  'Katakana': [[0, 255], []],
+  'NWP-H': [[], [8481, 32382]],
+  'NWP-V': [[], [8481, 32382]],
+  'RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  'RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
+  'Roman': [[0, 255], []],
+  'UniHojo-UCS2-H': [[], [0, 55295, 57344, 65535]],
+  'UniHojo-UCS2-V': [[], [0, 55295, 57344, 65535]],
+  'UniHojo-UTF16-H': [[], [0, 55295, 57344, 65535], [],
+    [3623934976, 3690979327]],
+  'UniHojo-UTF16-V': [[], [0, 55295, 57344, 65535], [],
+    [3623934976, 3690979327]],
+  'UniJIS-UCS2-H': [[], [0, 55295, 57344, 65535]],
+  'UniJIS-UCS2-HW-H': [[], [0, 55295, 57344, 65535]],
+  'UniJIS-UCS2-HW-V': [[], [0, 55295, 57344, 65535]],
+  'UniJIS-UCS2-V': [[], [0, 55295, 57344, 65535]],
+  'UniJIS-UTF16-H': [[], [0, 55295, 57344, 65535], [],
+    [3623934976, 3690979327]],
+  'UniJIS-UTF16-V': [[], [0, 55295, 57344, 65535], [],
+    [3623934976, 3690979327]],
+  'UniJISPro-UCS2-HW-V': [[], [0, 55295, 57344, 65535]],
+  'UniJISPro-UCS2-V': [[], [0, 55295, 57344, 65535]],
+  'V': [[], [8481, 32382]],
+  'WP-Symbol': [[0, 255], []],
+  'Adobe-Korea1-0': [[], [0, 9471]],
+  'Adobe-Korea1-1': [[], [0, 18175]],
+  'Adobe-Korea1-2': [[], [0, 18431]],
+  'Adobe-Korea1-UCS2': [[], [0, 65535]],
+  'KSC-EUC-H': [[0, 128], [41377, 65278]],
+  'KSC-EUC-V': [[0, 128], [41377, 65278]],
+  'KSC-H': [[], [8481, 32382]],
+  'KSC-Johab-H': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]],
+  'KSC-Johab-V': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]],
+  'KSC-V': [[], [8481, 32382]],
+  'KSCms-UHC-H': [[0, 128], [33089, 65278]],
+  'KSCms-UHC-HW-H': [[0, 128], [33089, 65278]],
+  'KSCms-UHC-HW-V': [[0, 128], [33089, 65278]],
+  'KSCms-UHC-V': [[0, 128], [33089, 65278]],
+  'KSCpc-EUC-H': [[0, 132, 254, 255], [41281, 65022]],
+  'KSCpc-EUC-V': [[0, 132, 254, 255], [41281, 65022]],
+  'UniKS-UCS2-H': [[], [0, 55295, 57344, 65535]],
+  'UniKS-UCS2-V': [[], [0, 55295, 57344, 65535]],
+  'UniKS-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
+  'UniKS-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]]
+};
+
+// CMap, not to be confused with TrueType's cmap.
+var CMap = (function CMapClosure() {
+  function CMap() {
+    // Codespace ranges are stored as follows:
+    // [[1BytePairs], [2BytePairs], [3BytePairs], [4BytePairs]]
+    // where nBytePairs are ranges e.g. [low1, high1, low2, high2, ...]
+    this.codespaceRanges = [[], [], [], []];
+    this.map = [];
+    this.vertical = false;
+  }
+  CMap.prototype = {
+    addCodespaceRange: function(n, low, high) {
+      this.codespaceRanges[n - 1].push(low, high);
+    },
+
+    mapRange: function(low, high, dstLow) {
+      var lastByte = dstLow.length - 1;
+      while (low <= high) {
+        this.map[low] = dstLow;
+        // Only the last byte has to be incremented.
+        dstLow = dstLow.substr(0, lastByte) +
+                 String.fromCharCode(dstLow.charCodeAt(lastByte) + 1);
+        ++low;
+      }
+    },
+
+    mapRangeToArray: function(low, high, array) {
+      var i = 0;
+      while (low <= high) {
+        this.map[low] = array[i++];
+        ++low;
+      }
+    },
+
+    mapOne: function(src, dst) {
+      this.map[src] = dst;
+    },
+
+    lookup: function(code) {
+      return this.map[code];
+    },
+
+    readCharCode: function(str, offset) {
+      var c = 0;
+      var codespaceRanges = this.codespaceRanges;
+      var codespaceRangesLen = this.codespaceRanges.length;
+      // 9.7.6.2 CMap Mapping
+      // The code length is at most 4.
+      for (var n = 0; n < codespaceRangesLen; n++) {
+        c = ((c << 8) | str.charCodeAt(offset + n)) >>> 0;
+        // Check each codespace range to see if it falls within.
+        var codespaceRange = codespaceRanges[n];
+        for (var k = 0, kk = codespaceRange.length; k < kk;) {
+          var low = codespaceRange[k++];
+          var high = codespaceRange[k++];
+          if (c >= low && c <= high) {
+            return [c, n + 1];
+          }
+        }
+      }
+
+      return [0, 1];
+    }
+
+  };
+  return CMap;
+})();
+
+var IdentityCMap = (function IdentityCMapClosure() {
+  function IdentityCMap(vertical, n) {
+    CMap.call(this);
+    this.vertical = vertical;
+    this.addCodespaceRange(n, 0, 0xffff);
+    this.mapRange(0, 0xffff, '\u0000');
+  }
+  Util.inherit(IdentityCMap, CMap, {});
+
+  return IdentityCMap;
+})();
+
+var CMapFactory = (function CMapFactoryClosure() {
+  function strToInt(str) {
+    var a = 0;
+    for (var i = 0; i < str.length; i++) {
+      a = (a << 8) | str.charCodeAt(i);
+    }
+    return a >>> 0;
+  }
+
+  function expectString(obj) {
+    if (!isString(obj)) {
+      error('Malformed CMap: expected string.');
+    }
+  }
+
+  function expectInt(obj) {
+    if (!isInt(obj)) {
+      error('Malformed CMap: expected int.');
+    }
+  }
+
+  function parseBfChar(cMap, lexer) {
+    while (true) {
+      var obj = lexer.getObj();
+      if (isEOF(obj)) {
+        break;
+      }
+      if (isCmd(obj, 'endbfchar')) {
+        return;
+      }
+      expectString(obj);
+      var src = strToInt(obj);
+      obj = lexer.getObj();
+      // TODO are /dstName used?
+      expectString(obj);
+      var dst = obj;
+      cMap.mapOne(src, dst);
+    }
+  }
+
+  function parseBfRange(cMap, lexer) {
+    while (true) {
+      var obj = lexer.getObj();
+      if (isEOF(obj)) {
+        break;
+      }
+      if (isCmd(obj, 'endbfrange')) {
+        return;
+      }
+      expectString(obj);
+      var low = strToInt(obj);
+      obj = lexer.getObj();
+      expectString(obj);
+      var high = strToInt(obj);
+      obj = lexer.getObj();
+      if (isInt(obj) || isString(obj)) {
+        var dstLow = isInt(obj) ? String.fromCharCode(obj) : obj;
+        cMap.mapRange(low, high, dstLow);
+      } else if (isCmd(obj, '[')) {
+        obj = lexer.getObj();
+        var array = [];
+        while (!isCmd(obj, ']') && !isEOF(obj)) {
+          array.push(obj);
+          obj = lexer.getObj();
+        }
+        cMap.mapRangeToArray(low, high, array);
+      } else {
+        break;
+      }
+    }
+    error('Invalid bf range.');
+  }
+
+  function parseCidChar(cMap, lexer) {
+    while (true) {
+      var obj = lexer.getObj();
+      if (isEOF(obj)) {
+        break;
+      }
+      if (isCmd(obj, 'endcidchar')) {
+        return;
+      }
+      expectString(obj);
+      var src = strToInt(obj);
+      obj = lexer.getObj();
+      expectInt(obj);
+      var dst = String.fromCharCode(obj);
+      cMap.mapOne(src, dst);
+    }
+  }
+
+  function parseCidRange(cMap, lexer) {
+    while (true) {
+      var obj = lexer.getObj();
+      if (isEOF(obj)) {
+        break;
+      }
+      if (isCmd(obj, 'endcidrange')) {
+        return;
+      }
+      expectString(obj);
+      var low = strToInt(obj);
+      obj = lexer.getObj();
+      expectString(obj);
+      var high = strToInt(obj);
+      obj = lexer.getObj();
+      expectInt(obj);
+      var dstLow = String.fromCharCode(obj);
+      cMap.mapRange(low, high, dstLow);
+    }
+  }
+
+  function parseCodespaceRange(cMap, lexer) {
+    while (true) {
+      var obj = lexer.getObj();
+      if (isEOF(obj)) {
+        break;
+      }
+      if (isCmd(obj, 'endcodespacerange')) {
+        return;
+      }
+      if (!isString(obj)) {
+        break;
+      }
+      var low = strToInt(obj);
+      obj = lexer.getObj();
+      if (!isString(obj)) {
+        break;
+      }
+      var high = strToInt(obj);
+      cMap.addCodespaceRange(obj.length, low, high);
+    }
+    error('Invalid codespace range.');
+  }
+
+  function parseCmap(cMap, lexer) {
+    objLoop: while (true) {
+      var obj = lexer.getObj();
+      if (isEOF(obj)) {
+        break;
+      } else if (isCmd(obj)) {
+        switch (obj.cmd) {
+          case 'endcMap':
+            break objLoop;
+          case 'usecMap':
+            // TODO
+            break;
+          case 'begincodespacerange':
+            parseCodespaceRange(cMap, lexer);
+            break;
+          case 'beginbfchar':
+            parseBfChar(cMap, lexer);
+            break;
+          case 'begincidchar':
+            parseCidChar(cMap, lexer);
+            break;
+          case 'beginbfrange':
+            parseBfRange(cMap, lexer);
+            break;
+          case 'begincidrange':
+            parseCidRange(cMap, lexer);
+            break;
+        }
+      }
+    }
+  }
+  return {
+    create: function (encoding) {
+      if (isName(encoding)) {
+        switch (encoding.name) {
+          case 'Identity-H':
+            return new IdentityCMap(false, 2);
+          case 'Identity-V':
+            return new IdentityCMap(true, 2);
+          default:
+            if (encoding.name in CMAP_CODESPACES) {
+              // XXX: Temporary hack so the correct amount of bytes are read in
+              // CMap.readCharCode.
+              var cMap = new CMap();
+              cMap.codespaceRanges = CMAP_CODESPACES[encoding.name];
+              return cMap;
+            }
+            return null;
+        }
+      } else if (isStream(encoding)) {
+        var cMap = new CMap();
+        var lexer = new Lexer(encoding);
+        try {
+          parseCmap(cMap, lexer);
+        } catch (e) {
+          warn('Invalid CMap data. ' + e);
+        }
+        return cMap;
+      }
+      error('Encoding required.');
+    }
+  };
+})();
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -20,7 +20,7 @@
           isStream, isString, JpegStream, Lexer, Metrics, Name, Parser,
           Pattern, PDFImage, PDFJS, serifFonts, stdFontMap, symbolsFonts,
           TilingPattern, TODO, warn, Util, Promise,
-           RefSetCache, isRef, TextRenderingMode */
+           RefSetCache, isRef, TextRenderingMode, CMapFactory */

 'use strict';

@ -1010,119 +1010,24 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        if (!isIdentityMap)
          error('ToUnicode file cmap translation not implemented');
      } else if (isStream(cmapObj)) {
-        var tokens = [];
-        var token = '';
-        var beginArrayToken = {};
-
-        var cmap = cmapObj.getBytes(cmapObj.length);
-        for (var i = 0, ii = cmap.length; i < ii; i++) {
-          var octet = cmap[i];
-          if (octet == 0x20 || octet == 0x0D || octet == 0x0A ||
-              octet == 0x3C || octet == 0x5B || octet == 0x5D) {
-            switch (token) {
-              case 'usecmap':
-                error('usecmap is not implemented');
-                break;
-
-              case 'beginbfchar':
-              case 'beginbfrange':
-              case 'begincidchar':
-              case 'begincidrange':
-                token = '';
-                tokens = [];
-                break;
-
-              case 'endcidrange':
-              case 'endbfrange':
-                for (var j = 0, jj = tokens.length; j < jj; j += 3) {
-                  var startRange = tokens[j];
-                  var endRange = tokens[j + 1];
-                  var code = tokens[j + 2];
-                  if (code == 0xFFFF) {
-                    // CMap is broken, assuming code == startRange
-                    code = startRange;
-                  }
-                  if (isArray(code)) {
-                    var codeindex = 0;
-                    while (startRange <= endRange) {
-                      charToUnicode[startRange] = code[codeindex++];
-                      ++startRange;
-                    }
-                  } else {
-                    while (startRange <= endRange) {
-                      charToUnicode[startRange] = code++;
-                      ++startRange;
-                    }
-                  }
-                }
-                break;
-
-              case 'endcidchar':
-              case 'endbfchar':
-                for (var j = 0, jj = tokens.length; j < jj; j += 2) {
-                  var index = tokens[j];
-                  var code = tokens[j + 1];
-                  charToUnicode[index] = code;
-                }
-                break;
-
-              case '':
-                break;
-
-              default:
-                if (token[0] >= '0' && token[0] <= '9')
-                  token = parseInt(token, 10); // a number
-                tokens.push(token);
-                token = '';
+        var cmap = CMapFactory.create(cmapObj).map;
+        // Convert UTF-16BE
+        for (var i in cmap) {
+          var token = cmap[i];
+          var str = [];
+          for (var k = 0; k < token.length; k += 2) {
+            var w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
+            if ((w1 & 0xF800) !== 0xD800) { // w1 < 0xD800 || w1 > 0xDFFF
+              str.push(w1);
+              continue;
            }
-            switch (octet) {
-              case 0x5B:
-                // begin list parsing
-                tokens.push(beginArrayToken);
-                break;
-              case 0x5D:
-                // collect array items
-                var items = [], item;
-                while (tokens.length &&
-                       (item = tokens.pop()) != beginArrayToken)
-                  items.unshift(item);
-                tokens.push(items);
-                break;
-            }
-          } else if (octet == 0x3E) {
-            if (token.length) {
-              // Heuristic: guessing chars size by checking numbers sizes
-              // in the CMap entries.
-              if (token.length == 2 && properties.composite)
-                properties.wideChars = false;
-
-              if (token.length <= 4) {
-                // parsing hex number
-                tokens.push(parseInt(token, 16));
-                token = '';
-              } else {
-                // parsing hex UTF-16BE numbers
-                var str = [];
-                for (var k = 0, kk = token.length; k < kk; k += 4) {
-                  var b = parseInt(token.substr(k, 4), 16);
-                  if (b <= 0x10) {
-                    k += 4;
-                    b = (b << 16) | parseInt(token.substr(k, 4), 16);
-                    b -= 0x10000;
-                    str.push(0xD800 | (b >> 10));
-                    str.push(0xDC00 | (b & 0x3FF));
-                    break;
-                  }
-                  str.push(b);
-                }
-                tokens.push(String.fromCharCode.apply(String, str));
-                token = '';
-              }
-            }
-          } else {
-            token += String.fromCharCode(octet);
+            k += 2;
+            var w2 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
+            str.push(((w1 & 0x3ff) << 10) + (w2 & 0x3ff) + 0x10000);
          }
+          cmap[i] = String.fromCharCode.apply(String, str);
        }
+        return cmap;
      }
      return charToUnicode;
    },
@ -1409,6 +1314,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
          properties.cidEncoding = cidEncoding.name;
          properties.vertical = /-V$/.test(cidEncoding.name);
        }
+        properties.cmap = CMapFactory.create(cidEncoding);
      }
      this.extractWidths(dict, xref, descriptor, properties);
      this.extractDataStructures(dict, baseDict, xref, properties);
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@ -18,7 +18,7 @@
           ExpertSubsetCharset, FileReaderSync, GlyphsUnicode,
           info, isArray, isNum, ISOAdobeCharset, Stream,
           stringToBytes, TextDecoder, TODO, warn, Lexer, Util,
-           FONT_IDENTITY_MATRIX, FontRendererFactory, shadow */
+           FONT_IDENTITY_MATRIX, FontRendererFactory, shadow, isString */

 'use strict';

@ -2182,6 +2182,7 @@ var Font = (function FontClosure() {
    this.composite = properties.composite;
    this.wideChars = properties.wideChars;
    this.hasEncoding = properties.hasEncoding;
+    this.cmap = properties.cmap;

    this.fontMatrix = properties.fontMatrix;
    if (properties.type == 'Type3') {
@ -3701,7 +3702,7 @@ var Font = (function FontClosure() {

      var dupFirstEntry = false;
      if (properties.type == 'CIDFontType2' && properties.toUnicode &&
-          properties.toUnicode[0] > 0) {
+          properties.toUnicode[0] > '\u0000') {
        // oracle's defect (see 3427), duplicating first entry
        dupFirstEntry = true;
        numGlyphs++;
@ -4250,8 +4251,12 @@ var Font = (function FontClosure() {
        var unicode = toUnicode[i];
        var fontCharCode = typeof unicode === 'object' ? unusedUnicode++ :
          unicode;
-        if (typeof unicode !== 'undefined')
+        if (typeof unicode !== 'undefined') {
+          if (isString(fontCharCode) && fontCharCode.length === 1) {
+            fontCharCode = fontCharCode.charCodeAt(0);
+          }
          result[i] = fontCharCode;
+        }
      }
      return result;
    },
@ -4264,7 +4269,7 @@ var Font = (function FontClosure() {
        var isIdentityMap = toUnicode.length === 0;
        for (var i = firstChar, ii = lastChar; i <= ii; i++) {
          // TODO missing map the character according font's CMap
-          map[i] = isIdentityMap ? i : toUnicode[i];
+          map[i] = isIdentityMap ? String.fromCharCode(i) : toUnicode[i];
        }
      } else {
        for (var i = firstChar, ii = lastChar; i <= ii; i++) {
@ -4272,7 +4277,7 @@ var Font = (function FontClosure() {
          if (!glyph)
            glyph = properties.baseEncoding[i];
          if (!!glyph && (glyph in GlyphsUnicode))
-            map[i] = GlyphsUnicode[glyph];
+            map[i] = String.fromCharCode(GlyphsUnicode[glyph]);
        }
      }
      this.toUnicode = map;
@ -4535,15 +4540,15 @@ var Font = (function FontClosure() {
          warn('Unsupported CMap: ' + cidEncoding);
        }
      }
-      if (!converter && this.wideChars) {
+      if (!converter && this.cmap) {
+        var i = 0;
        // composite fonts have multi-byte strings convert the string from
        // single-byte to multi-byte
-        // XXX assuming CIDFonts are two-byte - later need to extract the
-        // correct byte encoding according to the PDF spec
-        var length = chars.length - 1; // looping over two bytes at a time so
-                                       // loop should never end on the last byte
-        for (var i = 0; i < length; i++) {
-          var charcode = int16([chars.charCodeAt(i++), chars.charCodeAt(i)]);
+        while (i < chars.length) {
+          var c = this.cmap.readCharCode(chars, i);
+          var charcode = c[0];
+          var length = c[1];
+          i += length;
          var glyph = this.charToGlyph(charcode);
          glyphs.push(glyph);
          // placing null after each word break charcode (ASCII SPACE)
--- a/src/worker_loader.js
+++ b/src/worker_loader.js
@ -34,6 +34,7 @@ var files = [
  'core/cidmaps.js',
  'core/crypto.js',
  'core/evaluator.js',
+  'core/cmap.js',
  'core/fonts.js',
  'core/font_renderer.js',
  'core/glyphlist.js',
--- a/test/pdfs/bug898853.pdf
+++ b/test/pdfs/bug898853.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -869,6 +869,13 @@
      "link": true,
      "type": "eq"
    },
+    { "id": "bug898853.pdf",
+      "file": "pdfs/bug898853.pdf",
+      "md5": "37c37702bf98d33f9f74e2380c4d1a3f",
+      "rounds": 1,
+      "type": "eq",
+      "about": "Has a multi-byte char codes."
+    },
    {  "id": "issue1912",
      "file": "pdfs/issue1912.pdf",
      "md5": "15305b7c2cba971e7423de3f6ad38fef",
--- a/test/unit/cmap_spec.js
+++ b/test/unit/cmap_spec.js
@ -0,0 +1,86 @@
+/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
+/* globals expect, it, describe, StringStream, Lexer, CMapFactory */
+
+'use strict';
+
+describe('cmap', function() {
+  it('parses beginbfchar', function() {
+    var str = '2 beginbfchar\n' +
+              '<03> <00>\n' +
+              '<04> <01>\n' +
+              'endbfchar\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    expect(cmap.lookup(0x03)).toEqual(String.fromCharCode(0x00));
+    expect(cmap.lookup(0x04)).toEqual(String.fromCharCode(0x01));
+    expect(cmap.lookup(0x05)).toBeUndefined();
+  });
+  it('parses beginbfrange with range', function() {
+    var str = '1 beginbfrange\n' +
+              '<06> <0B> 0\n' +
+              'endbfrange\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    expect(cmap.lookup(0x05)).toBeUndefined();
+    expect(cmap.lookup(0x06)).toEqual(String.fromCharCode(0x00));
+    expect(cmap.lookup(0x0B)).toEqual(String.fromCharCode(0x05));
+    expect(cmap.lookup(0x0C)).toBeUndefined();
+  });
+  it('parses beginbfrange with array', function() {
+    var str = '1 beginbfrange\n' +
+              '<0D> <12> [ 0 1 2 3 4 5 ]\n' +
+              'endbfrange\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    expect(cmap.lookup(0x0C)).toBeUndefined();
+    expect(cmap.lookup(0x0D)).toEqual(0x00);
+    expect(cmap.lookup(0x12)).toEqual(0x05);
+    expect(cmap.lookup(0x13)).toBeUndefined();
+  });
+  it('parses begincidchar', function() {
+    var str = '1 begincidchar\n' +
+              '<14> 0\n' +
+              'endcidchar\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    expect(cmap.lookup(0x14)).toEqual(String.fromCharCode(0x00));
+    expect(cmap.lookup(0x15)).toBeUndefined();
+  });
+  it('parses begincidrange', function() {
+    var str = '1 begincidrange\n' +
+              '<0016> <001B>   0\n' +
+              'endcidrange\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    expect(cmap.lookup(0x15)).toBeUndefined();
+    expect(cmap.lookup(0x16)).toEqual(String.fromCharCode(0x00));
+    expect(cmap.lookup(0x1B)).toEqual(String.fromCharCode(0x05));
+    expect(cmap.lookup(0x1C)).toBeUndefined();
+  });
+  it('decodes codespace ranges', function() {
+    var str = '1 begincodespacerange\n' +
+              '<01> <02>\n' +
+              '<00000003> <00000004>\n' +
+              'endcodespacerange\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    var c = cmap.readCharCode(String.fromCharCode(1), 0);
+    expect(c[0]).toEqual(1);
+    expect(c[1]).toEqual(1);
+    c = cmap.readCharCode(String.fromCharCode(0, 0, 0, 3), 0);
+    expect(c[0]).toEqual(3);
+    expect(c[1]).toEqual(4);
+  });
+  it('decodes 4 byte codespace ranges', function() {
+    var str = '1 begincodespacerange\n' +
+              '<8EA1A1A1> <8EA1FEFE>\n' +
+              'endcodespacerange\n';
+    var stream = new StringStream(str);
+    var cmap = CMapFactory.create(stream);
+    var c = cmap.readCharCode(String.fromCharCode(0x8E, 0xA1, 0xA1, 0xA1), 0);
+    expect(c[0]).toEqual(0x8EA1A1A1);
+    expect(c[1]).toEqual(4);
+  });
+});
+
--- a/test/unit/unit_test.html
+++ b/test/unit/unit_test.html
@ -26,6 +26,7 @@
  <script type="text/javascript" src="../../src/shared/colorspace.js"></script>
  <script type="text/javascript" src="../../src/core/crypto.js"></script>
  <script type="text/javascript" src="../../src/core/evaluator.js"></script>
+  <script type="text/javascript" src="../../src/core/cmap.js"></script>
  <script type="text/javascript" src="../../src/core/fonts.js"></script>
  <script type="text/javascript" src="../../src/core/glyphlist.js"></script>
  <script type="text/javascript" src="../../src/core/image.js"></script>
@ -49,6 +50,7 @@
  <script type="text/javascript" src="api_spec.js"></script>
  <script type="text/javascript" src="metadata_spec.js"></script>
  <script type="text/javascript" src="util_spec.js"></script>
+  <script type="text/javascript" src="cmap_spec.js"></script>
  <script type="text/javascript">
    'use strict';