Merge branch 'master' of git://github.com/mozilla/pdf.js into text-select

Conflicts: src/canvas.js
2011-11-29 14:17:05 -05:00 · 2011-11-29 14:17:05 -05:00 · a544bed57e
commit a544bed57e
parent 4cfc552163 fcc05b08bf
3 changed files with 230 additions and 126 deletions
--- a/src/canvas.js
+++ b/src/canvas.js
@ -619,12 +619,6 @@ var CanvasGraphics = (function canvasGraphics() {
      this.textDivs.push(div);
    },
    showText: function canvasGraphicsShowText(str, skipTextSelection) {
-      function unicodeToChar(unicode) {
-        return (unicode >= 0x10000) ?
-          String.fromCharCode(0xD800 | ((unicode - 0x10000) >> 10),
-          0xDC00 | (unicode & 0x3FF)) : String.fromCharCode(unicode);
-      };
-
      var ctx = this.ctx;
      var current = this.current;
      var font = current.font;
@ -673,7 +667,7 @@ var CanvasGraphics = (function canvasGraphics() {
          ctx.translate(charWidth, 0);
          current.x += charWidth;

-          text.str += unicodeToChar(glyph.unicode);
+          text.str += glyph.fontChar;
          text.length++;
          text.canvasWidth += charWidth;
        }
@ -691,7 +685,7 @@ var CanvasGraphics = (function canvasGraphics() {
            continue;
          }

-          var char = unicodeToChar(glyph.unicode);
+          var char = glyph.fontChar;
          var charWidth = glyph.width * fontSize * 0.001 + charSpacing;
          ctx.fillText(char, width, 0);
          width += charWidth;
--- a/src/evaluator.js
+++ b/src/evaluator.js
@ -512,6 +512,7 @@ var PartialEvaluator = (function partialEvaluator() {
          error('Encoding is not a Name nor a Dict');
        }
      }
+
      properties.differences = differences;
      properties.baseEncoding = baseEncoding;
      properties.hasEncoding = hasEncoding;
@ -554,9 +555,21 @@ var PartialEvaluator = (function partialEvaluator() {
                  var startRange = tokens[j];
                  var endRange = tokens[j + 1];
                  var code = tokens[j + 2];
-                  while (startRange <= endRange) {
-                    charToUnicode[startRange] = code++;
-                    ++startRange;
+                  if (code == 0xFFFF) {
+                    // CMap is broken, assuming code == startRange
+                    code = startRange;
+                  }
+                  if (isArray(code)) {
+                    var codeindex = 0;
+                    while (startRange <= endRange) {
+                      charToUnicode[startRange] = code[codeindex++];
+                      ++startRange;
+                    }
+                  } else {
+                    while (startRange <= endRange) {
+                      charToUnicode[startRange] = code++;
+                      ++startRange;
+                    }
                  }
                }
                break;
@ -595,9 +608,18 @@ var PartialEvaluator = (function partialEvaluator() {
            }
          } else if (byte == 0x3E) {
            if (token.length) {
-              // parsing hex number
-              tokens.push(parseInt(token, 16));
-              token = '';
+              if (token.length <= 4) {
+                // parsing hex number
+                tokens.push(parseInt(token, 16));
+                token = '';
+              } else {
+                // parsing hex UTF-16BE numbers
+                var str = [];
+                for (var i = 0, ii = token.length; i < ii; i += 4)
+                  str.push(parseInt(token.substr(i, 4), 16));
+                tokens.push(String.fromCharCode.apply(String, str));
+                token = '';
+              }
            }
          } else {
            token += String.fromCharCode(byte);
--- a/src/fonts.js
+++ b/src/fonts.js
@ -719,20 +719,10 @@ function getUnicodeRangeFor(value) {
  return -1;
 }

-function adaptUnicode(unicode) {
-  return (unicode <= 0x1F || (unicode >= 127 && unicode < kSizeOfGlyphArea)) ?
-    unicode + kCmapGlyphOffset : unicode;
-}
-
-function isAdaptedUnicode(unicode) {
-  return unicode >= kCmapGlyphOffset &&
-    unicode < kCmapGlyphOffset + kSizeOfGlyphArea;
-}
-
 function isSpecialUnicode(unicode) {
  return (unicode <= 0x1F || (unicode >= 127 && unicode < kSizeOfGlyphArea)) ||
-    unicode >= kCmapGlyphOffset &&
-    unicode < kCmapGlyphOffset + kSizeOfGlyphArea;
+    (unicode >= kCmapGlyphOffset &&
+    unicode < kCmapGlyphOffset + kSizeOfGlyphArea);
 }

 /**
@ -771,7 +761,6 @@ var Font = (function Font() {
    this.widths = properties.widths;
    this.defaultWidth = properties.defaultWidth;
    this.composite = properties.composite;
-    this.toUnicode = properties.toUnicode;
    this.hasEncoding = properties.hasEncoding;

    this.fontMatrix = properties.fontMatrix;
@ -781,6 +770,11 @@ var Font = (function Font() {
    // Trying to fix encoding using glyph CIDSystemInfo.
    this.loadCidToUnicode(properties);

+    if (properties.toUnicode)
+      this.toUnicode = properties.toUnicode;
+    else
+      this.rebuildToUnicode(properties);
+
    if (!file) {
      // The file data is not specified. Trying to fix the font name
      // to be used with the canvas.font.
@ -961,15 +955,15 @@ var Font = (function Font() {
    var ranges = [];
    for (var n = 0; n < length; ) {
      var start = codes[n].unicode;
-      var startCode = codes[n].code;
+      var codeIndices = [codes[n].code];
      ++n;
      var end = start;
      while (n < length && end + 1 == codes[n].unicode) {
+        codeIndices.push(codes[n].code);
        ++end;
        ++n;
      }
-      var endCode = codes[n - 1].code;
-      ranges.push([start, end, startCode, endCode]);
+      ranges.push([start, end, codeIndices]);
    }

    return ranges;
@ -1012,17 +1006,16 @@ var Font = (function Font() {
        idDeltas += string16(0);
        idRangeOffsets += string16(offset);

-        var startCode = range[2];
-        var endCode = range[3];
-        for (var j = startCode; j <= endCode; ++j)
-          glyphsIds += string16(deltas[j]);
+        var codes = range[2];
+        for (var j = 0, jj = codes.length; j < jj; ++j)
+          glyphsIds += string16(deltas[codes[j]]);
      }
    } else {
      for (var i = 0; i < segCount - 1; i++) {
        var range = ranges[i];
        var start = range[0];
        var end = range[1];
-        var startCode = range[2];
+        var startCode = range[2][0];

        startCount += string16(start);
        endCount += string16(end);
@ -1299,7 +1292,7 @@ var Font = (function Font() {
          properties.baseEncoding = encoding;
      }

-      function replaceCMapTable(cmap, font, properties) {
+      function readCMapTable(cmap, font) {
        var start = (font.start ? font.start : 0) + cmap.offset;
        font.pos = start;

@ -1316,7 +1309,7 @@ var Font = (function Font() {
        }

        // Check that table are sorted by platformID then encodingID,
-        records.sort(function fontReplaceCMapTableSort(a, b) {
+        records.sort(function fontReadCMapTableSort(a, b) {
          return ((a.platformID << 16) + a.encodingID) -
                 ((b.platformID << 16) + b.encodingID);
        });
@ -1371,16 +1364,15 @@ var Font = (function Font() {
            for (var j = 0; j < 256; j++) {
              var index = font.getByte();
              if (index) {
-                var unicode = adaptUnicode(j);
-                glyphs.push({ unicode: unicode, code: j });
+                glyphs.push({ unicode: j, code: j });
                ids.push(index);
              }
            }
-
-            properties.hasShortCmap = true;
-
-            createGlyphNameMap(glyphs, ids, properties);
-            return cmap.data = createCMapTable(glyphs, ids);
+            return {
+              glyphs: glyphs,
+              ids: ids,
+              hasShortCmap: true
+            };
          } else if (format == 4) {
            // re-creating the table in format 4 since the encoding
            // might be changed
@ -1432,17 +1424,18 @@ var Font = (function Font() {
                var glyphCode = offsetIndex < 0 ? j :
                  offsets[offsetIndex + j - start];
                glyphCode = (glyphCode + delta) & 0xFFFF;
-                if (glyphCode == 0 || isAdaptedUnicode(j))
+                if (glyphCode == 0)
                  continue;

-                var unicode = adaptUnicode(j);
-                glyphs.push({ unicode: unicode, code: j });
+                glyphs.push({ unicode: j, code: j });
                ids.push(glyphCode);
              }
            }

-            createGlyphNameMap(glyphs, ids, properties);
-            return cmap.data = createCMapTable(glyphs, ids);
+            return {
+              glyphs: glyphs,
+              ids: ids
+            };
          } else if (format == 6) {
            // Format 6 is a 2-bytes dense mapping, which means the font data
            // lives glue together even if they are pretty far in the unicode
@ -1457,19 +1450,18 @@ var Font = (function Font() {
            for (var j = 0; j < entryCount; j++) {
              var glyphCode = int16(font.getBytes(2));
              var code = firstCode + j;
-              if (isAdaptedUnicode(glyphCode))
-                continue;

-              var unicode = adaptUnicode(code);
-              glyphs.push({ unicode: unicode, code: code });
+              glyphs.push({ unicode: code, code: code });
              ids.push(glyphCode);
            }

-            createGlyphNameMap(glyphs, ids, properties);
-            return cmap.data = createCMapTable(glyphs, ids);
+            return {
+              glyphs: glyphs,
+              ids: ids
+            };
          }
        }
-        return cmap.data;
+        error('Unsupported cmap table format');
      };

      function sanitizeMetrics(font, header, metrics, numGlyphs) {
@ -1708,17 +1700,77 @@ var Font = (function Font() {
          tables.push(cmap);
        }

-        var glyphs = [];
-        for (i = 1; i < numGlyphs; i++) {
-          if (isAdaptedUnicode(i))
-            continue;
-
-          glyphs.push({ unicode: adaptUnicode(i) });
+        var cidToGidMap = properties.cidToGidMap || [];
+        var gidToCidMap = [0];
+        for (var j = cidToGidMap.length - 1; j >= 0; j--) {
+          var gid = cidToGidMap[j];
+          if (gid)
+            gidToCidMap[gid] = j;
        }
-        cmap.data = createCMapTable(glyphs);
+
+        var glyphs = [], ids = [];
+        var usedUnicodes = [];
+        var unassignedUnicodeItems = [];
+        for (var i = 1; i < numGlyphs; i++) {
+          var cid = gidToCidMap[i] || i;
+          var unicode = this.toUnicode[cid];
+          if (!unicode || isSpecialUnicode(unicode) ||
+              unicode in usedUnicodes) {
+            unassignedUnicodeItems.push(i);
+            continue;
+          }
+          usedUnicodes[unicode] = true;
+          glyphs.push({ unicode: unicode, code: cid });
+          ids.push(i);
+        }
+        // trying to fit as many unassigned symbols as we can
+        // in the range allocated for the user defined symbols
+        var unusedUnicode = kCmapGlyphOffset;
+        for (var j = 0, jj = unassignedUnicodeItems.length; j < jj; j++) {
+          var i = unassignedUnicodeItems[j];
+          var cid = gidToCidMap[i] || i;
+          while (unusedUnicode in usedUnicodes)
+            unusedUnicode++;
+          if (unusedUnicode >= kCmapGlyphOffset + kSizeOfGlyphArea)
+            break;
+          var unicode = unusedUnicode++;
+          this.toUnicode[cid] = unicode;
+          usedUnicodes[unicode] = true;
+          glyphs.push({ unicode: unicode, code: cid });
+          ids.push(i);
+        }
+        cmap.data = createCMapTable(glyphs, ids);
      } else {
-        replaceCMapTable(cmap, font, properties);
+        var cmapTable = readCMapTable(cmap, font);
+        var glyphs = cmapTable.glyphs;
+        var ids = cmapTable.ids;
+        var hasShortCmap = !!cmapTable.hasShortCmap;
+        var toUnicode = this.toUnicode;
+
+        if (hasShortCmap && toUnicode) {
+          // checking if cmap is just identity map
+          var isIdentity = true;
+          for (var i = 0, ii = glyphs.length; i < ii; i++) {
+            if (glyphs[i].unicode != i + 1) {
+              isIdentity = false;
+              break;
+            }
+          }
+          // if it is, replacing with meaningful toUnicode values
+          if (isIdentity) {
+            for (var i = 0, ii = glyphs.length; i < ii; i++) {
+              var unicode = toUnicode[i + 1] || i + 1;
+              glyphs[i].unicode = unicode;
+            }
+            this.useToUnicode = true;
+          }
+        }
+        properties.hasShortCmap = hasShortCmap;
+
+        createGlyphNameMap(glyphs, ids, properties);
        this.glyphNameMap = properties.glyphNameMap;
+
+        cmap.data = createCMapTable(glyphs, ids);
      }

      // Rewrite the 'post' table if needed
@ -1808,6 +1860,14 @@ var Font = (function Font() {
        }
        properties.baseEncoding = encoding;
      }
+      if (properties.subtype == 'CIDFontType0C') {
+        var toUnicode = [];
+        for (var i = 0; i < charstrings.length; ++i) {
+          var charstring = charstrings[i];
+          toUnicode[charstring.code] = charstring.unicode;
+        }
+        this.toUnicode = toUnicode;
+      }

      var fields = {
        // PostScript Font Program
@ -1868,8 +1928,11 @@ var Font = (function Font() {
        // Horizontal metrics
        'hmtx': (function fontFieldsHmtx() {
          var hmtx = '\x00\x00\x00\x00'; // Fake .notdef
-          for (var i = 0, ii = charstrings.length; i < ii; i++)
-            hmtx += string16(charstrings[i].width) + string16(0);
+          for (var i = 0, ii = charstrings.length; i < ii; i++) {
+            var charstring = charstrings[i];
+            var width = 'width' in charstring ? charstring.width : 0;
+            hmtx += string16(width) + string16(0);
+          }
          return stringToArray(hmtx);
        })(),

@ -1898,17 +1961,35 @@ var Font = (function Font() {
      return stringToArray(otf.file);
    },

-    loadCidToUnicode: function font_loadCidToUnicode(properties) {
-      if (properties.cidToGidMap) {
-        this.cidToUnicode = properties.cidToGidMap;
-        return;
+    rebuildToUnicode: function font_rebuildToUnicode(properties) {
+      var firstChar = properties.firstChar, lastChar = properties.lastChar;
+      var map = [];
+      if (properties.composite) {
+        var isIdentityMap = this.cidToUnicode.length == 0;
+        for (var i = firstChar, ii = lastChar; i <= ii; i++) {
+          // TODO missing map the character according font's CMap
+          var cid = i;
+          map[i] = isIdentityMap ? cid : this.cidToUnicode[cid];
+        }
+      } else {
+        for (var i = firstChar, ii = lastChar; i <= ii; i++) {
+          var glyph = properties.differences[i];
+          if (!glyph)
+            glyph = properties.baseEncoding[i];
+          if (!!glyph && (glyph in GlyphsUnicode))
+            map[i] = GlyphsUnicode[glyph];
+        }
      }
+      this.toUnicode = map;
+    },

+    loadCidToUnicode: function font_loadCidToUnicode(properties) {
      if (!properties.cidSystemInfo)
        return;

-      var cidToUnicodeMap = [];
+      var cidToUnicodeMap = [], unicodeToCIDMap = [];
      this.cidToUnicode = cidToUnicodeMap;
+      this.unicodeToCID = unicodeToCIDMap;

      var cidSystemInfo = properties.cidSystemInfo;
      var cidToUnicode;
@ -1920,28 +2001,34 @@ var Font = (function Font() {
      if (!cidToUnicode)
        return; // identity encoding

-      var glyph = 1, i, j, k, ii;
+      var cid = 1, i, j, k, ii;
      for (i = 0, ii = cidToUnicode.length; i < ii; ++i) {
        var unicode = cidToUnicode[i];
        if (isArray(unicode)) {
          var length = unicode.length;
-          for (j = 0; j < length; j++)
-            cidToUnicodeMap[unicode[j]] = glyph;
-          glyph++;
+          for (j = 0; j < length; j++) {
+            cidToUnicodeMap[cid] = unicode[j];
+            unicodeToCIDMap[unicode[j]] = cid;
+          }
+          cid++;
        } else if (typeof unicode === 'object') {
          var fillLength = unicode.f;
          if (fillLength) {
            k = unicode.c;
            for (j = 0; j < fillLength; ++j) {
-              cidToUnicodeMap[k] = glyph++;
+              cidToUnicodeMap[cid] = k;
+              unicodeToCIDMap[k] = cid;
+              cid++;
              k++;
            }
          } else
-            glyph += unicode.s;
+            cid += unicode.s;
        } else if (unicode) {
-          cidToUnicodeMap[unicode] = glyph++;
+          cidToUnicodeMap[cid] = unicode;
+          unicodeToCIDMap[unicode] = cid;
+          cid++;
        } else
-          glyph++;
+          cid++;
      }
    },

@ -1981,19 +2068,19 @@ var Font = (function Font() {
      switch (this.type) {
        case 'CIDFontType0':
          if (this.noUnicodeAdaptation) {
-            width = this.widths[this.cidToUnicode[charcode]];
+            width = this.widths[this.unicodeToCID[charcode] || charcode];
            unicode = charcode;
            break;
          }
-          unicode = adaptUnicode(this.cidToUnicode[charcode] || charcode);
+          unicode = this.toUnicode[charcode] || charcode;
          break;
        case 'CIDFontType2':
          if (this.noUnicodeAdaptation) {
-            width = this.widths[this.cidToUnicode[charcode]];
+            width = this.widths[this.unicodeToCID[charcode] || charcode];
            unicode = charcode;
            break;
          }
-          unicode = adaptUnicode(this.cidToUnicode[charcode] || charcode);
+          unicode = this.toUnicode[charcode] || charcode;
          break;
        case 'Type1':
          var glyphName = this.differences[charcode] || this.encoding[charcode];
@ -2004,7 +2091,7 @@ var Font = (function Font() {
            break;
          }
          unicode = this.glyphNameMap[glyphName] ||
-            adaptUnicode(GlyphsUnicode[glyphName] || charcode);
+            GlyphsUnicode[glyphName] || charcode;
          break;
        case 'Type3':
          var glyphName = this.differences[charcode] || this.encoding[charcode];
@ -2022,16 +2109,16 @@ var Font = (function Font() {
            break;
          }
          if (!this.hasEncoding) {
-            unicode = adaptUnicode(charcode);
+            unicode = this.useToUnicode ? this.toUnicode[charcode] : charcode;
            break;
          }
-          if (this.hasShortCmap) {
+          if (this.hasShortCmap && false) {
            var j = Encodings.MacRomanEncoding.indexOf(glyphName);
-            unicode = j >= 0 && !isSpecialUnicode(j) ? j :
+            unicode = j >= 0 ? j :
              this.glyphNameMap[glyphName];
          } else {
            unicode = glyphName in GlyphsUnicode ?
-              adaptUnicode(GlyphsUnicode[glyphName]) :
+              GlyphsUnicode[glyphName] :
              this.glyphNameMap[glyphName];
          }
          break;
@ -2039,8 +2126,14 @@ var Font = (function Font() {
          warn('Unsupported font type: ' + this.type);
          break;
      }
+
+      var unicodeChars = this.toUnicode ? this.toUnicode[charcode] : charcode;
+      if (typeof unicodeChars === 'number')
+        unicodeChars = String.fromCharCode(unicodeChars);
+
      return {
-        unicode: unicode,
+        fontChar: String.fromCharCode(unicode),
+        unicode: unicodeChars,
        width: isNum(width) ? width : this.defaultWidth,
        codeIRQueue: codeIRQueue
      };
@ -2753,22 +2846,13 @@ CFF.prototype = {
  getOrderedCharStrings: function cff_getOrderedCharStrings(glyphs,
                                                            properties) {
    var charstrings = [];
-    var reverseMapping = {};
-    var encoding = properties.baseEncoding;
    var i, length, glyphName;
-    for (i = 0, length = encoding.length; i < length; ++i) {
-      glyphName = encoding[i];
-      if (!glyphName || isSpecialUnicode(i))
-        continue;
-      reverseMapping[glyphName] = i;
-    }
-    reverseMapping['.notdef'] = 0;
    var unusedUnicode = kCmapGlyphOffset;
    for (i = 0, length = glyphs.length; i < length; i++) {
      var item = glyphs[i];
      var glyphName = item.glyph;
-      var unicode = glyphName in reverseMapping ?
-        reverseMapping[glyphName] : unusedUnicode++;
+      var unicode = glyphName in GlyphsUnicode ?
+        GlyphsUnicode[glyphName] : unusedUnicode++;
      charstrings.push({
        glyph: glyphName,
        unicode: unicode,
@ -3055,16 +3139,14 @@ var Type2CFF = (function type2CFF() {
      }

      var charStrings = this.parseIndex(topDict.CharStrings);
-      var charset = this.parseCharsets(topDict.charset,
-                                       charStrings.length, strings);
-      var encoding = this.parseEncoding(topDict.Encoding, properties,
-                                             strings, charset);

      var charset, encoding;
      var isCIDFont = properties.subtype == 'CIDFontType0C';
      if (isCIDFont) {
-        charset = [];
-        charset.length = charStrings.length;
+        charset = ['.notdef'];
+        for (var i = 1, ii = charStrings.length; i < ii; ++i)
+          charset.push('glyph' + i);
+
        encoding = this.parseCidMap(topDict.charset,
                                    charStrings.length);
      } else {
@ -3133,38 +3215,44 @@ var Type2CFF = (function type2CFF() {
      var charstrings = [];
      var unicodeUsed = [];
      var unassignedUnicodeItems = [];
+      var inverseEncoding = [];
+      for (var charcode in encoding)
+        inverseEncoding[encoding[charcode]] = charcode | 0;
      for (var i = 0, ii = charsets.length; i < ii; i++) {
        var glyph = charsets[i];
-        var encodingFound = false;
-        for (var charcode in encoding) {
-          if (encoding[charcode] == i) {
-            var code = charcode | 0;
-            charstrings.push({
-              unicode: adaptUnicode(code),
-              code: code,
-              gid: i,
-              glyph: glyph
-            });
-            unicodeUsed[code] = true;
-            encodingFound = true;
-            break;
-          }
+        if (glyph == '.notdef') {
+          charstrings.push({
+            unicode: 0,
+            code: 0,
+            gid: i,
+            glyph: glyph
+          });
+          continue;
        }
-        if (!encodingFound) {
+        var code = inverseEncoding[i];
+        if (!code || isSpecialUnicode(code)) {
          unassignedUnicodeItems.push(i);
+          continue;
        }
+        charstrings.push({
+          unicode: code,
+          code: code,
+          gid: i,
+          glyph: glyph
+        });
+        unicodeUsed[code] = true;
      }

-      var nextUnusedUnicode = 0x21;
+      var nextUnusedUnicode = kCmapGlyphOffset;
      for (var j = 0, jj = unassignedUnicodeItems.length; j < jj; ++j) {
        var i = unassignedUnicodeItems[j];
        // giving unicode value anyway
-        while (unicodeUsed[nextUnusedUnicode])
+        while (nextUnusedUnicode in unicodeUsed)
          nextUnusedUnicode++;
-        var code = nextUnusedUnicode++;
+        var unicode = nextUnusedUnicode++;
        charstrings.push({
-          unicode: adaptUnicode(code),
-          code: code,
+          unicode: unicode,
+          code: inverseEncoding[i] || 0,
          gid: i,
          glyph: charsets[i]
        });