Merge pull request #1218 from mozilla/bidi

bidi characters algorithm; separation of the toFontChar and toUnicode
2012-02-23 07:00:48 -08:00 · 2012-02-23 07:00:48 -08:00 · 676e575a52
commit 676e575a52
parent 92e2723cbc 3b29736882
11 changed files with 580 additions and 44 deletions
--- a/1
+++ b/1
@ -38,6 +38,7 @@ PDF_JS_FILES = \
  worker.js \
  ../external/jpgjs/jpg.js \
  jpx.js \
+  bidi.js \
 	$(NULL)

 # make server
--- a/src/bidi.js
+++ b/src/bidi.js
@ -0,0 +1,433 @@
+/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
+
+'use strict';
+
+var bidi = (function bidiClosure() {
+  // Character types for symbols from 0000 to 00FF.
+  var baseTypes = [
+    'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'S', 'B', 'S', 'WS',
+    'B', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN',
+    'BN', 'BN', 'B', 'B', 'B', 'S', 'WS', 'ON', 'ON', 'ET', 'ET', 'ET', 'ON',
+    'ON', 'ON', 'ON', 'ON', 'ON', 'CS', 'ON', 'CS', 'ON', 'EN', 'EN', 'EN',
+    'EN', 'EN', 'EN', 'EN', 'EN', 'EN', 'EN', 'ON', 'ON', 'ON', 'ON', 'ON',
+    'ON', 'ON', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'ON', 'ON',
+    'ON', 'ON', 'ON', 'ON', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'ON', 'ON', 'ON', 'ON', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'B', 'BN',
+    'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN',
+    'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN', 'BN',
+    'BN', 'CS', 'ON', 'ET', 'ET', 'ET', 'ET', 'ON', 'ON', 'ON', 'ON', 'L', 'ON',
+    'ON', 'ON', 'ON', 'ON', 'ET', 'ET', 'EN', 'EN', 'ON', 'L', 'ON', 'ON', 'ON',
+    'EN', 'L', 'ON', 'ON', 'ON', 'ON', 'ON', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'ON', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
+    'L', 'L', 'L', 'ON', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L'
+  ];
+
+  // Character types for symbols from 0600 to 06FF
+  var arabicTypes = [
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'CS', 'AL', 'ON', 'ON', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM',
+    'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AN', 'AN', 'AN', 'AN', 'AN', 'AN', 'AN', 'AN', 'AN',
+    'AN', 'ET', 'AN', 'AN', 'AL', 'AL', 'AL', 'NSM', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM',
+    'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'ON', 'NSM',
+    'NSM', 'NSM', 'NSM', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
+    'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL'
+  ];
+
+  function isOdd(i) {
+    return (i & 1) != 0;
+  }
+
+  function isEven(i) {
+    return (i & 1) == 0;
+  }
+
+  function findUnequal(arr, start, value) {
+    var j;
+    for (var j = start, jj = arr.length; j < jj; ++j) {
+      if (arr[j] != value)
+        return j;
+    }
+    return j;
+  }
+
+  function setValues(arr, start, end, value) {
+    for (var j = start; j < end; ++j) {
+      arr[j] = value;
+    }
+  }
+
+  function reverseValues(arr, start, end) {
+    for (var i = start, j = end - 1; i < j; ++i, --j) {
+      var temp = arr[i];
+      arr[i] = arr[j];
+      arr[j] = temp;
+    }
+  }
+
+  function mirrorGlyphs(c) {
+    /*
+     # BidiMirroring-1.txt
+     0028; 0029 # LEFT PARENTHESIS
+     0029; 0028 # RIGHT PARENTHESIS
+     003C; 003E # LESS-THAN SIGN
+     003E; 003C # GREATER-THAN SIGN
+     005B; 005D # LEFT SQUARE BRACKET
+     005D; 005B # RIGHT SQUARE BRACKET
+     007B; 007D # LEFT CURLY BRACKET
+     007D; 007B # RIGHT CURLY BRACKET
+     00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+     00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+     */
+    switch (c) {
+      case '(':
+        return ')';
+      case ')':
+        return '(';
+      case '<':
+        return '>';
+      case '>':
+        return '<';
+      case ']':
+        return '[';
+      case '[':
+        return ']';
+      case '}':
+        return '{';
+      case '{':
+        return '}';
+      case '\u00AB':
+        return '\u00BB';
+      case '\u00BB':
+        return '\u00AB';
+      default:
+        return c;
+    }
+  }
+
+  return (function bidi(text, startLevel) {
+    var str = text.str;
+    var strLength = str.length;
+    if (strLength == 0)
+      return str;
+
+    // get types, fill arrays
+
+    var chars = new Array(strLength);
+    var types = new Array(strLength);
+    var oldtypes = new Array(strLength);
+    var numBidi = 0;
+
+    for (var i = 0; i < strLength; ++i) {
+      chars[i] = str.charAt(i);
+
+      var charCode = str.charCodeAt(i);
+      var charType = 'L';
+      if (charCode <= 0x00ff)
+        charType = baseTypes[charCode];
+      else if (0x0590 <= charCode && charCode <= 0x05f4)
+        charType = 'R';
+      else if (0x0600 <= charCode && charCode <= 0x06ff)
+        charType = arabicTypes[charCode & 0xff];
+      else if (0x0700 <= charCode && charCode <= 0x08AC)
+        charType = 'AL';
+
+      if (charType == 'R' || charType == 'AL' || charType == 'AN')
+        numBidi++;
+
+      oldtypes[i] = types[i] = charType;
+    }
+
+    // detect the bidi method
+    //  if there are no rtl characters then no bidi needed
+    //  if less than 30% chars are rtl then string is primarily ltr
+    //  if more than 30% chars are rtl then string is primarily rtl
+    if (numBidi == 0) {
+      text.direction = 'ltr';
+      return str;
+    }
+
+    if (startLevel == -1) {
+      if ((strLength / numBidi) < 0.3) {
+        text.direction = 'ltr';
+        startLevel = 0;
+      } else {
+        text.direction = 'rtl';
+        startLevel = 1;
+      }
+    }
+
+    var levels = new Array(strLength);
+
+    for (var i = 0; i < strLength; ++i) {
+      levels[i] = startLevel;
+    }
+
+    var diffChars = new Array(strLength);
+    var diffLevels = new Array(strLength);
+    var diffTypes = new Array(strLength);
+
+    /*
+     X1-X10: skip most of this, since we are NOT doing the embeddings.
+     */
+
+    var e = isOdd(startLevel) ? 'R' : 'L';
+    var sor = e;
+    var eor = sor;
+
+    /*
+     W1. Examine each non-spacing mark (NSM) in the level run, and change the
+     type of the NSM to the type of the previous character. If the NSM is at the
+     start of the level run, it will get the type of sor.
+     */
+
+    var lastType = sor;
+    for (var i = 0; i < strLength; ++i) {
+      if (types[i] == 'NSM')
+        types[i] = lastType;
+      else
+        lastType = types[i];
+    }
+
+    /*
+     W2. Search backwards from each instance of a European number until the
+     first strong type (R, L, AL, or sor) is found.  If an AL is found, change
+     the type of the European number to Arabic number.
+     */
+
+    var lastType = sor;
+    for (var i = 0; i < strLength; ++i) {
+      var t = types[i];
+      if (t == 'EN')
+        types[i] = (lastType == 'AL') ? 'AN' : 'EN';
+      else if (t == 'R' || t == 'L' || t == 'AL')
+        lastType = t;
+    }
+
+    /*
+     W3. Change all ALs to R.
+     */
+
+    for (var i = 0; i < strLength; ++i) {
+      var t = types[i];
+      if (t == 'AL')
+        types[i] = 'R';
+    }
+
+    /*
+     W4. A single European separator between two European numbers changes to a
+     European number. A single common separator between two numbers of the same
+     type changes to that type:
+     */
+
+    for (var i = 1; i < strLength - 1; ++i) {
+      if (types[i] == 'ES' && types[i - 1] == 'EN' && types[i + 1] == 'EN')
+        types[i] = 'EN';
+      if (types[i] == 'CS' && (types[i - 1] == 'EN' || types[i - 1] == 'AN') &&
+          types[i + 1] == types[i - 1])
+        types[i] = types[i - 1];
+    }
+
+    /*
+     W5. A sequence of European terminators adjacent to European numbers changes
+     to all European numbers:
+     */
+
+    for (var i = 0; i < strLength; ++i) {
+      if (types[i] == 'EN') {
+        // do before
+        for (var j = i - 1; j >= 0; --j) {
+          if (types[j] != 'ET')
+            break;
+          types[j] = 'EN';
+        }
+        // do after
+        for (var j = i + 1; j < strLength; --j) {
+          if (types[j] != 'ET')
+            break;
+          types[j] = 'EN';
+        }
+      }
+    }
+
+    /*
+     W6. Otherwise, separators and terminators change to Other Neutral:
+     */
+
+    for (var i = 0; i < strLength; ++i) {
+      var t = types[i];
+      if (t == 'WS' || t == 'ES' || t == 'ET' || t == 'CS')
+        types[i] = 'ON';
+    }
+
+    /*
+     W7. Search backwards from each instance of a European number until the
+     first strong type (R, L, or sor) is found. If an L is found,  then change
+     the type of the European number to L.
+     */
+
+    var lastType = sor;
+    for (var i = 0; i < strLength; ++i) {
+      var t = types[i];
+      if (t == 'EN')
+        types[i] = (lastType == 'L') ? 'L' : 'EN';
+      else if (t == 'R' || t == 'L')
+        lastType = t;
+    }
+
+    /*
+     N1. A sequence of neutrals takes the direction of the surrounding strong
+     text if the text on both sides has the same direction. European and Arabic
+     numbers are treated as though they were R. Start-of-level-run (sor) and
+     end-of-level-run (eor) are used at level run boundaries.
+     */
+
+    for (var i = 0; i < strLength; ++i) {
+      if (types[i] == 'ON') {
+        var end = findUnequal(types, i + 1, 'ON');
+        var before = sor;
+        if (i > 0)
+          before = types[i - 1];
+        var after = eor;
+        if (end + 1 < strLength)
+          after = types[end + 1];
+        if (before != 'L')
+          before = 'R';
+        if (after != 'L')
+          after = 'R';
+        if (before == after)
+          setValues(types, i, end, before);
+        i = end - 1; // reset to end (-1 so next iteration is ok)
+      }
+    }
+
+    /*
+     N2. Any remaining neutrals take the embedding direction.
+     */
+
+    for (var i = 0; i < strLength; ++i) {
+      if (types[i] == 'ON')
+        types[i] = e;
+    }
+
+    /*
+     I1. For all characters with an even (left-to-right) embedding direction,
+     those of type R go up one level and those of type AN or EN go up two
+     levels.
+     I2. For all characters with an odd (right-to-left) embedding direction,
+     those of type L, EN or AN go up one level.
+     */
+
+    for (var i = 0; i < strLength; ++i) {
+      var t = types[i];
+      if (isEven(levels[i])) {
+        if (t == 'R') {
+          levels[i] += 1;
+        } else if (t == 'AN' || t == 'EN') {
+          levels[i] += 2;
+        }
+      } else { // isOdd, so
+        if (t == 'L' || t == 'AN' || t == 'EN') {
+          levels[i] += 1;
+        }
+      }
+    }
+
+    /*
+     L1. On each line, reset the embedding level of the following characters to
+     the paragraph embedding level:
+
+     segment separators,
+     paragraph separators,
+     any sequence of whitespace characters preceding a segment separator or
+     paragraph separator, and any sequence of white space characters at the end
+     of the line.
+     */
+
+    // don't bother as text is only single line
+
+    /*
+     L2. From the highest level found in the text to the lowest odd level on
+     each line, reverse any contiguous sequence of characters that are at that
+     level or higher.
+     */
+
+    // find highest level & lowest odd level
+
+    var highestLevel = -1;
+    var lowestOddLevel = 99;
+    for (var i = 0, ii = levels.length; i < ii; ++i) {
+      var level = levels[i];
+      if (highestLevel < level)
+        highestLevel = level;
+      if (lowestOddLevel > level && isOdd(level))
+        lowestOddLevel = level;
+    }
+
+    // now reverse between those limits
+
+    for (var level = highestLevel; level >= lowestOddLevel; --level) {
+      // find segments to reverse
+      var start = -1;
+      for (var i = 0, ii = levels.length; i < ii; ++i) {
+        if (levels[i] < level) {
+          if (start >= 0) {
+            reverseValues(chars, start, i);
+            start = -1;
+          }
+        } else if (start < 0) {
+          start = i;
+        }
+      }
+      if (start >= 0) {
+        reverseValues(chars, start, levels.length);
+      }
+    }
+
+    /*
+     L3. Combining marks applied to a right-to-left base character will at this
+     point precede their base character. If the rendering engine expects them to
+     follow the base characters in the final display process, then the ordering
+     of the marks and the base character must be reversed.
+     */
+
+    // don't bother for now
+
+    /*
+     L4. A character that possesses the mirrored property as specified by
+     Section 4.7, Mirrored, must be depicted by a mirrored glyph if the resolved
+     directionality of that character is R.
+     */
+
+    // don't mirror as characters are already mirrored in the pdf
+
+    // Finally, return string
+
+    var result = '';
+    for (var i = 0, ii = chars.length; i < ii; ++i) {
+      var ch = chars[i];
+      if (ch != '<' && ch != '>')
+        result += ch;
+    }
+    return result;
+  });
+})();
--- a/src/canvas.js
+++ b/src/canvas.js
@ -778,8 +778,16 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {

          x += charWidth;

-          text.str += glyph.unicode === ' ' ? '\u00A0' : glyph.unicode;
-          text.length++;
+          var glyphUnicode = glyph.unicode === ' ' ? '\u00A0' : glyph.unicode;
+          var glyphUnicodeLength = glyphUnicode.length;
+          //reverse an arabic ligature
+          if (glyphUnicodeLength > 1 &&
+              isRTLRangeFor(glyphUnicode.charCodeAt(0))) {
+            for (var ii = glyphUnicodeLength - 1; ii >= 0; ii--)
+              text.str += glyphUnicode[ii];
+          } else
+            text.str += glyphUnicode;
+          text.length += glyphUnicodeLength;
          text.canvasWidth += charWidth;
        }
        current.x += x * textHScale2;
@ -845,7 +853,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
              text.str += shownText.str;
            }
            text.canvasWidth += shownText.canvasWidth;
-            text.length += e.length;
+            text.length += shownText.length;
          }
        } else {
          malformed('TJ array element ' + e + ' is not string or num');
--- a/src/evaluator.js
+++ b/src/evaluator.js
@ -620,8 +620,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
              } else {
                // parsing hex UTF-16BE numbers
                var str = [];
-                for (var i = 0, ii = token.length; i < ii; i += 4)
-                  str.push(parseInt(token.substr(i, 4), 16));
+                for (var k = 0, kk = token.length; k < kk; k += 4) {
+                  var b = parseInt(token.substr(k, 4), 16);
+                  if (b <= 0x10) {
+                    k += 4;
+                    b = (b << 16) | parseInt(token.substr(k, 4), 16);
+                    b -= 0x10000;
+                    str.push(0xD800 | (b >> 10));
+                    str.push(0xDC00 | (b & 0x3FF));
+                    break;
+                  }
+                  str.push(b);
+                }
                tokens.push(String.fromCharCode.apply(String, str));
                token = '';
              }
--- a/src/fonts.js
+++ b/src/fonts.js
@ -736,6 +736,16 @@ function getUnicodeRangeFor(value) {
  return -1;
 }

+function isRTLRangeFor(value) {
+  var range = UnicodeRanges[13];
+  if (value >= range.begin && value < range.end)
+    return true;
+  range = UnicodeRanges[11];
+  if (value >= range.begin && value < range.end)
+    return true;
+  return false;
+}
+
 function isSpecialUnicode(unicode) {
  return (unicode <= 0x1F || (unicode >= 127 && unicode < kSizeOfGlyphArea)) ||
    (unicode >= kCmapGlyphOffset &&
@ -796,6 +806,8 @@ var Font = (function FontClosure() {
    else
      this.rebuildToUnicode(properties);

+    this.toFontChar = this.buildToFontChar(this.toUnicode);
+
    if (!file) {
      // The file data is not specified. Trying to fix the font name
      // to be used with the canvas.font.
@ -1817,9 +1829,9 @@ var Font = (function FontClosure() {
        var unassignedUnicodeItems = [];
        for (var i = 1; i < numGlyphs; i++) {
          var cid = gidToCidMap[i] || i;
-          var unicode = this.toUnicode[cid];
-          if (!unicode || isSpecialUnicode(unicode) ||
-              unicode in usedUnicodes) {
+          var unicode = this.toFontChar[cid];
+          if (!unicode || typeof unicode !== 'number' ||
+              isSpecialUnicode(unicode) || unicode in usedUnicodes) {
            unassignedUnicodeItems.push(i);
            continue;
          }
@ -1838,7 +1850,7 @@ var Font = (function FontClosure() {
          if (unusedUnicode >= kCmapGlyphOffset + kSizeOfGlyphArea)
            break;
          var unicode = unusedUnicode++;
-          this.toUnicode[cid] = unicode;
+          this.toFontChar[cid] = unicode;
          usedUnicodes[unicode] = true;
          glyphs.push({ unicode: unicode, code: cid });
          ids.push(i);
@ -1849,9 +1861,9 @@ var Font = (function FontClosure() {
        var glyphs = cmapTable.glyphs;
        var ids = cmapTable.ids;
        var hasShortCmap = !!cmapTable.hasShortCmap;
-        var toUnicode = this.toUnicode;
+        var toFontChar = this.toFontChar;

-        if (toUnicode && toUnicode.length > 0) {
+        if (toFontChar && toFontChar.length > 0) {
          // checking if cmap is just identity map
          var isIdentity = true;
          for (var i = 0, ii = glyphs.length; i < ii; i++) {
@ -1864,8 +1876,9 @@ var Font = (function FontClosure() {
          if (isIdentity && !this.isSymbolicFont) {
            var usedUnicodes = [], unassignedUnicodeItems = [];
            for (var i = 0, ii = glyphs.length; i < ii; i++) {
-              var unicode = toUnicode[i + 1];
-              if (!unicode || unicode in usedUnicodes) {
+              var unicode = toFontChar[i + 1];
+              if (!unicode || typeof unicode !== 'number' ||
+                  unicode in usedUnicodes) {
                unassignedUnicodeItems.push(i);
                continue;
              }
@ -1879,11 +1892,11 @@ var Font = (function FontClosure() {
                unusedUnicode++;
              var cid = i + 1;
              // override only if unicode mapping is not specified
-              if (!(cid in toUnicode))
-                toUnicode[cid] = unusedUnicode;
+              if (!(cid in toFontChar))
+                toFontChar[cid] = unusedUnicode;
              glyphs[i].unicode = unusedUnicode++;
            }
-            this.useToUnicode = true;
+            this.useToFontChar = true;
          }
        }

@ -1914,13 +1927,13 @@ var Font = (function FontClosure() {
        }

        // Moving all symbolic font glyphs into 0xF000 - 0xF0FF range.
-        this.symbolicGlyphsOffset = 0;
        if (this.isSymbolicFont) {
          for (var i = 0, ii = glyphs.length; i < ii; i++) {
-            var code = glyphs[i].unicode;
-            glyphs[i].unicode = kSymbolicFontGlyphOffset | (code & 0xFF);
+            var code = glyphs[i].unicode & 0xFF;
+            var fontCharCode = kSymbolicFontGlyphOffset | code;
+            glyphs[i].unicode = toFontChar[code] = fontCharCode;
          }
-          this.symbolicGlyphsOffset = kSymbolicFontGlyphOffset;
+          this.useToFontChar = true;
        }

        // remove glyph references outside range of avaialable glyphs
@ -2023,12 +2036,12 @@ var Font = (function FontClosure() {
        properties.baseEncoding = encoding;
      }
      if (properties.subtype == 'CIDFontType0C') {
-        var toUnicode = [];
+        var toFontChar = [];
        for (var i = 0; i < charstrings.length; ++i) {
          var charstring = charstrings[i];
-          toUnicode[charstring.code] = charstring.unicode;
+          toFontChar[charstring.code] = charstring.unicode;
        }
-        this.toUnicode = toUnicode;
+        this.toFontChar = toFontChar;
      }

      var fields = {
@ -2123,6 +2136,19 @@ var Font = (function FontClosure() {
      return stringToArray(otf.file);
    },

+    buildToFontChar: function font_buildToFontChar(toUnicode) {
+      var result = [];
+      var unusedUnicode = kCmapGlyphOffset;
+      for (var i = 0, ii = toUnicode.length; i < ii; i++) {
+        var unicode = toUnicode[i];
+        var fontCharCode = typeof unicode === 'object' ? unusedUnicode++ :
+          unicode;
+        if (typeof unicode !== 'undefined')
+          result[i] = fontCharCode;
+      }
+      return result;
+    },
+
    rebuildToUnicode: function font_rebuildToUnicode(properties) {
      var firstChar = properties.firstChar, lastChar = properties.lastChar;
      var map = [];
@ -2258,7 +2284,7 @@ var Font = (function FontClosure() {
    },

    charToGlyph: function fonts_charToGlyph(charcode) {
-      var unicode, width, codeIRQueue;
+      var fontCharCode, width, codeIRQueue;

      var width = this.widths[charcode];

@ -2266,38 +2292,39 @@ var Font = (function FontClosure() {
        case 'CIDFontType0':
          if (this.noUnicodeAdaptation) {
            width = this.widths[this.unicodeToCID[charcode] || charcode];
-            unicode = mapPrivateUseChars(charcode);
+            fontCharCode = mapPrivateUseChars(charcode);
            break;
          }
-          unicode = this.toUnicode[charcode] || charcode;
+          fontCharCode = this.toFontChar[charcode] || charcode;
          break;
        case 'CIDFontType2':
          if (this.noUnicodeAdaptation) {
            width = this.widths[this.unicodeToCID[charcode] || charcode];
-            unicode = mapPrivateUseChars(charcode);
+            fontCharCode = mapPrivateUseChars(charcode);
            break;
          }
-          unicode = this.toUnicode[charcode] || charcode;
+          fontCharCode = this.toFontChar[charcode] || charcode;
          break;
        case 'Type1':
          var glyphName = this.differences[charcode] || this.encoding[charcode];
          if (!isNum(width))
            width = this.widths[glyphName];
          if (this.noUnicodeAdaptation) {
-            unicode = mapPrivateUseChars(GlyphsUnicode[glyphName] || charcode);
+            fontCharCode = mapPrivateUseChars(GlyphsUnicode[glyphName] ||
+              charcode);
            break;
          }
-          unicode = this.glyphNameMap[glyphName] ||
+          fontCharCode = this.glyphNameMap[glyphName] ||
            GlyphsUnicode[glyphName] || charcode;
          break;
        case 'Type3':
          var glyphName = this.differences[charcode] || this.encoding[charcode];
          codeIRQueue = this.charProcIRQueues[glyphName];
-          unicode = charcode;
+          fontCharCode = charcode;
          break;
        case 'TrueType':
-          if (this.useToUnicode) {
-            unicode = this.toUnicode[charcode] || charcode;
+          if (this.useToFontChar) {
+            fontCharCode = this.toFontChar[charcode] || charcode;
            break;
          }
          var glyphName = this.differences[charcode] || this.encoding[charcode];
@ -2306,17 +2333,17 @@ var Font = (function FontClosure() {
          if (!isNum(width))
            width = this.widths[glyphName];
          if (this.noUnicodeAdaptation) {
-            unicode = GlyphsUnicode[glyphName] || charcode;
+            fontCharCode = GlyphsUnicode[glyphName] || charcode;
            break;
          }
          if (!this.hasEncoding || this.isSymbolicFont) {
-            unicode = this.useToUnicode ? this.toUnicode[charcode] :
-              (this.symbolicGlyphsOffset + charcode);
+            fontCharCode = this.useToFontChar ? this.toFontChar[charcode] :
+              charcode;
            break;
          }

          // MacRoman encoding address by re-encoding the cmap table
-          unicode = glyphName in this.glyphNameMap ?
+          fontCharCode = glyphName in this.glyphNameMap ?
            this.glyphNameMap[glyphName] : GlyphsUnicode[glyphName];
          break;
        default:
@ -2332,7 +2359,7 @@ var Font = (function FontClosure() {
      width = (isNum(width) ? width : this.defaultWidth) * this.widthMultiplier;

      return {
-        fontChar: String.fromCharCode(unicode),
+        fontChar: String.fromCharCode(fontCharCode),
        unicode: unicodeChars,
        width: width,
        codeIRQueue: codeIRQueue
--- a/src/util.js
+++ b/src/util.js
@ -174,6 +174,57 @@ var Util = (function UtilClosure() {
  return Util;
 })();

+// optimised CSS custom property getter/setter
+var CustomStyle = (function CustomStyleClosure() {
+
+  // As noted on: http://www.zachstronaut.com/posts/2009/02/17/
+  //              animate-css-transforms-firefox-webkit.html
+  // in some versions of IE9 it is critical that ms appear in this list
+  // before Moz
+  var prefixes = ['ms', 'Moz', 'Webkit', 'O'];
+  var _cache = { };
+
+  function CustomStyle() {
+  }
+
+  CustomStyle.getProp = function get(propName, element) {
+    // check cache only when no element is given
+    if (arguments.length == 1 && typeof _cache[propName] == 'string') {
+      return _cache[propName];
+    }
+
+    element = element || document.documentElement;
+    var style = element.style, prefixed, uPropName;
+
+    // test standard property first
+    if (typeof style[propName] == 'string') {
+      return (_cache[propName] = propName);
+    }
+
+    // capitalize
+    uPropName = propName.charAt(0).toUpperCase() + propName.slice(1);
+
+    // test vendor specific properties
+    for (var i = 0, l = prefixes.length; i < l; i++) {
+      prefixed = prefixes[i] + uPropName;
+      if (typeof style[prefixed] == 'string') {
+        return (_cache[propName] = prefixed);
+      }
+    }
+
+    //if all fails then set to undefined
+    return (_cache[propName] = 'undefined');
+  }
+
+  CustomStyle.setProp = function set(propName, element, str) {
+    var prop = this.getProp(propName);
+    if (prop != 'undefined')
+      element.style[prop] = str;
+  }
+
+  return CustomStyle;
+})();
+
 var PDFStringTranslateTable = [
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0x2D8, 0x2C7, 0x2C6, 0x2D9, 0x2DD, 0x2DB, 0x2DA, 0x2DC, 0, 0, 0, 0, 0, 0, 0,
--- a/src/worker_loader.js
+++ b/src/worker_loader.js
@ -24,7 +24,8 @@ var files = [
  'stream.js',
  'worker.js',
  '../external/jpgjs/jpg.js',
-  'jpx.js'
+  'jpx.js',
+  'bidi.js'
 ];

 // Load all the files.
--- a/test/test_slave.html
+++ b/test/test_slave.html
@ -23,6 +23,7 @@
    <script type="text/javascript" src="/src/worker.js"></script>
    <script type="text/javascript" src="/external/jpgjs/jpg.js"></script>
    <script type="text/javascript" src="/src/jpx.js"></script>
+    <script type="text/javascript" src="/src/bidi.js"></script>
    <script type="text/javascript" src="driver.js"></script>

    <script type="text/javascript">
--- a/test/unit/jsTestDriver.conf
+++ b/test/unit/jsTestDriver.conf
@ -22,6 +22,7 @@ load:
  - ../../src/pattern.js
  - ../../src/stream.js
  - ../../src/worker.js
+  - ../../src/bidi.js
  - ../../external/jpgjs/jpg.js
  - ../unit/obj_spec.js
  - ../unit/function_spec.js
--- a/web/viewer.html
+++ b/web/viewer.html
@ -29,6 +29,7 @@
        <script type="text/javascript" src="../src/worker.js"></script>  <!-- PDFJSSCRIPT_REMOVE_CORE -->
        <script type="text/javascript" src="../external/jpgjs/jpg.js"></script>  <!-- PDFJSSCRIPT_REMOVE_CORE -->
        <script type="text/javascript" src="../src/jpx.js"></script>  <!-- PDFJSSCRIPT_REMOVE_CORE -->
+        <script type="text/javascript" src="../src/bidi.js"></script>  <!-- PDFJSSCRIPT_REMOVE_CORE -->
        <script type="text/javascript">PDFJS.workerSrc = '../src/worker_loader.js';</script> <!-- PDFJSSCRIPT_REMOVE_CORE -->
        <script type="text/javascript" src="debugger.js"></script>
        <script type="text/javascript" src="viewer.js"></script>
--- a/web/viewer.js
+++ b/web/viewer.js
@ -1051,12 +1051,13 @@ var TextLayerBuilder = function textLayerBuilder(textLayerDiv) {
        textLayerDiv.appendChild(textDiv);

        if (textDiv.dataset.textLength > 1) { // avoid div by zero
-          // Adjust div width (via letterSpacing) to match canvas text
+          // Adjust div width to match canvas text
          // Due to the .offsetWidth calls, this is slow
          // This needs to come after appending to the DOM
-          textDiv.style.letterSpacing =
-            ((textDiv.dataset.canvasWidth - textDiv.offsetWidth) /
-              (textDiv.dataset.textLength - 1)) + 'px';
+          var textScale = textDiv.dataset.canvasWidth / textDiv.offsetWidth;
+          CustomStyle.setProp('transform' , textDiv,
+            'scale(' + textScale + ', 1)');
+          CustomStyle.setProp('transformOrigin' , textDiv, '0% 0%');
        }
      } // textLength > 0
    }
@ -1096,7 +1097,8 @@ var TextLayerBuilder = function textLayerBuilder(textLayerDiv) {
    textDiv.style.fontSize = fontHeight + 'px';
    textDiv.style.left = text.geom.x + 'px';
    textDiv.style.top = (text.geom.y - fontHeight) + 'px';
-    textDiv.textContent = text.str;
+    textDiv.textContent = bidi(text, -1);
+    textDiv.dir = text.direction;
    textDiv.dataset.textLength = text.length;
    this.textDivs.push(textDiv);
  };