Bidi: skip invalid Unicode character to make indexing work

For Arabic characters, the Unicode character codes are mapped to Unicode
character types using the character codes for indexing. However, the
character code 0x061D is undefined (and therefore invalid) in the
Unicode standard. The imported list does not contain this entry, but not
having it in the list breaks indexing for items after it. Therefore, put
an empty string on its position to make indexing work properly and issue
a warning in the unlikely event that we encounter this character.
This commit is contained in:
Tim van der Meij 2016-11-23 22:49:48 +01:00
parent 11839f018f
commit 995be19378

View File

@ -17,13 +17,14 @@
(function (root, factory) { (function (root, factory) {
if (typeof define === 'function' && define.amd) { if (typeof define === 'function' && define.amd) {
define('pdfjs/core/bidi', ['exports'], factory); define('pdfjs/core/bidi', ['exports', 'pdfjs/shared/util'], factory);
} else if (typeof exports !== 'undefined') { } else if (typeof exports !== 'undefined') {
factory(exports); factory(exports, require('../shared/util.js'));
} else { } else {
factory((root.pdfjsCoreBidi = {})); factory((root.pdfjsCoreBidi = {}), root.pdfjsSharedUtil);
} }
}(this, function (exports) { }(this, function (exports, sharedUtil) {
var warn = sharedUtil.warn;
// Character types for symbols from 0000 to 00FF. // Character types for symbols from 0000 to 00FF.
// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt // Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
@ -52,10 +53,14 @@
// Character types for symbols from 0600 to 06FF. // Character types for symbols from 0600 to 06FF.
// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt // Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
// Note that 061D does not exist in the Unicode standard (see
// http://unicode.org/charts/PDF/U0600.pdf), so we replace it with an
// empty string and issue a warning if we encounter this character. The
// empty string is required to properly index the items after it.
var arabicTypes = [ var arabicTypes = [
'AN', 'AN', 'AN', 'AN', 'AN', 'AN', 'ON', 'ON', 'AL', 'ET', 'ET', 'AL', 'AN', 'AN', 'AN', 'AN', 'AN', 'AN', 'ON', 'ON', 'AL', 'ET', 'ET', 'AL',
'CS', 'AL', 'ON', 'ON', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'CS', 'AL', 'ON', 'ON', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM', 'NSM',
'NSM', 'NSM', 'NSM', 'NSM', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'NSM', 'NSM', 'NSM', 'NSM', 'AL', 'AL', '', 'AL', 'AL', 'AL', 'AL', 'AL',
'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL', 'AL',
@ -144,6 +149,9 @@
charType = 'R'; charType = 'R';
} else if (0x0600 <= charCode && charCode <= 0x06ff) { } else if (0x0600 <= charCode && charCode <= 0x06ff) {
charType = arabicTypes[charCode & 0xff]; charType = arabicTypes[charCode & 0xff];
if (!charType) {
warn('Bidi: invalid Unicode character ' + charCode.toString(16));
}
} else if (0x0700 <= charCode && charCode <= 0x08AC) { } else if (0x0700 <= charCode && charCode <= 0x08AC) {
charType = 'AL'; charType = 'AL';
} }