diff --git a/src/core/bidi.js b/src/core/bidi.js index ece975154..2596748e0 100644 --- a/src/core/bidi.js +++ b/src/core/bidi.js @@ -147,7 +147,11 @@ function bidi(str, startLevel = -1, vertical = false) { if (!charType) { warn("Bidi: invalid Unicode character " + charCode.toString(16)); } - } else if (0x0700 <= charCode && charCode <= 0x08ac) { + } else if ( + (0x0700 <= charCode && charCode <= 0x08ac) || + (0xfb50 <= charCode && charCode <= 0xfdff) || + (0xfe70 <= charCode && charCode <= 0xfeff) + ) { charType = "AL"; } if (charType === "R" || charType === "AL" || charType === "AN") { diff --git a/src/core/document.js b/src/core/document.js index 6e4ca7cff..77c552ea4 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -511,7 +511,13 @@ class Page { }); } - extractTextContent({ handler, task, includeMarkedContent, sink }) { + extractTextContent({ + handler, + task, + includeMarkedContent, + disableNormalization, + sink, + }) { const contentStreamPromise = this.getContentStream(); const resourcesPromise = this.loadResources([ "ExtGState", @@ -539,6 +545,7 @@ class Page { task, resources: this.resources, includeMarkedContent, + disableNormalization, sink, viewBox: this.view, }); diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 4354130dd..ddf6bbbe7 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -24,6 +24,7 @@ import { IDENTITY_MATRIX, info, isArrayEqual, + normalizeUnicode, OPS, shadow, stringToPDFString, @@ -2271,6 +2272,7 @@ class PartialEvaluator { seenStyles = new Set(), viewBox, markedContentData = null, + disableNormalization = false, }) { // Ensure that `resources`/`stateManager` is correctly initialized, // even if the provided parameter is e.g. `null`. @@ -2524,7 +2526,10 @@ class PartialEvaluator { } function runBidiTransform(textChunk) { - const text = textChunk.str.join(""); + let text = textChunk.str.join(""); + if (!disableNormalization) { + text = normalizeUnicode(text); + } const bidiResult = bidi(text, -1, textChunk.vertical); return { str: bidiResult.str, @@ -2859,7 +2864,7 @@ class PartialEvaluator { textChunk.prevTransform = getCurrentTextTransform(); } - const glyphUnicode = glyph.normalizedUnicode; + const glyphUnicode = glyph.unicode; if (saveLastChar(glyphUnicode)) { // The two last chars are a non-whitespace followed by a whitespace // and then this non-whitespace, so we insert a whitespace here. @@ -3242,6 +3247,7 @@ class PartialEvaluator { seenStyles, viewBox, markedContentData, + disableNormalization, }) .then(function () { if (!sinkWrapper.enqueueInvoked) { diff --git a/src/core/fonts.js b/src/core/fonts.js index 37b91682a..ad463dce2 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -33,11 +33,9 @@ import { } from "./fonts_utils.js"; import { getCharUnicodeCategory, - getNormalizedUnicodes, getUnicodeForGlyph, getUnicodeRangeFor, mapSpecialUnicodeValues, - reverseIfRtl, } from "./unicode.js"; import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js"; import { @@ -277,24 +275,6 @@ class Glyph { /* nonSerializable = */ true ); } - - /** - * This property, which is only used by `PartialEvaluator.getTextContent`, - * is purposely made non-serializable. - * @type {string} - */ - get normalizedUnicode() { - return shadow( - this, - "normalizedUnicode", - reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode), - /* nonSerializable = */ true - ); - } - - static get _NormalizedUnicodes() { - return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes()); - } } function int16(b0, b1) { @@ -507,6 +487,9 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) { const privateUseOffetStart = PRIVATE_USE_AREAS[privateUseAreaIndex][0]; let nextAvailableFontCharCode = privateUseOffetStart; let privateUseOffetEnd = PRIVATE_USE_AREAS[privateUseAreaIndex][1]; + const isInPrivateArea = code => + (PRIVATE_USE_AREAS[0][0] <= code && code <= PRIVATE_USE_AREAS[0][1]) || + (PRIVATE_USE_AREAS[1][0] <= code && code <= PRIVATE_USE_AREAS[1][1]); for (let originalCharCode in charCodeToGlyphId) { originalCharCode |= 0; let glyphId = charCodeToGlyphId[originalCharCode]; @@ -539,11 +522,7 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) { if (typeof unicode === "string") { unicode = unicode.codePointAt(0); } - if ( - unicode && - unicode < privateUseOffetStart && - !usedGlyphIds.has(glyphId) - ) { + if (unicode && !isInPrivateArea(unicode) && !usedGlyphIds.has(glyphId)) { toUnicodeExtraMap.set(unicode, glyphId); usedGlyphIds.add(glyphId); } @@ -785,6 +764,7 @@ function createOS2Table(properties, charstrings, override) { let firstCharIndex = null; let lastCharIndex = 0; + let position = -1; if (charstrings) { for (let code in charstrings) { @@ -796,7 +776,7 @@ function createOS2Table(properties, charstrings, override) { lastCharIndex = code; } - const position = getUnicodeRangeFor(code); + position = getUnicodeRangeFor(code, position); if (position < 32) { ulUnicodeRange1 |= 1 << position; } else if (position < 64) { diff --git a/src/core/unicode.js b/src/core/unicode.js index 6116fab6d..42fe49b85 100644 --- a/src/core/unicode.js +++ b/src/core/unicode.js @@ -14,10 +14,7 @@ */ /* no-babel-preset */ -import { - getArrayLookupTableFactory, - getLookupTableFactory, -} from "./core_utils.js"; +import { getLookupTableFactory } from "./core_utils.js"; // Some characters, e.g. copyrightserif, are mapped to the private use area // and might not be displayed using standard fonts. Mapping/hacking well-known @@ -94,1552 +91,158 @@ function getUnicodeForGlyph(name, glyphsUnicodeMap) { return -1; } +// See https://learn.microsoft.com/en-us/typography/opentype/spec/os2#ulunicoderange1-bits-031ulunicoderange2-bits-3263ulunicoderange3-bits-6495ulunicoderange4-bits-96127 const UnicodeRanges = [ - { begin: 0x0000, end: 0x007f }, // Basic Latin - { begin: 0x0080, end: 0x00ff }, // Latin-1 Supplement - { begin: 0x0100, end: 0x017f }, // Latin Extended-A - { begin: 0x0180, end: 0x024f }, // Latin Extended-B - { begin: 0x0250, end: 0x02af }, // IPA Extensions - { begin: 0x02b0, end: 0x02ff }, // Spacing Modifier Letters - { begin: 0x0300, end: 0x036f }, // Combining Diacritical Marks - { begin: 0x0370, end: 0x03ff }, // Greek and Coptic - { begin: 0x2c80, end: 0x2cff }, // Coptic - { begin: 0x0400, end: 0x04ff }, // Cyrillic - { begin: 0x0530, end: 0x058f }, // Armenian - { begin: 0x0590, end: 0x05ff }, // Hebrew - { begin: 0xa500, end: 0xa63f }, // Vai - { begin: 0x0600, end: 0x06ff }, // Arabic - { begin: 0x07c0, end: 0x07ff }, // NKo - { begin: 0x0900, end: 0x097f }, // Devanagari - { begin: 0x0980, end: 0x09ff }, // Bengali - { begin: 0x0a00, end: 0x0a7f }, // Gurmukhi - { begin: 0x0a80, end: 0x0aff }, // Gujarati - { begin: 0x0b00, end: 0x0b7f }, // Oriya - { begin: 0x0b80, end: 0x0bff }, // Tamil - { begin: 0x0c00, end: 0x0c7f }, // Telugu - { begin: 0x0c80, end: 0x0cff }, // Kannada - { begin: 0x0d00, end: 0x0d7f }, // Malayalam - { begin: 0x0e00, end: 0x0e7f }, // Thai - { begin: 0x0e80, end: 0x0eff }, // Lao - { begin: 0x10a0, end: 0x10ff }, // Georgian - { begin: 0x1b00, end: 0x1b7f }, // Balinese - { begin: 0x1100, end: 0x11ff }, // Hangul Jamo - { begin: 0x1e00, end: 0x1eff }, // Latin Extended Additional - { begin: 0x1f00, end: 0x1fff }, // Greek Extended - { begin: 0x2000, end: 0x206f }, // General Punctuation - { begin: 0x2070, end: 0x209f }, // Superscripts And Subscripts - { begin: 0x20a0, end: 0x20cf }, // Currency Symbol - { begin: 0x20d0, end: 0x20ff }, // Combining Diacritical Marks - { begin: 0x2100, end: 0x214f }, // Letterlike Symbols - { begin: 0x2150, end: 0x218f }, // Number Forms - { begin: 0x2190, end: 0x21ff }, // Arrows - { begin: 0x2200, end: 0x22ff }, // Mathematical Operators - { begin: 0x2300, end: 0x23ff }, // Miscellaneous Technical - { begin: 0x2400, end: 0x243f }, // Control Pictures - { begin: 0x2440, end: 0x245f }, // Optical Character Recognition - { begin: 0x2460, end: 0x24ff }, // Enclosed Alphanumerics - { begin: 0x2500, end: 0x257f }, // Box Drawing - { begin: 0x2580, end: 0x259f }, // Block Elements - { begin: 0x25a0, end: 0x25ff }, // Geometric Shapes - { begin: 0x2600, end: 0x26ff }, // Miscellaneous Symbols - { begin: 0x2700, end: 0x27bf }, // Dingbats - { begin: 0x3000, end: 0x303f }, // CJK Symbols And Punctuation - { begin: 0x3040, end: 0x309f }, // Hiragana - { begin: 0x30a0, end: 0x30ff }, // Katakana - { begin: 0x3100, end: 0x312f }, // Bopomofo - { begin: 0x3130, end: 0x318f }, // Hangul Compatibility Jamo - { begin: 0xa840, end: 0xa87f }, // Phags-pa - { begin: 0x3200, end: 0x32ff }, // Enclosed CJK Letters And Months - { begin: 0x3300, end: 0x33ff }, // CJK Compatibility - { begin: 0xac00, end: 0xd7af }, // Hangul Syllables - { begin: 0xd800, end: 0xdfff }, // Non-Plane 0 * - { begin: 0x10900, end: 0x1091f }, // Phoenicia - { begin: 0x4e00, end: 0x9fff }, // CJK Unified Ideographs - { begin: 0xe000, end: 0xf8ff }, // Private Use Area (plane 0) - { begin: 0x31c0, end: 0x31ef }, // CJK Strokes - { begin: 0xfb00, end: 0xfb4f }, // Alphabetic Presentation Forms - { begin: 0xfb50, end: 0xfdff }, // Arabic Presentation Forms-A - { begin: 0xfe20, end: 0xfe2f }, // Combining Half Marks - { begin: 0xfe10, end: 0xfe1f }, // Vertical Forms - { begin: 0xfe50, end: 0xfe6f }, // Small Form Variants - { begin: 0xfe70, end: 0xfeff }, // Arabic Presentation Forms-B - { begin: 0xff00, end: 0xffef }, // Halfwidth And Fullwidth Forms - { begin: 0xfff0, end: 0xffff }, // Specials - { begin: 0x0f00, end: 0x0fff }, // Tibetan - { begin: 0x0700, end: 0x074f }, // Syriac - { begin: 0x0780, end: 0x07bf }, // Thaana - { begin: 0x0d80, end: 0x0dff }, // Sinhala - { begin: 0x1000, end: 0x109f }, // Myanmar - { begin: 0x1200, end: 0x137f }, // Ethiopic - { begin: 0x13a0, end: 0x13ff }, // Cherokee - { begin: 0x1400, end: 0x167f }, // Unified Canadian Aboriginal Syllabics - { begin: 0x1680, end: 0x169f }, // Ogham - { begin: 0x16a0, end: 0x16ff }, // Runic - { begin: 0x1780, end: 0x17ff }, // Khmer - { begin: 0x1800, end: 0x18af }, // Mongolian - { begin: 0x2800, end: 0x28ff }, // Braille Patterns - { begin: 0xa000, end: 0xa48f }, // Yi Syllables - { begin: 0x1700, end: 0x171f }, // Tagalog - { begin: 0x10300, end: 0x1032f }, // Old Italic - { begin: 0x10330, end: 0x1034f }, // Gothic - { begin: 0x10400, end: 0x1044f }, // Deseret - { begin: 0x1d000, end: 0x1d0ff }, // Byzantine Musical Symbols - { begin: 0x1d400, end: 0x1d7ff }, // Mathematical Alphanumeric Symbols - { begin: 0xff000, end: 0xffffd }, // Private Use (plane 15) - { begin: 0xfe00, end: 0xfe0f }, // Variation Selectors - { begin: 0xe0000, end: 0xe007f }, // Tags - { begin: 0x1900, end: 0x194f }, // Limbu - { begin: 0x1950, end: 0x197f }, // Tai Le - { begin: 0x1980, end: 0x19df }, // New Tai Lue - { begin: 0x1a00, end: 0x1a1f }, // Buginese - { begin: 0x2c00, end: 0x2c5f }, // Glagolitic - { begin: 0x2d30, end: 0x2d7f }, // Tifinagh - { begin: 0x4dc0, end: 0x4dff }, // Yijing Hexagram Symbols - { begin: 0xa800, end: 0xa82f }, // Syloti Nagri - { begin: 0x10000, end: 0x1007f }, // Linear B Syllabary - { begin: 0x10140, end: 0x1018f }, // Ancient Greek Numbers - { begin: 0x10380, end: 0x1039f }, // Ugaritic - { begin: 0x103a0, end: 0x103df }, // Old Persian - { begin: 0x10450, end: 0x1047f }, // Shavian - { begin: 0x10480, end: 0x104af }, // Osmanya - { begin: 0x10800, end: 0x1083f }, // Cypriot Syllabary - { begin: 0x10a00, end: 0x10a5f }, // Kharoshthi - { begin: 0x1d300, end: 0x1d35f }, // Tai Xuan Jing Symbols - { begin: 0x12000, end: 0x123ff }, // Cuneiform - { begin: 0x1d360, end: 0x1d37f }, // Counting Rod Numerals - { begin: 0x1b80, end: 0x1bbf }, // Sundanese - { begin: 0x1c00, end: 0x1c4f }, // Lepcha - { begin: 0x1c50, end: 0x1c7f }, // Ol Chiki - { begin: 0xa880, end: 0xa8df }, // Saurashtra - { begin: 0xa900, end: 0xa92f }, // Kayah Li - { begin: 0xa930, end: 0xa95f }, // Rejang - { begin: 0xaa00, end: 0xaa5f }, // Cham - { begin: 0x10190, end: 0x101cf }, // Ancient Symbols - { begin: 0x101d0, end: 0x101ff }, // Phaistos Disc - { begin: 0x102a0, end: 0x102df }, // Carian - { begin: 0x1f030, end: 0x1f09f }, // Domino Tiles + [0x0000, 0x007f], // 0 - Basic Latin + [0x0080, 0x00ff], // 1 - Latin-1 Supplement + [0x0100, 0x017f], // 2 - Latin Extended-A + [0x0180, 0x024f], // 3 - Latin Extended-B + [0x0250, 0x02af, 0x1d00, 0x1d7f, 0x1d80, 0x1dbf], // 4 - IPA Extensions - Phonetic Extensions - Phonetic Extensions Supplement + [0x02b0, 0x02ff, 0xa700, 0xa71f], // 5 - Spacing Modifier Letters - Modifier Tone Letters + [0x0300, 0x036f, 0x1dc0, 0x1dff], // 6 - Combining Diacritical Marks - Combining Diacritical Marks Supplement + [0x0370, 0x03ff], // 7 - Greek and Coptic + [0x2c80, 0x2cff], // 8 - Coptic + [0x0400, 0x04ff, 0x0500, 0x052f, 0x2de0, 0x2dff, 0xa640, 0xa69f], // 9 - Cyrillic - Cyrillic Supplement - Cyrillic Extended-A - Cyrillic Extended-B + [0x0530, 0x058f], // 10 - Armenian + [0x0590, 0x05ff], // 11 - Hebrew + [0xa500, 0xa63f], // 12 - Vai + [0x0600, 0x06ff, 0x0750, 0x077f], // 13 - Arabic - Arabic Supplement + [0x07c0, 0x07ff], // 14 - NKo + [0x0900, 0x097f], // 15 - Devanagari + [0x0980, 0x09ff], // 16 - Bengali + [0x0a00, 0x0a7f], // 17 - Gurmukhi + [0x0a80, 0x0aff], // 18 - Gujarati + [0x0b00, 0x0b7f], // 19 - Oriya + [0x0b80, 0x0bff], // 20 - Tamil + [0x0c00, 0x0c7f], // 21 - Telugu + [0x0c80, 0x0cff], // 22 - Kannada + [0x0d00, 0x0d7f], // 23 - Malayalam + [0x0e00, 0x0e7f], // 24 - Thai + [0x0e80, 0x0eff], // 25 - Lao + [0x10a0, 0x10ff, 0x2d00, 0x2d2f], // 26 - Georgian - Georgian Supplement + [0x1b00, 0x1b7f], // 27 - Balinese + [0x1100, 0x11ff], // 28 - Hangul Jamo + [0x1e00, 0x1eff, 0x2c60, 0x2c7f, 0xa720, 0xa7ff], // 29 - Latin Extended Additional - Latin Extended-C - Latin Extended-D + [0x1f00, 0x1fff], // 30 - Greek Extended + [0x2000, 0x206f, 0x2e00, 0x2e7f], // 31 - General Punctuation - Supplemental Punctuation + [0x2070, 0x209f], // 32 - Superscripts And Subscripts + [0x20a0, 0x20cf], // 33 - Currency Symbol + [0x20d0, 0x20ff], // 34 - Combining Diacritical Marks + [0x2100, 0x214f], // 35 - Letterlike Symbols + [0x2150, 0x218f], // 36 - Number Forms + [0x2190, 0x21ff, 0x27f0, 0x27ff, 0x2900, 0x297f, 0x2b00, 0x2bff], // 37 - Arrows - Supplemental Arrows-A - Supplemental Arrows-B - Miscellaneous Symbols and Arrows + [0x2200, 0x22ff, 0x2a00, 0x2aff, 0x27c0, 0x27ef, 0x2980, 0x29ff], // 38 - Mathematical Operators - Supplemental Mathematical Operators - Miscellaneous Mathematical Symbols-A - Miscellaneous Mathematical Symbols-B + [0x2300, 0x23ff], // 39 - Miscellaneous Technical + [0x2400, 0x243f], // 40 - Control Pictures + [0x2440, 0x245f], // 41 - Optical Character Recognition + [0x2460, 0x24ff], // 42 - Enclosed Alphanumerics + [0x2500, 0x257f], // 43 - Box Drawing + [0x2580, 0x259f], // 44 - Block Elements + [0x25a0, 0x25ff], // 45 - Geometric Shapes + [0x2600, 0x26ff], // 46 - Miscellaneous Symbols + [0x2700, 0x27bf], // 47 - Dingbats + [0x3000, 0x303f], // 48 - CJK Symbols And Punctuation + [0x3040, 0x309f], // 49 - Hiragana + [0x30a0, 0x30ff, 0x31f0, 0x31ff], // 50 - Katakana - Katakana Phonetic Extensions + [0x3100, 0x312f, 0x31a0, 0x31bf], // 51 - Bopomofo - Bopomofo Extended + [0x3130, 0x318f], // 52 - Hangul Compatibility Jamo + [0xa840, 0xa87f], // 53 - Phags-pa + [0x3200, 0x32ff], // 54 - Enclosed CJK Letters And Months + [0x3300, 0x33ff], // 55 - CJK Compatibility + [0xac00, 0xd7af], // 56 - Hangul Syllables + [0xd800, 0xdfff], // 57 - Non-Plane 0 * + [0x10900, 0x1091f], // 58 - Phoenicia + [ + 0x4e00, 0x9fff, 0x2e80, 0x2eff, 0x2f00, 0x2fdf, 0x2ff0, 0x2fff, 0x3400, + 0x4dbf, 0x20000, 0x2a6df, 0x3190, 0x319f, + ], // 59 - CJK Unified Ideographs - CJK Radicals Supplement - Kangxi Radicals - Ideographic Description Characters - CJK Unified Ideographs Extension A - CJK Unified Ideographs Extension B - Kanbun + [0xe000, 0xf8ff], // 60 - Private Use Area (plane 0) + [0x31c0, 0x31ef, 0xf900, 0xfaff, 0x2f800, 0x2fa1f], // 61 - CJK Strokes - CJK Compatibility Ideographs - CJK Compatibility Ideographs Supplement + [0xfb00, 0xfb4f], // 62 - Alphabetic Presentation Forms + [0xfb50, 0xfdff], // 63 - Arabic Presentation Forms-A + [0xfe20, 0xfe2f], // 64 - Combining Half Marks + [0xfe10, 0xfe1f], // 65 - Vertical Forms + [0xfe50, 0xfe6f], // 66 - Small Form Variants + [0xfe70, 0xfeff], // 67 - Arabic Presentation Forms-B + [0xff00, 0xffef], // 68 - Halfwidth And Fullwidth Forms + [0xfff0, 0xffff], // 69 - Specials + [0x0f00, 0x0fff], // 70 - Tibetan + [0x0700, 0x074f], // 71 - Syriac + [0x0780, 0x07bf], // 72 - Thaana + [0x0d80, 0x0dff], // 73 - Sinhala + [0x1000, 0x109f], // 74 - Myanmar + [0x1200, 0x137f, 0x1380, 0x139f, 0x2d80, 0x2ddf], // 75 - Ethiopic - Ethiopic Supplement - Ethiopic Extended + [0x13a0, 0x13ff], // 76 - Cherokee + [0x1400, 0x167f], // 77 - Unified Canadian Aboriginal Syllabics + [0x1680, 0x169f], // 78 - Ogham + [0x16a0, 0x16ff], // 79 - Runic + [0x1780, 0x17ff], // 80 - Khmer + [0x1800, 0x18af], // 81 - Mongolian + [0x2800, 0x28ff], // 82 - Braille Patterns + [0xa000, 0xa48f], // 83 - Yi Syllables + [0x1700, 0x171f, 0x1720, 0x173f, 0x1740, 0x175f, 0x1760, 0x177f], // 84 - Tagalog - Hanunoo - Buhid - Tagbanwa + [0x10300, 0x1032f], // 85 - Old Italic + [0x10330, 0x1034f], // 86 - Gothic + [0x10400, 0x1044f], // 87 - Deseret + [0x1d000, 0x1d0ff, 0x1d100, 0x1d1ff, 0x1d200, 0x1d24f], // 88 - Byzantine Musical Symbols - Musical Symbols - Ancient Greek Musical Notation + [0x1d400, 0x1d7ff], // 89 - Mathematical Alphanumeric Symbols + [0xff000, 0xffffd], // 90 - Private Use (plane 15) + [0xfe00, 0xfe0f, 0xe0100, 0xe01ef], // 91 - Variation Selectors - Variation Selectors Supplement + [0xe0000, 0xe007f], // 92 - Tags + [0x1900, 0x194f], // 93 - Limbu + [0x1950, 0x197f], // 94 - Tai Le + [0x1980, 0x19df], // 95 - New Tai Lue + [0x1a00, 0x1a1f], // 96 - Buginese + [0x2c00, 0x2c5f], // 97 - Glagolitic + [0x2d30, 0x2d7f], // 98 - Tifinagh + [0x4dc0, 0x4dff], // 99 - Yijing Hexagram Symbols + [0xa800, 0xa82f], // 100 - Syloti Nagri + [0x10000, 0x1007f, 0x10080, 0x100ff, 0x10100, 0x1013f], // 101 - Linear B Syllabary - Linear B Ideograms - Aegean Numbers + [0x10140, 0x1018f], // 102 - Ancient Greek Numbers + [0x10380, 0x1039f], // 103 - Ugaritic + [0x103a0, 0x103df], // 104 - Old Persian + [0x10450, 0x1047f], // 105 - Shavian + [0x10480, 0x104af], // 106 - Osmanya + [0x10800, 0x1083f], // 107 - Cypriot Syllabary + [0x10a00, 0x10a5f], // 108 - Kharoshthi + [0x1d300, 0x1d35f], // 109 - Tai Xuan Jing Symbols + [0x12000, 0x123ff, 0x12400, 0x1247f], // 110 - Cuneiform - Cuneiform Numbers and Punctuation + [0x1d360, 0x1d37f], // 111 - Counting Rod Numerals + [0x1b80, 0x1bbf], // 112 - Sundanese + [0x1c00, 0x1c4f], // 113 - Lepcha + [0x1c50, 0x1c7f], // 114 - Ol Chiki + [0xa880, 0xa8df], // 115 - Saurashtra + [0xa900, 0xa92f], // 116 - Kayah Li + [0xa930, 0xa95f], // 117 - Rejang + [0xaa00, 0xaa5f], // 118 - Cham + [0x10190, 0x101cf], // 119 - Ancient Symbols + [0x101d0, 0x101ff], // 120 - Phaistos Disc + [0x102a0, 0x102df, 0x10280, 0x1029f, 0x10920, 0x1093f], // 121 - Carian - Lycian - Lydian + [0x1f030, 0x1f09f, 0x1f000, 0x1f02f], // 122 - Domino Tiles - Mahjong Tiles ]; -function getUnicodeRangeFor(value) { +function getUnicodeRangeFor(value, lastPosition = -1) { + // TODO: create a map range => position, sort the ranges and cache it. + // Then we can make a binary search for finding a range for a given unicode. + if (lastPosition !== -1) { + const range = UnicodeRanges[lastPosition]; + for (let i = 0, ii = range.length; i < ii; i += 2) { + if (value >= range[i] && value <= range[i + 1]) { + return lastPosition; + } + } + } for (let i = 0, ii = UnicodeRanges.length; i < ii; i++) { const range = UnicodeRanges[i]; - if (value >= range.begin && value < range.end) { - return i; + for (let j = 0, jj = range.length; j < jj; j += 2) { + if (value >= range[j] && value <= range[j + 1]) { + return i; + } } } return -1; } -function isRTLRangeFor(value) { - let range = UnicodeRanges[13]; - if (value >= range.begin && value < range.end) { - return true; - } - range = UnicodeRanges[11]; - if (value >= range.begin && value < range.end) { - return true; - } - return false; -} - -// The normalization table is obtained by filtering the Unicode characters -// database with entries. -const getNormalizedUnicodes = getArrayLookupTableFactory(function () { - // prettier-ignore - return [ - "\u00A8", "\u0020\u0308", - "\u00AF", "\u0020\u0304", - "\u00B4", "\u0020\u0301", - "\u00B5", "\u03BC", - "\u00B8", "\u0020\u0327", - "\u0132", "\u0049\u004A", - "\u0133", "\u0069\u006A", - "\u013F", "\u004C\u00B7", - "\u0140", "\u006C\u00B7", - "\u0149", "\u02BC\u006E", - "\u017F", "\u0073", - "\u01C4", "\u0044\u017D", - "\u01C5", "\u0044\u017E", - "\u01C6", "\u0064\u017E", - "\u01C7", "\u004C\u004A", - "\u01C8", "\u004C\u006A", - "\u01C9", "\u006C\u006A", - "\u01CA", "\u004E\u004A", - "\u01CB", "\u004E\u006A", - "\u01CC", "\u006E\u006A", - "\u01F1", "\u0044\u005A", - "\u01F2", "\u0044\u007A", - "\u01F3", "\u0064\u007A", - "\u02D8", "\u0020\u0306", - "\u02D9", "\u0020\u0307", - "\u02DA", "\u0020\u030A", - "\u02DB", "\u0020\u0328", - "\u02DC", "\u0020\u0303", - "\u02DD", "\u0020\u030B", - "\u037A", "\u0020\u0345", - "\u0384", "\u0020\u0301", - "\u03D0", "\u03B2", - "\u03D1", "\u03B8", - "\u03D2", "\u03A5", - "\u03D5", "\u03C6", - "\u03D6", "\u03C0", - "\u03F0", "\u03BA", - "\u03F1", "\u03C1", - "\u03F2", "\u03C2", - "\u03F4", "\u0398", - "\u03F5", "\u03B5", - "\u03F9", "\u03A3", - "\u0587", "\u0565\u0582", - "\u0675", "\u0627\u0674", - "\u0676", "\u0648\u0674", - "\u0677", "\u06C7\u0674", - "\u0678", "\u064A\u0674", - "\u0E33", "\u0E4D\u0E32", - "\u0EB3", "\u0ECD\u0EB2", - "\u0EDC", "\u0EAB\u0E99", - "\u0EDD", "\u0EAB\u0EA1", - "\u0F77", "\u0FB2\u0F81", - "\u0F79", "\u0FB3\u0F81", - "\u1E9A", "\u0061\u02BE", - "\u1FBD", "\u0020\u0313", - "\u1FBF", "\u0020\u0313", - "\u1FC0", "\u0020\u0342", - "\u1FFE", "\u0020\u0314", - "\u2002", "\u0020", - "\u2003", "\u0020", - "\u2004", "\u0020", - "\u2005", "\u0020", - "\u2006", "\u0020", - "\u2008", "\u0020", - "\u2009", "\u0020", - "\u200A", "\u0020", - "\u2017", "\u0020\u0333", - "\u2024", "\u002E", - "\u2025", "\u002E\u002E", - "\u2026", "\u002E\u002E\u002E", - "\u2033", "\u2032\u2032", - "\u2034", "\u2032\u2032\u2032", - "\u2036", "\u2035\u2035", - "\u2037", "\u2035\u2035\u2035", - "\u203C", "\u0021\u0021", - "\u203E", "\u0020\u0305", - "\u2047", "\u003F\u003F", - "\u2048", "\u003F\u0021", - "\u2049", "\u0021\u003F", - "\u2057", "\u2032\u2032\u2032\u2032", - "\u205F", "\u0020", - "\u20A8", "\u0052\u0073", - "\u2100", "\u0061\u002F\u0063", - "\u2101", "\u0061\u002F\u0073", - "\u2103", "\u00B0\u0043", - "\u2105", "\u0063\u002F\u006F", - "\u2106", "\u0063\u002F\u0075", - "\u2107", "\u0190", - "\u2109", "\u00B0\u0046", - "\u2116", "\u004E\u006F", - "\u2121", "\u0054\u0045\u004C", - "\u2135", "\u05D0", - "\u2136", "\u05D1", - "\u2137", "\u05D2", - "\u2138", "\u05D3", - "\u213B", "\u0046\u0041\u0058", - "\u2160", "\u0049", - "\u2161", "\u0049\u0049", - "\u2162", "\u0049\u0049\u0049", - "\u2163", "\u0049\u0056", - "\u2164", "\u0056", - "\u2165", "\u0056\u0049", - "\u2166", "\u0056\u0049\u0049", - "\u2167", "\u0056\u0049\u0049\u0049", - "\u2168", "\u0049\u0058", - "\u2169", "\u0058", - "\u216A", "\u0058\u0049", - "\u216B", "\u0058\u0049\u0049", - "\u216C", "\u004C", - "\u216D", "\u0043", - "\u216E", "\u0044", - "\u216F", "\u004D", - "\u2170", "\u0069", - "\u2171", "\u0069\u0069", - "\u2172", "\u0069\u0069\u0069", - "\u2173", "\u0069\u0076", - "\u2174", "\u0076", - "\u2175", "\u0076\u0069", - "\u2176", "\u0076\u0069\u0069", - "\u2177", "\u0076\u0069\u0069\u0069", - "\u2178", "\u0069\u0078", - "\u2179", "\u0078", - "\u217A", "\u0078\u0069", - "\u217B", "\u0078\u0069\u0069", - "\u217C", "\u006C", - "\u217D", "\u0063", - "\u217E", "\u0064", - "\u217F", "\u006D", - "\u222C", "\u222B\u222B", - "\u222D", "\u222B\u222B\u222B", - "\u222F", "\u222E\u222E", - "\u2230", "\u222E\u222E\u222E", - "\u2474", "\u0028\u0031\u0029", - "\u2475", "\u0028\u0032\u0029", - "\u2476", "\u0028\u0033\u0029", - "\u2477", "\u0028\u0034\u0029", - "\u2478", "\u0028\u0035\u0029", - "\u2479", "\u0028\u0036\u0029", - "\u247A", "\u0028\u0037\u0029", - "\u247B", "\u0028\u0038\u0029", - "\u247C", "\u0028\u0039\u0029", - "\u247D", "\u0028\u0031\u0030\u0029", - "\u247E", "\u0028\u0031\u0031\u0029", - "\u247F", "\u0028\u0031\u0032\u0029", - "\u2480", "\u0028\u0031\u0033\u0029", - "\u2481", "\u0028\u0031\u0034\u0029", - "\u2482", "\u0028\u0031\u0035\u0029", - "\u2483", "\u0028\u0031\u0036\u0029", - "\u2484", "\u0028\u0031\u0037\u0029", - "\u2485", "\u0028\u0031\u0038\u0029", - "\u2486", "\u0028\u0031\u0039\u0029", - "\u2487", "\u0028\u0032\u0030\u0029", - "\u2488", "\u0031\u002E", - "\u2489", "\u0032\u002E", - "\u248A", "\u0033\u002E", - "\u248B", "\u0034\u002E", - "\u248C", "\u0035\u002E", - "\u248D", "\u0036\u002E", - "\u248E", "\u0037\u002E", - "\u248F", "\u0038\u002E", - "\u2490", "\u0039\u002E", - "\u2491", "\u0031\u0030\u002E", - "\u2492", "\u0031\u0031\u002E", - "\u2493", "\u0031\u0032\u002E", - "\u2494", "\u0031\u0033\u002E", - "\u2495", "\u0031\u0034\u002E", - "\u2496", "\u0031\u0035\u002E", - "\u2497", "\u0031\u0036\u002E", - "\u2498", "\u0031\u0037\u002E", - "\u2499", "\u0031\u0038\u002E", - "\u249A", "\u0031\u0039\u002E", - "\u249B", "\u0032\u0030\u002E", - "\u249C", "\u0028\u0061\u0029", - "\u249D", "\u0028\u0062\u0029", - "\u249E", "\u0028\u0063\u0029", - "\u249F", "\u0028\u0064\u0029", - "\u24A0", "\u0028\u0065\u0029", - "\u24A1", "\u0028\u0066\u0029", - "\u24A2", "\u0028\u0067\u0029", - "\u24A3", "\u0028\u0068\u0029", - "\u24A4", "\u0028\u0069\u0029", - "\u24A5", "\u0028\u006A\u0029", - "\u24A6", "\u0028\u006B\u0029", - "\u24A7", "\u0028\u006C\u0029", - "\u24A8", "\u0028\u006D\u0029", - "\u24A9", "\u0028\u006E\u0029", - "\u24AA", "\u0028\u006F\u0029", - "\u24AB", "\u0028\u0070\u0029", - "\u24AC", "\u0028\u0071\u0029", - "\u24AD", "\u0028\u0072\u0029", - "\u24AE", "\u0028\u0073\u0029", - "\u24AF", "\u0028\u0074\u0029", - "\u24B0", "\u0028\u0075\u0029", - "\u24B1", "\u0028\u0076\u0029", - "\u24B2", "\u0028\u0077\u0029", - "\u24B3", "\u0028\u0078\u0029", - "\u24B4", "\u0028\u0079\u0029", - "\u24B5", "\u0028\u007A\u0029", - "\u2A0C", "\u222B\u222B\u222B\u222B", - "\u2A74", "\u003A\u003A\u003D", - "\u2A75", "\u003D\u003D", - "\u2A76", "\u003D\u003D\u003D", - "\u2E9F", "\u6BCD", - "\u2EF3", "\u9F9F", - "\u2F00", "\u4E00", - "\u2F01", "\u4E28", - "\u2F02", "\u4E36", - "\u2F03", "\u4E3F", - "\u2F04", "\u4E59", - "\u2F05", "\u4E85", - "\u2F06", "\u4E8C", - "\u2F07", "\u4EA0", - "\u2F08", "\u4EBA", - "\u2F09", "\u513F", - "\u2F0A", "\u5165", - "\u2F0B", "\u516B", - "\u2F0C", "\u5182", - "\u2F0D", "\u5196", - "\u2F0E", "\u51AB", - "\u2F0F", "\u51E0", - "\u2F10", "\u51F5", - "\u2F11", "\u5200", - "\u2F12", "\u529B", - "\u2F13", "\u52F9", - "\u2F14", "\u5315", - "\u2F15", "\u531A", - "\u2F16", "\u5338", - "\u2F17", "\u5341", - "\u2F18", "\u535C", - "\u2F19", "\u5369", - "\u2F1A", "\u5382", - "\u2F1B", "\u53B6", - "\u2F1C", "\u53C8", - "\u2F1D", "\u53E3", - "\u2F1E", "\u56D7", - "\u2F1F", "\u571F", - "\u2F20", "\u58EB", - "\u2F21", "\u5902", - "\u2F22", "\u590A", - "\u2F23", "\u5915", - "\u2F24", "\u5927", - "\u2F25", "\u5973", - "\u2F26", "\u5B50", - "\u2F27", "\u5B80", - "\u2F28", "\u5BF8", - "\u2F29", "\u5C0F", - "\u2F2A", "\u5C22", - "\u2F2B", "\u5C38", - "\u2F2C", "\u5C6E", - "\u2F2D", "\u5C71", - "\u2F2E", "\u5DDB", - "\u2F2F", "\u5DE5", - "\u2F30", "\u5DF1", - "\u2F31", "\u5DFE", - "\u2F32", "\u5E72", - "\u2F33", "\u5E7A", - "\u2F34", "\u5E7F", - "\u2F35", "\u5EF4", - "\u2F36", "\u5EFE", - "\u2F37", "\u5F0B", - "\u2F38", "\u5F13", - "\u2F39", "\u5F50", - "\u2F3A", "\u5F61", - "\u2F3B", "\u5F73", - "\u2F3C", "\u5FC3", - "\u2F3D", "\u6208", - "\u2F3E", "\u6236", - "\u2F3F", "\u624B", - "\u2F40", "\u652F", - "\u2F41", "\u6534", - "\u2F42", "\u6587", - "\u2F43", "\u6597", - "\u2F44", "\u65A4", - "\u2F45", "\u65B9", - "\u2F46", "\u65E0", - "\u2F47", "\u65E5", - "\u2F48", "\u66F0", - "\u2F49", "\u6708", - "\u2F4A", "\u6728", - "\u2F4B", "\u6B20", - "\u2F4C", "\u6B62", - "\u2F4D", "\u6B79", - "\u2F4E", "\u6BB3", - "\u2F4F", "\u6BCB", - "\u2F50", "\u6BD4", - "\u2F51", "\u6BDB", - "\u2F52", "\u6C0F", - "\u2F53", "\u6C14", - "\u2F54", "\u6C34", - "\u2F55", "\u706B", - "\u2F56", "\u722A", - "\u2F57", "\u7236", - "\u2F58", "\u723B", - "\u2F59", "\u723F", - "\u2F5A", "\u7247", - "\u2F5B", "\u7259", - "\u2F5C", "\u725B", - "\u2F5D", "\u72AC", - "\u2F5E", "\u7384", - "\u2F5F", "\u7389", - "\u2F60", "\u74DC", - "\u2F61", "\u74E6", - "\u2F62", "\u7518", - "\u2F63", "\u751F", - "\u2F64", "\u7528", - "\u2F65", "\u7530", - "\u2F66", "\u758B", - "\u2F67", "\u7592", - "\u2F68", "\u7676", - "\u2F69", "\u767D", - "\u2F6A", "\u76AE", - "\u2F6B", "\u76BF", - "\u2F6C", "\u76EE", - "\u2F6D", "\u77DB", - "\u2F6E", "\u77E2", - "\u2F6F", "\u77F3", - "\u2F70", "\u793A", - "\u2F71", "\u79B8", - "\u2F72", "\u79BE", - "\u2F73", "\u7A74", - "\u2F74", "\u7ACB", - "\u2F75", "\u7AF9", - "\u2F76", "\u7C73", - "\u2F77", "\u7CF8", - "\u2F78", "\u7F36", - "\u2F79", "\u7F51", - "\u2F7A", "\u7F8A", - "\u2F7B", "\u7FBD", - "\u2F7C", "\u8001", - "\u2F7D", "\u800C", - "\u2F7E", "\u8012", - "\u2F7F", "\u8033", - "\u2F80", "\u807F", - "\u2F81", "\u8089", - "\u2F82", "\u81E3", - "\u2F83", "\u81EA", - "\u2F84", "\u81F3", - "\u2F85", "\u81FC", - "\u2F86", "\u820C", - "\u2F87", "\u821B", - "\u2F88", "\u821F", - "\u2F89", "\u826E", - "\u2F8A", "\u8272", - "\u2F8B", "\u8278", - "\u2F8C", "\u864D", - "\u2F8D", "\u866B", - "\u2F8E", "\u8840", - "\u2F8F", "\u884C", - "\u2F90", "\u8863", - "\u2F91", "\u897E", - "\u2F92", "\u898B", - "\u2F93", "\u89D2", - "\u2F94", "\u8A00", - "\u2F95", "\u8C37", - "\u2F96", "\u8C46", - "\u2F97", "\u8C55", - "\u2F98", "\u8C78", - "\u2F99", "\u8C9D", - "\u2F9A", "\u8D64", - "\u2F9B", "\u8D70", - "\u2F9C", "\u8DB3", - "\u2F9D", "\u8EAB", - "\u2F9E", "\u8ECA", - "\u2F9F", "\u8F9B", - "\u2FA0", "\u8FB0", - "\u2FA1", "\u8FB5", - "\u2FA2", "\u9091", - "\u2FA3", "\u9149", - "\u2FA4", "\u91C6", - "\u2FA5", "\u91CC", - "\u2FA6", "\u91D1", - "\u2FA7", "\u9577", - "\u2FA8", "\u9580", - "\u2FA9", "\u961C", - "\u2FAA", "\u96B6", - "\u2FAB", "\u96B9", - "\u2FAC", "\u96E8", - "\u2FAD", "\u9751", - "\u2FAE", "\u975E", - "\u2FAF", "\u9762", - "\u2FB0", "\u9769", - "\u2FB1", "\u97CB", - "\u2FB2", "\u97ED", - "\u2FB3", "\u97F3", - "\u2FB4", "\u9801", - "\u2FB5", "\u98A8", - "\u2FB6", "\u98DB", - "\u2FB7", "\u98DF", - "\u2FB8", "\u9996", - "\u2FB9", "\u9999", - "\u2FBA", "\u99AC", - "\u2FBB", "\u9AA8", - "\u2FBC", "\u9AD8", - "\u2FBD", "\u9ADF", - "\u2FBE", "\u9B25", - "\u2FBF", "\u9B2F", - "\u2FC0", "\u9B32", - "\u2FC1", "\u9B3C", - "\u2FC2", "\u9B5A", - "\u2FC3", "\u9CE5", - "\u2FC4", "\u9E75", - "\u2FC5", "\u9E7F", - "\u2FC6", "\u9EA5", - "\u2FC7", "\u9EBB", - "\u2FC8", "\u9EC3", - "\u2FC9", "\u9ECD", - "\u2FCA", "\u9ED1", - "\u2FCB", "\u9EF9", - "\u2FCC", "\u9EFD", - "\u2FCD", "\u9F0E", - "\u2FCE", "\u9F13", - "\u2FCF", "\u9F20", - "\u2FD0", "\u9F3B", - "\u2FD1", "\u9F4A", - "\u2FD2", "\u9F52", - "\u2FD3", "\u9F8D", - "\u2FD4", "\u9F9C", - "\u2FD5", "\u9FA0", - "\u3036", "\u3012", - "\u3038", "\u5341", - "\u3039", "\u5344", - "\u303A", "\u5345", - "\u309B", "\u0020\u3099", - "\u309C", "\u0020\u309A", - "\u3131", "\u1100", - "\u3132", "\u1101", - "\u3133", "\u11AA", - "\u3134", "\u1102", - "\u3135", "\u11AC", - "\u3136", "\u11AD", - "\u3137", "\u1103", - "\u3138", "\u1104", - "\u3139", "\u1105", - "\u313A", "\u11B0", - "\u313B", "\u11B1", - "\u313C", "\u11B2", - "\u313D", "\u11B3", - "\u313E", "\u11B4", - "\u313F", "\u11B5", - "\u3140", "\u111A", - "\u3141", "\u1106", - "\u3142", "\u1107", - "\u3143", "\u1108", - "\u3144", "\u1121", - "\u3145", "\u1109", - "\u3146", "\u110A", - "\u3147", "\u110B", - "\u3148", "\u110C", - "\u3149", "\u110D", - "\u314A", "\u110E", - "\u314B", "\u110F", - "\u314C", "\u1110", - "\u314D", "\u1111", - "\u314E", "\u1112", - "\u314F", "\u1161", - "\u3150", "\u1162", - "\u3151", "\u1163", - "\u3152", "\u1164", - "\u3153", "\u1165", - "\u3154", "\u1166", - "\u3155", "\u1167", - "\u3156", "\u1168", - "\u3157", "\u1169", - "\u3158", "\u116A", - "\u3159", "\u116B", - "\u315A", "\u116C", - "\u315B", "\u116D", - "\u315C", "\u116E", - "\u315D", "\u116F", - "\u315E", "\u1170", - "\u315F", "\u1171", - "\u3160", "\u1172", - "\u3161", "\u1173", - "\u3162", "\u1174", - "\u3163", "\u1175", - "\u3164", "\u1160", - "\u3165", "\u1114", - "\u3166", "\u1115", - "\u3167", "\u11C7", - "\u3168", "\u11C8", - "\u3169", "\u11CC", - "\u316A", "\u11CE", - "\u316B", "\u11D3", - "\u316C", "\u11D7", - "\u316D", "\u11D9", - "\u316E", "\u111C", - "\u316F", "\u11DD", - "\u3170", "\u11DF", - "\u3171", "\u111D", - "\u3172", "\u111E", - "\u3173", "\u1120", - "\u3174", "\u1122", - "\u3175", "\u1123", - "\u3176", "\u1127", - "\u3177", "\u1129", - "\u3178", "\u112B", - "\u3179", "\u112C", - "\u317A", "\u112D", - "\u317B", "\u112E", - "\u317C", "\u112F", - "\u317D", "\u1132", - "\u317E", "\u1136", - "\u317F", "\u1140", - "\u3180", "\u1147", - "\u3181", "\u114C", - "\u3182", "\u11F1", - "\u3183", "\u11F2", - "\u3184", "\u1157", - "\u3185", "\u1158", - "\u3186", "\u1159", - "\u3187", "\u1184", - "\u3188", "\u1185", - "\u3189", "\u1188", - "\u318A", "\u1191", - "\u318B", "\u1192", - "\u318C", "\u1194", - "\u318D", "\u119E", - "\u318E", "\u11A1", - "\u3200", "\u0028\u1100\u0029", - "\u3201", "\u0028\u1102\u0029", - "\u3202", "\u0028\u1103\u0029", - "\u3203", "\u0028\u1105\u0029", - "\u3204", "\u0028\u1106\u0029", - "\u3205", "\u0028\u1107\u0029", - "\u3206", "\u0028\u1109\u0029", - "\u3207", "\u0028\u110B\u0029", - "\u3208", "\u0028\u110C\u0029", - "\u3209", "\u0028\u110E\u0029", - "\u320A", "\u0028\u110F\u0029", - "\u320B", "\u0028\u1110\u0029", - "\u320C", "\u0028\u1111\u0029", - "\u320D", "\u0028\u1112\u0029", - "\u320E", "\u0028\u1100\u1161\u0029", - "\u320F", "\u0028\u1102\u1161\u0029", - "\u3210", "\u0028\u1103\u1161\u0029", - "\u3211", "\u0028\u1105\u1161\u0029", - "\u3212", "\u0028\u1106\u1161\u0029", - "\u3213", "\u0028\u1107\u1161\u0029", - "\u3214", "\u0028\u1109\u1161\u0029", - "\u3215", "\u0028\u110B\u1161\u0029", - "\u3216", "\u0028\u110C\u1161\u0029", - "\u3217", "\u0028\u110E\u1161\u0029", - "\u3218", "\u0028\u110F\u1161\u0029", - "\u3219", "\u0028\u1110\u1161\u0029", - "\u321A", "\u0028\u1111\u1161\u0029", - "\u321B", "\u0028\u1112\u1161\u0029", - "\u321C", "\u0028\u110C\u116E\u0029", - "\u321D", "\u0028\u110B\u1169\u110C\u1165\u11AB\u0029", - "\u321E", "\u0028\u110B\u1169\u1112\u116E\u0029", - "\u3220", "\u0028\u4E00\u0029", - "\u3221", "\u0028\u4E8C\u0029", - "\u3222", "\u0028\u4E09\u0029", - "\u3223", "\u0028\u56DB\u0029", - "\u3224", "\u0028\u4E94\u0029", - "\u3225", "\u0028\u516D\u0029", - "\u3226", "\u0028\u4E03\u0029", - "\u3227", "\u0028\u516B\u0029", - "\u3228", "\u0028\u4E5D\u0029", - "\u3229", "\u0028\u5341\u0029", - "\u322A", "\u0028\u6708\u0029", - "\u322B", "\u0028\u706B\u0029", - "\u322C", "\u0028\u6C34\u0029", - "\u322D", "\u0028\u6728\u0029", - "\u322E", "\u0028\u91D1\u0029", - "\u322F", "\u0028\u571F\u0029", - "\u3230", "\u0028\u65E5\u0029", - "\u3231", "\u0028\u682A\u0029", - "\u3232", "\u0028\u6709\u0029", - "\u3233", "\u0028\u793E\u0029", - "\u3234", "\u0028\u540D\u0029", - "\u3235", "\u0028\u7279\u0029", - "\u3236", "\u0028\u8CA1\u0029", - "\u3237", "\u0028\u795D\u0029", - "\u3238", "\u0028\u52B4\u0029", - "\u3239", "\u0028\u4EE3\u0029", - "\u323A", "\u0028\u547C\u0029", - "\u323B", "\u0028\u5B66\u0029", - "\u323C", "\u0028\u76E3\u0029", - "\u323D", "\u0028\u4F01\u0029", - "\u323E", "\u0028\u8CC7\u0029", - "\u323F", "\u0028\u5354\u0029", - "\u3240", "\u0028\u796D\u0029", - "\u3241", "\u0028\u4F11\u0029", - "\u3242", "\u0028\u81EA\u0029", - "\u3243", "\u0028\u81F3\u0029", - "\u32C0", "\u0031\u6708", - "\u32C1", "\u0032\u6708", - "\u32C2", "\u0033\u6708", - "\u32C3", "\u0034\u6708", - "\u32C4", "\u0035\u6708", - "\u32C5", "\u0036\u6708", - "\u32C6", "\u0037\u6708", - "\u32C7", "\u0038\u6708", - "\u32C8", "\u0039\u6708", - "\u32C9", "\u0031\u0030\u6708", - "\u32CA", "\u0031\u0031\u6708", - "\u32CB", "\u0031\u0032\u6708", - "\u3358", "\u0030\u70B9", - "\u3359", "\u0031\u70B9", - "\u335A", "\u0032\u70B9", - "\u335B", "\u0033\u70B9", - "\u335C", "\u0034\u70B9", - "\u335D", "\u0035\u70B9", - "\u335E", "\u0036\u70B9", - "\u335F", "\u0037\u70B9", - "\u3360", "\u0038\u70B9", - "\u3361", "\u0039\u70B9", - "\u3362", "\u0031\u0030\u70B9", - "\u3363", "\u0031\u0031\u70B9", - "\u3364", "\u0031\u0032\u70B9", - "\u3365", "\u0031\u0033\u70B9", - "\u3366", "\u0031\u0034\u70B9", - "\u3367", "\u0031\u0035\u70B9", - "\u3368", "\u0031\u0036\u70B9", - "\u3369", "\u0031\u0037\u70B9", - "\u336A", "\u0031\u0038\u70B9", - "\u336B", "\u0031\u0039\u70B9", - "\u336C", "\u0032\u0030\u70B9", - "\u336D", "\u0032\u0031\u70B9", - "\u336E", "\u0032\u0032\u70B9", - "\u336F", "\u0032\u0033\u70B9", - "\u3370", "\u0032\u0034\u70B9", - "\u33E0", "\u0031\u65E5", - "\u33E1", "\u0032\u65E5", - "\u33E2", "\u0033\u65E5", - "\u33E3", "\u0034\u65E5", - "\u33E4", "\u0035\u65E5", - "\u33E5", "\u0036\u65E5", - "\u33E6", "\u0037\u65E5", - "\u33E7", "\u0038\u65E5", - "\u33E8", "\u0039\u65E5", - "\u33E9", "\u0031\u0030\u65E5", - "\u33EA", "\u0031\u0031\u65E5", - "\u33EB", "\u0031\u0032\u65E5", - "\u33EC", "\u0031\u0033\u65E5", - "\u33ED", "\u0031\u0034\u65E5", - "\u33EE", "\u0031\u0035\u65E5", - "\u33EF", "\u0031\u0036\u65E5", - "\u33F0", "\u0031\u0037\u65E5", - "\u33F1", "\u0031\u0038\u65E5", - "\u33F2", "\u0031\u0039\u65E5", - "\u33F3", "\u0032\u0030\u65E5", - "\u33F4", "\u0032\u0031\u65E5", - "\u33F5", "\u0032\u0032\u65E5", - "\u33F6", "\u0032\u0033\u65E5", - "\u33F7", "\u0032\u0034\u65E5", - "\u33F8", "\u0032\u0035\u65E5", - "\u33F9", "\u0032\u0036\u65E5", - "\u33FA", "\u0032\u0037\u65E5", - "\u33FB", "\u0032\u0038\u65E5", - "\u33FC", "\u0032\u0039\u65E5", - "\u33FD", "\u0033\u0030\u65E5", - "\u33FE", "\u0033\u0031\u65E5", - "\uFB00", "\u0066\u0066", - "\uFB01", "\u0066\u0069", - "\uFB02", "\u0066\u006C", - "\uFB03", "\u0066\u0066\u0069", - "\uFB04", "\u0066\u0066\u006C", - "\uFB05", "\u017F\u0074", - "\uFB06", "\u0073\u0074", - "\uFB13", "\u0574\u0576", - "\uFB14", "\u0574\u0565", - "\uFB15", "\u0574\u056B", - "\uFB16", "\u057E\u0576", - "\uFB17", "\u0574\u056D", - "\uFB4F", "\u05D0\u05DC", - "\uFB50", "\u0671", - "\uFB51", "\u0671", - "\uFB52", "\u067B", - "\uFB53", "\u067B", - "\uFB54", "\u067B", - "\uFB55", "\u067B", - "\uFB56", "\u067E", - "\uFB57", "\u067E", - "\uFB58", "\u067E", - "\uFB59", "\u067E", - "\uFB5A", "\u0680", - "\uFB5B", "\u0680", - "\uFB5C", "\u0680", - "\uFB5D", "\u0680", - "\uFB5E", "\u067A", - "\uFB5F", "\u067A", - "\uFB60", "\u067A", - "\uFB61", "\u067A", - "\uFB62", "\u067F", - "\uFB63", "\u067F", - "\uFB64", "\u067F", - "\uFB65", "\u067F", - "\uFB66", "\u0679", - "\uFB67", "\u0679", - "\uFB68", "\u0679", - "\uFB69", "\u0679", - "\uFB6A", "\u06A4", - "\uFB6B", "\u06A4", - "\uFB6C", "\u06A4", - "\uFB6D", "\u06A4", - "\uFB6E", "\u06A6", - "\uFB6F", "\u06A6", - "\uFB70", "\u06A6", - "\uFB71", "\u06A6", - "\uFB72", "\u0684", - "\uFB73", "\u0684", - "\uFB74", "\u0684", - "\uFB75", "\u0684", - "\uFB76", "\u0683", - "\uFB77", "\u0683", - "\uFB78", "\u0683", - "\uFB79", "\u0683", - "\uFB7A", "\u0686", - "\uFB7B", "\u0686", - "\uFB7C", "\u0686", - "\uFB7D", "\u0686", - "\uFB7E", "\u0687", - "\uFB7F", "\u0687", - "\uFB80", "\u0687", - "\uFB81", "\u0687", - "\uFB82", "\u068D", - "\uFB83", "\u068D", - "\uFB84", "\u068C", - "\uFB85", "\u068C", - "\uFB86", "\u068E", - "\uFB87", "\u068E", - "\uFB88", "\u0688", - "\uFB89", "\u0688", - "\uFB8A", "\u0698", - "\uFB8B", "\u0698", - "\uFB8C", "\u0691", - "\uFB8D", "\u0691", - "\uFB8E", "\u06A9", - "\uFB8F", "\u06A9", - "\uFB90", "\u06A9", - "\uFB91", "\u06A9", - "\uFB92", "\u06AF", - "\uFB93", "\u06AF", - "\uFB94", "\u06AF", - "\uFB95", "\u06AF", - "\uFB96", "\u06B3", - "\uFB97", "\u06B3", - "\uFB98", "\u06B3", - "\uFB99", "\u06B3", - "\uFB9A", "\u06B1", - "\uFB9B", "\u06B1", - "\uFB9C", "\u06B1", - "\uFB9D", "\u06B1", - "\uFB9E", "\u06BA", - "\uFB9F", "\u06BA", - "\uFBA0", "\u06BB", - "\uFBA1", "\u06BB", - "\uFBA2", "\u06BB", - "\uFBA3", "\u06BB", - "\uFBA4", "\u06C0", - "\uFBA5", "\u06C0", - "\uFBA6", "\u06C1", - "\uFBA7", "\u06C1", - "\uFBA8", "\u06C1", - "\uFBA9", "\u06C1", - "\uFBAA", "\u06BE", - "\uFBAB", "\u06BE", - "\uFBAC", "\u06BE", - "\uFBAD", "\u06BE", - "\uFBAE", "\u06D2", - "\uFBAF", "\u06D2", - "\uFBB0", "\u06D3", - "\uFBB1", "\u06D3", - "\uFBD3", "\u06AD", - "\uFBD4", "\u06AD", - "\uFBD5", "\u06AD", - "\uFBD6", "\u06AD", - "\uFBD7", "\u06C7", - "\uFBD8", "\u06C7", - "\uFBD9", "\u06C6", - "\uFBDA", "\u06C6", - "\uFBDB", "\u06C8", - "\uFBDC", "\u06C8", - "\uFBDD", "\u0677", - "\uFBDE", "\u06CB", - "\uFBDF", "\u06CB", - "\uFBE0", "\u06C5", - "\uFBE1", "\u06C5", - "\uFBE2", "\u06C9", - "\uFBE3", "\u06C9", - "\uFBE4", "\u06D0", - "\uFBE5", "\u06D0", - "\uFBE6", "\u06D0", - "\uFBE7", "\u06D0", - "\uFBE8", "\u0649", - "\uFBE9", "\u0649", - "\uFBEA", "\u0626\u0627", - "\uFBEB", "\u0626\u0627", - "\uFBEC", "\u0626\u06D5", - "\uFBED", "\u0626\u06D5", - "\uFBEE", "\u0626\u0648", - "\uFBEF", "\u0626\u0648", - "\uFBF0", "\u0626\u06C7", - "\uFBF1", "\u0626\u06C7", - "\uFBF2", "\u0626\u06C6", - "\uFBF3", "\u0626\u06C6", - "\uFBF4", "\u0626\u06C8", - "\uFBF5", "\u0626\u06C8", - "\uFBF6", "\u0626\u06D0", - "\uFBF7", "\u0626\u06D0", - "\uFBF8", "\u0626\u06D0", - "\uFBF9", "\u0626\u0649", - "\uFBFA", "\u0626\u0649", - "\uFBFB", "\u0626\u0649", - "\uFBFC", "\u06CC", - "\uFBFD", "\u06CC", - "\uFBFE", "\u06CC", - "\uFBFF", "\u06CC", - "\uFC00", "\u0626\u062C", - "\uFC01", "\u0626\u062D", - "\uFC02", "\u0626\u0645", - "\uFC03", "\u0626\u0649", - "\uFC04", "\u0626\u064A", - "\uFC05", "\u0628\u062C", - "\uFC06", "\u0628\u062D", - "\uFC07", "\u0628\u062E", - "\uFC08", "\u0628\u0645", - "\uFC09", "\u0628\u0649", - "\uFC0A", "\u0628\u064A", - "\uFC0B", "\u062A\u062C", - "\uFC0C", "\u062A\u062D", - "\uFC0D", "\u062A\u062E", - "\uFC0E", "\u062A\u0645", - "\uFC0F", "\u062A\u0649", - "\uFC10", "\u062A\u064A", - "\uFC11", "\u062B\u062C", - "\uFC12", "\u062B\u0645", - "\uFC13", "\u062B\u0649", - "\uFC14", "\u062B\u064A", - "\uFC15", "\u062C\u062D", - "\uFC16", "\u062C\u0645", - "\uFC17", "\u062D\u062C", - "\uFC18", "\u062D\u0645", - "\uFC19", "\u062E\u062C", - "\uFC1A", "\u062E\u062D", - "\uFC1B", "\u062E\u0645", - "\uFC1C", "\u0633\u062C", - "\uFC1D", "\u0633\u062D", - "\uFC1E", "\u0633\u062E", - "\uFC1F", "\u0633\u0645", - "\uFC20", "\u0635\u062D", - "\uFC21", "\u0635\u0645", - "\uFC22", "\u0636\u062C", - "\uFC23", "\u0636\u062D", - "\uFC24", "\u0636\u062E", - "\uFC25", "\u0636\u0645", - "\uFC26", "\u0637\u062D", - "\uFC27", "\u0637\u0645", - "\uFC28", "\u0638\u0645", - "\uFC29", "\u0639\u062C", - "\uFC2A", "\u0639\u0645", - "\uFC2B", "\u063A\u062C", - "\uFC2C", "\u063A\u0645", - "\uFC2D", "\u0641\u062C", - "\uFC2E", "\u0641\u062D", - "\uFC2F", "\u0641\u062E", - "\uFC30", "\u0641\u0645", - "\uFC31", "\u0641\u0649", - "\uFC32", "\u0641\u064A", - "\uFC33", "\u0642\u062D", - "\uFC34", "\u0642\u0645", - "\uFC35", "\u0642\u0649", - "\uFC36", "\u0642\u064A", - "\uFC37", "\u0643\u0627", - "\uFC38", "\u0643\u062C", - "\uFC39", "\u0643\u062D", - "\uFC3A", "\u0643\u062E", - "\uFC3B", "\u0643\u0644", - "\uFC3C", "\u0643\u0645", - "\uFC3D", "\u0643\u0649", - "\uFC3E", "\u0643\u064A", - "\uFC3F", "\u0644\u062C", - "\uFC40", "\u0644\u062D", - "\uFC41", "\u0644\u062E", - "\uFC42", "\u0644\u0645", - "\uFC43", "\u0644\u0649", - "\uFC44", "\u0644\u064A", - "\uFC45", "\u0645\u062C", - "\uFC46", "\u0645\u062D", - "\uFC47", "\u0645\u062E", - "\uFC48", "\u0645\u0645", - "\uFC49", "\u0645\u0649", - "\uFC4A", "\u0645\u064A", - "\uFC4B", "\u0646\u062C", - "\uFC4C", "\u0646\u062D", - "\uFC4D", "\u0646\u062E", - "\uFC4E", "\u0646\u0645", - "\uFC4F", "\u0646\u0649", - "\uFC50", "\u0646\u064A", - "\uFC51", "\u0647\u062C", - "\uFC52", "\u0647\u0645", - "\uFC53", "\u0647\u0649", - "\uFC54", "\u0647\u064A", - "\uFC55", "\u064A\u062C", - "\uFC56", "\u064A\u062D", - "\uFC57", "\u064A\u062E", - "\uFC58", "\u064A\u0645", - "\uFC59", "\u064A\u0649", - "\uFC5A", "\u064A\u064A", - "\uFC5B", "\u0630\u0670", - "\uFC5C", "\u0631\u0670", - "\uFC5D", "\u0649\u0670", - "\uFC5E", "\u0020\u064C\u0651", - "\uFC5F", "\u0020\u064D\u0651", - "\uFC60", "\u0020\u064E\u0651", - "\uFC61", "\u0020\u064F\u0651", - "\uFC62", "\u0020\u0650\u0651", - "\uFC63", "\u0020\u0651\u0670", - "\uFC64", "\u0626\u0631", - "\uFC65", "\u0626\u0632", - "\uFC66", "\u0626\u0645", - "\uFC67", "\u0626\u0646", - "\uFC68", "\u0626\u0649", - "\uFC69", "\u0626\u064A", - "\uFC6A", "\u0628\u0631", - "\uFC6B", "\u0628\u0632", - "\uFC6C", "\u0628\u0645", - "\uFC6D", "\u0628\u0646", - "\uFC6E", "\u0628\u0649", - "\uFC6F", "\u0628\u064A", - "\uFC70", "\u062A\u0631", - "\uFC71", "\u062A\u0632", - "\uFC72", "\u062A\u0645", - "\uFC73", "\u062A\u0646", - "\uFC74", "\u062A\u0649", - "\uFC75", "\u062A\u064A", - "\uFC76", "\u062B\u0631", - "\uFC77", "\u062B\u0632", - "\uFC78", "\u062B\u0645", - "\uFC79", "\u062B\u0646", - "\uFC7A", "\u062B\u0649", - "\uFC7B", "\u062B\u064A", - "\uFC7C", "\u0641\u0649", - "\uFC7D", "\u0641\u064A", - "\uFC7E", "\u0642\u0649", - "\uFC7F", "\u0642\u064A", - "\uFC80", "\u0643\u0627", - "\uFC81", "\u0643\u0644", - "\uFC82", "\u0643\u0645", - "\uFC83", "\u0643\u0649", - "\uFC84", "\u0643\u064A", - "\uFC85", "\u0644\u0645", - "\uFC86", "\u0644\u0649", - "\uFC87", "\u0644\u064A", - "\uFC88", "\u0645\u0627", - "\uFC89", "\u0645\u0645", - "\uFC8A", "\u0646\u0631", - "\uFC8B", "\u0646\u0632", - "\uFC8C", "\u0646\u0645", - "\uFC8D", "\u0646\u0646", - "\uFC8E", "\u0646\u0649", - "\uFC8F", "\u0646\u064A", - "\uFC90", "\u0649\u0670", - "\uFC91", "\u064A\u0631", - "\uFC92", "\u064A\u0632", - "\uFC93", "\u064A\u0645", - "\uFC94", "\u064A\u0646", - "\uFC95", "\u064A\u0649", - "\uFC96", "\u064A\u064A", - "\uFC97", "\u0626\u062C", - "\uFC98", "\u0626\u062D", - "\uFC99", "\u0626\u062E", - "\uFC9A", "\u0626\u0645", - "\uFC9B", "\u0626\u0647", - "\uFC9C", "\u0628\u062C", - "\uFC9D", "\u0628\u062D", - "\uFC9E", "\u0628\u062E", - "\uFC9F", "\u0628\u0645", - "\uFCA0", "\u0628\u0647", - "\uFCA1", "\u062A\u062C", - "\uFCA2", "\u062A\u062D", - "\uFCA3", "\u062A\u062E", - "\uFCA4", "\u062A\u0645", - "\uFCA5", "\u062A\u0647", - "\uFCA6", "\u062B\u0645", - "\uFCA7", "\u062C\u062D", - "\uFCA8", "\u062C\u0645", - "\uFCA9", "\u062D\u062C", - "\uFCAA", "\u062D\u0645", - "\uFCAB", "\u062E\u062C", - "\uFCAC", "\u062E\u0645", - "\uFCAD", "\u0633\u062C", - "\uFCAE", "\u0633\u062D", - "\uFCAF", "\u0633\u062E", - "\uFCB0", "\u0633\u0645", - "\uFCB1", "\u0635\u062D", - "\uFCB2", "\u0635\u062E", - "\uFCB3", "\u0635\u0645", - "\uFCB4", "\u0636\u062C", - "\uFCB5", "\u0636\u062D", - "\uFCB6", "\u0636\u062E", - "\uFCB7", "\u0636\u0645", - "\uFCB8", "\u0637\u062D", - "\uFCB9", "\u0638\u0645", - "\uFCBA", "\u0639\u062C", - "\uFCBB", "\u0639\u0645", - "\uFCBC", "\u063A\u062C", - "\uFCBD", "\u063A\u0645", - "\uFCBE", "\u0641\u062C", - "\uFCBF", "\u0641\u062D", - "\uFCC0", "\u0641\u062E", - "\uFCC1", "\u0641\u0645", - "\uFCC2", "\u0642\u062D", - "\uFCC3", "\u0642\u0645", - "\uFCC4", "\u0643\u062C", - "\uFCC5", "\u0643\u062D", - "\uFCC6", "\u0643\u062E", - "\uFCC7", "\u0643\u0644", - "\uFCC8", "\u0643\u0645", - "\uFCC9", "\u0644\u062C", - "\uFCCA", "\u0644\u062D", - "\uFCCB", "\u0644\u062E", - "\uFCCC", "\u0644\u0645", - "\uFCCD", "\u0644\u0647", - "\uFCCE", "\u0645\u062C", - "\uFCCF", "\u0645\u062D", - "\uFCD0", "\u0645\u062E", - "\uFCD1", "\u0645\u0645", - "\uFCD2", "\u0646\u062C", - "\uFCD3", "\u0646\u062D", - "\uFCD4", "\u0646\u062E", - "\uFCD5", "\u0646\u0645", - "\uFCD6", "\u0646\u0647", - "\uFCD7", "\u0647\u062C", - "\uFCD8", "\u0647\u0645", - "\uFCD9", "\u0647\u0670", - "\uFCDA", "\u064A\u062C", - "\uFCDB", "\u064A\u062D", - "\uFCDC", "\u064A\u062E", - "\uFCDD", "\u064A\u0645", - "\uFCDE", "\u064A\u0647", - "\uFCDF", "\u0626\u0645", - "\uFCE0", "\u0626\u0647", - "\uFCE1", "\u0628\u0645", - "\uFCE2", "\u0628\u0647", - "\uFCE3", "\u062A\u0645", - "\uFCE4", "\u062A\u0647", - "\uFCE5", "\u062B\u0645", - "\uFCE6", "\u062B\u0647", - "\uFCE7", "\u0633\u0645", - "\uFCE8", "\u0633\u0647", - "\uFCE9", "\u0634\u0645", - "\uFCEA", "\u0634\u0647", - "\uFCEB", "\u0643\u0644", - "\uFCEC", "\u0643\u0645", - "\uFCED", "\u0644\u0645", - "\uFCEE", "\u0646\u0645", - "\uFCEF", "\u0646\u0647", - "\uFCF0", "\u064A\u0645", - "\uFCF1", "\u064A\u0647", - "\uFCF2", "\u0640\u064E\u0651", - "\uFCF3", "\u0640\u064F\u0651", - "\uFCF4", "\u0640\u0650\u0651", - "\uFCF5", "\u0637\u0649", - "\uFCF6", "\u0637\u064A", - "\uFCF7", "\u0639\u0649", - "\uFCF8", "\u0639\u064A", - "\uFCF9", "\u063A\u0649", - "\uFCFA", "\u063A\u064A", - "\uFCFB", "\u0633\u0649", - "\uFCFC", "\u0633\u064A", - "\uFCFD", "\u0634\u0649", - "\uFCFE", "\u0634\u064A", - "\uFCFF", "\u062D\u0649", - "\uFD00", "\u062D\u064A", - "\uFD01", "\u062C\u0649", - "\uFD02", "\u062C\u064A", - "\uFD03", "\u062E\u0649", - "\uFD04", "\u062E\u064A", - "\uFD05", "\u0635\u0649", - "\uFD06", "\u0635\u064A", - "\uFD07", "\u0636\u0649", - "\uFD08", "\u0636\u064A", - "\uFD09", "\u0634\u062C", - "\uFD0A", "\u0634\u062D", - "\uFD0B", "\u0634\u062E", - "\uFD0C", "\u0634\u0645", - "\uFD0D", "\u0634\u0631", - "\uFD0E", "\u0633\u0631", - "\uFD0F", "\u0635\u0631", - "\uFD10", "\u0636\u0631", - "\uFD11", "\u0637\u0649", - "\uFD12", "\u0637\u064A", - "\uFD13", "\u0639\u0649", - "\uFD14", "\u0639\u064A", - "\uFD15", "\u063A\u0649", - "\uFD16", "\u063A\u064A", - "\uFD17", "\u0633\u0649", - "\uFD18", "\u0633\u064A", - "\uFD19", "\u0634\u0649", - "\uFD1A", "\u0634\u064A", - "\uFD1B", "\u062D\u0649", - "\uFD1C", "\u062D\u064A", - "\uFD1D", "\u062C\u0649", - "\uFD1E", "\u062C\u064A", - "\uFD1F", "\u062E\u0649", - "\uFD20", "\u062E\u064A", - "\uFD21", "\u0635\u0649", - "\uFD22", "\u0635\u064A", - "\uFD23", "\u0636\u0649", - "\uFD24", "\u0636\u064A", - "\uFD25", "\u0634\u062C", - "\uFD26", "\u0634\u062D", - "\uFD27", "\u0634\u062E", - "\uFD28", "\u0634\u0645", - "\uFD29", "\u0634\u0631", - "\uFD2A", "\u0633\u0631", - "\uFD2B", "\u0635\u0631", - "\uFD2C", "\u0636\u0631", - "\uFD2D", "\u0634\u062C", - "\uFD2E", "\u0634\u062D", - "\uFD2F", "\u0634\u062E", - "\uFD30", "\u0634\u0645", - "\uFD31", "\u0633\u0647", - "\uFD32", "\u0634\u0647", - "\uFD33", "\u0637\u0645", - "\uFD34", "\u0633\u062C", - "\uFD35", "\u0633\u062D", - "\uFD36", "\u0633\u062E", - "\uFD37", "\u0634\u062C", - "\uFD38", "\u0634\u062D", - "\uFD39", "\u0634\u062E", - "\uFD3A", "\u0637\u0645", - "\uFD3B", "\u0638\u0645", - "\uFD3C", "\u0627\u064B", - "\uFD3D", "\u0627\u064B", - "\uFD50", "\u062A\u062C\u0645", - "\uFD51", "\u062A\u062D\u062C", - "\uFD52", "\u062A\u062D\u062C", - "\uFD53", "\u062A\u062D\u0645", - "\uFD54", "\u062A\u062E\u0645", - "\uFD55", "\u062A\u0645\u062C", - "\uFD56", "\u062A\u0645\u062D", - "\uFD57", "\u062A\u0645\u062E", - "\uFD58", "\u062C\u0645\u062D", - "\uFD59", "\u062C\u0645\u062D", - "\uFD5A", "\u062D\u0645\u064A", - "\uFD5B", "\u062D\u0645\u0649", - "\uFD5C", "\u0633\u062D\u062C", - "\uFD5D", "\u0633\u062C\u062D", - "\uFD5E", "\u0633\u062C\u0649", - "\uFD5F", "\u0633\u0645\u062D", - "\uFD60", "\u0633\u0645\u062D", - "\uFD61", "\u0633\u0645\u062C", - "\uFD62", "\u0633\u0645\u0645", - "\uFD63", "\u0633\u0645\u0645", - "\uFD64", "\u0635\u062D\u062D", - "\uFD65", "\u0635\u062D\u062D", - "\uFD66", "\u0635\u0645\u0645", - "\uFD67", "\u0634\u062D\u0645", - "\uFD68", "\u0634\u062D\u0645", - "\uFD69", "\u0634\u062C\u064A", - "\uFD6A", "\u0634\u0645\u062E", - "\uFD6B", "\u0634\u0645\u062E", - "\uFD6C", "\u0634\u0645\u0645", - "\uFD6D", "\u0634\u0645\u0645", - "\uFD6E", "\u0636\u062D\u0649", - "\uFD6F", "\u0636\u062E\u0645", - "\uFD70", "\u0636\u062E\u0645", - "\uFD71", "\u0637\u0645\u062D", - "\uFD72", "\u0637\u0645\u062D", - "\uFD73", "\u0637\u0645\u0645", - "\uFD74", "\u0637\u0645\u064A", - "\uFD75", "\u0639\u062C\u0645", - "\uFD76", "\u0639\u0645\u0645", - "\uFD77", "\u0639\u0645\u0645", - "\uFD78", "\u0639\u0645\u0649", - "\uFD79", "\u063A\u0645\u0645", - "\uFD7A", "\u063A\u0645\u064A", - "\uFD7B", "\u063A\u0645\u0649", - "\uFD7C", "\u0641\u062E\u0645", - "\uFD7D", "\u0641\u062E\u0645", - "\uFD7E", "\u0642\u0645\u062D", - "\uFD7F", "\u0642\u0645\u0645", - "\uFD80", "\u0644\u062D\u0645", - "\uFD81", "\u0644\u062D\u064A", - "\uFD82", "\u0644\u062D\u0649", - "\uFD83", "\u0644\u062C\u062C", - "\uFD84", "\u0644\u062C\u062C", - "\uFD85", "\u0644\u062E\u0645", - "\uFD86", "\u0644\u062E\u0645", - "\uFD87", "\u0644\u0645\u062D", - "\uFD88", "\u0644\u0645\u062D", - "\uFD89", "\u0645\u062D\u062C", - "\uFD8A", "\u0645\u062D\u0645", - "\uFD8B", "\u0645\u062D\u064A", - "\uFD8C", "\u0645\u062C\u062D", - "\uFD8D", "\u0645\u062C\u0645", - "\uFD8E", "\u0645\u062E\u062C", - "\uFD8F", "\u0645\u062E\u0645", - "\uFD92", "\u0645\u062C\u062E", - "\uFD93", "\u0647\u0645\u062C", - "\uFD94", "\u0647\u0645\u0645", - "\uFD95", "\u0646\u062D\u0645", - "\uFD96", "\u0646\u062D\u0649", - "\uFD97", "\u0646\u062C\u0645", - "\uFD98", "\u0646\u062C\u0645", - "\uFD99", "\u0646\u062C\u0649", - "\uFD9A", "\u0646\u0645\u064A", - "\uFD9B", "\u0646\u0645\u0649", - "\uFD9C", "\u064A\u0645\u0645", - "\uFD9D", "\u064A\u0645\u0645", - "\uFD9E", "\u0628\u062E\u064A", - "\uFD9F", "\u062A\u062C\u064A", - "\uFDA0", "\u062A\u062C\u0649", - "\uFDA1", "\u062A\u062E\u064A", - "\uFDA2", "\u062A\u062E\u0649", - "\uFDA3", "\u062A\u0645\u064A", - "\uFDA4", "\u062A\u0645\u0649", - "\uFDA5", "\u062C\u0645\u064A", - "\uFDA6", "\u062C\u062D\u0649", - "\uFDA7", "\u062C\u0645\u0649", - "\uFDA8", "\u0633\u062E\u0649", - "\uFDA9", "\u0635\u062D\u064A", - "\uFDAA", "\u0634\u062D\u064A", - "\uFDAB", "\u0636\u062D\u064A", - "\uFDAC", "\u0644\u062C\u064A", - "\uFDAD", "\u0644\u0645\u064A", - "\uFDAE", "\u064A\u062D\u064A", - "\uFDAF", "\u064A\u062C\u064A", - "\uFDB0", "\u064A\u0645\u064A", - "\uFDB1", "\u0645\u0645\u064A", - "\uFDB2", "\u0642\u0645\u064A", - "\uFDB3", "\u0646\u062D\u064A", - "\uFDB4", "\u0642\u0645\u062D", - "\uFDB5", "\u0644\u062D\u0645", - "\uFDB6", "\u0639\u0645\u064A", - "\uFDB7", "\u0643\u0645\u064A", - "\uFDB8", "\u0646\u062C\u062D", - "\uFDB9", "\u0645\u062E\u064A", - "\uFDBA", "\u0644\u062C\u0645", - "\uFDBB", "\u0643\u0645\u0645", - "\uFDBC", "\u0644\u062C\u0645", - "\uFDBD", "\u0646\u062C\u062D", - "\uFDBE", "\u062C\u062D\u064A", - "\uFDBF", "\u062D\u062C\u064A", - "\uFDC0", "\u0645\u062C\u064A", - "\uFDC1", "\u0641\u0645\u064A", - "\uFDC2", "\u0628\u062D\u064A", - "\uFDC3", "\u0643\u0645\u0645", - "\uFDC4", "\u0639\u062C\u0645", - "\uFDC5", "\u0635\u0645\u0645", - "\uFDC6", "\u0633\u062E\u064A", - "\uFDC7", "\u0646\u062C\u064A", - "\uFE49", "\u203E", - "\uFE4A", "\u203E", - "\uFE4B", "\u203E", - "\uFE4C", "\u203E", - "\uFE4D", "\u005F", - "\uFE4E", "\u005F", - "\uFE4F", "\u005F", - "\uFE80", "\u0621", - "\uFE81", "\u0622", - "\uFE82", "\u0622", - "\uFE83", "\u0623", - "\uFE84", "\u0623", - "\uFE85", "\u0624", - "\uFE86", "\u0624", - "\uFE87", "\u0625", - "\uFE88", "\u0625", - "\uFE89", "\u0626", - "\uFE8A", "\u0626", - "\uFE8B", "\u0626", - "\uFE8C", "\u0626", - "\uFE8D", "\u0627", - "\uFE8E", "\u0627", - "\uFE8F", "\u0628", - "\uFE90", "\u0628", - "\uFE91", "\u0628", - "\uFE92", "\u0628", - "\uFE93", "\u0629", - "\uFE94", "\u0629", - "\uFE95", "\u062A", - "\uFE96", "\u062A", - "\uFE97", "\u062A", - "\uFE98", "\u062A", - "\uFE99", "\u062B", - "\uFE9A", "\u062B", - "\uFE9B", "\u062B", - "\uFE9C", "\u062B", - "\uFE9D", "\u062C", - "\uFE9E", "\u062C", - "\uFE9F", "\u062C", - "\uFEA0", "\u062C", - "\uFEA1", "\u062D", - "\uFEA2", "\u062D", - "\uFEA3", "\u062D", - "\uFEA4", "\u062D", - "\uFEA5", "\u062E", - "\uFEA6", "\u062E", - "\uFEA7", "\u062E", - "\uFEA8", "\u062E", - "\uFEA9", "\u062F", - "\uFEAA", "\u062F", - "\uFEAB", "\u0630", - "\uFEAC", "\u0630", - "\uFEAD", "\u0631", - "\uFEAE", "\u0631", - "\uFEAF", "\u0632", - "\uFEB0", "\u0632", - "\uFEB1", "\u0633", - "\uFEB2", "\u0633", - "\uFEB3", "\u0633", - "\uFEB4", "\u0633", - "\uFEB5", "\u0634", - "\uFEB6", "\u0634", - "\uFEB7", "\u0634", - "\uFEB8", "\u0634", - "\uFEB9", "\u0635", - "\uFEBA", "\u0635", - "\uFEBB", "\u0635", - "\uFEBC", "\u0635", - "\uFEBD", "\u0636", - "\uFEBE", "\u0636", - "\uFEBF", "\u0636", - "\uFEC0", "\u0636", - "\uFEC1", "\u0637", - "\uFEC2", "\u0637", - "\uFEC3", "\u0637", - "\uFEC4", "\u0637", - "\uFEC5", "\u0638", - "\uFEC6", "\u0638", - "\uFEC7", "\u0638", - "\uFEC8", "\u0638", - "\uFEC9", "\u0639", - "\uFECA", "\u0639", - "\uFECB", "\u0639", - "\uFECC", "\u0639", - "\uFECD", "\u063A", - "\uFECE", "\u063A", - "\uFECF", "\u063A", - "\uFED0", "\u063A", - "\uFED1", "\u0641", - "\uFED2", "\u0641", - "\uFED3", "\u0641", - "\uFED4", "\u0641", - "\uFED5", "\u0642", - "\uFED6", "\u0642", - "\uFED7", "\u0642", - "\uFED8", "\u0642", - "\uFED9", "\u0643", - "\uFEDA", "\u0643", - "\uFEDB", "\u0643", - "\uFEDC", "\u0643", - "\uFEDD", "\u0644", - "\uFEDE", "\u0644", - "\uFEDF", "\u0644", - "\uFEE0", "\u0644", - "\uFEE1", "\u0645", - "\uFEE2", "\u0645", - "\uFEE3", "\u0645", - "\uFEE4", "\u0645", - "\uFEE5", "\u0646", - "\uFEE6", "\u0646", - "\uFEE7", "\u0646", - "\uFEE8", "\u0646", - "\uFEE9", "\u0647", - "\uFEEA", "\u0647", - "\uFEEB", "\u0647", - "\uFEEC", "\u0647", - "\uFEED", "\u0648", - "\uFEEE", "\u0648", - "\uFEEF", "\u0649", - "\uFEF0", "\u0649", - "\uFEF1", "\u064A", - "\uFEF2", "\u064A", - "\uFEF3", "\u064A", - "\uFEF4", "\u064A", - "\uFEF5", "\u0644\u0622", - "\uFEF6", "\u0644\u0622", - "\uFEF7", "\u0644\u0623", - "\uFEF8", "\u0644\u0623", - "\uFEF9", "\u0644\u0625", - "\uFEFA", "\u0644\u0625", - "\uFEFB", "\u0644\u0627", - "\uFEFC", "\u0644\u0627", - ]; -}); - -function reverseIfRtl(chars) { - const charsLength = chars.length; - // Reverse an arabic ligature. - if (charsLength <= 1 || !isRTLRangeFor(chars.charCodeAt(0))) { - return chars; - } - const buf = []; - for (let ii = charsLength - 1; ii >= 0; ii--) { - buf.push(chars[ii]); - } - return buf.join(""); -} - const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u"); const CategoryCache = new Map(); @@ -1665,9 +268,7 @@ function clearUnicodeCaches() { export { clearUnicodeCaches, getCharUnicodeCategory, - getNormalizedUnicodes, getUnicodeForGlyph, getUnicodeRangeFor, mapSpecialUnicodeValues, - reverseIfRtl, }; diff --git a/src/core/worker.js b/src/core/worker.js index 13a1af4e8..1056c9690 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -745,7 +745,7 @@ class WorkerMessageHandler { }); handler.on("GetTextContent", function (data, sink) { - const { pageIndex, includeMarkedContent } = data; + const { pageIndex, includeMarkedContent, disableNormalization } = data; pdfManager.getPage(pageIndex).then(function (page) { const task = new WorkerTask("GetTextContent: page " + pageIndex); @@ -760,6 +760,7 @@ class WorkerMessageHandler { task, sink, includeMarkedContent, + disableNormalization, }) .then( function () { diff --git a/src/display/api.js b/src/display/api.js index de5ad9e6c..dba9fc826 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1122,6 +1122,8 @@ class PDFDocumentProxy { * @typedef {Object} getTextContentParameters * @property {boolean} [includeMarkedContent] - When true include marked * content items in the items array of TextContent. The default is `false`. + * @property {boolean} [disableNormalization] - When true the text is *not* + * normalized in the worker-thread. The default is `false`. */ /** @@ -1598,7 +1600,10 @@ class PDFPageProxy { * @param {getTextContentParameters} params - getTextContent parameters. * @returns {ReadableStream} Stream for reading text content chunks. */ - streamTextContent({ includeMarkedContent = false } = {}) { + streamTextContent({ + includeMarkedContent = false, + disableNormalization = false, + } = {}) { const TEXT_CONTENT_CHUNK_SIZE = 100; return this._transport.messageHandler.sendWithStream( @@ -1606,6 +1611,7 @@ class PDFPageProxy { { pageIndex: this._pageIndex, includeMarkedContent: includeMarkedContent === true, + disableNormalization: disableNormalization === true, }, { highWaterMark: TEXT_CONTENT_CHUNK_SIZE, diff --git a/src/pdf.js b/src/pdf.js index dcad1a231..e2162ee11 100644 --- a/src/pdf.js +++ b/src/pdf.js @@ -35,6 +35,7 @@ import { FeatureTest, InvalidPDFException, MissingPDFException, + normalizeUnicode, OPS, PasswordResponses, PermissionFlag, @@ -100,6 +101,7 @@ export { isPdfFile, loadScript, MissingPDFException, + normalizeUnicode, OPS, PasswordResponses, PDFDataRangeTransport, diff --git a/src/shared/util.js b/src/shared/util.js index 873e26e0c..756cff9aa 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -1026,6 +1026,25 @@ function createPromiseCapability() { return capability; } +let NormalizeRegex = null; +let NormalizationMap = null; +function normalizeUnicode(str) { + if (!NormalizeRegex) { + // In order to generate the following regex: + // - create a PDF containing all the chars in the range 0000-FFFF with + // a NFKC which is different of the char. + // - copy and paste all those chars and get the ones where NFKC is + // required. + // It appears that most the chars here contain some ligatures. + NormalizeRegex = + /([\u00a0\u00b5\u037e\u0eb3\u2000-\u200a\u202f\u2126\ufb00-\ufb04\ufb06\ufb20-\ufb36\ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufba1\ufba4-\ufba9\ufbae-\ufbb1\ufbd3-\ufbdc\ufbde-\ufbe7\ufbea-\ufbf8\ufbfc-\ufbfd\ufc00-\ufc5d\ufc64-\ufcf1\ufcf5-\ufd3d\ufd88\ufdf4\ufdfa-\ufdfb\ufe71\ufe77\ufe79\ufe7b\ufe7d]+)|(\ufb05+)/gu; + NormalizationMap = new Map([["ſt", "ſt"]]); + } + return str.replaceAll(NormalizeRegex, (_, p1, p2) => { + return p1 ? p1.normalize("NFKC") : NormalizationMap.get(p2); + }); +} + export { AbortException, AnnotationActionEventType, @@ -1064,6 +1083,7 @@ export { LINE_FACTOR, MAX_IMAGE_SIZE_TO_CACHE, MissingPDFException, + normalizeUnicode, objectFromMap, objectSize, OPS, diff --git a/test/driver.js b/test/driver.js index 56d507dde..124bdf1e9 100644 --- a/test/driver.js +++ b/test/driver.js @@ -693,6 +693,7 @@ class Driver { initPromise = page .getTextContent({ includeMarkedContent: true, + disableNormalization: true, }) .then(function (textContent) { return Rasterize.textLayer( diff --git a/test/integration/copy_paste_spec.js b/test/integration/copy_paste_spec.js index 7de6f34eb..4f7d29bcf 100644 --- a/test/integration/copy_paste_spec.js +++ b/test/integration/copy_paste_spec.js @@ -28,7 +28,7 @@ describe("Copy and paste", () => { await closePages(pages); }); - it("must check that we've all the contents", async () => { + it("must check that we've all the contents on copy/paste", async () => { await Promise.all( pages.map(async ([browserName, page]) => { await page.keyboard.down("Control"); @@ -117,4 +117,47 @@ describe("Copy and paste", () => { ); }); }); + describe("all text", () => { + let pages; + + beforeAll(async () => { + pages = await loadAndWait("copy_paste_ligatures.pdf", ".textLayer"); + await mockClipboard(pages); + }); + + afterAll(async () => { + await closePages(pages); + }); + + it("must check that we've all the contents on copy/paste", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + await page.keyboard.down("Control"); + await page.keyboard.press("a"); + await page.keyboard.up("Control"); + + await page.waitForTimeout(100); + + await page.keyboard.down("Control"); + await page.keyboard.press("c"); + await page.keyboard.up("Control"); + + await page.waitForTimeout(100); + + await page.waitForFunction( + `document.querySelector('#viewerContainer').style.cursor !== "wait"` + ); + + const text = await page.evaluate(() => + navigator.clipboard.readText() + ); + + expect(!!text).withContext(`In ${browserName}`).toEqual(true); + expect(text) + .withContext(`In ${browserName}`) + .toEqual("abcdeffffiflffifflſtstghijklmno"); + }) + ); + }); + }); }); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index b9e823515..f8d8573f5 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -585,3 +585,4 @@ !issue16221.pdf !issue16224.pdf !issue16278.pdf +!copy_paste_ligatures.pdf diff --git a/test/pdfs/copy_paste_ligatures.pdf b/test/pdfs/copy_paste_ligatures.pdf new file mode 100755 index 000000000..973593129 Binary files /dev/null and b/test/pdfs/copy_paste_ligatures.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index ce46d4ef7..cf817b328 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -2340,7 +2340,9 @@ page 1 / 3`); ); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items, styles } = await pdfPage.getTextContent(); + const { items, styles } = await pdfPage.getTextContent({ + disableNormalization: true, + }); expect(items.length).toEqual(1); // Font name will be a random object id. const fontName = items[0].fontName; @@ -2376,7 +2378,9 @@ page 1 / 3`); const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect(text).toEqual( @@ -2394,7 +2398,9 @@ page 1 / 3`); const loadingTask = getDocument(buildGetDocumentParams("issue16119.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect( @@ -2410,7 +2416,9 @@ page 1 / 3`); const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect( @@ -2436,7 +2444,9 @@ page 1 / 3`); const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect( @@ -2456,7 +2466,9 @@ page 1 / 3`); const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect( @@ -2475,11 +2487,27 @@ page 1 / 3`); const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); - const text = mergeText(items); + let { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + let text = mergeText(items); + let expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson. +Open Sans was designed with an upright stress, open forms and a neu- +tral, yet friendly appearance. It was optimized for print, web, and mobile +interfaces, and has excellent legibility characteristics in its letterforms (see +figure \x81 on the following page). This font is available from the Google Font +Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80. +This package provides support for this font in LATEX. It includes Type \x81 +versions of the fonts, converted for this package using FontForge from its +sources, for full support with Dvips.`; - expect( - text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson. + expect(text.includes(expected)).toEqual(true); + + ({ items } = await pdfPage.getTextContent({ + disableNormalization: false, + })); + text = mergeText(items); + expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson. Open Sans was designed with an upright stress, open forms and a neu- tral, yet friendly appearance. It was optimized for print, web, and mobile interfaces, and has excellent legibility characteristics in its letterforms (see @@ -2487,8 +2515,8 @@ figure \x81 on the following page). This font is available from the Google Font Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80. This package provides support for this font in LATEX. It includes Type \x81 versions of the fonts, converted for this package using FontForge from its -sources, for full support with Dvips.`) - ).toEqual(true); +sources, for full support with Dvips.`; + expect(text.includes(expected)).toEqual(true); await loadingTask.destroy(); }); @@ -2501,7 +2529,9 @@ sources, for full support with Dvips.`) const loadingTask = getDocument(buildGetDocumentParams("bug931481.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect( @@ -2529,7 +2559,9 @@ sozialökonomische Gerechtigkeit.`) const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect( @@ -2550,7 +2582,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) ); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect(text).toEqual( @@ -2568,7 +2602,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(6); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect(/win aisle/.test(text)).toEqual(false); @@ -2586,10 +2622,12 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) const pdfPage = await pdfDoc.getPage(568); let { items } = await pdfPage.getTextContent({ includeMarkedContent: false, + disableNormalization: true, }); const textWithoutMC = mergeText(items); ({ items } = await pdfPage.getTextContent({ includeMarkedContent: true, + disableNormalization: true, })); const textWithMC = mergeText(items); @@ -2607,7 +2645,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) ); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); const text = mergeText(items); expect(text).toEqual("𠮷"); @@ -2619,7 +2659,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf")); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items } = await pdfPage.getTextContent(); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); expect(items.map(i => i.str)).toEqual(["Hello ", "World"]); diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index a371ecbe5..c0100bd8b 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -542,7 +542,7 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - pageMatches: [[2743]], + pageMatches: [[2734]], pageMatchesLength: [[14]], }); }); @@ -561,7 +561,7 @@ describe("pdf_find_controller", function () { pageIndex: 1, matchIndex: 0, }, - pageMatches: [[], [1493]], + pageMatches: [[], [1486]], pageMatchesLength: [[], [11]], }); }); @@ -594,7 +594,7 @@ describe("pdf_find_controller", function () { [], [], [], - [2087], + [2081], ], pageMatchesLength: [ [24], @@ -629,7 +629,7 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - pageMatches: [[1501]], + pageMatches: [[1497]], pageMatchesLength: [[25]], }); }); @@ -670,7 +670,7 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - pageMatches: [[1946]], + pageMatches: [[1941]], pageMatchesLength: [[21]], }); }); @@ -692,7 +692,7 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - pageMatches: [[1946]], + pageMatches: [[1941]], pageMatchesLength: [[23]], }); }); @@ -712,7 +712,7 @@ describe("pdf_find_controller", function () { pageIndex: 0, matchIndex: 0, }, - pageMatches: [[1946]], + pageMatches: [[1941]], pageMatchesLength: [[23]], }); }); @@ -976,4 +976,61 @@ describe("pdf_find_controller", function () { pageMatchesLength: [[5, 5]], }); }); + + it("performs a search in a text with some arabic chars in different unicode ranges but with same normalized form", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "ArabicCIDTrueType.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "\u0629", + }, + matchesPerPage: [4], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[6, 25, 44, 63]], + pageMatchesLength: [[1, 1, 1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "\ufe94", + }, + matchesPerPage: [4], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[6, 25, 44, 63]], + pageMatchesLength: [[1, 1, 1, 1]], + }); + }); + + it("performs a search in a text with some f ligatures", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "copy_paste_ligatures.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "f", + }, + matchesPerPage: [9], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[5, 6, 6, 7, 8, 9, 9, 10, 10]], + pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1]], + }); + }); }); diff --git a/test/unit/unicode_spec.js b/test/unit/unicode_spec.js index 1753ef26b..1f7de5a1d 100644 --- a/test/unit/unicode_spec.js +++ b/test/unit/unicode_spec.js @@ -15,11 +15,9 @@ import { getCharUnicodeCategory, - getNormalizedUnicodes, getUnicodeForGlyph, getUnicodeRangeFor, mapSpecialUnicodeValues, - reverseIfRtl, } from "../../src/core/unicode.js"; import { getDingbatsGlyphsUnicode, @@ -152,69 +150,12 @@ describe("unicode", function () { expect(getUnicodeRangeFor(0x0041)).toEqual(0); // fi (Alphabetic Presentation Forms) expect(getUnicodeRangeFor(0xfb01)).toEqual(62); + // Combining diacritic (Cyrillic Extended-A) + expect(getUnicodeRangeFor(0x2dff)).toEqual(9); }); it("should not get a Unicode range", function () { - expect(getUnicodeRangeFor(0x05ff)).toEqual(-1); - }); - }); - - describe("getNormalizedUnicodes", function () { - let NormalizedUnicodes; - - beforeAll(function () { - NormalizedUnicodes = getNormalizedUnicodes(); - }); - - afterAll(function () { - NormalizedUnicodes = null; - }); - - it("should get normalized Unicode values for ligatures", function () { - // fi => f + i - expect(NormalizedUnicodes["\uFB01"]).toEqual("fi"); - // Arabic - expect(NormalizedUnicodes["\u0675"]).toEqual("\u0627\u0674"); - }); - - it("should not normalize standard characters", function () { - expect(NormalizedUnicodes.A).toEqual(undefined); - }); - }); - - describe("reverseIfRtl", function () { - let NormalizedUnicodes; - - function getGlyphUnicode(char) { - if (NormalizedUnicodes[char] !== undefined) { - return NormalizedUnicodes[char]; - } - return char; - } - - beforeAll(function () { - NormalizedUnicodes = getNormalizedUnicodes(); - }); - - afterAll(function () { - NormalizedUnicodes = null; - }); - - it("should not reverse LTR characters", function () { - const A = getGlyphUnicode("A"); - expect(reverseIfRtl(A)).toEqual("A"); - - const fi = getGlyphUnicode("\uFB01"); - expect(reverseIfRtl(fi)).toEqual("fi"); - }); - - it("should reverse RTL characters", function () { - // Hebrew (no-op, since it's not a combined character) - const heAlef = getGlyphUnicode("\u05D0"); - expect(reverseIfRtl(heAlef)).toEqual("\u05D0"); - // Arabic - const arAlef = getGlyphUnicode("\u0675"); - expect(reverseIfRtl(arAlef)).toEqual("\u0674\u0627"); + expect(getUnicodeRangeFor(0xaa60)).toEqual(-1); }); }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 3e5fb4d45..f5f241407 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -18,8 +18,8 @@ /** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */ import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js"; +import { getCharacterType, getNormalizeWithNFKC } from "./pdf_find_utils.js"; import { createPromiseCapability } from "pdfjs-lib"; -import { getCharacterType } from "./pdf_find_utils.js"; const FindState = { FOUND: 0, @@ -126,12 +126,7 @@ function normalize(text) { } else { // Compile the regular expression for text normalization once. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); - const toNormalizeWithNFKC = - "\u2460-\u2473" + // Circled numbers. - "\u24b6-\u24ff" + // Circled letters/numbers. - "\u3244-\u32bf" + // Circled ideograms/numbers. - "\u32d0-\u32fe" + // Circled ideograms. - "\uff00-\uffef"; // Halfwidth, fullwidth forms. + const toNormalizeWithNFKC = getNormalizeWithNFKC(); // 3040-309F: Hiragana // 30A0-30FF: Katakana @@ -840,6 +835,7 @@ class PDFFindController { } let promise = Promise.resolve(); + const textOptions = { disableNormalization: true }; for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) { const extractTextCapability = createPromiseCapability(); this._extractTextPromises[i] = extractTextCapability.promise; @@ -848,7 +844,7 @@ class PDFFindController { return this._pdfDocument .getPage(i + 1) .then(pdfPage => { - return pdfPage.getTextContent(); + return pdfPage.getTextContent(textOptions); }) .then( textContent => { diff --git a/web/pdf_find_utils.js b/web/pdf_find_utils.js index 24ec4c575..78b747706 100644 --- a/web/pdf_find_utils.js +++ b/web/pdf_find_utils.js @@ -112,4 +112,46 @@ function getCharacterType(charCode) { return CharacterType.ALPHA_LETTER; } -export { CharacterType, getCharacterType }; +let NormalizeWithNFKC; +function getNormalizeWithNFKC() { + /* eslint-disable no-irregular-whitespace */ + NormalizeWithNFKC ||= ` ¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´-῾ - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉-ℓℕ-№ℙ-ℝ℠-™ℤΩℨK-ℭℯ-ℱℳ-ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰꟲ-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`; + + if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) { + const ranges = []; + const range = []; + const diacriticsRegex = /^\p{M}$/u; + // Some chars must be replaced by their NFKC counterpart during a search. + for (let i = 0; i < 65536; i++) { + const c = String.fromCharCode(i); + if (c.normalize("NFKC") !== c && !diacriticsRegex.test(c)) { + if (range.length !== 2) { + range[0] = range[1] = i; + continue; + } + if (range[1] + 1 !== i) { + if (range[0] === range[1]) { + ranges.push(String.fromCharCode(range[0])); + } else { + ranges.push( + `${String.fromCharCode(range[0])}-${String.fromCharCode( + range[1] + )}` + ); + } + range[0] = range[1] = i; + } else { + range[1] = i; + } + } + } + if (ranges.join("") !== NormalizeWithNFKC) { + throw new Error( + "getNormalizeWithNFKC - update the `NormalizeWithNFKC` string." + ); + } + } + return NormalizeWithNFKC; +} + +export { CharacterType, getCharacterType, getNormalizeWithNFKC }; diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index e0b0d5632..ed3f751b5 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -368,6 +368,7 @@ class PDFPageView { if (!textLayer.renderingDone) { const readableStream = pdfPage.streamTextContent({ includeMarkedContent: true, + disableNormalization: true, }); textLayer.setTextContentSource(readableStream); } diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js index 9bd14f39f..e3bee5c15 100644 --- a/web/pdf_viewer.js +++ b/web/pdf_viewer.js @@ -665,6 +665,8 @@ class PDFViewer { } buffer.length = 0; const page = await this.pdfDocument.getPage(pageNum); + // By default getTextContent pass disableNormalization equals to false + // which is fine because we want a normalized string. const { items } = await page.getTextContent(); for (const item of items) { if (item.str) { diff --git a/web/text_highlighter.js b/web/text_highlighter.js index c1b828c40..41721554d 100644 --- a/web/text_highlighter.js +++ b/web/text_highlighter.js @@ -208,9 +208,20 @@ class TextHighlighter { return; } + let lastDivIdx = -1; + let lastOffset = -1; for (let i = i0; i < i1; i++) { const match = matches[i]; const begin = match.begin; + if (begin.divIdx === lastDivIdx && begin.offset === lastOffset) { + // It's possible to be in this situation if we searched for a 'f' and we + // have a ligature 'ff' in the text. The 'ff' has to be highlighted two + // times. + continue; + } + lastDivIdx = begin.divIdx; + lastOffset = begin.offset; + const end = match.end; const isSelected = isSelectedPage && i === selectedMatchIdx; const highlightSuffix = isSelected ? " selected" : ""; diff --git a/web/text_layer_builder.js b/web/text_layer_builder.js index 20fc7f67d..81a77918c 100644 --- a/web/text_layer_builder.js +++ b/web/text_layer_builder.js @@ -20,7 +20,8 @@ // eslint-disable-next-line max-len /** @typedef {import("./text_accessibility.js").TextAccessibilityManager} TextAccessibilityManager */ -import { renderTextLayer, updateTextLayer } from "pdfjs-lib"; +import { normalizeUnicode, renderTextLayer, updateTextLayer } from "pdfjs-lib"; +import { removeNullCharacters } from "./ui_utils.js"; /** * @typedef {Object} TextLayerBuilderOptions @@ -212,6 +213,16 @@ class TextLayerBuilder { } end.classList.remove("active"); }); + + div.addEventListener("copy", event => { + const selection = document.getSelection(); + event.clipboardData.setData( + "text/plain", + removeNullCharacters(normalizeUnicode(selection.toString())) + ); + event.preventDefault(); + event.stopPropagation(); + }); } }