95bf9fc17f
Now that https://bugzilla.mozilla.org/show_bug.cgi?id=1247687 has landed in Firefox, we're able to use worker-modules during development :-) This removes the final piece of SystemJS usage from the PDF.js library, thus allowing a fair bit of clean-up, and we now use *only* native `import`/`export` statements everywhere in development mode.
274 lines
12 KiB
JavaScript
274 lines
12 KiB
JavaScript
/* Copyright 2016 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
import { getLookupTableFactory } from "./core_utils.js";
|
|
|
|
// Some characters, e.g. copyrightserif, are mapped to the private use area
|
|
// and might not be displayed using standard fonts. Mapping/hacking well-known
|
|
// chars to the similar equivalents in the normal characters range.
|
|
const getSpecialPUASymbols = getLookupTableFactory(function (t) {
|
|
t[63721] = 0x00a9; // copyrightsans (0xF8E9) => copyright
|
|
t[63193] = 0x00a9; // copyrightserif (0xF6D9) => copyright
|
|
t[63720] = 0x00ae; // registersans (0xF8E8) => registered
|
|
t[63194] = 0x00ae; // registerserif (0xF6DA) => registered
|
|
t[63722] = 0x2122; // trademarksans (0xF8EA) => trademark
|
|
t[63195] = 0x2122; // trademarkserif (0xF6DB) => trademark
|
|
t[63729] = 0x23a7; // bracelefttp (0xF8F1)
|
|
t[63730] = 0x23a8; // braceleftmid (0xF8F2)
|
|
t[63731] = 0x23a9; // braceleftbt (0xF8F3)
|
|
t[63740] = 0x23ab; // bracerighttp (0xF8FC)
|
|
t[63741] = 0x23ac; // bracerightmid (0xF8FD)
|
|
t[63742] = 0x23ad; // bracerightbt (0xF8FE)
|
|
t[63726] = 0x23a1; // bracketlefttp (0xF8EE)
|
|
t[63727] = 0x23a2; // bracketleftex (0xF8EF)
|
|
t[63728] = 0x23a3; // bracketleftbt (0xF8F0)
|
|
t[63737] = 0x23a4; // bracketrighttp (0xF8F9)
|
|
t[63738] = 0x23a5; // bracketrightex (0xF8FA)
|
|
t[63739] = 0x23a6; // bracketrightbt (0xF8FB)
|
|
t[63723] = 0x239b; // parenlefttp (0xF8EB)
|
|
t[63724] = 0x239c; // parenleftex (0xF8EC)
|
|
t[63725] = 0x239d; // parenleftbt (0xF8ED)
|
|
t[63734] = 0x239e; // parenrighttp (0xF8F6)
|
|
t[63735] = 0x239f; // parenrightex (0xF8F7)
|
|
t[63736] = 0x23a0; // parenrightbt (0xF8F8)
|
|
});
|
|
|
|
function mapSpecialUnicodeValues(code) {
|
|
if (code >= 0xfff0 && code <= 0xffff) {
|
|
// Specials unicode block.
|
|
return 0;
|
|
} else if (code >= 0xf600 && code <= 0xf8ff) {
|
|
return getSpecialPUASymbols()[code] || code;
|
|
} else if (code === /* softhyphen = */ 0x00ad) {
|
|
return 0x002d; // hyphen
|
|
}
|
|
return code;
|
|
}
|
|
|
|
function getUnicodeForGlyph(name, glyphsUnicodeMap) {
|
|
let unicode = glyphsUnicodeMap[name];
|
|
if (unicode !== undefined) {
|
|
return unicode;
|
|
}
|
|
if (!name) {
|
|
return -1;
|
|
}
|
|
// Try to recover valid Unicode values from 'uniXXXX'/'uXXXX{XX}' glyphs.
|
|
if (name[0] === "u") {
|
|
const nameLen = name.length;
|
|
let hexStr;
|
|
|
|
if (nameLen === 7 && name[1] === "n" && name[2] === "i") {
|
|
// 'uniXXXX'
|
|
hexStr = name.substring(3);
|
|
} else if (nameLen >= 5 && nameLen <= 7) {
|
|
// 'uXXXX{XX}'
|
|
hexStr = name.substring(1);
|
|
} else {
|
|
return -1;
|
|
}
|
|
// Check for upper-case hexadecimal characters, to avoid false positives.
|
|
if (hexStr === hexStr.toUpperCase()) {
|
|
unicode = parseInt(hexStr, 16);
|
|
if (unicode >= 0) {
|
|
return unicode;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
// See https://learn.microsoft.com/en-us/typography/opentype/spec/os2#ulunicoderange1-bits-031ulunicoderange2-bits-3263ulunicoderange3-bits-6495ulunicoderange4-bits-96127
|
|
const UnicodeRanges = [
|
|
[0x0000, 0x007f], // 0 - Basic Latin
|
|
[0x0080, 0x00ff], // 1 - Latin-1 Supplement
|
|
[0x0100, 0x017f], // 2 - Latin Extended-A
|
|
[0x0180, 0x024f], // 3 - Latin Extended-B
|
|
[0x0250, 0x02af, 0x1d00, 0x1d7f, 0x1d80, 0x1dbf], // 4 - IPA Extensions - Phonetic Extensions - Phonetic Extensions Supplement
|
|
[0x02b0, 0x02ff, 0xa700, 0xa71f], // 5 - Spacing Modifier Letters - Modifier Tone Letters
|
|
[0x0300, 0x036f, 0x1dc0, 0x1dff], // 6 - Combining Diacritical Marks - Combining Diacritical Marks Supplement
|
|
[0x0370, 0x03ff], // 7 - Greek and Coptic
|
|
[0x2c80, 0x2cff], // 8 - Coptic
|
|
[0x0400, 0x04ff, 0x0500, 0x052f, 0x2de0, 0x2dff, 0xa640, 0xa69f], // 9 - Cyrillic - Cyrillic Supplement - Cyrillic Extended-A - Cyrillic Extended-B
|
|
[0x0530, 0x058f], // 10 - Armenian
|
|
[0x0590, 0x05ff], // 11 - Hebrew
|
|
[0xa500, 0xa63f], // 12 - Vai
|
|
[0x0600, 0x06ff, 0x0750, 0x077f], // 13 - Arabic - Arabic Supplement
|
|
[0x07c0, 0x07ff], // 14 - NKo
|
|
[0x0900, 0x097f], // 15 - Devanagari
|
|
[0x0980, 0x09ff], // 16 - Bengali
|
|
[0x0a00, 0x0a7f], // 17 - Gurmukhi
|
|
[0x0a80, 0x0aff], // 18 - Gujarati
|
|
[0x0b00, 0x0b7f], // 19 - Oriya
|
|
[0x0b80, 0x0bff], // 20 - Tamil
|
|
[0x0c00, 0x0c7f], // 21 - Telugu
|
|
[0x0c80, 0x0cff], // 22 - Kannada
|
|
[0x0d00, 0x0d7f], // 23 - Malayalam
|
|
[0x0e00, 0x0e7f], // 24 - Thai
|
|
[0x0e80, 0x0eff], // 25 - Lao
|
|
[0x10a0, 0x10ff, 0x2d00, 0x2d2f], // 26 - Georgian - Georgian Supplement
|
|
[0x1b00, 0x1b7f], // 27 - Balinese
|
|
[0x1100, 0x11ff], // 28 - Hangul Jamo
|
|
[0x1e00, 0x1eff, 0x2c60, 0x2c7f, 0xa720, 0xa7ff], // 29 - Latin Extended Additional - Latin Extended-C - Latin Extended-D
|
|
[0x1f00, 0x1fff], // 30 - Greek Extended
|
|
[0x2000, 0x206f, 0x2e00, 0x2e7f], // 31 - General Punctuation - Supplemental Punctuation
|
|
[0x2070, 0x209f], // 32 - Superscripts And Subscripts
|
|
[0x20a0, 0x20cf], // 33 - Currency Symbol
|
|
[0x20d0, 0x20ff], // 34 - Combining Diacritical Marks
|
|
[0x2100, 0x214f], // 35 - Letterlike Symbols
|
|
[0x2150, 0x218f], // 36 - Number Forms
|
|
[0x2190, 0x21ff, 0x27f0, 0x27ff, 0x2900, 0x297f, 0x2b00, 0x2bff], // 37 - Arrows - Supplemental Arrows-A - Supplemental Arrows-B - Miscellaneous Symbols and Arrows
|
|
[0x2200, 0x22ff, 0x2a00, 0x2aff, 0x27c0, 0x27ef, 0x2980, 0x29ff], // 38 - Mathematical Operators - Supplemental Mathematical Operators - Miscellaneous Mathematical Symbols-A - Miscellaneous Mathematical Symbols-B
|
|
[0x2300, 0x23ff], // 39 - Miscellaneous Technical
|
|
[0x2400, 0x243f], // 40 - Control Pictures
|
|
[0x2440, 0x245f], // 41 - Optical Character Recognition
|
|
[0x2460, 0x24ff], // 42 - Enclosed Alphanumerics
|
|
[0x2500, 0x257f], // 43 - Box Drawing
|
|
[0x2580, 0x259f], // 44 - Block Elements
|
|
[0x25a0, 0x25ff], // 45 - Geometric Shapes
|
|
[0x2600, 0x26ff], // 46 - Miscellaneous Symbols
|
|
[0x2700, 0x27bf], // 47 - Dingbats
|
|
[0x3000, 0x303f], // 48 - CJK Symbols And Punctuation
|
|
[0x3040, 0x309f], // 49 - Hiragana
|
|
[0x30a0, 0x30ff, 0x31f0, 0x31ff], // 50 - Katakana - Katakana Phonetic Extensions
|
|
[0x3100, 0x312f, 0x31a0, 0x31bf], // 51 - Bopomofo - Bopomofo Extended
|
|
[0x3130, 0x318f], // 52 - Hangul Compatibility Jamo
|
|
[0xa840, 0xa87f], // 53 - Phags-pa
|
|
[0x3200, 0x32ff], // 54 - Enclosed CJK Letters And Months
|
|
[0x3300, 0x33ff], // 55 - CJK Compatibility
|
|
[0xac00, 0xd7af], // 56 - Hangul Syllables
|
|
[0xd800, 0xdfff], // 57 - Non-Plane 0 *
|
|
[0x10900, 0x1091f], // 58 - Phoenicia
|
|
[
|
|
0x4e00, 0x9fff, 0x2e80, 0x2eff, 0x2f00, 0x2fdf, 0x2ff0, 0x2fff, 0x3400,
|
|
0x4dbf, 0x20000, 0x2a6df, 0x3190, 0x319f,
|
|
], // 59 - CJK Unified Ideographs - CJK Radicals Supplement - Kangxi Radicals - Ideographic Description Characters - CJK Unified Ideographs Extension A - CJK Unified Ideographs Extension B - Kanbun
|
|
[0xe000, 0xf8ff], // 60 - Private Use Area (plane 0)
|
|
[0x31c0, 0x31ef, 0xf900, 0xfaff, 0x2f800, 0x2fa1f], // 61 - CJK Strokes - CJK Compatibility Ideographs - CJK Compatibility Ideographs Supplement
|
|
[0xfb00, 0xfb4f], // 62 - Alphabetic Presentation Forms
|
|
[0xfb50, 0xfdff], // 63 - Arabic Presentation Forms-A
|
|
[0xfe20, 0xfe2f], // 64 - Combining Half Marks
|
|
[0xfe10, 0xfe1f], // 65 - Vertical Forms
|
|
[0xfe50, 0xfe6f], // 66 - Small Form Variants
|
|
[0xfe70, 0xfeff], // 67 - Arabic Presentation Forms-B
|
|
[0xff00, 0xffef], // 68 - Halfwidth And Fullwidth Forms
|
|
[0xfff0, 0xffff], // 69 - Specials
|
|
[0x0f00, 0x0fff], // 70 - Tibetan
|
|
[0x0700, 0x074f], // 71 - Syriac
|
|
[0x0780, 0x07bf], // 72 - Thaana
|
|
[0x0d80, 0x0dff], // 73 - Sinhala
|
|
[0x1000, 0x109f], // 74 - Myanmar
|
|
[0x1200, 0x137f, 0x1380, 0x139f, 0x2d80, 0x2ddf], // 75 - Ethiopic - Ethiopic Supplement - Ethiopic Extended
|
|
[0x13a0, 0x13ff], // 76 - Cherokee
|
|
[0x1400, 0x167f], // 77 - Unified Canadian Aboriginal Syllabics
|
|
[0x1680, 0x169f], // 78 - Ogham
|
|
[0x16a0, 0x16ff], // 79 - Runic
|
|
[0x1780, 0x17ff], // 80 - Khmer
|
|
[0x1800, 0x18af], // 81 - Mongolian
|
|
[0x2800, 0x28ff], // 82 - Braille Patterns
|
|
[0xa000, 0xa48f], // 83 - Yi Syllables
|
|
[0x1700, 0x171f, 0x1720, 0x173f, 0x1740, 0x175f, 0x1760, 0x177f], // 84 - Tagalog - Hanunoo - Buhid - Tagbanwa
|
|
[0x10300, 0x1032f], // 85 - Old Italic
|
|
[0x10330, 0x1034f], // 86 - Gothic
|
|
[0x10400, 0x1044f], // 87 - Deseret
|
|
[0x1d000, 0x1d0ff, 0x1d100, 0x1d1ff, 0x1d200, 0x1d24f], // 88 - Byzantine Musical Symbols - Musical Symbols - Ancient Greek Musical Notation
|
|
[0x1d400, 0x1d7ff], // 89 - Mathematical Alphanumeric Symbols
|
|
[0xff000, 0xffffd], // 90 - Private Use (plane 15)
|
|
[0xfe00, 0xfe0f, 0xe0100, 0xe01ef], // 91 - Variation Selectors - Variation Selectors Supplement
|
|
[0xe0000, 0xe007f], // 92 - Tags
|
|
[0x1900, 0x194f], // 93 - Limbu
|
|
[0x1950, 0x197f], // 94 - Tai Le
|
|
[0x1980, 0x19df], // 95 - New Tai Lue
|
|
[0x1a00, 0x1a1f], // 96 - Buginese
|
|
[0x2c00, 0x2c5f], // 97 - Glagolitic
|
|
[0x2d30, 0x2d7f], // 98 - Tifinagh
|
|
[0x4dc0, 0x4dff], // 99 - Yijing Hexagram Symbols
|
|
[0xa800, 0xa82f], // 100 - Syloti Nagri
|
|
[0x10000, 0x1007f, 0x10080, 0x100ff, 0x10100, 0x1013f], // 101 - Linear B Syllabary - Linear B Ideograms - Aegean Numbers
|
|
[0x10140, 0x1018f], // 102 - Ancient Greek Numbers
|
|
[0x10380, 0x1039f], // 103 - Ugaritic
|
|
[0x103a0, 0x103df], // 104 - Old Persian
|
|
[0x10450, 0x1047f], // 105 - Shavian
|
|
[0x10480, 0x104af], // 106 - Osmanya
|
|
[0x10800, 0x1083f], // 107 - Cypriot Syllabary
|
|
[0x10a00, 0x10a5f], // 108 - Kharoshthi
|
|
[0x1d300, 0x1d35f], // 109 - Tai Xuan Jing Symbols
|
|
[0x12000, 0x123ff, 0x12400, 0x1247f], // 110 - Cuneiform - Cuneiform Numbers and Punctuation
|
|
[0x1d360, 0x1d37f], // 111 - Counting Rod Numerals
|
|
[0x1b80, 0x1bbf], // 112 - Sundanese
|
|
[0x1c00, 0x1c4f], // 113 - Lepcha
|
|
[0x1c50, 0x1c7f], // 114 - Ol Chiki
|
|
[0xa880, 0xa8df], // 115 - Saurashtra
|
|
[0xa900, 0xa92f], // 116 - Kayah Li
|
|
[0xa930, 0xa95f], // 117 - Rejang
|
|
[0xaa00, 0xaa5f], // 118 - Cham
|
|
[0x10190, 0x101cf], // 119 - Ancient Symbols
|
|
[0x101d0, 0x101ff], // 120 - Phaistos Disc
|
|
[0x102a0, 0x102df, 0x10280, 0x1029f, 0x10920, 0x1093f], // 121 - Carian - Lycian - Lydian
|
|
[0x1f030, 0x1f09f, 0x1f000, 0x1f02f], // 122 - Domino Tiles - Mahjong Tiles
|
|
];
|
|
|
|
function getUnicodeRangeFor(value, lastPosition = -1) {
|
|
// TODO: create a map range => position, sort the ranges and cache it.
|
|
// Then we can make a binary search for finding a range for a given unicode.
|
|
if (lastPosition !== -1) {
|
|
const range = UnicodeRanges[lastPosition];
|
|
for (let i = 0, ii = range.length; i < ii; i += 2) {
|
|
if (value >= range[i] && value <= range[i + 1]) {
|
|
return lastPosition;
|
|
}
|
|
}
|
|
}
|
|
for (let i = 0, ii = UnicodeRanges.length; i < ii; i++) {
|
|
const range = UnicodeRanges[i];
|
|
for (let j = 0, jj = range.length; j < jj; j += 2) {
|
|
if (value >= range[j] && value <= range[j + 1]) {
|
|
return i;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u");
|
|
const CategoryCache = new Map();
|
|
|
|
function getCharUnicodeCategory(char) {
|
|
const cachedCategory = CategoryCache.get(char);
|
|
if (cachedCategory) {
|
|
return cachedCategory;
|
|
}
|
|
const groups = char.match(SpecialCharRegExp);
|
|
const category = {
|
|
isWhitespace: !!(groups && groups[1]),
|
|
isZeroWidthDiacritic: !!(groups && groups[2]),
|
|
isInvisibleFormatMark: !!(groups && groups[3]),
|
|
};
|
|
CategoryCache.set(char, category);
|
|
return category;
|
|
}
|
|
|
|
function clearUnicodeCaches() {
|
|
CategoryCache.clear();
|
|
}
|
|
|
|
export {
|
|
clearUnicodeCaches,
|
|
getCharUnicodeCategory,
|
|
getUnicodeForGlyph,
|
|
getUnicodeRangeFor,
|
|
mapSpecialUnicodeValues,
|
|
};
|