Merge pull request #3674 from brendandahl/cmap-squash
Read multi-byte character codes based on codespace ranges.
This commit is contained in:
commit
1c7f1cee00
3
make.js
3
make.js
@ -306,7 +306,8 @@ target.bundle = function(args) {
|
||||
'core/worker.js',
|
||||
'core/jpx.js',
|
||||
'core/jbig2.js',
|
||||
'core/bidi.js'
|
||||
'core/bidi.js',
|
||||
'core/cmap.js'
|
||||
];
|
||||
|
||||
var EXT_SRC_FILES = [
|
||||
|
460
src/core/cmap.js
Normal file
460
src/core/cmap.js
Normal file
@ -0,0 +1,460 @@
|
||||
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
|
||||
/* Copyright 2012 Mozilla Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/* globals Util, isString, isInt, warn, error, isCmd, isEOF, isName, Lexer,
|
||||
isStream */
|
||||
|
||||
'use strict';
|
||||
|
||||
var CMAP_CODESPACES = {
|
||||
'Adobe-CNS1-0': [[], [0, 14335]],
|
||||
'Adobe-CNS1-1': [[], [0, 17407]],
|
||||
'Adobe-CNS1-2': [[], [0, 17663]],
|
||||
'Adobe-CNS1-3': [[], [0, 18943]],
|
||||
'Adobe-CNS1-4': [[], [0, 19199]],
|
||||
'Adobe-CNS1-5': [[], [0, 19199]],
|
||||
'Adobe-CNS1-6': [[], [0, 19199]],
|
||||
'Adobe-CNS1-UCS2': [[], [0, 65535]],
|
||||
'B5-H': [[0, 128], [41280, 65278]],
|
||||
'B5-V': [[0, 128], [41280, 65278]],
|
||||
'B5pc-H': [[0, 128, 253, 255], [41280, 64766]],
|
||||
'B5pc-V': [[0, 128, 253, 255], [41280, 64766]],
|
||||
'CNS-EUC-H': [[0, 128], [41377, 65278], [],
|
||||
[2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]],
|
||||
'CNS-EUC-V': [[0, 128], [41377, 65278], [],
|
||||
[2392957345, 2392981246, 2393022881, 2393046782, 2393088417, 2393112318]],
|
||||
'CNS1-H': [[], [8481, 32382]],
|
||||
'CNS1-V': [[], [8481, 32382]],
|
||||
'CNS2-H': [[], [8481, 32382]],
|
||||
'CNS2-V': [[], [8481, 32382]],
|
||||
'ETen-B5-H': [[0, 128], [41280, 65278]],
|
||||
'ETen-B5-V': [[0, 128], [41280, 65278]],
|
||||
'ETenms-B5-H': [[0, 128], [41280, 65278]],
|
||||
'ETenms-B5-V': [[0, 128], [41280, 65278]],
|
||||
'ETHK-B5-H': [[0, 128], [34624, 65278]],
|
||||
'ETHK-B5-V': [[0, 128], [34624, 65278]],
|
||||
'HKdla-B5-H': [[0, 128], [41280, 65278]],
|
||||
'HKdla-B5-V': [[0, 128], [41280, 65278]],
|
||||
'HKdlb-B5-H': [[0, 128], [36416, 65278]],
|
||||
'HKdlb-B5-V': [[0, 128], [36416, 65278]],
|
||||
'HKgccs-B5-H': [[0, 128], [35392, 65278]],
|
||||
'HKgccs-B5-V': [[0, 128], [35392, 65278]],
|
||||
'HKm314-B5-H': [[0, 128], [41280, 65278]],
|
||||
'HKm314-B5-V': [[0, 128], [41280, 65278]],
|
||||
'HKm471-B5-H': [[0, 128], [41280, 65278]],
|
||||
'HKm471-B5-V': [[0, 128], [41280, 65278]],
|
||||
'HKscs-B5-H': [[0, 128], [34624, 65278]],
|
||||
'HKscs-B5-V': [[0, 128], [34624, 65278]],
|
||||
'UniCNS-UCS2-H': [[], [0, 55295, 57344, 65535]],
|
||||
'UniCNS-UCS2-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniCNS-UTF16-H': [[], [0, 55295, 57344, 65535], [],
|
||||
[3623934976, 3690979327]],
|
||||
'UniCNS-UTF16-V': [[], [0, 55295, 57344, 65535], [],
|
||||
[3623934976, 3690979327]],
|
||||
'Adobe-GB1-0': [[], [0, 7935]],
|
||||
'Adobe-GB1-1': [[], [0, 9983]],
|
||||
'Adobe-GB1-2': [[], [0, 22271]],
|
||||
'Adobe-GB1-3': [[], [0, 22527]],
|
||||
'Adobe-GB1-4': [[], [0, 29183]],
|
||||
'Adobe-GB1-5': [[], [0, 30463]],
|
||||
'Adobe-GB1-UCS2': [[], [0, 65535]],
|
||||
'GB-EUC-H': [[0, 128], [41377, 65278]],
|
||||
'GB-EUC-V': [[0, 128], [41377, 65278]],
|
||||
'GB-H': [[], [8481, 32382]],
|
||||
'GB-V': [[], [8481, 32382]],
|
||||
'GBK-EUC-H': [[0, 128], [33088, 65278]],
|
||||
'GBK-EUC-V': [[0, 128], [33088, 65278]],
|
||||
'GBK2K-H': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]],
|
||||
'GBK2K-V': [[0, 127], [33088, 65278], [], [2167439664, 4265213497]],
|
||||
'GBKp-EUC-H': [[0, 128], [33088, 65278]],
|
||||
'GBKp-EUC-V': [[0, 128], [33088, 65278]],
|
||||
'GBpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]],
|
||||
'GBpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]],
|
||||
'GBT-EUC-H': [[0, 128], [41377, 65278]],
|
||||
'GBT-EUC-V': [[0, 128], [41377, 65278]],
|
||||
'GBT-H': [[], [8481, 32382]],
|
||||
'GBT-V': [[], [8481, 32382]],
|
||||
'GBTpc-EUC-H': [[0, 128, 253, 255], [41377, 64766]],
|
||||
'GBTpc-EUC-V': [[0, 128, 253, 255], [41377, 64766]],
|
||||
'UniGB-UCS2-H': [[], [0, 55295, 57344, 65535]],
|
||||
'UniGB-UCS2-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniGB-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
|
||||
'UniGB-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
|
||||
'78-EUC-H': [[0, 128], [36512, 36575, 41377, 65278]],
|
||||
'78-EUC-V': [[0, 128], [36512, 36575, 41377, 65278]],
|
||||
'78-H': [[], [8481, 32382]],
|
||||
'78-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'78-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'78-V': [[], [8481, 32382]],
|
||||
'78ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'78ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'83pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
|
||||
'90ms-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'90ms-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'90msp-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'90msp-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'90pv-RKSJ-H': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
|
||||
'90pv-RKSJ-V': [[0, 128, 160, 223, 253, 255], [33088, 40956, 57408, 64764]],
|
||||
'Add-H': [[], [8481, 32382]],
|
||||
'Add-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'Add-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'Add-V': [[], [8481, 32382]],
|
||||
'Adobe-Japan1-0': [[], [0, 8447]],
|
||||
'Adobe-Japan1-1': [[], [0, 8447]],
|
||||
'Adobe-Japan1-2': [[], [0, 8959]],
|
||||
'Adobe-Japan1-3': [[], [0, 9471]],
|
||||
'Adobe-Japan1-4': [[], [0, 15615]],
|
||||
'Adobe-Japan1-5': [[], [0, 20479]],
|
||||
'Adobe-Japan1-6': [[], [0, 23295]],
|
||||
'Adobe-Japan1-UCS2': [[], [0, 65535]],
|
||||
'Adobe-Japan2-0': [[], [0, 6143]],
|
||||
'EUC-H': [[0, 128], [36512, 36575, 41377, 65278]],
|
||||
'EUC-V': [[0, 128], [36512, 36575, 41377, 65278]],
|
||||
'Ext-H': [[], [8481, 32382]],
|
||||
'Ext-RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'Ext-RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'Ext-V': [[], [8481, 32382]],
|
||||
'H': [[], [8481, 32382]],
|
||||
'Hankaku': [[0, 255], []],
|
||||
'Hiragana': [[0, 255], []],
|
||||
'Hojo-EUC-H': [[], [], [9413025, 9436926], []],
|
||||
'Hojo-EUC-V': [[], [], [9413025, 9436926], []],
|
||||
'Hojo-H': [[], [8481, 32382]],
|
||||
'Hojo-V': [[], [8481, 32382]],
|
||||
'Katakana': [[0, 255], []],
|
||||
'NWP-H': [[], [8481, 32382]],
|
||||
'NWP-V': [[], [8481, 32382]],
|
||||
'RKSJ-H': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'RKSJ-V': [[0, 128, 160, 223], [33088, 40956, 57408, 64764]],
|
||||
'Roman': [[0, 255], []],
|
||||
'UniHojo-UCS2-H': [[], [0, 55295, 57344, 65535]],
|
||||
'UniHojo-UCS2-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniHojo-UTF16-H': [[], [0, 55295, 57344, 65535], [],
|
||||
[3623934976, 3690979327]],
|
||||
'UniHojo-UTF16-V': [[], [0, 55295, 57344, 65535], [],
|
||||
[3623934976, 3690979327]],
|
||||
'UniJIS-UCS2-H': [[], [0, 55295, 57344, 65535]],
|
||||
'UniJIS-UCS2-HW-H': [[], [0, 55295, 57344, 65535]],
|
||||
'UniJIS-UCS2-HW-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniJIS-UCS2-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniJIS-UTF16-H': [[], [0, 55295, 57344, 65535], [],
|
||||
[3623934976, 3690979327]],
|
||||
'UniJIS-UTF16-V': [[], [0, 55295, 57344, 65535], [],
|
||||
[3623934976, 3690979327]],
|
||||
'UniJISPro-UCS2-HW-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniJISPro-UCS2-V': [[], [0, 55295, 57344, 65535]],
|
||||
'V': [[], [8481, 32382]],
|
||||
'WP-Symbol': [[0, 255], []],
|
||||
'Adobe-Korea1-0': [[], [0, 9471]],
|
||||
'Adobe-Korea1-1': [[], [0, 18175]],
|
||||
'Adobe-Korea1-2': [[], [0, 18431]],
|
||||
'Adobe-Korea1-UCS2': [[], [0, 65535]],
|
||||
'KSC-EUC-H': [[0, 128], [41377, 65278]],
|
||||
'KSC-EUC-V': [[0, 128], [41377, 65278]],
|
||||
'KSC-H': [[], [8481, 32382]],
|
||||
'KSC-Johab-H': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]],
|
||||
'KSC-Johab-V': [[0, 128], [33857, 54270, 55345, 57086, 57393, 63998]],
|
||||
'KSC-V': [[], [8481, 32382]],
|
||||
'KSCms-UHC-H': [[0, 128], [33089, 65278]],
|
||||
'KSCms-UHC-HW-H': [[0, 128], [33089, 65278]],
|
||||
'KSCms-UHC-HW-V': [[0, 128], [33089, 65278]],
|
||||
'KSCms-UHC-V': [[0, 128], [33089, 65278]],
|
||||
'KSCpc-EUC-H': [[0, 132, 254, 255], [41281, 65022]],
|
||||
'KSCpc-EUC-V': [[0, 132, 254, 255], [41281, 65022]],
|
||||
'UniKS-UCS2-H': [[], [0, 55295, 57344, 65535]],
|
||||
'UniKS-UCS2-V': [[], [0, 55295, 57344, 65535]],
|
||||
'UniKS-UTF16-H': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]],
|
||||
'UniKS-UTF16-V': [[], [0, 55295, 57344, 65535], [], [3623934976, 3690979327]]
|
||||
};
|
||||
|
||||
// CMap, not to be confused with TrueType's cmap.
|
||||
var CMap = (function CMapClosure() {
|
||||
function CMap() {
|
||||
// Codespace ranges are stored as follows:
|
||||
// [[1BytePairs], [2BytePairs], [3BytePairs], [4BytePairs]]
|
||||
// where nBytePairs are ranges e.g. [low1, high1, low2, high2, ...]
|
||||
this.codespaceRanges = [[], [], [], []];
|
||||
this.map = [];
|
||||
this.vertical = false;
|
||||
}
|
||||
CMap.prototype = {
|
||||
addCodespaceRange: function(n, low, high) {
|
||||
this.codespaceRanges[n - 1].push(low, high);
|
||||
},
|
||||
|
||||
mapRange: function(low, high, dstLow) {
|
||||
var lastByte = dstLow.length - 1;
|
||||
while (low <= high) {
|
||||
this.map[low] = dstLow;
|
||||
// Only the last byte has to be incremented.
|
||||
dstLow = dstLow.substr(0, lastByte) +
|
||||
String.fromCharCode(dstLow.charCodeAt(lastByte) + 1);
|
||||
++low;
|
||||
}
|
||||
},
|
||||
|
||||
mapRangeToArray: function(low, high, array) {
|
||||
var i = 0;
|
||||
while (low <= high) {
|
||||
this.map[low] = array[i++];
|
||||
++low;
|
||||
}
|
||||
},
|
||||
|
||||
mapOne: function(src, dst) {
|
||||
this.map[src] = dst;
|
||||
},
|
||||
|
||||
lookup: function(code) {
|
||||
return this.map[code];
|
||||
},
|
||||
|
||||
readCharCode: function(str, offset) {
|
||||
var c = 0;
|
||||
var codespaceRanges = this.codespaceRanges;
|
||||
var codespaceRangesLen = this.codespaceRanges.length;
|
||||
// 9.7.6.2 CMap Mapping
|
||||
// The code length is at most 4.
|
||||
for (var n = 0; n < codespaceRangesLen; n++) {
|
||||
c = ((c << 8) | str.charCodeAt(offset + n)) >>> 0;
|
||||
// Check each codespace range to see if it falls within.
|
||||
var codespaceRange = codespaceRanges[n];
|
||||
for (var k = 0, kk = codespaceRange.length; k < kk;) {
|
||||
var low = codespaceRange[k++];
|
||||
var high = codespaceRange[k++];
|
||||
if (c >= low && c <= high) {
|
||||
return [c, n + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [0, 1];
|
||||
}
|
||||
|
||||
};
|
||||
return CMap;
|
||||
})();
|
||||
|
||||
var IdentityCMap = (function IdentityCMapClosure() {
|
||||
function IdentityCMap(vertical, n) {
|
||||
CMap.call(this);
|
||||
this.vertical = vertical;
|
||||
this.addCodespaceRange(n, 0, 0xffff);
|
||||
this.mapRange(0, 0xffff, '\u0000');
|
||||
}
|
||||
Util.inherit(IdentityCMap, CMap, {});
|
||||
|
||||
return IdentityCMap;
|
||||
})();
|
||||
|
||||
var CMapFactory = (function CMapFactoryClosure() {
|
||||
function strToInt(str) {
|
||||
var a = 0;
|
||||
for (var i = 0; i < str.length; i++) {
|
||||
a = (a << 8) | str.charCodeAt(i);
|
||||
}
|
||||
return a >>> 0;
|
||||
}
|
||||
|
||||
function expectString(obj) {
|
||||
if (!isString(obj)) {
|
||||
error('Malformed CMap: expected string.');
|
||||
}
|
||||
}
|
||||
|
||||
function expectInt(obj) {
|
||||
if (!isInt(obj)) {
|
||||
error('Malformed CMap: expected int.');
|
||||
}
|
||||
}
|
||||
|
||||
function parseBfChar(cMap, lexer) {
|
||||
while (true) {
|
||||
var obj = lexer.getObj();
|
||||
if (isEOF(obj)) {
|
||||
break;
|
||||
}
|
||||
if (isCmd(obj, 'endbfchar')) {
|
||||
return;
|
||||
}
|
||||
expectString(obj);
|
||||
var src = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
// TODO are /dstName used?
|
||||
expectString(obj);
|
||||
var dst = obj;
|
||||
cMap.mapOne(src, dst);
|
||||
}
|
||||
}
|
||||
|
||||
function parseBfRange(cMap, lexer) {
|
||||
while (true) {
|
||||
var obj = lexer.getObj();
|
||||
if (isEOF(obj)) {
|
||||
break;
|
||||
}
|
||||
if (isCmd(obj, 'endbfrange')) {
|
||||
return;
|
||||
}
|
||||
expectString(obj);
|
||||
var low = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
expectString(obj);
|
||||
var high = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
if (isInt(obj) || isString(obj)) {
|
||||
var dstLow = isInt(obj) ? String.fromCharCode(obj) : obj;
|
||||
cMap.mapRange(low, high, dstLow);
|
||||
} else if (isCmd(obj, '[')) {
|
||||
obj = lexer.getObj();
|
||||
var array = [];
|
||||
while (!isCmd(obj, ']') && !isEOF(obj)) {
|
||||
array.push(obj);
|
||||
obj = lexer.getObj();
|
||||
}
|
||||
cMap.mapRangeToArray(low, high, array);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
error('Invalid bf range.');
|
||||
}
|
||||
|
||||
function parseCidChar(cMap, lexer) {
|
||||
while (true) {
|
||||
var obj = lexer.getObj();
|
||||
if (isEOF(obj)) {
|
||||
break;
|
||||
}
|
||||
if (isCmd(obj, 'endcidchar')) {
|
||||
return;
|
||||
}
|
||||
expectString(obj);
|
||||
var src = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
expectInt(obj);
|
||||
var dst = String.fromCharCode(obj);
|
||||
cMap.mapOne(src, dst);
|
||||
}
|
||||
}
|
||||
|
||||
function parseCidRange(cMap, lexer) {
|
||||
while (true) {
|
||||
var obj = lexer.getObj();
|
||||
if (isEOF(obj)) {
|
||||
break;
|
||||
}
|
||||
if (isCmd(obj, 'endcidrange')) {
|
||||
return;
|
||||
}
|
||||
expectString(obj);
|
||||
var low = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
expectString(obj);
|
||||
var high = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
expectInt(obj);
|
||||
var dstLow = String.fromCharCode(obj);
|
||||
cMap.mapRange(low, high, dstLow);
|
||||
}
|
||||
}
|
||||
|
||||
function parseCodespaceRange(cMap, lexer) {
|
||||
while (true) {
|
||||
var obj = lexer.getObj();
|
||||
if (isEOF(obj)) {
|
||||
break;
|
||||
}
|
||||
if (isCmd(obj, 'endcodespacerange')) {
|
||||
return;
|
||||
}
|
||||
if (!isString(obj)) {
|
||||
break;
|
||||
}
|
||||
var low = strToInt(obj);
|
||||
obj = lexer.getObj();
|
||||
if (!isString(obj)) {
|
||||
break;
|
||||
}
|
||||
var high = strToInt(obj);
|
||||
cMap.addCodespaceRange(obj.length, low, high);
|
||||
}
|
||||
error('Invalid codespace range.');
|
||||
}
|
||||
|
||||
function parseCmap(cMap, lexer) {
|
||||
objLoop: while (true) {
|
||||
var obj = lexer.getObj();
|
||||
if (isEOF(obj)) {
|
||||
break;
|
||||
} else if (isCmd(obj)) {
|
||||
switch (obj.cmd) {
|
||||
case 'endcMap':
|
||||
break objLoop;
|
||||
case 'usecMap':
|
||||
// TODO
|
||||
break;
|
||||
case 'begincodespacerange':
|
||||
parseCodespaceRange(cMap, lexer);
|
||||
break;
|
||||
case 'beginbfchar':
|
||||
parseBfChar(cMap, lexer);
|
||||
break;
|
||||
case 'begincidchar':
|
||||
parseCidChar(cMap, lexer);
|
||||
break;
|
||||
case 'beginbfrange':
|
||||
parseBfRange(cMap, lexer);
|
||||
break;
|
||||
case 'begincidrange':
|
||||
parseCidRange(cMap, lexer);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
create: function (encoding) {
|
||||
if (isName(encoding)) {
|
||||
switch (encoding.name) {
|
||||
case 'Identity-H':
|
||||
return new IdentityCMap(false, 2);
|
||||
case 'Identity-V':
|
||||
return new IdentityCMap(true, 2);
|
||||
default:
|
||||
if (encoding.name in CMAP_CODESPACES) {
|
||||
// XXX: Temporary hack so the correct amount of bytes are read in
|
||||
// CMap.readCharCode.
|
||||
var cMap = new CMap();
|
||||
cMap.codespaceRanges = CMAP_CODESPACES[encoding.name];
|
||||
return cMap;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
} else if (isStream(encoding)) {
|
||||
var cMap = new CMap();
|
||||
var lexer = new Lexer(encoding);
|
||||
try {
|
||||
parseCmap(cMap, lexer);
|
||||
} catch (e) {
|
||||
warn('Invalid CMap data. ' + e);
|
||||
}
|
||||
return cMap;
|
||||
}
|
||||
error('Encoding required.');
|
||||
}
|
||||
};
|
||||
})();
|
@ -20,7 +20,7 @@
|
||||
isStream, isString, JpegStream, Lexer, Metrics, Name, Parser,
|
||||
Pattern, PDFImage, PDFJS, serifFonts, stdFontMap, symbolsFonts,
|
||||
TilingPattern, TODO, warn, Util, Promise,
|
||||
RefSetCache, isRef, TextRenderingMode */
|
||||
RefSetCache, isRef, TextRenderingMode, CMapFactory */
|
||||
|
||||
'use strict';
|
||||
|
||||
@ -1010,119 +1010,24 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
if (!isIdentityMap)
|
||||
error('ToUnicode file cmap translation not implemented');
|
||||
} else if (isStream(cmapObj)) {
|
||||
var tokens = [];
|
||||
var token = '';
|
||||
var beginArrayToken = {};
|
||||
|
||||
var cmap = cmapObj.getBytes(cmapObj.length);
|
||||
for (var i = 0, ii = cmap.length; i < ii; i++) {
|
||||
var octet = cmap[i];
|
||||
if (octet == 0x20 || octet == 0x0D || octet == 0x0A ||
|
||||
octet == 0x3C || octet == 0x5B || octet == 0x5D) {
|
||||
switch (token) {
|
||||
case 'usecmap':
|
||||
error('usecmap is not implemented');
|
||||
break;
|
||||
|
||||
case 'beginbfchar':
|
||||
case 'beginbfrange':
|
||||
case 'begincidchar':
|
||||
case 'begincidrange':
|
||||
token = '';
|
||||
tokens = [];
|
||||
break;
|
||||
|
||||
case 'endcidrange':
|
||||
case 'endbfrange':
|
||||
for (var j = 0, jj = tokens.length; j < jj; j += 3) {
|
||||
var startRange = tokens[j];
|
||||
var endRange = tokens[j + 1];
|
||||
var code = tokens[j + 2];
|
||||
if (code == 0xFFFF) {
|
||||
// CMap is broken, assuming code == startRange
|
||||
code = startRange;
|
||||
}
|
||||
if (isArray(code)) {
|
||||
var codeindex = 0;
|
||||
while (startRange <= endRange) {
|
||||
charToUnicode[startRange] = code[codeindex++];
|
||||
++startRange;
|
||||
}
|
||||
} else {
|
||||
while (startRange <= endRange) {
|
||||
charToUnicode[startRange] = code++;
|
||||
++startRange;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'endcidchar':
|
||||
case 'endbfchar':
|
||||
for (var j = 0, jj = tokens.length; j < jj; j += 2) {
|
||||
var index = tokens[j];
|
||||
var code = tokens[j + 1];
|
||||
charToUnicode[index] = code;
|
||||
}
|
||||
break;
|
||||
|
||||
case '':
|
||||
break;
|
||||
|
||||
default:
|
||||
if (token[0] >= '0' && token[0] <= '9')
|
||||
token = parseInt(token, 10); // a number
|
||||
tokens.push(token);
|
||||
token = '';
|
||||
var cmap = CMapFactory.create(cmapObj).map;
|
||||
// Convert UTF-16BE
|
||||
for (var i in cmap) {
|
||||
var token = cmap[i];
|
||||
var str = [];
|
||||
for (var k = 0; k < token.length; k += 2) {
|
||||
var w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
|
||||
if ((w1 & 0xF800) !== 0xD800) { // w1 < 0xD800 || w1 > 0xDFFF
|
||||
str.push(w1);
|
||||
continue;
|
||||
}
|
||||
switch (octet) {
|
||||
case 0x5B:
|
||||
// begin list parsing
|
||||
tokens.push(beginArrayToken);
|
||||
break;
|
||||
case 0x5D:
|
||||
// collect array items
|
||||
var items = [], item;
|
||||
while (tokens.length &&
|
||||
(item = tokens.pop()) != beginArrayToken)
|
||||
items.unshift(item);
|
||||
tokens.push(items);
|
||||
break;
|
||||
}
|
||||
} else if (octet == 0x3E) {
|
||||
if (token.length) {
|
||||
// Heuristic: guessing chars size by checking numbers sizes
|
||||
// in the CMap entries.
|
||||
if (token.length == 2 && properties.composite)
|
||||
properties.wideChars = false;
|
||||
|
||||
if (token.length <= 4) {
|
||||
// parsing hex number
|
||||
tokens.push(parseInt(token, 16));
|
||||
token = '';
|
||||
} else {
|
||||
// parsing hex UTF-16BE numbers
|
||||
var str = [];
|
||||
for (var k = 0, kk = token.length; k < kk; k += 4) {
|
||||
var b = parseInt(token.substr(k, 4), 16);
|
||||
if (b <= 0x10) {
|
||||
k += 4;
|
||||
b = (b << 16) | parseInt(token.substr(k, 4), 16);
|
||||
b -= 0x10000;
|
||||
str.push(0xD800 | (b >> 10));
|
||||
str.push(0xDC00 | (b & 0x3FF));
|
||||
break;
|
||||
}
|
||||
str.push(b);
|
||||
}
|
||||
tokens.push(String.fromCharCode.apply(String, str));
|
||||
token = '';
|
||||
}
|
||||
}
|
||||
} else {
|
||||
token += String.fromCharCode(octet);
|
||||
k += 2;
|
||||
var w2 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
|
||||
str.push(((w1 & 0x3ff) << 10) + (w2 & 0x3ff) + 0x10000);
|
||||
}
|
||||
cmap[i] = String.fromCharCode.apply(String, str);
|
||||
}
|
||||
return cmap;
|
||||
}
|
||||
return charToUnicode;
|
||||
},
|
||||
@ -1409,6 +1314,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
properties.cidEncoding = cidEncoding.name;
|
||||
properties.vertical = /-V$/.test(cidEncoding.name);
|
||||
}
|
||||
properties.cmap = CMapFactory.create(cidEncoding);
|
||||
}
|
||||
this.extractWidths(dict, xref, descriptor, properties);
|
||||
this.extractDataStructures(dict, baseDict, xref, properties);
|
||||
|
@ -18,7 +18,7 @@
|
||||
ExpertSubsetCharset, FileReaderSync, GlyphsUnicode,
|
||||
info, isArray, isNum, ISOAdobeCharset, Stream,
|
||||
stringToBytes, TextDecoder, TODO, warn, Lexer, Util,
|
||||
FONT_IDENTITY_MATRIX, FontRendererFactory, shadow */
|
||||
FONT_IDENTITY_MATRIX, FontRendererFactory, shadow, isString */
|
||||
|
||||
'use strict';
|
||||
|
||||
@ -2182,6 +2182,7 @@ var Font = (function FontClosure() {
|
||||
this.composite = properties.composite;
|
||||
this.wideChars = properties.wideChars;
|
||||
this.hasEncoding = properties.hasEncoding;
|
||||
this.cmap = properties.cmap;
|
||||
|
||||
this.fontMatrix = properties.fontMatrix;
|
||||
if (properties.type == 'Type3') {
|
||||
@ -3701,7 +3702,7 @@ var Font = (function FontClosure() {
|
||||
|
||||
var dupFirstEntry = false;
|
||||
if (properties.type == 'CIDFontType2' && properties.toUnicode &&
|
||||
properties.toUnicode[0] > 0) {
|
||||
properties.toUnicode[0] > '\u0000') {
|
||||
// oracle's defect (see 3427), duplicating first entry
|
||||
dupFirstEntry = true;
|
||||
numGlyphs++;
|
||||
@ -4250,8 +4251,12 @@ var Font = (function FontClosure() {
|
||||
var unicode = toUnicode[i];
|
||||
var fontCharCode = typeof unicode === 'object' ? unusedUnicode++ :
|
||||
unicode;
|
||||
if (typeof unicode !== 'undefined')
|
||||
if (typeof unicode !== 'undefined') {
|
||||
if (isString(fontCharCode) && fontCharCode.length === 1) {
|
||||
fontCharCode = fontCharCode.charCodeAt(0);
|
||||
}
|
||||
result[i] = fontCharCode;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
},
|
||||
@ -4264,7 +4269,7 @@ var Font = (function FontClosure() {
|
||||
var isIdentityMap = toUnicode.length === 0;
|
||||
for (var i = firstChar, ii = lastChar; i <= ii; i++) {
|
||||
// TODO missing map the character according font's CMap
|
||||
map[i] = isIdentityMap ? i : toUnicode[i];
|
||||
map[i] = isIdentityMap ? String.fromCharCode(i) : toUnicode[i];
|
||||
}
|
||||
} else {
|
||||
for (var i = firstChar, ii = lastChar; i <= ii; i++) {
|
||||
@ -4272,7 +4277,7 @@ var Font = (function FontClosure() {
|
||||
if (!glyph)
|
||||
glyph = properties.baseEncoding[i];
|
||||
if (!!glyph && (glyph in GlyphsUnicode))
|
||||
map[i] = GlyphsUnicode[glyph];
|
||||
map[i] = String.fromCharCode(GlyphsUnicode[glyph]);
|
||||
}
|
||||
}
|
||||
this.toUnicode = map;
|
||||
@ -4535,15 +4540,15 @@ var Font = (function FontClosure() {
|
||||
warn('Unsupported CMap: ' + cidEncoding);
|
||||
}
|
||||
}
|
||||
if (!converter && this.wideChars) {
|
||||
if (!converter && this.cmap) {
|
||||
var i = 0;
|
||||
// composite fonts have multi-byte strings convert the string from
|
||||
// single-byte to multi-byte
|
||||
// XXX assuming CIDFonts are two-byte - later need to extract the
|
||||
// correct byte encoding according to the PDF spec
|
||||
var length = chars.length - 1; // looping over two bytes at a time so
|
||||
// loop should never end on the last byte
|
||||
for (var i = 0; i < length; i++) {
|
||||
var charcode = int16([chars.charCodeAt(i++), chars.charCodeAt(i)]);
|
||||
while (i < chars.length) {
|
||||
var c = this.cmap.readCharCode(chars, i);
|
||||
var charcode = c[0];
|
||||
var length = c[1];
|
||||
i += length;
|
||||
var glyph = this.charToGlyph(charcode);
|
||||
glyphs.push(glyph);
|
||||
// placing null after each word break charcode (ASCII SPACE)
|
||||
|
@ -34,6 +34,7 @@ var files = [
|
||||
'core/cidmaps.js',
|
||||
'core/crypto.js',
|
||||
'core/evaluator.js',
|
||||
'core/cmap.js',
|
||||
'core/fonts.js',
|
||||
'core/font_renderer.js',
|
||||
'core/glyphlist.js',
|
||||
|
BIN
test/pdfs/bug898853.pdf
Normal file
BIN
test/pdfs/bug898853.pdf
Normal file
Binary file not shown.
@ -869,6 +869,13 @@
|
||||
"link": true,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "bug898853.pdf",
|
||||
"file": "pdfs/bug898853.pdf",
|
||||
"md5": "37c37702bf98d33f9f74e2380c4d1a3f",
|
||||
"rounds": 1,
|
||||
"type": "eq",
|
||||
"about": "Has a multi-byte char codes."
|
||||
},
|
||||
{ "id": "issue1912",
|
||||
"file": "pdfs/issue1912.pdf",
|
||||
"md5": "15305b7c2cba971e7423de3f6ad38fef",
|
||||
|
86
test/unit/cmap_spec.js
Normal file
86
test/unit/cmap_spec.js
Normal file
@ -0,0 +1,86 @@
|
||||
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
|
||||
/* globals expect, it, describe, StringStream, Lexer, CMapFactory */
|
||||
|
||||
'use strict';
|
||||
|
||||
describe('cmap', function() {
|
||||
it('parses beginbfchar', function() {
|
||||
var str = '2 beginbfchar\n' +
|
||||
'<03> <00>\n' +
|
||||
'<04> <01>\n' +
|
||||
'endbfchar\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
expect(cmap.lookup(0x03)).toEqual(String.fromCharCode(0x00));
|
||||
expect(cmap.lookup(0x04)).toEqual(String.fromCharCode(0x01));
|
||||
expect(cmap.lookup(0x05)).toBeUndefined();
|
||||
});
|
||||
it('parses beginbfrange with range', function() {
|
||||
var str = '1 beginbfrange\n' +
|
||||
'<06> <0B> 0\n' +
|
||||
'endbfrange\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
expect(cmap.lookup(0x05)).toBeUndefined();
|
||||
expect(cmap.lookup(0x06)).toEqual(String.fromCharCode(0x00));
|
||||
expect(cmap.lookup(0x0B)).toEqual(String.fromCharCode(0x05));
|
||||
expect(cmap.lookup(0x0C)).toBeUndefined();
|
||||
});
|
||||
it('parses beginbfrange with array', function() {
|
||||
var str = '1 beginbfrange\n' +
|
||||
'<0D> <12> [ 0 1 2 3 4 5 ]\n' +
|
||||
'endbfrange\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
expect(cmap.lookup(0x0C)).toBeUndefined();
|
||||
expect(cmap.lookup(0x0D)).toEqual(0x00);
|
||||
expect(cmap.lookup(0x12)).toEqual(0x05);
|
||||
expect(cmap.lookup(0x13)).toBeUndefined();
|
||||
});
|
||||
it('parses begincidchar', function() {
|
||||
var str = '1 begincidchar\n' +
|
||||
'<14> 0\n' +
|
||||
'endcidchar\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
expect(cmap.lookup(0x14)).toEqual(String.fromCharCode(0x00));
|
||||
expect(cmap.lookup(0x15)).toBeUndefined();
|
||||
});
|
||||
it('parses begincidrange', function() {
|
||||
var str = '1 begincidrange\n' +
|
||||
'<0016> <001B> 0\n' +
|
||||
'endcidrange\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
expect(cmap.lookup(0x15)).toBeUndefined();
|
||||
expect(cmap.lookup(0x16)).toEqual(String.fromCharCode(0x00));
|
||||
expect(cmap.lookup(0x1B)).toEqual(String.fromCharCode(0x05));
|
||||
expect(cmap.lookup(0x1C)).toBeUndefined();
|
||||
});
|
||||
it('decodes codespace ranges', function() {
|
||||
var str = '1 begincodespacerange\n' +
|
||||
'<01> <02>\n' +
|
||||
'<00000003> <00000004>\n' +
|
||||
'endcodespacerange\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
var c = cmap.readCharCode(String.fromCharCode(1), 0);
|
||||
expect(c[0]).toEqual(1);
|
||||
expect(c[1]).toEqual(1);
|
||||
c = cmap.readCharCode(String.fromCharCode(0, 0, 0, 3), 0);
|
||||
expect(c[0]).toEqual(3);
|
||||
expect(c[1]).toEqual(4);
|
||||
});
|
||||
it('decodes 4 byte codespace ranges', function() {
|
||||
var str = '1 begincodespacerange\n' +
|
||||
'<8EA1A1A1> <8EA1FEFE>\n' +
|
||||
'endcodespacerange\n';
|
||||
var stream = new StringStream(str);
|
||||
var cmap = CMapFactory.create(stream);
|
||||
var c = cmap.readCharCode(String.fromCharCode(0x8E, 0xA1, 0xA1, 0xA1), 0);
|
||||
expect(c[0]).toEqual(0x8EA1A1A1);
|
||||
expect(c[1]).toEqual(4);
|
||||
});
|
||||
});
|
||||
|
@ -26,6 +26,7 @@
|
||||
<script type="text/javascript" src="../../src/shared/colorspace.js"></script>
|
||||
<script type="text/javascript" src="../../src/core/crypto.js"></script>
|
||||
<script type="text/javascript" src="../../src/core/evaluator.js"></script>
|
||||
<script type="text/javascript" src="../../src/core/cmap.js"></script>
|
||||
<script type="text/javascript" src="../../src/core/fonts.js"></script>
|
||||
<script type="text/javascript" src="../../src/core/glyphlist.js"></script>
|
||||
<script type="text/javascript" src="../../src/core/image.js"></script>
|
||||
@ -49,6 +50,7 @@
|
||||
<script type="text/javascript" src="api_spec.js"></script>
|
||||
<script type="text/javascript" src="metadata_spec.js"></script>
|
||||
<script type="text/javascript" src="util_spec.js"></script>
|
||||
<script type="text/javascript" src="cmap_spec.js"></script>
|
||||
<script type="text/javascript">
|
||||
'use strict';
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user