For embedded Type1 fonts without included ToUnicode/Encoding data, attempt to improve text selection by using the builtInEncoding to amend the toUnicode map (issue 6901, issue 7182, issue 7217, bug 917796, bug 1242142)

Note that in order to prevent any possible issues, this patch does *not* try to amend the `toUnicode` data for Type1 fonts that contain either `ToUnicode` or `Encoding` entries in the font dictionary.

Fixes, or at least improves, issues/bugs such as e.g. 6658, 6901, 7182, 7217, bug 917796, bug 1242142.
This commit is contained in:
Jonas Jenwald 2016-08-17 18:33:06 +02:00
parent bf6f5d1cc9
commit 325f7afcca
5 changed files with 65 additions and 8 deletions

View File

@ -1757,6 +1757,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
properties.differences = differences; properties.differences = differences;
properties.baseEncodingName = baseEncodingName; properties.baseEncodingName = baseEncodingName;
properties.hasEncoding = !!baseEncodingName || differences.length > 0;
properties.dict = dict; properties.dict = dict;
return toUnicodePromise.then(function(toUnicode) { return toUnicodePromise.then(function(toUnicode) {
properties.toUnicode = toUnicode; properties.toUnicode = toUnicode;
@ -1774,8 +1775,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
* {ToUnicodeMap|IdentityToUnicodeMap} object. * {ToUnicodeMap|IdentityToUnicodeMap} object.
*/ */
buildToUnicode: function PartialEvaluator_buildToUnicode(properties) { buildToUnicode: function PartialEvaluator_buildToUnicode(properties) {
properties.hasIncludedToUnicodeMap =
!!properties.toUnicode && properties.toUnicode.length > 0;
// Section 9.10.2 Mapping Character Codes to Unicode Values // Section 9.10.2 Mapping Character Codes to Unicode Values
if (properties.toUnicode && properties.toUnicode.length !== 0) { if (properties.hasIncludedToUnicodeMap) {
return Promise.resolve(properties.toUnicode); return Promise.resolve(properties.toUnicode);
} }
// According to the spec if the font is a simple font we should only map // According to the spec if the font is a simple font we should only map

View File

@ -163,6 +163,30 @@ function adjustWidths(properties) {
properties.defaultWidth *= scale; properties.defaultWidth *= scale;
} }
function adjustToUnicode(properties, builtInEncoding) {
if (properties.hasIncludedToUnicodeMap) {
return; // The font dictionary has a `ToUnicode` entry.
}
if (properties.hasEncoding) {
return; // The font dictionary has an `Encoding` entry.
}
if (builtInEncoding === properties.defaultEncoding) {
return; // No point in trying to adjust `toUnicode` if the encodings match.
}
if (properties.toUnicode instanceof IdentityToUnicodeMap) {
return;
}
var toUnicode = [], glyphsUnicodeMap = getGlyphsUnicode();
for (var charCode in builtInEncoding) {
var glyphName = builtInEncoding[charCode];
var unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
if (unicode !== -1) {
toUnicode[charCode] = String.fromCharCode(unicode);
}
}
properties.toUnicode.amend(toUnicode);
}
function getFontType(type, subtype) { function getFontType(type, subtype) {
switch (type) { switch (type) {
case 'Type1': case 'Type1':
@ -261,7 +285,13 @@ var ToUnicodeMap = (function ToUnicodeMapClosure() {
charCodeOf: function(v) { charCodeOf: function(v) {
return this._map.indexOf(v); return this._map.indexOf(v);
} },
amend: function (map) {
for (var charCode in map) {
this._map[charCode] = map[charCode];
}
},
}; };
return ToUnicodeMap; return ToUnicodeMap;
@ -297,7 +327,11 @@ var IdentityToUnicodeMap = (function IdentityToUnicodeMapClosure() {
charCodeOf: function (v) { charCodeOf: function (v) {
return (isInt(v) && v >= this.firstChar && v <= this.lastChar) ? v : -1; return (isInt(v) && v >= this.firstChar && v <= this.lastChar) ? v : -1;
} },
amend: function (map) {
error('Should not call amend()');
},
}; };
return IdentityToUnicodeMap; return IdentityToUnicodeMap;
@ -765,6 +799,7 @@ var Font = (function FontClosure() {
this.fontMatrix = properties.fontMatrix; this.fontMatrix = properties.fontMatrix;
this.widths = properties.widths; this.widths = properties.widths;
this.defaultWidth = properties.defaultWidth; this.defaultWidth = properties.defaultWidth;
this.toUnicode = properties.toUnicode;
this.encoding = properties.baseEncoding; this.encoding = properties.baseEncoding;
this.seacMap = properties.seacMap; this.seacMap = properties.seacMap;
@ -2386,10 +2421,8 @@ var Font = (function FontClosure() {
} else { } else {
// Most of the following logic in this code branch is based on the // Most of the following logic in this code branch is based on the
// 9.6.6.4 of the PDF spec. // 9.6.6.4 of the PDF spec.
var hasEncoding = var cmapTable = readCmapTable(tables['cmap'], font, this.isSymbolicFont,
properties.differences.length > 0 || !!properties.baseEncodingName; properties.hasEncoding);
var cmapTable =
readCmapTable(tables['cmap'], font, this.isSymbolicFont, hasEncoding);
var cmapPlatformId = cmapTable.platformId; var cmapPlatformId = cmapTable.platformId;
var cmapEncodingId = cmapTable.encodingId; var cmapEncodingId = cmapTable.encodingId;
var cmapMappings = cmapTable.mappings; var cmapMappings = cmapTable.mappings;
@ -2398,7 +2431,7 @@ var Font = (function FontClosure() {
// The spec seems to imply that if the font is symbolic the encoding // The spec seems to imply that if the font is symbolic the encoding
// should be ignored, this doesn't appear to work for 'preistabelle.pdf' // should be ignored, this doesn't appear to work for 'preistabelle.pdf'
// where the the font is symbolic and it has an encoding. // where the the font is symbolic and it has an encoding.
if (hasEncoding && if (properties.hasEncoding &&
(cmapPlatformId === 3 && cmapEncodingId === 1 || (cmapPlatformId === 3 && cmapEncodingId === 1 ||
cmapPlatformId === 1 && cmapEncodingId === 0) || cmapPlatformId === 1 && cmapEncodingId === 0) ||
(cmapPlatformId === -1 && cmapEncodingId === -1 && // Temporary hack (cmapPlatformId === -1 && cmapEncodingId === -1 && // Temporary hack
@ -2562,6 +2595,12 @@ var Font = (function FontClosure() {
// TODO: Check the charstring widths to determine this. // TODO: Check the charstring widths to determine this.
properties.fixedPitch = false; properties.fixedPitch = false;
if (properties.builtInEncoding) {
// For Type1 fonts that do not include either `ToUnicode` or `Encoding`
// data, attempt to use the `builtInEncoding` to improve text selection.
adjustToUnicode(properties, properties.builtInEncoding);
}
var mapping = font.getGlyphMapping(properties); var mapping = font.getGlyphMapping(properties);
var newMapping = adjustMapping(mapping, properties); var newMapping = adjustMapping(mapping, properties);
this.toFontChar = newMapping.toFontChar; this.toFontChar = newMapping.toFontChar;

View File

@ -22,6 +22,7 @@
!issue5808.pdf !issue5808.pdf
!issue6204.pdf !issue6204.pdf
!issue6782.pdf !issue6782.pdf
!issue6901.pdf
!issue6961.pdf !issue6961.pdf
!issue6962.pdf !issue6962.pdf
!issue7020.pdf !issue7020.pdf

BIN
test/pdfs/issue6901.pdf Normal file

Binary file not shown.

View File

@ -1220,6 +1220,20 @@
"link": false, "link": false,
"type": "text" "type": "text"
}, },
{ "id": "issue6901-eq",
"file": "pdfs/issue6901.pdf",
"md5": "1a0604b1a7a3aaf2162b425a9a84230b",
"rounds": 1,
"link": false,
"type": "eq"
},
{ "id": "issue6901-text",
"file": "pdfs/issue6901.pdf",
"md5": "1a0604b1a7a3aaf2162b425a9a84230b",
"rounds": 1,
"link": false,
"type": "text"
},
{ "id": "issue6962", { "id": "issue6962",
"file": "pdfs/issue6962.pdf", "file": "pdfs/issue6962.pdf",
"md5": "d40e871ecca68baf93114bd28c782148", "md5": "d40e871ecca68baf93114bd28c782148",