Add basic support for non-embedded Calibri fonts (issue 9195)

There's a number of issues with the fonts in the referenced PDF file. First of all, they contain broken `ToUnicode` data (`NUL` bytes all over the place). However even if you skip those, the `ToUnicode` data appears to contain nothing but a `IdentityH` CMap which won't help provide a proper glyph mapping.

The real issue actually turns out to be that the PDF file uses the "Calibri" font[1], but doesn't include any font files. Since that one isn't a standard font, and uses a fairly different CID to GID map compared to the standard fonts, we're not able to render the file even remotely correct.
To work around this, I'm thus proposing that we include a (incomplete) glyph map for Calibri, and fallback to the standard Helvetica font. Obviously this isn't going to look perfect, but it's really the best that we can hope to achieve given that the PDF file is missing the necessary font data.

Finally, please note that none of the PDF readers I've tried (Adobe Reader, PDFium in Chrome) were able to extract the text (which isn't very surprising, given the broken `ToUnicode` data).

Fixes 9195.

---

[1] According to Wikipedia, see https://en.wikipedia.org/wiki/Calibri, Calibri is (primarily) a Windows font.
This commit is contained in:
Jonas Jenwald 2017-12-03 14:02:22 +01:00
parent de0bac727e
commit 08de655177
4 changed files with 44 additions and 1 deletions

View File

@ -28,7 +28,7 @@ import {
} from './encodings';
import {
getGlyphMapForStandardFonts, getNonStdFontMap, getStdFontMap,
getSupplementalGlyphMapForArialBlack
getSupplementalGlyphMapForArialBlack, getSupplementalGlyphMapForCalibri
} from './standard_fonts';
import {
getUnicodeForGlyph, getUnicodeRangeFor, mapSpecialUnicodeValues
@ -1242,7 +1242,14 @@ var Font = (function FontClosure() {
for (charCode in SupplementalGlyphMapForArialBlack) {
map[+charCode] = SupplementalGlyphMapForArialBlack[charCode];
}
} else if (/Calibri/i.test(name)) {
let SupplementalGlyphMapForCalibri =
getSupplementalGlyphMapForCalibri();
for (charCode in SupplementalGlyphMapForCalibri) {
map[+charCode] = SupplementalGlyphMapForCalibri[charCode];
}
}
var isIdentityUnicode = this.toUnicode instanceof IdentityToUnicodeMap;
if (!isIdentityUnicode) {
this.toUnicode.forEach(function(charCode, unicodeCharCode) {

View File

@ -83,6 +83,10 @@ var getStdFontMap = getLookupTableFactory(function (t) {
* a standard fonts without glyph data.
*/
var getNonStdFontMap = getLookupTableFactory(function (t) {
t['Calibri'] = 'Helvetica';
t['Calibri-Bold'] = 'Helvetica-Bold';
t['Calibri-BoldItalic'] = 'Helvetica-BoldOblique';
t['Calibri-Italic'] = 'Helvetica-Oblique';
t['CenturyGothic'] = 'Helvetica';
t['CenturyGothic-Bold'] = 'Helvetica-Bold';
t['CenturyGothic-BoldItalic'] = 'Helvetica-BoldOblique';
@ -355,6 +359,28 @@ var getSupplementalGlyphMapForArialBlack =
t[227] = 322; t[264] = 261; t[291] = 346;
});
// The glyph map for Calibri (a Windows font) differs from the glyph map used
// in the standard fonts. Hence we use this (incomplete) CID to GID mapping to
// adjust the glyph map for non-embedded Calibri fonts.
let getSupplementalGlyphMapForCalibri = getLookupTableFactory(function(t) {
t[1] = 32; t[4] = 65; t[17] = 66; t[18] = 67; t[24] = 68; t[28] = 69;
t[38] = 70; t[39] = 71; t[44] = 72; t[47] = 73; t[58] = 74; t[60] = 75;
t[62] = 76; t[68] = 77; t[69] = 78; t[75] = 79; t[87] = 80; t[89] = 81;
t[90] = 82; t[94] = 83; t[100] = 84; t[104] = 85; t[115] = 86; t[116] = 87;
t[121] = 88; t[122] = 89; t[127] = 90; t[258] = 97; t[268] = 261; t[271] = 98;
t[272] = 99; t[273] = 263; t[282] = 100; t[286] = 101; t[295] = 281;
t[296] = 102; t[336] = 103; t[346] = 104; t[349] = 105; t[361] = 106;
t[364] = 107; t[367] = 108; t[371] = 322; t[373] = 109; t[374] = 110;
t[381] = 111; t[383] = 243; t[393] = 112; t[395] = 113; t[396] = 114;
t[400] = 115; t[401] = 347; t[410] = 116; t[437] = 117; t[448] = 118;
t[449] = 119; t[454] = 120; t[455] = 121; t[460] = 122; t[463] = 380;
t[853] = 44; t[855] = 58; t[856] = 46; t[876] = 47; t[878] = 45; t[882] = 45;
t[894] = 40; t[895] = 41; t[896] = 91; t[897] = 93; t[923] = 64; t[1004] = 48;
t[1005] = 49; t[1006] = 50; t[1007] = 51; t[1008] = 52; t[1009] = 53;
t[1010] = 54; t[1011] = 55; t[1012] = 56; t[1013] = 57; t[1081] = 37;
t[1085] = 43; t[1086] = 45;
});
export {
getStdFontMap,
getNonStdFontMap,
@ -362,4 +388,5 @@ export {
getSymbolsFonts,
getGlyphMapForStandardFonts,
getSupplementalGlyphMapForArialBlack,
getSupplementalGlyphMapForCalibri,
};

View File

@ -0,0 +1 @@
https://github.com/mozilla/pdf.js/files/1506940/testfile.pdf

View File

@ -718,6 +718,14 @@
"link": false,
"type": "eq"
},
{ "id": "issue9195",
"file": "pdfs/issue9195.pdf",
"md5": "90e78a11abdc6c5ae79b8b95cfbb1895",
"rounds": 1,
"link": true,
"lastPage": 1,
"type": "eq"
},
{ "id": "issue8707",
"file": "pdfs/issue8707.pdf",
"md5": "d3dc670adde9ec9fb82c974027033029",