From f5be2d62a3e0d02b5f92abb6ebda7d1a70426c2f Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 29 Sep 2019 23:50:58 +0200 Subject: [PATCH] Improve the heuristics, in `PartialEvaluator._buildSimpleFontToUnicode`, for glyphNames of the Cdd{d}/cdd{d} format (issue 9655) *Please note:* I've been thinking about possible ways of addressing this issue for a while now, but all of the solutions I came up with became too complicated and thus hurt readability of the code. However, it occured to me that we're essentially trying to add a heuristic *on top* of another heuristic, and that it shouldn't matter how efficient the code is as long as it works. In the PDF file in the issue the Encoding contains glyphNames of the `Cdd` format, which our existing heuristics will treat as base 10 values. However, in this particular file they actually contain base 16 values, which we thus attempt to detect and fix such that text-selection works. --- src/core/evaluator.js | 31 ++++++++++++++++++++++++------- test/pdfs/.gitignore | 1 + test/pdfs/issue9655_reduced.pdf | Bin 0 -> 31518 bytes test/test_manifest.json | 7 +++++++ 4 files changed, 32 insertions(+), 7 deletions(-) create mode 100644 test/pdfs/issue9655_reduced.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 2c3c80964..9bc1e75ef 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1977,7 +1977,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { * @returns {ToUnicodeMap} * @private */ - _buildSimpleFontToUnicode(properties) { + _buildSimpleFontToUnicode(properties, forceGlyphs = false) { assert(!properties.composite, 'Must be a simple font.'); let toUnicode = [], charcode, glyphName; @@ -2017,14 +2017,31 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { code = parseInt(glyphName.substring(1), 16); } break; - case 'C': // Cddd glyph - case 'c': // cddd glyph - if (glyphName.length >= 3) { - code = +glyphName.substring(1); + case 'C': // Cdd{d} glyph + case 'c': // cdd{d} glyph + if (glyphName.length >= 3 && glyphName.length <= 4) { + const codeStr = glyphName.substring(1); + + if (forceGlyphs) { + code = parseInt(codeStr, 16); + break; + } + // Normally the Cdd{d}/cdd{d} glyphName format will contain + // regular, i.e. base 10, charCodes (see issue4550.pdf)... + code = +codeStr; + + // ... however some PDF generators violate that assumption by + // containing glyph, i.e. base 16, codes instead. + // In that case we need to re-parse the *entire* encoding to + // prevent broken text-selection (fixes issue9655_reduced.pdf). + if (Number.isNaN(code) && + Number.isInteger(parseInt(codeStr, 16))) { + return this._buildSimpleFontToUnicode(properties, + /* forceGlyphs */ true); + } } break; - default: - // 'uniXXXX'/'uXXXX{XX}' glyphs + default: // 'uniXXXX'/'uXXXX{XX}' glyphs let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); if (unicode !== -1) { code = unicode; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 46f0aa1a6..0ca334476 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -73,6 +73,7 @@ !issue9291.pdf !issue9418.pdf !issue9458.pdf +!issue9655_reduced.pdf !issue9915_reduced.pdf !issue9940.pdf !issue10388_reduced.pdf diff --git a/test/pdfs/issue9655_reduced.pdf b/test/pdfs/issue9655_reduced.pdf new file mode 100644 index 0000000000000000000000000000000000000000..535fc1acf55a80cf2204ba5643bb61f167ca4556 GIT binary patch literal 31518 zcmeHQO^@7Gc4ec16wq*7+bjkqHwKc0)x|2-7htFm%Qoy}oH!lGv+* z*FEpP`yL<3SKoj0?bqUK^SxL9`@3)cYcGyQ(b32M9PPdS`riKg2cH~1h{g!|U~m8C zZ|)yP`!^0w4t{;~$=>zrdx!UKizMUyC4YbS_Jiod2`~9*Z~w;8=l4#c*j+5?l`o~7 zzx0EnlY^7PXvBPv4{@Dk{NCa1y9aL^eGz>)g7_>RNAu+viGOhT;OO(?TPUFv_U$8F zqG%Z|_dbbcniU1#KR&wkAew8yFFybHL>iLK zi3@&yV5!AZG!wIZ!>amPoJ6xEnl6y3wmoC__J4Wz_Q|IYqJ^YaQUfdJ%m4Gkd!L+q z8YM~0Dn2+lK0Nqr?~R+tVHu6Cjg#o+oxON8if;Z^G;Y<4Zhp2GFUO3ZPZrmbMRfD_ z-iLqv{@;8Q-Tb?{;CK{YTT}&)kD4Xp>9r}eR!Z`Iv*37gZCMpOY!;l1uT83g$1Q^M zYqLtuC(Uw6T+jJ8&4SadBEEa?L9^(5Qg5Koj}Mwfm(y#rMWvkgZdujey1B=$raFv# zuOq>{y5GEe=guMe3_HVzXqf#Q;}Mk4-&;{K=kJ|pHpOoe^F)bc!V_f@iI@>l6l%w5 z8OzCHRVN9rZt^MizR=T(UQr|nDdO%GfIQn zI8k~=>6wTy=h-rfM*^7ZjJeL3>x{Y1M?5j5Ia8W5r8!fQ!p>(=jEdtgUOz`dDCPc* zB{NvgeLr%YwA}mIJ$-YsMbJ-Z^=>dt_J3luR!ln};Zx-0!Y5HICXe!vQR@TwF-qX% zA>)h!ln|ReU_D~(%eWTdKHpJ4qv0e1YYk6GaK$9M%NKl1T)xmg7_Bm(4~T8}Fao~~ z-;N+=_*OJ^`9c6mX*ezf!wt_P5M41ChT>WHEJCp@9P*lP$v!i4GVDXp09uA3(6;!; zLPGDL2LhpVw6jnJe^PQ`Vk*3SrWl$Ln$mw5G045pJzyPS12I`sh5_|Hi7~sJWo;5 zIO?t(C8ff+4!Y zZ5zpC{M%w#T<$*;_J#(Q!R718GD=>`G85`x8OD|?vy5%laf~c3G!71pZ=j2WfGrA0 zE(z%iB;gXHEu&$+Q8cief*nWpz$hLBD1fMPzQ171b10rZPV`aGcr#>YE~&w zl~Y}zS|#iJ3^6;k1;7@Ka$w*AjHW~lkeGtpz%DGSgzSQwyUc3`v=I=XFuWiDH#brY zlur#PNNScibO4=3Z8LifGM3AXfh+4iw9*@i+VON3O1AQjj@uURss@XL*>h~L+_1Q- zts4YrXl+9@rf5gTKIz8ZJcZDKgfxJ&)S@sf>Q$K;h+!7&xDe4&fDk#iA(!k*0oxg90|VYM9#?6`q5P)KrN6E`6RL#eWG|F~tz^ zLKOnBCLXZoi=in1F0Gm{Ub@3_h~*N1lZ1%nV#rROp(X-OQ6K=0d`ipoG)SdBt5cdY z9LQb)a@9J6(o*;UW1)fj#n_3&^>7_AU^1<|m=lC7`au>RhbV*?I2fVhxsDdYI^kJzW*Gae$pe zR96L^b1^kJL(F}yFcHa_{0Vge9bb`II4m0AO(_k zSUC{p-)hJp|5Cxu#lUO_Y>^Q#NV5)0!8N8j%aU1T#^2&nPkOVAl9#&FCe&dmu$*&s zOTotFAqpT+Clf%OOaVio6pVMM;=_xT(G6e&5%|NYfzwoNa|pYTNz}&tg|Gy^7G)sh z-z4eqD+m5yVEA3WD^Z}W2XvijO7hUaMJfP(c>yHlo%}piedlMX6eCzUFwD{o)4`Fh zacZ?VTw|Vi6caIuLnFWgLt3OzX?=`)8iPh#Y)E|J2A@TW% zP2B*F9VI6+Q)xd99%@e$w?(5kF;SGdJ{LM6=uV>m)`9>s2Sl0FujV`WUlSU@I~JkA zS*S@4&Mko8gtRFVa7BT$E43fqDWrqfG|1D$DM0b!mWyaRMILpo;zpi(wNe& znOw{U_CPuCOtoBz1>NQ)5vzSZ(y0bPSw5qE23J;;@HH8Q^Re%njNeJA3(NCBKsgfe& zbO8r6sro4cQ50QU0r}Tw29URgpcBhg%u><;xR8_qw9o=J$iau85=P0D1D%vCyp5G3 zSLe(acU2n3n$fgmASP`~D!2pDZ>*7f6_*(Z6Dp55H3NZLM7<1z^>eghhNk3U_LJ>u zt%Al}Ym_4gLR6kW?1BKwcPW8!N{Za?Xex@~tHayrE3bnxy<;WD8|A0m`A$NZO7RL!WCj;})rz~XU#$t>f zvH~K|g#;^$Dm<{=4m~$5wN6Hrm5Z%=t9l0#yMlV!1xj8zA2Fd0yMVnIz3hUkzl@~v z$8{IFyb{q))N;7HgQB7-;lliD*fw%8s zYfG!^*2_iHJGKd>Ihg;Ek+gV!6qX}M0Zg8Ofdj$HLV_I!aHV8|lkXc{U7~3*FImn?ifSDy(N%g5 zmF!fP4wWj@(Iv1&=cy}78Qv_}bpVR3+n0e0K^zl;HF0Ca z;M9dUu;Ty-uEITskET_<`MA81d)ner&9L&@Z}_0(rNf5_b*PtndV8(vxdtfz{?7f8 z{sdqSUc5%o8-A_+ZsU^VqCB%`-F4P`W-;kAvoN8Ket=!ikKYfx;?ESpDy;=!M9Boe z+~^PipjHa@N{tf@m&g#Ti3m0Xp<9#={Vk8vhiN8J*~68j7-kPc2V{H<5aL{^_R`%2TwQ&DYmG2S}sH<-#zw#(*XCe&d>up9sC zMnszl=am`?`Z#L|gf;O-M7k<1MVk=`y%F)?e4xD-_CgZ^NF_zAPy*v912qt^DQZMu zQxxD{shJh?ws{lJg!8qflb^0MLFArE%B~o+Zc=&zshI#)R=hc>5QDU49yEA!#U@&R z8!>pu%y|4oyXnEx1=~yE$$U|s-?y0_ERpx)l(6zLPMJ^#r|<~H6XaB?ys2<1ljcJ6 zY8;y0k5ILNDTFdMlrBKiJrvb^=r{%71||cZQ(%oz$>wVTws9$yu&fzMfdI5*Si`X* z0%%7_9ay7=y#=KBQ#Bblh z{yu04$1u<27vOXwD9q2aV*;q#6JnQ)MV+T>B8F@P{NW~de1)errxwPL@n8)^jmc%OQ!7yi?vf7V`tS@%F(ClkevH5P2knb(KL!{u4~EFjvkK5% z$B0tE#5m{)=>#w_0ZfBHHmwQ3sk$Lq9;ILuZx`w2Hq7lpsaWc2=s}1OsXS{afR$%{ zMFW9=dBzZQ8ynBicoKj@QnrM&CbsxEKT0M(@o}xm8HF}PncRA!0D3Xtol*bP zmXC76{wgR__|G1(rZ%B@VE{%))i-lz1nDo7AlO)l<+!~;4l#laCcS^ z1+rXL4)_WdoA&y7+f5r@XGwX(s~D?>jRbgALr-s;l9$><6Y8*ud=0`?ZQ`Rc(d+rK z6y-TpXQC*aI(%FpMB?R2DF7uR$!~F~Q4)D8pAl!Y0kzjEwbBG$2FcajotgC0Jh*xF}U|~~3Fn=^)69WutYG^pR$+TXExrZo#0L~Buuta}8(0fydyb7Z}ZJgmv zLOr%{W20<8b|ku`wtG{D33XUEybG(qS7|GE)6mL`+X0f+eS;bGLs0mjH%#gi1Pz} z5tq8_N^zEe-ui3l2bUJr46x^XgTGlSyj9E>0`_|&wSdt#_OD!Dfse74IMgCjq145N zuJ)y7s8omYp$qX{q%hXkHNmQBZtW0FcOHJS$gT-Xzs?3ZwV0nMR<=KtJf{H65UVOn zi{6>$TB4iNwxbOkiY}pzCg3|%=(vS4Yg9w;DG6D2T{hZ09P=L}QCG-Ew8%}ny{iOl zed6$Wa|y&>LtA?7z=mk6(is#Z0u8DT1<>a7v2LNe?-642q9h5R>J2j_!AuBb`9uJ) zhE#wqeIuv&uV)TEM2;Ce?@RTkq%hUJd#Iz3@z0yT@rKGa>JDwE)h$B#ILZ1sM4Z zTsV-QSxBCLMb&lCmTInP-uWTi+G?6F6kd3qo_mrP%l{NWG4giIeB{NFb;q-|You-F z^CRCivY{SuICp5r0p6XE1J9J%O|^Wpl75W^o>a^MfcjAaGKh!g;0JACh{vGCQiNQ; zqsa(Q!48h9VYZup>44p18R+X3!pz>hAb@#&X1 zu)oM309nGi8jy#eSU#TeV(-ACl{t#RDno)iR`}m)xti9o zr?c{|k=CKJ_pTASJ9ycy5fkc|%z*z_p3L|O9u18gkG>9=;W}@6peVB@p#vO;bYy{4 z9sJ3KoQ4*4YW8J#H2?-P!C~eAh@&B7nM8ru4qOPMS&MHiN{j02<-tpQM}@E~_|?h) z?4~uJcQE~AUixS&-I|u2r_=J>x%JJUz30wJpSiOMbDfY}b1HK73Q+tq+$tI-W*JhSP|P|3wof1~Arp{oHivKU-JN>mVs2dN`h zC^i~VgFK@I4$gU7$-}IkoX5%rMwVNM;^9LUiQi>jWeU=Yix}0JkR&yB5^)miEMLLJ>w$Ro~>e zq~ifCs)W#olk$+_8y16wUJHP<{JPc6R+=;|AlLS2x83^cJzJdgSwJ?S4jc3Xc@R>y z>XcBZ1#PctfnE$0kBy}kQg|fc#uTJ455OOT3@0ACIS!+<+vV>V^voWFl-IAL(sGiSXy`i=b~nF+Z`W3vLLY?)C3S z0z-GzP-nrU(pQ=eIJFH5SNK^IuyM0%_1TS<&90n7_q)&hl;3S7TI4Bn$Z51#0j-9N zAV^muq!9qn4cPDs)!oRyhqaa{{0e~4u7Q;c@x}xf>#PB!2qmwWB0)&w z5#kEDr$r$56$orC*#Lb?F>AxqSCC?WD^9UALofgk?hno!I5)6+VPGdW;UNDHy$XfL z$~Dsgz${xq${I4H0kEiOjVu*|7Xq9i1ck{`^Wwpc=zCc(+Q#H%N;AaYNqk zwTlcdl-XRV(d$9dR1$)d5DGwR_&Vy8o%w2JppFAo%O&}ZaBd||tnZTZQ$om>)8%$&K;t!g^RjQ&^K*atc@ti^q#Cxp;x^ zBG%-_$r8`vRj-}Iqb<3~4Bv;W$t4qfzOo*+m~UN+yD?ZGt>!UZ;*~nJu-OzJx2%QD zN6W2i=d;CDSr#$AM_IdeG3Q4Vc@%BGoQ$@^=F56n;!(^GAy%%9M^k)mvlbS|{Geo2 zE}kv760WIMuT4hVWl3gui&sq!ZMsz!wCPrQvjpE|tX(@>;7u2`u=#ka z?~fM=zGGUKTW@fFu!-2Lq# b4V64?ad3PhrfV`9FGqW?zWUZbzP