From 6dfe53b976c10e957f6f5e611a14aa7ebf4a1aed Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 23 Nov 2015 16:57:43 +0100 Subject: [PATCH] [api-minor] Add a parameter to `PDFPageProxy_getTextContent` that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612) This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612. --- src/core/core.js | 7 +++++-- src/core/evaluator.js | 27 +++++++++++++++++++++------ src/core/worker.js | 4 +++- src/display/api.js | 16 ++++++++++++++-- test/driver.js | 10 ++++++---- test/pdfs/.gitignore | 1 + test/pdfs/issue6612.pdf | Bin 0 -> 7067 bytes test/test_manifest.json | 7 +++++++ test/unit/api_spec.js | 20 +++++++++++++++----- web/pdf_find_controller.js | 1 - web/pdf_page_view.js | 2 +- web/pdf_viewer.js | 4 ++-- 12 files changed, 75 insertions(+), 24 deletions(-) create mode 100644 test/pdfs/issue6612.pdf diff --git a/src/core/core.js b/src/core/core.js index 984c5e91e..52ac6d58b 100644 --- a/src/core/core.js +++ b/src/core/core.js @@ -218,7 +218,8 @@ var Page = (function PageClosure() { }); }, - extractTextContent: function Page_extractTextContent(task) { + extractTextContent: function Page_extractTextContent(task, + normalizeWhitespace) { var handler = { on: function nullHandlerOn() {}, send: function nullHandlerSend() {} @@ -248,7 +249,9 @@ var Page = (function PageClosure() { return partialEvaluator.getTextContent(contentStream, task, - self.resources); + self.resources, + /* stateManager = */ null, + normalizeWhitespace); }); }, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 7e80ecf42..20087751d 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }); }, - getTextContent: function PartialEvaluator_getTextContent(stream, task, - resources, - stateManager) { + getTextContent: + function PartialEvaluator_getTextContent(stream, task, resources, + stateManager, + normalizeWhitespace) { stateManager = (stateManager || new StateManager(new TextState())); + var WhitespaceRegexp = /\s/g; + var textContent = { items: [], styles: Object.create(null) @@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return textContentItem; } + function replaceWhitespace(str) { + // Replaces all whitespaces with standard spaces (0x20), to avoid + // alignment issues between the textLayer and the canvas if the text + // contains e.g. tabs (fixes issue6612.pdf). + var i = 0, ii = str.length, code; + while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) { + i++; + } + return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str); + } + function runBidiTransform(textChunk) { var str = textChunk.str.join(''); var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); return { - str: bidiResult.str, + str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) : + bidiResult.str), dir: bidiResult.dir, width: textChunk.width, height: textChunk.height, @@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } return self.getTextContent(xobj, task, - xobj.dict.get('Resources') || resources, stateManager). - then(function (formTextContent) { + xobj.dict.get('Resources') || resources, stateManager, + normalizeWhitespace).then(function (formTextContent) { Util.appendToArray(textContent.items, formTextContent.items); Util.extendObj(textContent.styles, formTextContent.styles); stateManager.restore(); diff --git a/src/core/worker.js b/src/core/worker.js index 08fa18981..c45634004 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { handler.on('GetTextContent', function wphExtractText(data) { var pageIndex = data.pageIndex; + var normalizeWhitespace = data.normalizeWhitespace; return pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent(task).then(function(textContent) { + return page.extractTextContent(task, normalizeWhitespace).then( + function(textContent) { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + (Date.now() - start) + 'ms'); diff --git a/src/display/api.js b/src/display/api.js index e3aafa0eb..1b8dce16c 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { return PDFDocumentProxy; })(); +/** + * Page getTextContent parameters. + * + * @typedef {Object} getTextContentParameters + * @param {boolean} normalizeWhitespace - replaces all occurrences of + * whitespace with standard spaces (0x20). The default value is `false`. + */ + /** * Page text content. * @@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() { }, /** + * @param {getTextContentParameters} params - getTextContent parameters. * @return {Promise} That is resolved a {@link TextContent} * object that represent the page text content. */ - getTextContent: function PDFPageProxy_getTextContent() { + getTextContent: function PDFPageProxy_getTextContent(params) { + var normalizeWhitespace = (params && params.normalizeWhitespace) || false; + return this.transport.messageHandler.sendWithPromise('GetTextContent', { - pageIndex: this.pageNumber - 1 + pageIndex: this.pageNumber - 1, + normalizeWhitespace: normalizeWhitespace, }); }, diff --git a/test/driver.js b/test/driver.js index c41ec7012..a61084eb0 100644 --- a/test/driver.js +++ b/test/driver.js @@ -334,10 +334,12 @@ var Driver = (function DriverClosure() { textLayerContext.clearRect(0, 0, textLayerCanvas.width, textLayerCanvas.height); // The text builder will draw its content on the test canvas - initPromise = page.getTextContent().then(function(textContent) { - return rasterizeTextLayer(textLayerContext, viewport, - textContent); - }); + initPromise = + page.getTextContent({ normalizeWhitespace: true }).then( + function(textContent) { + return rasterizeTextLayer(textLayerContext, viewport, + textContent); + }); } else { textLayerCanvas = null; initPromise = Promise.resolve(); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 62a7a80a9..38a33eb86 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -49,6 +49,7 @@ !issue5280.pdf !issue5677.pdf !issue5954.pdf +!issue6612.pdf !alphatrans.pdf !devicen.pdf !cmykjpeg.pdf diff --git a/test/pdfs/issue6612.pdf b/test/pdfs/issue6612.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c9543f12d758ffa157f91c1d828ab2c913746c0b GIT binary patch literal 7067 zcmbVQc{J4h_eYitk?d`NSDFGz3Ts_cmAE1PmhXWd}0e3{g-~b6dIKmn20+g1Ml>b`*10fKQ96&_{fI|Dg z9XtV;tdwgs7{-y5MrS8c&L#n?b0OBLVpgdXsc=_*&MyTp&dV^)rcgV!rr5sTNA{s$ za-A``2WjQHLJouBSbn12^iRCJZ@U|GXtkX2My;OhHp zso-&><;R%Sw^FEuEL8tq|7~$ns18W@0&O=BTl!*Ilh3z{a+R40uP0g>SEq8~FY&bE z*||`3bE@kSuZ6xJapDIlwQJXYtuc1@g}2{#Jj&IU{ef;;4f)FV{fKhE?b)Pa?O9e= zUj8R!`@)&^+WRKAb2;M=)Sxq>q&^r#T<;frB02P9N`QXM@Jku_ZH`jqu34@Ia@j)k@an@Pp;yAB$?+x>-wo12BE%FBiZrKHWn_v&#i zL3hm62dmEvsv~}2r4=ceA6~4lx69tf2-;nrKIk#qyY5A=x9>)eZ?HKPKRlu&nQ;Fc ztZ#Cjqt2Z_@v{+G7_?w!^OXHQJpp=~ckeH`UW0N}{pHoq41Qmd%Y7c~Q%cw+U*bfTDZt~WgEewbn6L3@VYRE)~Aq)xK;6SlFV zS9aO#dY{KBW%rw(dS`E%mt9Ru<;W+o)x$is$Yok~5z{Et!VYo{OoH`>48c`LyMhVNB5MU1%iwZiB7+CF_;Go-(QDnd3Nt(|!%(ybbUINP~lZ9vE~_qb4he=S0p)DDrM$Rl{zzZKIm@J3Y-h?lviDoSN$tS(KIpeY@(+16Ya#?X6 z3mItEzKE?@7GMA6^v*SVUC_Mx6^ui=XeRPk{!Q%*)5aN;A|e2qoyp5w)}EdmWn4e^ z4YVztYf)&({H^mg*7)FBq31257Ea+HxQ7O&@=BMmuW?KH#Px(jvqwJZWTK7w0(+A= zXcx?5na(Hkx?!K_<(Nf9Z9x@jO`LNM`x$-E%A}_5w$kr z-&is%ZL=t3-s7I$4d|OPXz1gplDu8o@C3rwMwu`vZE?N6RNlHE2_~#CJmTYm zX1*geUQPo$p$J!xO(pw)dutG}tt zI;=9eA%SlqUqz>dsB}NL!6$Es4Xz{&%%8hB-e9PgoNB;jcQj+zE3u)Z6enJpz94bh z(ER-K?&e9q$31?J?=J+>fy#c|Qt#fa@a680XKnj=Xw}lh&l2#hWmLx8s;;1Ib}_4{ z=<}djSd5lRfRB`}?Yg7;s&K2c^4X3QpJD%2v54F%h8k*$E^&>2#h(LjsyZ=!G;=+kAi`M$m$%Vqg0ap8q~) z3Yy;)Frkp$po%g);>5~rZQ63!T(CBQ)G$vO3GAb=Cq%F_~4z6!$R1_pE2u(X0_2@a`h*90wEHHy9u`*TJ2AY zWTh-!scvw0JmYs6emQH+5ntc&-MLYN)^w(L=#%4+S1;$B*FDT@_bR_4_WX<|X$d0l zy-kIf$Dn1j395CGki`MJnAbwh;M4fl`%iJs&WK2-A5oZ8&dkV*ieE8^O>V zty?Ps2mX{W+Rnih=R!K3cp9!;dA6x#9(1P@r*Hq$%)0$uOd*a`J*VG9F2k;->BUGR ze;0cOr+r;Vy0jv%ul15cojkR?D64^Rt>Asb{cNG_eK*e7QLjS4NX0MpgY;r4><2@ZoWSJD!jG>(M%NQ z^ZV7<|3inwk9-phr}0^3tWaA^z_4oSzYox{(xY+JsQRS49YcYC7O&U(vgMOo=CHkm zFdIx9=bhqPO+8g1xNPONT$Q$%?ytAaF$>ZXjgqz+>`*m2D-?PSTl7%94$cbz`77>r zlUSQVEP9Li$m2bzBFw(aUH4Cob=p}QrT!(aV9~roPidp0Od>jJ<+tkJ;#@UL>tqjU++xk$i@fm4 z=GB2Q3<`Od^ca{+AH)<_jb=oDB-uvf%5?drEWly8P#EN4Xka3@rH=3=aVhzpRj%9A z)ARu$Zwhy)#-pQaj~@HZ%g{gS8UA`W_4Q|^TdQkzq|lLD$EpF9xM>#;l7eD2PWK!7 zHY9lk^Rjvc$s>weWo<;&_{YpGt*@q>itsoLejHL6cI74X=x1GIs4G)Cna^3B=2N8V zOX#;_AL0;!AHIe{bIMq>A`IMKAS2WxRmpGkQhR^L;i(4KsBI&Z_l`hoi7f?+rNz@fBCRnFq zN0I65m#3&>-0++iq;#a*bVXD}#RoQ$KRuHU$eS3Hh~fkUU`d>+I&QMOl-lo0Z8)G2 zV4Ij{ul35T=xJxzgJiL^QJrITkHT|n*x))|?B2G>!4bNTOFa6z+`SP<*A@ zU02JO#=W1zCkIvpI>vLq#Ho$+=IX8N_r;GX-uRiOOn)2iGjiQUZhI}v0KLsYzENXaC5gjk32){QQD^r^cs+#RDXHF}dHd280XD9;?DQVxtL z^=)-H8soXY?!QH2X;qmZXq2QDlXSyo4Q+$8kjUg1WFs%B7ES)-a3$y5YSg`hw$Ut^f+n8!+z7K@ z=KuW`2}(ZGD8ZCHehbyjy0+yup8-iYXFL-wC1rEvjRKhCegAvYCy&U^x9tL0 zQrcsDb12QfE|`OT#1taE)kJO|3HFj5C8yspxseo+C2k#?4OJpjc3L!0?7tmNjl}!cN{nUyS*>Hbr3D8gH!E*!dEgU9YfJD)2y7CQhK&sFEv@!u95NuC^^d z30|59HQ4*2uR6PSi&W?Pn7#c5@+q2(9%uY=7g}JdbJ7{AV2!%5#P)N;%nGM|aeqI= zIMy0;~(#kVvyz-RItI`zun?oGB2C1_HtvTHfgWp~hyt27VeqZ4VRc~t! zrFQ?bGqLdl=-e#p9ESK`)4o$HoxWy$!5^l+<#Z`z1e`BD;yUDJ5zPq`Cf^|CymZ}P z=o;^eRirhu;5V<4HW}AfmBo;>6pOMLy$2d1kFEd!gMj-%=_C@_Eg^MPQfa=%Ahssb=?hRRJzyU%#=?rRkX;yn! z+C*yj!<)DM9`g%QzMFGZkngvGcVKOUvHdY=l=Js6?SKUT@<{r7n#$j3rDZQPs1-b7 zd6^L|LN$q)t?V5-T}tZzycMEh;7bBTSA zwG!!3-z#R_ig8z=3l)Ga##40tx)G}@UdB2Ry9olbytHT>P~7&*F)J{h+#Xpw^W%-N zDb00r%ywmU0p)o7<$2+WO}(qQ)f{s!_gB~-sKgw=dxI2Q-*SUx7DvbeF2ARzXWm8Z zp1L7Tvf;stEu(#UoKs%KYi~`uB|Jr?fIZ;bo$Qtvrn{9_q6?Gty10IljBLubZ~da0 zP$`<^E`CKtPJyE3yUZA#uGksZkG-t;@JPiZUT&5qZTY~I1%)-I(MQ81CimNR9xzYO z8lvXhJ`bk6!=kS!2ceBcaV8x}s~423x-{bVgdV#|-W86%>!|dtDfa=pt(em`nMr)5 z+<{day>fkXHcv>1ySD?)(DMWcd)UyA-UI9VQ;V6YH+VOu2^ zB3p{}_z(I0~aC6ZtE`ZmGnv{IJ=*hhlwgXR>9Zvj*$&{}RW2iKKDGi#c z+P)8d0s4z+cF%a5G;B7 zSHkd`#8c5FhKm9+z1EwGIn&l|8UGf4mKnW^z2|fA^Qz`JPxxegjiI^9d4=(Iaah^k%-&SZH9Bu z)R2T9Z+odRxviJO`^)KOIVIbJm1{|j=ajwkn+?VND7wlL+BVh3EtK%ymO#oQ!!O_{ z?cyM8;8f?AA~ZdB82!d{%u(&UGtM_V=%9Y0z>}GWVpAz>^Yi0vTK=9jdC|9wW}Cqj zo4+=LXjfRtl@ujLMmCSuG)hZgj3Lvc0ITZ)^F5d{}C5T_or<$Q9|;fHm5(sgak z*M#z`@d*oLcC$#T)g72;#^B7;F)WxRbG|IOXN|pO?JTY(k<>$f#x_h&8-NphySvin zc-Yq`CmYJgS+T;L%hlRYJM&(VZ2rPYe$M$48 z4{S_&5n-USZ?>2NJ?I&z*<1v|lLcejpU|h^&307R&+#%lvir=exho-ls)i1zINgeY zOr-3a_%cTHJGorSd9ji#twiTM2;$!RRA|qb_&uXrdv$!fnhtvPJBSR+ z)UCk{1^4`S9h{F`DZ8J&>FikphHkh@`!R`g&REzdA3`9*3w8$k&p_$rT=fOk~+~UBm!;V z;0Xsx7@MjQJ`n>~&zrs|eI!Cp%mnW23y?5H!#yp4G9bb>lweEPh{7d-#3iW-Ktc`W z2uGlSa*|-ep@)Mr3MdVb(18EZib+BUk}lf8!_`p@;p_ni{sjSrA~8T~F=<&K(F7g@6-4q?Um_mz`Ao(I$c1t)p}CH^x3 z-<#;)$m+TpL=$2dNdiwxpp3i>0bf_c)E`AiQWxQb{Im8o_4V@dfO`^iBXA~^!If~` z{eePigu4zG21pqCz~DZv2xp*>E{tI0iuM;K-syi)2NDZSs1hN78B$wUL*Ky*DDk($ z@ob5{eMvb<`aQ1w}d^cOjntv$58P!(1JH7Z^lPOM^g!&Jjv` z)fI*UT0@9P1W6s~OGu5-v_F`?!yHE>+WmuQq3hC?PNMQSCZ%*vQ zpI>0095E^po>(Gv!U+Km1(g4-J;vZT0iw%41pZtom;>SR5u6EW5F1ZSX)$IUd33xApYslf?= z6LC(I_XM=&aWt*t!U>-fK~Kav!R