From a96f10e55d79028588262491a5c25e6d23adef92 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Tue, 28 Mar 2023 12:00:53 +0200 Subject: [PATCH] Create a new chunk when the char is too rised compared to the previouse one --- src/core/evaluator.js | 14 ++++++++++++++ test/pdfs/.gitignore | 1 + test/pdfs/issue16221.pdf | Bin 0 -> 10087 bytes test/test_manifest.json | 7 +++++++ test/unit/api_spec.js | 11 +++++++++++ 5 files changed, 33 insertions(+) create mode 100755 test/pdfs/issue16221.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 912538af8..748914b76 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2341,6 +2341,12 @@ class PartialEvaluator { const SPACE_IN_FLOW_MIN_FACTOR = 0.102; const SPACE_IN_FLOW_MAX_FACTOR = 0.6; + // If a char is too high/too low compared to the previous we just create + // a new chunk. + // If the advance isn't in the +/-VERTICAL_SHIFT_RATIO * height range then + // a new chunk is created. + const VERTICAL_SHIFT_RATIO = 0.25; + const self = this; const xref = this.xref; const showSpacedTextBuffer = []; @@ -2649,6 +2655,10 @@ class PartialEvaluator { } } + if (Math.abs(advanceX) > textContentItem.width * VERTICAL_SHIFT_RATIO) { + flushTextContentItem(); + } + return true; } @@ -2706,6 +2716,10 @@ class PartialEvaluator { } } + if (Math.abs(advanceY) > textContentItem.height * VERTICAL_SHIFT_RATIO) { + flushTextContentItem(); + } + return true; } diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 679f1e97b..7756d4512 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -581,3 +581,4 @@ !issue16063.pdf !issue16067.pdf !bug1820909.1.pdf +!issue16221.pdf diff --git a/test/pdfs/issue16221.pdf b/test/pdfs/issue16221.pdf new file mode 100755 index 0000000000000000000000000000000000000000..1334ac774aa67419b13f1f4c878cf53deebc975d GIT binary patch literal 10087 zcmeHNc{G&m`$rNYOA-=e$<8ch#@Kh6>`cf$X2vp@VGIUEC?adNh-9l|Cy6YT>`PKA zJE@enC~Js*qy77ReV6x~-|w8?AMczq&v~BbzOU_bUHAQ*>$_SXh{c(rNnXBGiP}U@KQDqW017lE z;jr!)GLZxTL)6rOdhVWN90{oBi6-N|kpPi8wko>12)LG_9u%&itqp-e!3d~=f;tkxW(T>$n9K8UO?iLe(&RBLQCub*ST22-DT43)r0y10WDTjDl}_QULV8 zp1bUwcUyvTUtFPyXpbbC4!)>2wO_rf#EY`H^0}CXmVpM`$%xuP>#uNqk0~P@8I47g z(NvhWAvWD*EdR?yZ6bk;BanRoVCYv0 zZ4%L2i%78r$x;1B{lmc!>QzCJdbI}{QR4~yFGr~XNagfhvN_I|=tsifwj)xHDq}k^ zL5?5*YhM6tL;v#r=;x3~zaav8`c%%gBc@j5PW`WLDxJC%?<^JVifBZZMq90$r z<{x7GPCd|0tF472h+)tjoh2K)|0y)}(OhjBxzG^TM8`tYwM(L7Czt;FNYisC`H}(h z&>ztz4?;kwy9MkUHYUz^3$oWQQ{BcJGHlQDcXFKla!eHu&XZrbT*0C*-7OtkqgQ1W znJ3^I3rq^@x?)s{lBev#do6M(9HjC8k)bx(qS%{?lEu3j?2GItw2F$0T-HSR0 zIIG9k7dBU;as<@)1&sp~OdL)j#*>ivvK4k1Bs zIgrW9Ds%DOM^$w_!6-q8MoO>>(6VOmQVZL&oDh}Egb-C=&GV^NpZMTRJffkYu_@4Q zcIwsU*xK6SMrf(YG~*UBIoPGLSG}oWi&DS2@Dh0K&g0Fog=bM4^De7zS1xjUG_f5n zFIN{}Q*T^bTh95i6s2g5$vL+2u=W|7+KcyVQ#;8x=_RrUa{nd=7j&^`RD+6R*bgR?s-7Zb|p?L4DgSaqGdH#V!fhE<3z<;4ZkP7QxL~`@ODUF_>|tx0q!yMy z(|LnZoO_W>p?_dXQ$~mt(*-Z&y1lKpVE@IGTRp931A7yWPZq z_c)fXO(3VuGw1XCX3*6sYER}2(iYP??&zbK68JL@EypX_SURROj6p9q7M{&p)5Rl) zr3bI_^baGe^vi||8@SW$BA@DD-iWS^9Dh0s&-^?v2yMgTJgahUdW_?Q%Ia!{$}mWl z-V>P2CgkoU&w5O`q^8C)>jv;?uj+%KCljM1_59Flz4ctva5@=xApwD0(6 zRpR(EjKbg6!(1v6p&KY4lo)2&rwA?YlK1emh@wE6S+j)#lEpFBEGFmlb5HDvB{k-C z?$*97&K2P55E}k85xvJ;8S9O;L~f#XWgH1t%45c)u=s7 zU)XEFMgR1IR+!?xYl)B#8i4}M(-k$)!<08yF4W|I5IQ>-FNiy5{JfJUPBENTi&>X@ z%6|@Vd^zO43X;|=QbyO8eoi*_t%e#mR7yVffRI35hmDVZsa4N2J=f#&-9c)Rl`#_Q z)hS6ep?kuGU2n|MNC-+4@hD?%L(sZT)|`FP(=Opm@20ayt?&DF)~`xZ7WVmUU1}@< zrq_vov1j=ta;(Msx$a~NqC(}kp3u$e3R$+zG+NTG>vs}-*&b^i>#mNYGc#@UynEHh z0HVLrGF#i4dd?)~DXJ^Y??Y_*#>HEC1)?(5nlJTF#5lLBv>%n!B6U8ljeT}4dPr$$ zmP0z~?rp_~Z~BIwN?h+Yck1&#OIPw%#XSVLHlpHPd$Y0mVc=vD%lx9$?dI~4cOxn( zC5sQYATzL(iPsI0aw~qpU*K$?P@+N2>yi0-p>!Mh#oxS&v7y__Q`mRIG|?lBbkom`wgsEzY!JT3r_*joM)q*2=YpW zOS2Qp8xS^V#~s(1f)+P6NxqIbillZJ8z?uq$jFHV_i;Z!iU3Ouk)S`#hUC}w59cN0vL08 z9`N=OD2>M#uw)A%$Kx-%;3W_ei9m6={L(U8GLK7_8GX5O?(-X2P2pV8^ICam+tjAh zcKUiSeck00VNxzse#stF2<+DmE#T$%D6Vj1+^r)TZaK)r#}fda9ggx}xN&#acK}Qn@s)5dgTkPeZ2m1!ye|R6_%N-qj&edKaf~p;_6Xt^LF4=xI!Rkq{e}aa<$dGNl>bxWmn7DrAFQbbFBXyl8--r;aCnz!5A-=YbH|WAfHJ2pul+?pqYluzc`+2 zj2(R;Y7q{A|eS~D@oCT z7oQ(BSIOXS4^Sx5Nr0@%H#45z_sQ~@cA1K{rr!Mzk_Zn&i*n10nqFtN;`Z|0P%YOm zVo#4brs$!sxvYFgne7~Rv0nu)!1ZLr^S07x$yX(`mQw0u@A^yPhFWJtm5Vn!gDi5& zGE|RFM%XV`gdb-Mj1Nb~y{i>w*H{2&b&PR^=)_E!8XHgTf3!DbM*nWozQN%J&+B&u z>G=r^v5|GM#DlwDltNuNTHCo)nTn@w_C6}}nNM7i;>^@unU~+VP^95h1$spH(1?qF zqlQJLIYLcKPNB|FDmHBY=_rQ*#}{x>+Gs&Lp0U&M2RkVLtG$3oI9VP+==IeLqM zx+ess#za&rkYz4U+&ugfFKRphHG`dCf@O6<@2nIaw~ zirOiy*JwbRhC;xJu|%J%3^1N6qVwsr(I1TaYA;^+q{o>a(ebg4bx1Oq?_>-5)E<8B zS3w_=s#(Gd=(}2%~&@6wJ$?ck-bP4d0a^*6!^dE5L8rJihgFk8s%rZj`$>cXEEBAt9B=hkk zFAMm+*RePA+HsvKS^c~U*w-~IPfgKFj-Pu!$DD;Q;&}b~&)(<7R}Q$ySw^sC4^&zm zatzF(bq?y;Kwg^Ic=TjS^|TvJq5?`TJa9Xa+sH0h*2t@pt$f|V7UNU6=6fg=uhOvF4Nh5 zU}}z+*wD54Apfa#_{q(iR&{M9m+gmk5mfw}?59pVW&qDA2oJ7|B-T$PK49u-B#aEE zGF!@Z)_!@raB6E}s%3dB!zbbMvxvPb>f?>gU%cz@DB5j3{3CEod{uZ<$C<9{on;Vs^?jwll;3m^wi~ztRc)XuJ~*-JVDWcpB4oR?1XEJF*jF zh5v>z7)(w7E&QCx+c~~AQfE6K+|Ktw+P@1tiSjbEu;OV-9~du&>Po^ExrdAQ&*W)C zncs6~Ky;KN!*~0H^c)W=3OEKFw`8K*CAs?yV`=F^;VyEC#i&NU?I@j6#vy!K24d#A zcD6cqC-?sS>8q1y-&Oz6G?}5!&g4avdBumpeZj{2kUB>>M3H?C&FJa+cd)&_6mcNJ zS9dCvH48IOxFoSBIZ5CMw3t;QNe8GS5@8di-eSRYW3pPcYh$)fLXBHCE7}1iene-& z^r1+~1(94~=mTGKBeAloy~>F5*}nCG0c(e}-0M1&{p*L0xcGJn^QP-~yVIxZf@H(5 z>b@IDNl9Upi<^YNU$okE+Y`zjwAy6phFeRWgDbe2xLCa(@6{`@FX^a5_~giJ`SPa) zRPxV{cSe^OF)LBr8#{(h)Vfvs9)bHDl03TRzpt-0ZFceUprO# zQdp;oizzspkThf0dA_=2?NuSahx2>8kD=$KYtotaS7vqylg`=QzmX^<4$HjqitjZ? zdZ!9WMoOgXu%Fqz6*yb&M3g?$46CH+bV=uEQJcE2BsLTJ##?FMB$-hn2|s(mSRkzD zDwcsG5*LXTdzf;Op-&-dvU{($sE*d>T^A`=F3Kd8D$x&G+g}-fFo{y3Eqc;;dcF^> z%~;Fygkpb1C@#ZBqcCvRagmSy4YQd`b9-n)SScv1*4({jxu|M(B|kce1tGPH{*;^FWzbX_1X%QbvNp&UdJKq2WngFl;0FOD4vw~pq;pHiXn_|nH;1Bb-Z^%1hEKlFvziFZCT)`a~+66 zwIJS#$=wc5N3J85LiYxW6SC?LvW&6}WaiHP@xnK}C~3SXKa!_JGGaD4C#^Mm^T3V! z!?aJpB_o4*GKA7ad*^=oyF)6h1u|UDO}ce-$O@mHAoc@cVZQ*!!|F z$O77$wx)Xhy64;RCQQFYOnKjs|MsZSqu6eXwP5YP}3j#6_7Y&#p3D8a}zS=ibU3;dMcA!o)(^@_Wt*y{)3FBKbjE zb!XmG-tIESzEv8gMZQ0rz3b@lPaWW{{^Y?5d)3kthaan}^6%QBh|X?tJJFmuC(*Uj z9*6jE_BaR}_WNdLmuQ4<0kNN5Rce(<)=iv#?!&BS%1`I86jiT!S|-k3Q5(h z!bGh-qMt4X4m2FSv@(o;71pFIqO=5(kUiYz;zO6)Zstzv(Uh8+=OTiA4BXfQ{&?qu zUv|>B-?houJ9OuO|HFZ$+u}>TNb<3SiVb|dYCoZJWsmq10f-Ze=)S4kg9OL5GaC8t z`n&qALR9H{Cmzh;ITkwFZ^nW+UAuQHl^qrJXXNtv7JDIJXNi|*2Srejn9F(4a|V^FVrA~qaHh7ZTt->RU8G%Ku{{9-Q@fxLojMESD zP*;rQ&sFtY$U?=Pez1f(Th;RgA$ zW#!W}yJ$vR#2@Wo-F8Lx`-k>>K~f&9`0L|mZDjb(lpiJBUooSiPVq)#JaA-yGtSkW zpej6nr&$=_j#U-5f*FB~ytQ#|?)pI_oJEi^3KQgkQN#+Xt1+uklqg=_Uetmxfa2vz z@KvIy3ZtpV1~?_^{q}31FyQMHvWKd$=Jo)<+Q<~3O(fv}a5<1H1_XjrtGjYwI1~;= z$N(T92p9+g1HlkkFiZ&sQGzJ`D2)rNZCA-vNLUvoa~-|!;;2um!fs@;w-OK-5D*|2 zATLKGxdN#+D*{0fAOs>yogwQRNFbvrvIJj|Z%lsh(ZTs*NbcTbcOn6>%@^%V^dqYZ z3;!&~Z_EB}mzVcXK?Gm9uQtkIh+aSn+8YR#0|9p;!eV~Xc>9q&zp94C0CAo;ukD&Q zl@7ck9Tid|qo1TZiu3Z?k<$A&JpO@>IzSaC5Bv$hHU?BVN=8JiyGx)Bwd}1%1z%nk zBrgj=pui9%5EXowH1!GssQ~|J<}ZN0^3bN9lTgoecHjY~`tWxizs>w79zP;-CoX@| z?Xj3xu96@P#P94hPVPL>2fAae3@5Rg0s^3Ay4 z$$sOA#VENDNnU8O8rs|2(;b7}jujvkOcmg-q2K0z^_-Fp5#zV*VWf_lpPxHc39X1$ zz{x3A`1n95weQT80sAsq@d`Gg<%z(pfsRqgZsf3q~Ht( zDJWoM!3xe8Stv{%C+m#Hz+|xq7bwQXSssIc!GG}mjrKRbR7;eoxM2M-IMRO$*FVsF z=cNMNUO&IBr*Dh(>s#t-rEV0!pF73Zq3xCRfBO4n%F0{VB_#P(D&M}WBWrmX5NG3hq0SrndQ%q#BSmqAA_k428hXYPDyFY@ob};I<29W<3JzRfk+kztg+^hwr_N)DeW-S^z zF+Cr91~G=VV2$t@mUDx9h0ExU5VC2iczAfw^pDj6_d00UooKG!VK}@4EWdSVfx+;f HJzD<*w--J| literal 0 HcmV?d00001 diff --git a/test/test_manifest.json b/test/test_manifest.json index e88f2975f..ba6324c3c 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -7510,5 +7510,12 @@ "md5": "f71e89ebe3d6e75e0c83ce41cd72df1f", "link": true, "type": "other" + }, + { + "id": "issue16221-text", + "file": "pdfs/issue16221.pdf", + "md5": "62d93c9b3aa4ba3af5446504632e78a5", + "rounds": 1, + "type": "text" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 4c157c3b5..0aaa0d47c 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -2624,6 +2624,17 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content with a rised text", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + + expect(items.map(i => i.str)).toEqual(["Hello ", "World"]); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();