From 2be64d63e1ce7dd13c76de967db3960578a43234 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Mon, 14 Nov 2022 18:14:08 +0100 Subject: [PATCH] Normalize fullwidth, halfwidth and circled chars when searching --- test/pdfs/.gitignore | 1 + test/pdfs/issue15690.pdf | Bin 0 -> 18794 bytes test/unit/pdf_find_controller_spec.js | 21 ++++++++++ web/pdf_find_controller.js | 53 ++++++++++++++++++-------- 4 files changed, 60 insertions(+), 15 deletions(-) create mode 100755 test/pdfs/issue15690.pdf diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index bbd1f456c..2a1db6fe9 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -554,3 +554,4 @@ !bug1796741.pdf !textfields.pdf !freetext_no_appearance.pdf +!issue15690.pdf diff --git a/test/pdfs/issue15690.pdf b/test/pdfs/issue15690.pdf new file mode 100755 index 0000000000000000000000000000000000000000..fdc09c6ceb61797ec8380511bfd7a35d0b32b23a GIT binary patch literal 18794 zcmeHvWmr{P*EZcE-N*(3rP;7ygLHQ{f^=+Bx&>4~N?J-9Nh#^>Zt0drQd;`kaE?b$ zJnwnF@7MR^-Iv#NuDQmHwdNR;ao=;yMWZMx&IDv;N26(5Jy_k!K1>~I>p)`%umWri zEzo#*0W9JWD@PN1=>Jv*jwYfeMz+Q#02Wyj8#70904Em*fS(`D!O`Btz#7dJwp^q4 zZK5Dn-61}C;yqXR)}uNpo4^BD2D=qg2)@`!^TCZwJaQ|{fT?v$53<6Z_G)@oO^uf? zVU^(Fa?tZSxbul)6eh$1t~9s-KrS`oyKGG~De6~IH<-o310OI>-gZ%3h!W= z3j(Q`UM&+hxSTXbqoK}WXC9<{vVE@{PFA{$AvXc+&GPU&7_mtaSWOA`nvcC&gqgGb zKIJxlbM60n#)7-KTnBFVO40jaaJ2nPs~|Gs)78VP$v|2v>2%XKPwm8-yJcTX9?O&z zZK@f!2VmhJFex#7Dhf8OMSK9K#(Mh;wJgRxhj6n# zm%8b^E9aG=y0yK8)SytA_FzO_3S2KT?RTgS2{8N$&{Ap-e&YS5x|2qD#!TT2>0Iao zcOY!%#ne=0sZOH5RzsO~E71ou2Z1zOQBsqw^%rn&%gVLeRgqO_>I&XiJeUSiN@QiR zNLk{{#G6&y)vgl1CJ<#;Oyj@jlttv^eS+)GD-i9pycR2$-jnrUn%utPgZf7S&w|}) zh0ySe){GO?ht*{AoZ1U4FW?(H3&6&mOit1I4;#1`b069`!rZ6} z5cWU$bRd!`M_xO6k~=*!x1OxVgvNu!S@Kx8v2g^jL2oMqSd>j1Y@O_l zOdJ59-wH%*Z5*NZ9RRmR0~L@nF@_ij+qwdtvO)_uxH$k^AU0hzsG0oU&uzUQ#-(U) zYouc02zUzZOjI1eqH5ym2+#$vh}c@$+N(S_Ffsw$+MtL75WsQQJU>6w5}`fb6|1VU z0l>G#Xe_F#AOQEB1T@PD0Ny?fbTF!_Tmazj15;H6vI2mAH1l&nKu)Oej}gg3pMyo{ z*5qxV_6}eXHF1U*nJ7yL|6%lOoItjJUKV~`3XxXct05nG=?x=vz zJd*ebN`X>7ig5m~X`Mw8#qqG|)Qwe}U?l>mjKuKJU%fP(Ks~@x>}XaJ6PDC8`F^R&nIVuNx>auRjjQb7Q5nWtRzja5+jnBHG+Ng2 z<`FvuWIH=r<7?(;jZ+H`UZ`C0b-7=t?bB`#CJTimYb{u9e`KjAfH|SoP9!gFh*5(n zvqlnJUHqu92Y@+1gejYUfB3-P>RDvl62>F+;O%WI2iO!#W`+&$MaoOto<@J})A-)q z3w53>LIK~0Quyp9Vyju%Su&!1eHfUrgT_~rk6R>t=!O@#Y?8)`}$(WtKj z&1?O5eKo_GjnQ6vk1cu=aXvyOevxE}XYQm2Hp%N@eW?Bh1*ICY_NuW8t+`GaODhvSTV5$d{QF zj#2*}nGgGO_;~=#YadB0_;>Kkl5h|~gik{JlQ3vPz)4u=R~FAy?LFWB<;c+ZdZ4@^VAw@1c_;!)NaI8qd zvmx3XBKlW(X(EbzM-LrIYEX1vou?uX0YFGsuwJjR3PrFB1%6mu6NPkNR5)Cd8Wx+|oe z95a)&I#e>kZUr7GAk83-h@gXbIY299(NNuB+)$czEY2uSU6yc-(-e!{_w3~d!?BVq z%|{;voKZQk6GNHW)b+#kpBBfKz)u=%^N?Y7HW4q8J%3R}q{C*zQHW4^eNrXo}xP7iFlPh~NZ8FVD zE&WAYenhcRe}rm6WAb$M@haIW$!ckjGExhx{&#j|;twHFA+;f4hoH6U!R5gj z&LYUOn>xCfGkP0(&lsv0(-_#8usm5c+cCVcq&(|9g;7=uS_=b_M00 z!6ZG<&B|rKW^qlRd)2wcaqA}EW)w*RNdpNni5f{PsDpzz!6uMXyL{FO6R*r647e(dc1bME)>+hIR%| z2bWQWUC=)N@>AO4z$B?isnBjYmC4b0RsEX5vOx^XHFH`}K1Xe5dhx85)<^O2RPI*Z zF3$|N#4`qTzF^JZB!W2tJ;ohI6{aml3N1P9zz*T_dkfe~V?K(t)hG zweo^orUmE59WR~)-QqcdTKU?+6U-B;3)%}HGGBmeKt1N^q@(lW+SguV*jpQmRRjAW z%gB#B9+iwpWiKRY4bTnT8^}P%ApAx+%0y&8Y`0iNW*@<9%p9WU`aEq^Z6;<}d+WSU zZX$Huxy`z5Zjx{6s_KH_zRdjyiQvxY?5K5;LG)*DVv95)G>R%CPYw4h_Okh(@CWjT z^5Zr*H{^S8t z1&EPZcp2nujcj+7KZCJYuV91Af+H*z`6;0Z#hbf1PlUalFwf|)^$F3!2f!}zc&P=tui*9qx;&H=P zm^Hv!a_U>JcyFi)>4Htr{HRi@QgMU6dFkF-hj6v9(qcW+z*9`Cb}M|VojKFe0*jeg zs%OK;hP_KcOA0izky{H|UK(d}eJ8JX>OY)(xN!O6jN&po_qFCwGu}|^+0;26cm{cKY`rVa8!)Y&oCmrHd>i#hjg^Vahg-I3gn7?kjn_|VxB<}}7L zIswiBM{~A>Ms}1e+jZJreK{8Kz|H@v$evOC9f6CIdw;3(JH=iy2Je(xkXPQl<72Y`@w*cc%X0QY|*$tdLum$1BOJe#ieRbo@bW z@;kll7m%;zd)D<|pO;>}TP}FpfatY;u6w~Ue^TSM`t~M?6h0W~e3!YwKiW>6&>t*)V;1ILCBkwOb5^9# zia3}%$RJ)Oz9~^4F&J_dGUr}$)c&nEBz9!kuaWM$;xuMBKjnqXMZK9}ciT71;jMT9 z>G_-n?(^{z>viLel@S-nURHL{e13z#iSJd~h0#%G#MfwAQ9; z3#YFSFaGY4-}9srjCqMkwwRVEK z&p-f+48+(0>UjU=DeKaRjUTzJ0i_t0A%{U}u^0B#KUN!2vu=aGe2A7e_RSp;P z&U9l}7k+vB$qv@I8s-qX1tMxZ$-@086SaBT4JXI>)6fmY)T=^V@FrXVFog zdw;O-ir;8)u`D;Ypm|LX64O08gbt>^N|@1`J`$AaAPrf622B6=pUz6=B4E_Aq$p@F)iYa=Hkc;l~3Ww z8*qYg2iAO!6lvbKEUuWgOvwm_+M;F`tJ%=L?}z0U%zLdO9A3x1xJ{#=rrSVOZfil+45hU;=_Te$z)T4sKmE7AdH)L5zfK%&bhHI>RF5 zU~~(>q4#d#=x-O$Do_&^F?cR%0x>gpgfdakMpYb5tkrL8$eI`!Lu|~@?u0ad7XfkH zB3oy(J0k&bKn;XN&cOAzt2+a^8x)l8GGPO--C}p;U)F@Xf*%k4oAv?$Y}}k&KVdN| z8+3gC#FqUEiwo3eY{@i;1=joexE&gIbWYe#rg`?QtR8KR*RrG2$t|@xOG!IG(y8x% zi0Lj@n-$LyLPbkQ6wN$%iFO~5L$5}4A4L+YOi1wo7E1&c3#G8|3O<%MjId3kqUAf>m@b7AkYVW#1_J%2q=ejb2EX$A%th7YHnvXWQ3o}K&cCnr(2P1=3E?5N&P zz9e*TLPcKYBiEm|gB%`PMqLs2X)rfBJ0H9sD_JP5wzE!8;ca`dHWu1{xO~b4{p0L< z<$bA;#_;e%9Vya7el4nl?cs3Ce$VslQtztc$kTXlY|G1UyQi3SlzL~&hYHtbTNwq7 zOR7udX$HJ2miH?kM3P5;IcqE-_vUg7p^HKer!;=S)_aWIMY6(WOm2^7Pht;ak7x+sUE#CO5$l3iviz5M027Q6L0RosyI{L!QMQw4yDFMoqIub zq+#oI4j=76a}h`oY0}bc(lnkO+7YRDXFH?$Je7{mp5_1vcC9Re8b0Y2^`E88ZQ=7P z4#FR&7%&BV_~esY^>FFBGe47Jef6NYZ+U+ckYDtnO4N(#vu2IDS$4EBQ&J7Y-ez~k z(vI2YiDu!)FED)=&BJ^Lu#;$5M|F>p)MBW4bA1|GGd$|Eg9miLH^TMFCNfjWG!>4| zDTwyo^Jnhfky@@=e&YkDr=$Fo&L4|Ox(_{%(H^&r@98H(t_!Fj# zsg0ppg8Clr@*s^k62d;J~1IgXs%Vfo-4y0c5E)$u1O;nUs0^xIV+4s=&=H4?+y*coqEZP_)U~s zYK^PnO>zZcREf+Zpm$}{_e-!Yo?{AXpb}oCf2sWHFq#;sRtQ>qS{6ShAa^{+KW>iqlSslNdSj=0Gn(?_mbkD%*RR*Hsu|Sd=AMO-v%2hp?6)^T`~omemre zrv+V$9mEFXq-^{*A8WaG z0&wlr%ak@PJb3(G6czJZKmU0hFxP~7bBrdr*&w&(cUVTQ9PZ2>nb%Djo(T7vy!p&? z8Oi(^vl?XPEJ?hHpMCt>bmooU0>?HVDxU+Fb`qFHdD zWv#87sXmCSWuQ$kabbfgx>o6^R{JRx*5oAG*Y$((ve9c?&*d_*cN49$p(kzIZ}Tp( zQnYQmE7|L=z$Qm=~@D zGs^Y5TXW7t;V%x23v>+*OBt$CDLYld zr}7wzbNa)%n($O@ve|YUjN0av$Ac1`wi!`$*kc(zwX*XJU6T~b!AWL2WymJcbv_ph z?N>^cc{@C|hk$(DeK}y7t2BPOs9D9p(L|k^)}{nFj8UdEqS^n+=dNz1ORC6y|L9F1 ze?hAF8=H9fQi~y@RQ-hdjJzO@UTt5I5r1n%wG5l>56*b63)5bhWRueOCww)967?q$Jofp@z6+sRyhRNZSfK~ zL6iu>2{ENwF4fr(XI|9(rs9fl3qecIZwrKPqNMq@edFC&83kPSl3Y-~$GrRsY~HA& z(P@!JGHB{{8!yvOlDf#XFddKB$2gsde7<{0Bq)v4)X4scdiu*Z(^j9(bgu|8`xh@; zuVytq=owu$6MMlmN5cghHSCIXD)2r07L@!X(JQt>CF>fel8I_XJl_04jBIh0sC9T* zO1Kndbe#^Ki0zFQ4{M0WC+ld&hoTG?*I)6(=R^ouQT`$PB14Y2Y`v4jRWd=-XS;k|Lg~IvvU06 z2eaL7#{Bey!BBkscR%=c&VRVU98etj(+vjQy3v2Q!FQei>IVO|b@YpV{JJyz!wm*; zLVfSw5gi*R_}`UIM&Oal9-B9L6~Y07wK=GBxnti>UP(#h~qBj7S1JW9ddo4Z#~b&M$eBMaSMF zh(ug)zB=+%&rD{^MPM|XfM5QuSa%zDe^;#kW{r?wP%$yHHBpoL`x^24R^?AF0jg+s z?81MyMu7gjM*N^rex1R8SR**V>;NVZR6~H=+}r>l z8#mW~#L7Tc@ULK*jpI*X8GMU<{{YLsFAQjRi1;TyW`{QL$HD;Z@psVvJ3hYMGX4`j z{&f%e$HD;S0{z6t>|hRPumARUWJk;6x$1=5%|OGEwp4!Srjt-juXJ6ZKCD5SUwO58 zIa@K2jBwvJ+(=<2LwM6 zn&Ya9N3^4%i&F%3lY`s5B+VfnWE!E(8;F%M9VD9q~ zRAgp3!5R5MTTE@N)FB{8$-Y4*n#X#%Lhh+t!wSOpdx^UjYvNHwHbQf=6Eh}pyt+bj zlgPf^rG7b<7Ztm%Z7cZ9Bd&;gfXl0Bl=`c)P@d!~++_uHc)iF-rgn)&3^M$-BHlre z&b+3Pp^cEIo+p{Yc^hdNDSr3fJAH~ca8_FX-&85`>h%okH zp|iba5$Ws4Iwo$3r?et&NuZ2-q2-(u=_6GS$#*z#5 z3z;Rmz=iPdPJs$Cp3+MxGM5Q$Mbf;q`wTdV$u!hns!Ile0n~40JSCTC0|}*gAMf)A zo=Vm|UDCyIk(s64w+}=|I7uqg6>QooPeLBIUOy^)?|Fl4*3*-HIU)e>wN=8l`ZP@K z`hY@c&y&b}i!=4&0nD_$cl@W-YrgyHJFYpSd?q8Nsi9Yb^Rn=@*JV2UX z&uH}nQ_9+5ZWdQL8gMHX!Ut_=g6E!SN9J)md_JKdx(I2@+xcw%B0^7LnxD9v;g~`p zbc*>c{m7+E?CMN|&lqmSpzr#JeX*~Hof&S-H+345+Rx$PN(Y8p%QK!IxOW4k zDe}qTQg$iGvHxh8=as20kIJZw#Fm$-F2C5Q%cjW)$BixjCyR55V~4AA3ADB(4vuSM z3t5&6zlp7h85dMc#>Xp^w}Uq1w_GcKsbCtMd|CpWNR`vW5?^}#hUo*EU`=`64^0o zVxEfLbX_+V4W5BrLtkvJH%uJLDq9L{3th}d-D__#wadX#AFub~uQA^fmJYaCYbWV> z<6{OxLe%KeUqJfQwkf;;nbv@PjFPlZe{%;hlz7-MS&&$uG{#&)550c-E>5_`&^7 zEQtf^pIEDNLkomoSQ#-y0E4G+jHPi8RbVZRgL8ZiG;x_=U&A)SOadUUGb6^~Kdu-C z4-Wth<;EBfi*?=woJ=ty?~pU1?^G6B9VH{zAz&e05+bLt+MF^+~itQ^7;L{)TVawF0#3HnhX|b1u_K`ng!l+6yYC=>#}Aw zY?v4e=#FEw@i{pfsq*O;cIXFLDq_|y@YE^d$m59P9^~AUMbGiw^uftNse*6!Z4s1j z!Y)R7JTyMk?P?d-lGI;^KzXkSvUX3K3!=vJ9geBbVR~EwC*(>DPuYe$&Um!j8aawU zwY}MC>!Fl7-RMQk@*C>?=TnJ9dP z1x97)21_?6CFqhO%1xe--w({v)1~#I!uU333btVFI3bUpLY2;QL(~hBTrfwoxi}Gh z`^NpLghQN}3(WNRAw5f+iatp*SA9R~Cj=f`0$(!0tqF;D)l<60#W4Dm`mjyNap*b7 zT(F!z_)SM1q0vAC9UwoLpM#m0FciJxQO89`Z3j=s(~hcr|NR=fX)cq?S=WBoF(US7 zz-J(w4ke>WBDSRtN~57kvu&U=X>Ev2M!%*F9?^{vj<>20!r21hg&TeA>WWs2Tz6Z1 zzF&?t^@?b2m`Xt5V&E8Vsj{B5M}n)?h8;Gsf^PB6z164hvIn#>qVI1xWOcOVqE{f% zXElCk%%Gf>xvx+cJ{(jV4iiEnJ^Rtzqh|i)IVnhR4sxwNHgj}R(B>`L@ipMcczP}1 z8>0%s71^uwWjltB@B#t{*d< zMs2(YuOAp=pK}7xK`WN%y$S%R&b&k40Ga=WB+gEtItm#)286cq3Ci5THNqUCSh zK-$UdWNZ|9fwel5EafGiG>S%1F_ic5I`Hg>GO3C)+WLJEnMAk{;q&7wZ1RkY)C+7H zrJL^wV-p{gLBkGCTVqMN0x~aFThry~zOZaW;oOYsGo>k?)Z&0JX5 z&R|!O`~IGM_r~(|at-5`Q^-EaewK=vu=@A|;kRV#q&JgBM+iHe3rJ^c(5M_jmk_-X zJtZQr51|Uh!j}uZC9f;XS;>~FbC}( zBHO=T&AN8B_#+;!pREn#PM8iRa2vVn7Purx6yzp95zEc{_?X2Z_8lH!H5&Rsl3O+R zlon=fzlDM7Aib*?E6#Ku!z{MSMyP2`^P6P4bgRmARCs%br zKQoh9i{~1oD~gicX!6$Bu-&6mv?#gVxQM%NzHMTV;@8KkR1b77S2cvpt*luPxX`?h zX(o{WfNGz-x15H3|AqgI0Lh%`qE%b%7i3HjTnY-hpBDX zhJ{(wuhTa%NBJHl}A%uc|SuguOVU72llV#^=+9=eJP@_6sf@XBY7#CK3U1I zuItL;5SW?U!BIRp#BMNLw2^EwExg|@==u~3Y%X00`8H?x-5ozI9wN#bg%@RHFA;7Q zFQ0Rzv$q7wq}}XXYLy~%>I7vka@25&R+Xd6({yPQZf>opv{-KeUgG*_rqI8u6{9L~ zwa8HjeMMLS)VD=Xn12jDcQ>Mv)9jmp)0JnnZ(`5-g6z5?_=XU|OVNKwaiv(&A%cIE zr@y=pPcQKKLG`mOY);9SLEOg{yesYI8;oVuS|8(V?Wc@63t3l|HbivkDK_Rq;ugP> z46HOh+Fcr0a4UYINl}JZD}tcugFVBT^@*6tI%o8|V$u7};`by6YWJ*dumL1g7Z4`x zqTtu(b(C%Fd;qwYC*w7VGw!@~%DDHI_rsr;^!l7qvANQ7eD&|vS9nbFNmeLhVB!H0XPB?jbG4=e-4+cWzRzi2rjZ#&dm=s2dRb;@wWu5)a3tc?cnP^A$> zBo&t-Jal=MqsZ)oSt%E!g=n?ZfKDYTT-rfu11?QfXp>0Q%&zvuEm^m-O4_$hi=%hpe|5P^=~sy&P8Y#8Omxyl(Us-;JRpup9rVZgbogPtkT-txb2Z|oe zCB>jH4~wglp6fyR%74$25I3PDq@!Nl9EmNVdkx4ko+s@2B9ePwjXU(vlPzUgY~FB; z1=GPbz|60?^9iu?#%gKmO`Ohdq<&kT=4;f)!G`#%h5j1CKCzac(%zuTcUSf1H+)@X z4xM=w0i?lF(-q?a5L>-#RW#1tlG}sO4#?9x*%5sCDBFcN-WY57K)r6Ue=(Ed(x$sN9_{!bag>40qUA}hn5#qO!}g9LngrRN@QNC` z7Vb|nZg@4X{Wu=gZR>jon;}np4E}O&!gE$E^TBpVh-MrOk>y)ji?lE@4LjmUWl!1= z4)iouCB#>IlN)wzkxV-=%TI=VMyE3c5BRC`3{tZ&H6G>AzxS-K*1_kMpQjG3YIsMkyyuHehV zHYv{Vu3`KULl^@0(IZ!;yrUt@2{G}c*QPi|sisQBs_>d&oZB?}G10YcXTe+e8u{`J zcClQAZg>~0yXRm#%#(+5I8NL8Pgk@%lSo}5RYvtKT8?kMIQeYj71CKR1DDzwVczHP zMd1i}07`}qzsqx(F9h0NYN>u$Hv#5{esuPD8|t~-{$hbE@5A!pItQLx&?c`~@_IpI zjeVc}_NMg3z(QwA#UgWX%Qy1RWZk~r;zMjoyW`_WHtwk|J3%){Xcxyd|G?V(N{ju3 z`u`1UBlVvGK-lggyMGM;Vf#HK>*G83*R z4CXT#_gOdV83`bT30wka!QV;_(4%Alg7Nbj;M)R-uA+>bG2`O!iGF{N?egqq;bulqDcr;Di>Yt)+?jWPkY5Ds;~VlW zKXU}gOBmW>LHM}VJ1!SKp}8p)WhEmYnat#?W@EaQ|*Gr#l=jWz2&mj-sVX8%# zVgJG0e`P@b@626>K@@s)N)uvW!=!9t=41tp;=1G8e<~On`){!)zeIkq|Gvlb-xxj{ z_|I(ZUETjR{D0lixqItB-qX)H{W&kc&C@SCI{!G8e_aC5p58?&F|k2|kAOfZ4a>~| zf}ZdB4-rn#13Et=oY>g^w6nv>#fA1a(iQ;XhVHlgLE3^q(5s)ME%5f{ZEV(Wq%E|) zKa;k11wWqohuwpLxBdS9CxRVHB>%hR?`X+9Cu=9(jZf0uk!C36?084K9Pi{Y&iQc4 zCp5w@WFpeJ0D02AAU>jaYAWCXV5;LgDiJ5vD@znSwE@Zr)m1#@+-G(X)d-km_FuwP zm~(q*a@i0mW2t%Jl0Cf#Ew0|ADaYiyWM|)K+{{7E`^Kfn@q9F%-qMJJz~FuiVSSYB zi=*C+U6vF+YyYZ>Ce2Gi_D5!fB6Ye(R%3ose(3C!g*X1^)p;7Ib|>WLd=7xuRi8+& z-c`4ctc&OrrG>G)e+>S9UgXnBP=6SDHkIO&*je9B@9yd7tFhj#GKQpQM-==q<|Bm% zoqnDiY|MvAd8FP^u)1!z>Jr1(=7lU=_`&&+3x%vsnzPeA7p#D*#r4J+*Q>z=@2JU< z1DRCI#G0nide1pSMmxky7F>_G=-5}SvF`ez@AUDR2GGoz6VlsZDt*MgFv?yO*g{6D zoCPP&CnwD5Yb)R&Xfud}ghR7k3sZ$%u^)ZbEH28u82d4HP^}k~i%O20=)Z)e~BASnSzpZ{S z=3Mg3e7ayqJCEoU^5-Ue>+FhGYJy|>QV0oDL|)_>4V^iAu-@=;da&0dpv^|O9d-iI zLfD3P_hw55nuy9BIt*$}$v(j^(_MHiOI^eP4fihM9X*B(j=s_($nI|3N@rY5k4akxx?ry zLr1csKCi_Ki)~JLqlv60S7ViaSJx4PWy*)F&LkQkd%K+k#v6J@zfZu5WM=fz)J}=|aw5yd1}MT=1WF(#YB^AIDRFn%hDG zUac^T^fMP{pw{iLH*;1?V;a#m?`}iud7!SjT46d-4UDZ0l&!<+&he|tuZghY%+pbu zH;OvxvX^1yr&`Jw5w2$Zi0W?EnI*dYN#8FQl3nQjxe zYhZSv*t}DGzZEIdC0z7S&(jiHPbv9`&!P`yaduILzpo(!3=9GyvTZx85jW88QS}68 zQxf+$x8-fmYx0cbLY8IC7{6~x*FHeKMm!h);w^+$-!)Jo=Beq${a82T{^pd7OsqfSIAhErc9N)h`Q-!X3`dr0P?o5X`Gk6IXCKhl2b1_5&f8U3-t*dd+AW@~udSP!<;B={Gw^$CZ?KQ36doQ$w5+sE zZY+u@@Nu*8f?Qb}`z6b1pTxL7wG`MZs&IDtm-ZEI6zotE_J-OJr$roZyDyR?_TYSs(TJmaj9O^PM3WMdhGmY%z>-hN zM6?oEwGLqtgg{!OZ1^oq(PjgQ9A#*qtn{vCS~Hnn@%8d z^59sEh_h0Maa&LP8}W#w=aQdtvUOcj^P{L*99`AcZM=50Z;UZXT<3yKf-3e<&@j-D zlQ)H)i+xOE8zhv1YYsI-$vnU=eJm|Tni}78p`U0HmN`{ZLsGICWJ;${{7Bz}kxtvK zT%l~gf!0i5hQi*k4YKGZtVQU~-05K2S*N!~(Tvsm>)3s=`$po|(@i7BTOB;|&ol3Vi8=k(y?65BejgJx6 zD~XWMh<=bQ?_qakO8lt0@*!N1+mXlm#JL3{(WfXgAiS2QSRyRmhPZnCB+GQSVr-0r@`oY#X(o@y z-%RF8R)b2j&$J<*xinMdantdgPU5*)~J@tw_hwKd>mcI~d5>7fH+M$>? z%@V6@FUSn1v*Z+BaGTiM|5)U$Ugu{vzkjfe)ZDM@EKe=cAwQP)2s6ioC8?#6zPEFR|xU9kL+UIUgq2LP8Xkd0#BDST!0H6Ut*4 zP5U8Rj34G%v0qICsVv(w;!a{{h(##Xx>x~a6*zdPmExjfQ5+xnEhEX|cS@2x@Z(T} zRK-=tuT~<*_hW|$3c>F|;^$(@{HQW*Ojcz>#s1-Q!fPKlqFD*Z-!?}U&kn&f#RbVf zbkC%0M@rXv&U4faAA*sZH0a*t9=6rnERn&*5!@6s>jZJgduaFgoWPt|Dec=`LG!n*GORB<;>0@}! z+}af3^|#{mJr&akMJ)D((%`IxJ*7;f-@B;9ewlMpg9w#N+*`{%p+>9MV^)7Yu*l^X zt5rEKLrC%+o-`EM!{eRwNG9$WM-QWMG#8-_-Dk8Hl)kh#Wb~A73`fTAU0*~=y^Vg= zJtHZ1#&6k~w$~_ewtr*I_WI?lTtql}nniTrNXrY?Jjp7dRuF7ZK1l*lHcNETe5GtJ zmT&ge+ro4VIaBpl4M?0jeh?4Rie_+8)2>nixus7tPLgBSB`1O}MlvU>^0(zq1M^DX z_?L?m8L1rOUK}VyWRu~Ty2v-eDIteQPic|pGVIo^2o;mHv6bB{LS-#1jUfjWc?*gZ zyO1SyT3h;zSIJ-w9CeMzs!tZ~ccMpis9ik(U1gFrGxEE~~;JP@D^CPpp%ZY`eWn6vA=ndyKX6`sxS z%dRXeMA4s_`*r0Bu>)MZU5ky5I*YPZ9ka@&(+p_cZg5lM`Amxt>ZB-#k0bBH;P+wZ zgLMHJ#7=K|%4d9QN4e&n+bOKmseCo5UkoFBAU=yQ0Ml>peq7INbJN>wMw&a&%0Z9m zRwlIoQps;V=;!#(4PaKHx48Ep7W96pL{=n-vC8=3#pCRg-pv?<6o3*R- zJtpLG97JC|1zNjWF)4|&m)EObS}UC*ZsHk~+qpj=f9U9IR&%?ozn(9F)bOnLbKl3~ zy>KHDFSO|qY$l(=xIqw6q38%+lo0){mq{I5LjBD#ifXDqwq+bdZDhq~guKCScwaSa z@X6u)e&PP+c0A%9UfT`)BNYi$<{WPmz-^-O$BjE-<3DaN-Q`RI%g!&TK1A=jJFx!ZNhe&FyAH|f8>DB(E?fRB~?!UkhqhTxV_}MP1tWG?#k|l zsdQWBber6DcsJI&hj70wyAx5o&57P7;)j>vc|@QeUV)w# z13cESwKt}N1_It*Lu_qCZ@=B}So8@SD;paS2t5V^WaVN5YqPRGg8qZb$=MqJ&mzBk zufo;d#1xGczz#x#_Wb(~z{$bE&H*q5{3ruMw!!5S9&|{ys^sX|Lf3Y#Oy*)np z+l46f7$~m+D+kELlncxO<^} { + (match, p1, p2, p3, p4, p5, p6, p7, i) => { i -= shiftOrigin; if (p1) { // Maybe fractions or quotations mark... - const replacement = CHARACTERS_TO_NORMALIZE[match]; + const replacement = CHARACTERS_TO_NORMALIZE[p1]; const jj = replacement.length; for (let j = 1; j < jj; j++) { positions.push([i - shift + j, shift - j]); @@ -202,8 +210,23 @@ function normalize(text) { } if (p2) { - const hasTrailingDashEOL = p2.endsWith("\n"); - const len = hasTrailingDashEOL ? p2.length - 2 : p2.length; + // Use the NFKC representation to normalize the char. + let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2); + if (!replacement) { + replacement = p2.normalize("NFKC"); + NFKC_CHARS_TO_NORMALIZE.set(p2, replacement); + } + const jj = replacement.length; + for (let j = 1; j < jj; j++) { + positions.push([i - shift + j, shift - j]); + } + shift -= jj - 1; + return replacement; + } + + if (p3) { + const hasTrailingDashEOL = p3.endsWith("\n"); + const len = hasTrailingDashEOL ? p3.length - 2 : p3.length; // Diacritics. hasDiacritics = true; @@ -223,19 +246,19 @@ function normalize(text) { if (hasTrailingDashEOL) { // Diacritics are followed by a -\n. - // See comments in `if (p3)` block. + // See comments in `if (p4)` block. i += len - 1; positions.push([i - shift + 1, 1 + shift]); shift += 1; shiftOrigin += 1; eol += 1; - return p2.slice(0, len); + return p3.slice(0, len); } - return p2; + return p3; } - if (p3) { + if (p4) { // "X-\n" is removed because an hyphen at the end of a line // with not a space before is likely here to mark a break // in a word. @@ -244,19 +267,19 @@ function normalize(text) { shift += 1; shiftOrigin += 1; eol += 1; - return p3.charAt(0); + return p4.charAt(0); } - if (p4) { + if (p5) { // An ideographic at the end of a line doesn't imply adding an extra // white space. positions.push([i - shift + 1, shift]); shiftOrigin += 1; eol += 1; - return p4.charAt(0); + return p5.charAt(0); } - if (p5) { + if (p6) { // eol is replaced by space: "foo\nbar" is likely equivalent to // "foo bar". positions.push([i - shift + 1, shift - 1]); @@ -266,7 +289,7 @@ function normalize(text) { return " "; } - // p6 + // p7 if (i + eol === syllablePositions[syllableIndex]?.[1]) { // A syllable (1 char) is replaced with several chars (n) so // newCharsLen = n - 1. @@ -278,7 +301,7 @@ function normalize(text) { shift -= newCharLen; shiftOrigin += newCharLen; } - return p6; + return p7; } );