From 8a82b103f1875a06990e9b4a6b252d632f4295ff Mon Sep 17 00:00:00 2001 From: Geremia Taglialatela Date: Fri, 27 Sep 2024 09:37:11 +0200 Subject: [PATCH] Add support for specifying OCR language in Tika This change ensure that Tika can specify the language configuration for its internal Tesseract OCR Parser. Close #302 --- config/app.yml | 3 + lib/colore.rb | 1 + lib/config.rb | 4 ++ lib/heathen/processor_methods/libreoffice.rb | 1 + lib/heathen/processor_methods/pdftotext.rb | 1 + lib/tika_config.rb | 61 ++++++++++++++++++ spec/fixtures/heathen/quickfox.ar.jpg | Bin 0 -> 6461 bytes spec/fixtures/heathen/quickfox.ar.pdf | Bin 0 -> 10639 bytes spec/fixtures/heathen/quickfox.ar.txt | 1 + .../processor_methods/pdftotext_spec.rb | 15 ++++- spec/lib/tika_config_spec.rb | 59 +++++++++++++++++ 11 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 lib/tika_config.rb create mode 100644 spec/fixtures/heathen/quickfox.ar.jpg create mode 100644 spec/fixtures/heathen/quickfox.ar.pdf create mode 100644 spec/fixtures/heathen/quickfox.ar.txt create mode 100644 spec/lib/tika_config_spec.rb diff --git a/config/app.yml b/config/app.yml index 0471152..8ad03fc 100644 --- a/config/app.yml +++ b/config/app.yml @@ -29,4 +29,7 @@ libreoffice_path: <%= ENV['LIBREOFFICE_PATH'] %> tesseract_path: <%= ENV['TESSERACT_PATH'] %> tika_path: <%= ENV['TIKA_PATH'] %> wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %> + +# Other settings +tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %> wkhtmltopdf_params: '-d 100 --encoding UTF-8' diff --git a/lib/colore.rb b/lib/colore.rb index 9b50db9..ae1b295 100644 --- a/lib/colore.rb +++ b/lib/colore.rb @@ -9,3 +9,4 @@ require_relative 'document' require_relative 'heathen' require_relative 'sidekiq_workers' +require_relative 'tika_config' diff --git a/lib/config.rb b/lib/config.rb index c034cca..e94d90b 100644 --- a/lib/config.rb +++ b/lib/config.rb @@ -41,6 +41,8 @@ class C_ attr_accessor :tika_path # @return [String] Path to the wkhtmltopdf binary. Defaults to `"wkhtmltopdf"` attr_accessor :wkhtmltopdf_path + # @return [String] Relative path to the writable tika config directory. Defaults to `"../tmp/tika"` + attr_accessor :tika_config_directory # @return [String] Params for wkhtmltopdf attr_accessor :wkhtmltopdf_params @@ -65,6 +67,8 @@ def self.config c.tesseract_path = yaml['tesseract_path'] || 'tesseract' c.tika_path = yaml['tika_path'] || 'tika' c.wkhtmltopdf_path = yaml['wkhtmltopdf_path'] || 'wkhtmltopdf' + + c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika' c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || '' c diff --git a/lib/heathen/processor_methods/libreoffice.rb b/lib/heathen/processor_methods/libreoffice.rb index f76f997..a983f59 100644 --- a/lib/heathen/processor_methods/libreoffice.rb +++ b/lib/heathen/processor_methods/libreoffice.rb @@ -55,6 +55,7 @@ def libreoffice(format:) if to_suffix == 'txt' executioner.execute( Colore::C_.tika_path, + "--config=#{Colore::TikaConfig.path_for(job.language)}", '--text', job.content_file, binary: true diff --git a/lib/heathen/processor_methods/pdftotext.rb b/lib/heathen/processor_methods/pdftotext.rb index d1dca7b..54a7c8c 100644 --- a/lib/heathen/processor_methods/pdftotext.rb +++ b/lib/heathen/processor_methods/pdftotext.rb @@ -7,6 +7,7 @@ def pdftotext executioner.execute( Colore::C_.tika_path, + "--config=#{Colore::TikaConfig.path_for(job.language)}", '--text', job.content_file, binary: true diff --git a/lib/tika_config.rb b/lib/tika_config.rb new file mode 100644 index 0000000..f6d2156 --- /dev/null +++ b/lib/tika_config.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +require 'fileutils' +require 'pathname' + +module Colore + # The Colore Tika is a module to help with Tika-related configuration files. + module TikaConfig + # The configuration template version + VERSION = 'v1' + + # The default language to use when the language has not been found + DEFAULT_LANGUAGE = 'eng' + + # Config template + TEMPLATE = <<~XML + + + + + + + %s + + + + + XML + + class << self + private + + def tika_config_path + Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__) + end + + def path_for!(language_alpha3) + file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml") + return file if file.file? + + FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION)) + File.write(file, format(TEMPLATE, language_alpha3: language_alpha3)) + file + end + end + + # Returns the file path of the Tika configuration for performing OCR + # detection in a specified language. + # + # @param [String] language The language code in either ISO 639-1 (two-letter) or ISO 639-2 (three-letter) format. + # Supported languages are those with corresponding Tika configuration files. + # + # @return [Pathname] The path to the Tika configuration file for the specified language or + # the configuration file for DEFAULT_LANGUAGE if the language is not found. + def self.path_for(language) + language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE + + path_for!(language_alpha3) + end + end +end diff --git a/spec/fixtures/heathen/quickfox.ar.jpg b/spec/fixtures/heathen/quickfox.ar.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bb69b2408895477326867eb0862c07d547d8d267 GIT binary patch literal 6461 zcmb_gWmFtZm+leV-QC@T1PIPBxDB2F0fGgB#c}t+A-KDSpdq-25S$=Eh7dFXf&?3u zyx+Uu?)iS~?%Ca^>zwYYTUA}%_x98GshjzmRe%_-rlAIaKmY)`J%F14=z2jJTR z0fWI1DC8DUOz0nAVqyLTtiOT%7jXUn@1OWH3FyyA2n6G{;9+B8|I_wAzPRZDNU(u4 z&^ri-6#$cfAS9rhK7jFdkzfb_`Lp1^fsKQKi3Nt>LjTyw|C8Zw`^^G?54jyh3?aUK z$j+{RANRjJgZf|E;dQ54f^_@KzOf#g35W^dZ(8&1)e+e{4JNL3EQwZXZeGj8Ez#Fn zODbU}wPJDdEi6*;)(McTVvlnxN5o-loVGzt-P%7%+QPFXWm$HXnBbI}inZMXgF3z};9QunWI|8Mm_&f_5l zZleYS0b^kNJue0(6bph420+B5%;XeQ!U1FwcO><0J&JwnXAlPXN+QY!Bbjs6a=1TB33zG_N-^ram-ZOyKWp_w4aOL(Z^p!OL5I*i(D2B3 z*4Nw_uTL|`-hT=a7i3E6L!#|GbVG9*&}YekmlaVch3eQ-C;NCtE2j{9Khh$e^-c#q)&H1(+F_s;K9;crYxv8zrQ)vkSDvS}MLBl)G9@|d3}zQ$Evw%+oq zf#Q(o69HMQ_{ zWht;yHe?BzEJifLN!1Z)@qA+?G!EV=SF^6j6Kwxn6VCTeu|$f1NyedozXvJd3PP*H zuC3qF$30s#U%5kj_RzQ=aW0rn@J@5y)~<#vv7{R}zTPK6qSD27Do+8UUW2WwZ~g|* zsoG_7Fg9JE8dq|ytf<1(!ONY08un|OzR*$j?PZBlh?G&3OYyISRAXQ!zehHgZEd4lyR;scFO*SGiYj zXGT7%t*(8 z)PTQ2@|KnEf$;tRiypU}`gac)F>o8`P%tJo2pfao&n&lW3&kJ-FiBa+M95hc^=$&N zh$)ywl{^yI*oDOm^6MK8C_TLc3dEK542^BQf)d|$_e}i#DPZ0MqqjdPmLXV0f9aNp zer{Ia@ljgiYIhZ$9z-q_3xy4Fp^R&7R_e~oM}Leu>O7}lmCx^EcGJmudf2-aaKtE* z#owN{q8R`-c>Sy^Q`bjGyfUDXF8HCl=gyrIX@yvUQqG20H_;SuBUhT~(-&*TEk_oX z;w~JkHN~0ie0~yDt8VL9}=4BUj|ue-Ud{cTR* zp=xhFrscJm`!;e@+^L^eqk<_9iCQRlXg{bTt`2ODt=q%2g6G7SMg%<5`CxK^epuo_ zt3_k%JT#^4<98w;(~capdt}Bfc6!p92@mdp@67<#ndn7`n~jn zku9AkYN9v5ssD9+O)TG+4$%Y=O_frNle}mPUHg5n&bv<^PF$OZaz&H+KIQ#v@YMX- zrYzC?+D}!Qu(ng9A}4i_Rw2B2hiSKVrnQA3JNz}g*nWP8L0ZPHZGB|vS+3aT3k$Mk zi`^%(j0Q@bW2Nz>u!9Bk^U#~yf7u#ix$)~}9b?2pW^z?{W6IADXb1@(>AEy%Sb(En z{T0gH#O6ddTk#WaFwGpDpO_p2?^^j?a3n>xJInhij7?3^V~}E)M^_e(&`Z28JVaRR$gMXs^mG34 zkC}YfGGVVT?gcsh-F{zpim7Zp-gARuI`-$FCZ2oq^JviSi(K`ijBvv?qlFcnA~Vi! za!s9l^Xrn$cn*(nWc?_|4h3eX{6>%ZtndV6mUVcX`hF=zPWkhpCPq0_pBig=ef$#g z^T#7^sRAXoDJA!LDLeGlUF~q5G&_+CHDnzkzHC#3O-v~l{;=rN`x^4coJI!jojc`{ zcfftQk?IuZRWgknMrFGm^#|2z!_aH6NA6Gb#3S^|CZ8X29qy2?fxL36ecSWCEB!~_ z!RN;ZMR-?;x^IgynOOWH0({|y>(YZRu&I*ekhM4aHeU`=QfAYZhN$Yh{8b>IXPPLA&rMG@Rh3yBviAS(mx$i+P?PGpncX+g>m`B z^JnqUWQAvz;9(+S_?u$raObZK!k~_8RdNFC~bIeG1BrbeuYfWx= z$8v8CLbk3-RIxTLN4Rlo6gZhd>l7t4KxeAeiA%c!X7 zSNCtF8z8k{iFtE<82w7PLRG+r+NUUto{%L!@53*9s5TE$ zY6pDRv_8Cvj?q<-&7Iknl5#TLVK z!GGd91sVv+QHiTxxIDUCThP%8H$Fvai>C+-rA>@Rylq|4-bY0h;H0;)M>)kZAV-vv zcr1r^{pIFWMfvzF!=2iQ=NW8^`D|Pbw6P`$Q7s@_uj8tPw_E3 zJaV=YC|QYHYRbSh!7(AIT>qU&wAW8G@xJL>t9kTo{_{?9+JM%pvg4;O<)$=>O*dr1 zLRhY7cXv<8Xn|OMBStGz+a;bQ)zU&6-8LzRmC+~mpsGMX6+d4zGZD|32sF3XEWc;C zGE6bM;wO8`fE>j!h@MEgQ2Dx`o-yF?iLhRe#HCurS(2dW(4ZutXwTT~b?drN(TU-1 ztbJq4n(`4%`wwUJaTm_M9N(708PCYF!33=`2_Eh8JYtm6wz&rGJsznVhXp*{W)knA z{Wux#Yz>n@wo{ahadD%C?PCpD3Z~q!Xi{0B^Hkn17^g3_UWN9j_(8Vh*}SV>m;Q{P zSb<~mT-|HP%R^YNSagPb86h?ItAKcS3!=ZozGn;-X0!b$UWWX)Z<_jjTe z-`DJ4jI_CB$IZ1!(O5`Dki$iX$czd4WsXA#9aYifl|L`*A||Ga2?CM>=doIKKhltF zNSGIVco4&IO;>KhK(=H;d2kF^v_zPkeCO(VCPrH%52Jo^Vow3L)x*vXfj;RZNVhjQOt zUZy8(O{6#2v#j2C)}N}b`JBrnODOQp;+5w~tI>!Nt*TN3J-k!M1}h?POs4wd?}RX> zP*|`|TCXlufp{{|?i0WaaX)?~gs^$bqaSIgI*qZ#^|&@{JGn%k_AVQo&eaL79>BE1 zcAK7Y6W{8=KS~1%zEy^QC0`^gfXHpmMa=9W9GH+F(5+WLvG(se;~oo|OFcowrGGKW zm`?a(UV2;jh+2zWBXPRPDx}EH8jPza4&tH?dtcza#uK7zr-6f4>3G}<6%11JGA}v| zjKgCsoZj>?l~lZgOt{#}f>tU}hh$+|=d@Ywh#4YCzA=(5R`o%Z&bT3IG@%$E>!V!i zR}`McU&>_kn1~8;*c^Hr%@`20-~&s;F4)}|4%4WI{FVwygP)B)lv4=cgryHVV-Uv$ zoc9xT&(*QGX*x;(dS(xw*F!)(io{}UBO6G&?Dl0G>+A8F3~rrSdKtI6Zco#!dP+tW zb`9*W%79+bTBr`%NfpH+8hc=r|0F3|3C)~Hu1p(ZH-2xd_nBWRDy#!gOm ztvd7-=v5b{ymnpEIP#8CNHQ>y*x=604$a2P@s)7z#5I1t`k@YYKr6*AgXGX-Ut+D> z=m&ga>KaG=s@CR6=Q}@CFJu)ftOwo`t*B%*>maPq`cX1|$m2WfrO8Vk6|&P)5JoYF z!dBu{>xEwOCR&--{Rp-&P28X@n{h^`9p4!DAmgc*32Ms(%**8eDBpr##3|2GADsh8 zL5osv0wft6!#wX-Omr((1QWt~}M!4p*Zz;7jy<(a z5a>Nm6vCYi%krf-w(^|Aok9e%Qk=rMoHK{Nb&t27Mgo9Vlhs+F&JaIotMGcV4tDOg z8513oyS4m%-)tSh?+`?$ST8dD{mYkr9E6Fi(7eZ-lB=IKGL1B;p0H761W6&;^xrTf z28T5&1;`UQOy^!`kPj7v7E;hqDQmhP&F)k|CyQRP`e(6{8$%P46}+mlyMidwL{Kvf27@@Ho%Vmb~%%H2GTL$@S?+Jk1lFaS))Bg=n0IUE_3c|i+ zFF+k24uC2s0030D4oc2~DT^eM`mDkY37`)A*PAf^Ys3Fv{lAd9f5b5He=COn(Swcn zcE9ltBj3{Tf8t|eMP?geeUE@Zy@Y>KF)_LnE}mD+8VNJ60M=T@BqYho;?)OwtdtH? zamsuIl*2#5t5`$P3o%~tTcwPH7&aNX5G{}%FV;~RRz1!C~^_r4SgGFdY1p@I#B+D)CF0WRn!Y-z~eVI`%9NH znP-Ko?(pJ0bW#rwgFJPcv&!Q!aB`k$ci42_OhpCQt2FpU8$ZD^bN_VYZ&AnaduN*B z8VTJ_7=uB_^Vlp(@i{>`--AC1V)l?~$h12quR=g$GEC3L&1NpbRs!|*7t`E0K-^Sd zhDW8)d20zINSaLA%LGtFIF7QIKKa%UHY%SJB}fuQu}a+lIgF<8AeJ-Gw8X(XLS^a3 zyF{AKki3Kuub7CKnQ`M*K{km8ixa{@Ak|RzK|+?&yK!S-Vf zrG-HhrT<_$180if*Q?c1F^EGUzfzLLG~eoP?~E z`b0QhuyS(n`tBuF{#v3a6(gD8r{igqvv*ahdzk+7I4Ob+FCBS+(VN~u%Zqm#fG{#j) z0#7i*ytHan5B7Z6<37o_t$dYYGhA7DjlZJ)=3@tioj`t?izI!7T$2I$BX63us3?N4 zb86Y70(D)5^{3wo)dNMl1I_HXF-IrO@n|_BEhQV5at4kgV@TVBO=C-ncRSz4$Hu8w zGc}7C4wjNH&`#TsD8x&DR)~(QT;OMB-A$!kS92z`gBX6nOG$urw5t_nU2*jFn)$?y zgO5*216z+28!MIl5s zOLjA4EAT`xV^F%nb$K8#YDx-}>7#HbN;9y0(9qEelqerHcrxVMu955${03rl1+n_! zzq2TGf8H{J@9a4>OI35DPzb3kz!G?n&NQS`#)tXawEW#qI%M0A2#CD}-zXd?m-{zSb#@3p7KaooJwcjpE6Xuj;&fnd qu#gw14xP-8b?pZ*_M%^s1QstXo5G(>^!TxKelgAj&>Q|YU;YhRqOqC) literal 0 HcmV?d00001 diff --git a/spec/fixtures/heathen/quickfox.ar.pdf b/spec/fixtures/heathen/quickfox.ar.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9c613b03cd718abec03e4e91dd00ef3295e62eba GIT binary patch literal 10639 zcmb_?1z23m^5+Z;?(Q(S!@%GHf_rcX8gy`%Km-l$uEE_kXmAe+gy0@DXaa;l@Evk< z@4N55{oa4S-7U_?Ij6e2y1S|$Du(P=x zx~M3aTh7+O1Lg+5JD7REWMCG~mM}243e3sc!v@U52L+3Zqq}>!!OR@dy@5J0b=bWS zT3jUX&TxfO$bTQ9Vn9La|-%o39p;SgIg8m3um$A4;&^J)Old<`bL**@+rcb6ezhn%@_+??x7-`y_pJA__ga_vxB)G+gieH$Ojb!b1T4Xt!+HOJp4THwZM1l=Hh&BZm@@&Ck*}X zYx-j?zxu#6#;p#6E6T;g)64X7f?pV@$f$p{{U1Nv^#brv0ja<@h(J030v-?%4|vxPpoT9J z0TF3;rtAl`I569xgwGvIycNpZHzF}hlr^S;QmBcoC`_!`{2OL+7hu5A z_id*6q>@Ml5gTm&kFkDDZyOkc1ENM|*G!)in>UW~pUvui(jx@`&=ro_tFg7dna7{u zYt@i+;eBuA<#dtF))#lRWWLt_+N*xu()=MUmn+4f_GqA?zaQ zFh^3V(8?h<9OiUcGKib$^>^eMf-s-k76s5Vo9Tm+C8ouoXg@(t{y<|y$d6rJ_b}WdGizcx@ z?$**(EjZhxb1GPwOR)p#+M+9Sz1Nb;RCnqVzgJ2I#gdKB^V`&8&m}81I^6=HjgA9( zg7@d-ViM6$!VP2T1d;(aktJFv0FG*+I2Rn1H#dz6AC8xyfeLTq>!WtcG(HbLZt9iI z>zmsr!Q7k>VvwEGu377(F>CvvkM~_I*^vV$w%S2lw9aTn(`{>{mM1DSo!_dgX_k^? z&^LQR+vn=M&gZPi^0w2XW30k{&dYtRaj`Vy69!ASw2kaBOnKfTf0yWMAIJ3OE`rcF zy{6;{pq#4eCx1tLCTtgzbk|DNVfvN@%Is=2_l~U_L;qmMvUHy=9Cs4?O4(4^U^c%{ zWil|vj7#!IN0r^sBElE2$M^dEnB?FZ?{Ke-vf5Um$%U2GrGN_2pryDZD9$t!Nk_P$ z_vUJFj8d0a^|~bME10*k68js;Vj(OV5vzQT-Z*{-;H|vEjcEgAjMt*UssQ=9h0cKP z1y>r@8|683%WC?B;vV$aI?s6i3VZX(TnVH)Mf%GAxjTSr8bDB0{MEWUbPe`h=s8d6p6*CN%W)PW=8;Kz@jDyE`UXs!PZFU9Fh+W3vRSr_>sAy8~ouC~l#M zJ7k!7L#M`Uwf5X#^y{dNsy7jxcwRr)Nj3ZFao>*r2{q4aj`rMDWq$vgvVzW=IVU3>q^3QoPkF^b*5~7s_$Wt6p z(ibu1llBaal8+V?A5`hG)t-Q^Lb55gq77Jj%*#*nz z>d9uz%epwqn7s{=h%yPom%R!|(&LXh-Z>YXvyF}t>c8JJR&4sgs>~2AQq2c!+Hmzb z>#gt*XBat&w`Nvw58DXWy#qYDMi!$d{l@JdF(8>Tf1%MwITYT~^+S&L4shmo6I&h4 zKHtF`&!a3;g7hOd(ohX{=-&0@KP9FTvp;r!U_hgz2882T%+ zSqfjM7!})E*#km?g-D-q>lcr{L`+5SO7Cdl)8kWZFsJI!eR+Fs{-8FCmrP56#Uo4B zw&`7$`t~k ze4L3Uv((uxG>a%0yn#)u59j7?f&0&L0 zWKTB&O}Eh2#)v&mXXFx7x-y?%_H|R**fFvBOE*PwuZ~KjD;KwKMZ=%U-ClDY$I2_k z6$<5RIiA&Cnd=(cFF;RPDHV+5?f@)N1s|%W&0`Lav)OAuHa4hNYFqLWXR=F27e?1f z7fvs5DA++}g56aMd~Sa_A$zS46etB|N7J&fJI7WT^fpGWs^*kr3x%}3`ka!*{F6t$ z%BqZt2}doxC!SRVzolFd6zrzD>5Wy#XoEHXX;)T{{+wA^e!J8paOEaK!o47DSGT}BO=}X#C;T)QJA4q|= zPob#U>H~ROyUrH>ghvQz3c?jkXxp;yPNSm1IL+^xurS}&npFRW_Td^L!MZGo6JkIx zcgHfV61Ak__m+b}?c#o{EAsO1b(@Z}>$`hCD9)>dGuo#++J#NudwxpY0aEgpz}xF1 zw=W>&vYeizo`oTl*tB`Mog+#Yua1_?Gt+G_S$_;gY$`&m<)$V!3=QIPAH~KDIdeec z+$zZ<=R+98W2;g;iyBMJ>nwcBg&oIwsO>Kg-UZ|u#hLdw1 z`TcXAax`n4xk)0KJDS>>YG(aEFC?d+T838jQK888m@W>Jz`>wwnV7nT ztCOp>1yz+$owJQce94@{sS~4N4Xvw>4mTq5(bC!&B5b3n;@(LmvKo!-`-#oT^0KoV zh1#~^&QX~cv70$)?S2|e)r*?6r}@SRrqrRZVJ_&7pTK{i-B?JJlgl>w1BHnEOF_E! zIe21KlR%DvEYZMu-Gn{zyK2tjLQ6fp4qZq7yK7(>`or&VOR$Uc@f#mMl{ZQCap zaeJozY@e2bX}9pwp?H;2e%422xwsory9SEr4_Sq(trjrUn(;h_4`W0;vJ@W&(4TFH z=oB>?nma0r64B&@MBIQcdksBgyAW z0`&scEb`QUw~ z0YZRhhQAJ9c(ec>_?ZhA>)xG>i(H(^e2yprpG&;c-9=23^6_wx^z; zq(EH!<<4tRg#>BPYh=^xHlsbLmM-2FHNj$KKS=7F1u>N@7zt>4l0*7}$W3RyR79Ny zCqJ9ss;^O>N|zkr$VjVO;XxFmUW5gQkwoIqNA1osB2KiBv=KWb+!+6h0i2%MT3RP% z8-9SgzJ+%kBCuBy7fSzbGtM%reHqR4X1qF`MRkT!#HqH&P49IbF|`bXBI=6tF%)xC z{gs<5F`Lp;77~q*vioFghmz-)x6ZSs8!u#nA1C|kf_c%jLA~~yd)YlT@}L*M7v0F> zk7^Ug;{>FFk^mD4_0C`hP!>kE55IF4x{mkf&RXH)!(L zHD)KOUpPSOL7!1VdX;)3C8dmKtRNDt?*(IrUBA#>J$}wALvVHmL@HvG*h#ok!T1hln@7#^UOQ0nXC`H~)9q0C%gi}(anRynq%%t(*YYmWF z?yC^cYyQYS2hMqPJYeveZO*k&cb^x9Zel?rC8m_mrzEO(GjXb45S1nIZJ|-K8&qRl zBQZ}(<9m>qbFLHP8x%*Pe0@C*npD^si7+)Vtd0#F?f^k0GAf61%<9-5_|%Z&TT;nd z;$rId54m7*9tm1zDNMcPp*n<-*I#`GeufQVtkBU9ZkPqhj0=U1TVi+=1cAme?UMT-G*x*0e-9zNrG6< zz@Td>EA#=xXC5e))p|9&Hm#D00aq8VjwC!dI97C zxc(ps2><{DsRa?zB8$f12z``cLG&jL_~#_xe=`36z5ka~_dXfM{U4LzKYGyP!oN4( zM`U<7{#!i8l?0nXG+g}y)Z_mO#kjX6N_@FRbm0mH<$$%85B!1zrLppZy~a{UDQKmh zoYJB1l`83iZx^E6V|PlZhmg$Da}ZU4>TD<{r6|p!FwPZGM*x6Rn)KCIpHC~S-W{)Z z8Mq-d3gH(M$|(T54uER|(Pg1MWIwneT<<(D<0e47Q|OYQ+c@Hx9l-S`D#LU8QW39$ z&(2Ck2e*m6tW@HpZFVw_9}tpq^?E|4`lrjw5!_2&J!|8@S_ThHh5zKW34M63%%>RN zV~aEtcshqlD;1j^nEmC|7#DIczM@FGP10vX;0F;JuW|k9%aGLo^~1$fCq^I(2?CXM z324rgKMp8NAng7aAgOCJN~`CxHGrU9Hp`2Z$h$!&bO*?$)>8_kJqJ$lAHBhr7H)ii zqilzm8~@HdDlBSxT&I{wF$Q~f5lV|QUfWYolA*BtL>?#VXdeCbUl+|yOXFPFO znBpvTr($=2IafjNS@AHR@2!Xg3#?Gp8NwfI-vg(&M#>XH;uU(C6iA#;oP@!fWIC0J zoG!>A?kY91M+cq^F=HZ5t1CkETC1xsF<0d?-ggjLa^|Jl3sQ!OHE9xBdyuU~L|}zn zkcuYe%d1JOKiw*j58&+y(6?kk9{ps1LCzR%B-pr=J$M=(h2Q>4FS@vBzpEiOI!3;V zrkO`;sDyBVe98<@B3AgLL}Yly0tc9GKZSf<&JNoWQEMI}IbNZoU9RBuHDiCDzGuuh zaOoB^Niz(q1n}`IK7JT(v3RSG8gFJMPy$02&9WMwtgvvGO*muw>Bl}~0ooY<>_B2f z@v`pE-{pZxa=G1;p4~me>xxr6XU0ki=}X*9#Cr8K)k8jbm7E&cU$Qh0%E>ZER=I$m!! z-@Y3?WTRfwZ9lSMOGx+}4MRn&CWBgywHjzL3WR}a6__y&O>cF;AD_+StDpfuER~<1 z;jhg~%RdL;)IihH&e9~}5rPq)K&ktloalQ7cX$ve&a@>Rck#z4#5V#TsIwm)D>t$} zr|f=h+LH~?W<1F)STd3XQFVy6pJ3s7aAimwNtgLGk#u!H9QhE7q+H?*cBRFrjnFz! zH)#p;Wru&veds>)r|!Fbju%k0w5+G(HPP!!+cmF~3Ao+#yPN;dN|(^zsz9pnx)rzN zeG$*;zNEt~1AA&~0n?O+m!E!@@puJzc>gTV{ClDAUj-U(U{;Ekw$?mx=y-=<$47iR zFykrtb2RX$2MR1KQ3t2*pSApiUXeYSpgFPBD zvRlhh_GsU#_`k1Y+zjcj&VOm1owxqxGusAGzYYZTdguZD`SSD#zwY-K;S-GZpCgds z0Ao-s%2vgS`V(ICn3WE)2U%C7(+ZQ>y)S298?)>8E1f>~}0qC;R0;0Z5 zBwGkcS`7naymz2?r^&3mAw0aqd1(2SbQ}FO4re9X?Cm3IsZ5)B?{He$v=Nc?M@ zQ)GdR`i|)~@rPBEhllzz+cpJWz~3H|Gs%ovde2M+GQ9D6SI9l@)dK<9T| zA;RbBqhGId9~6^344_veg2H6Cic5->W!}HX#7-TzygA3c+vpay{V>wU@j9oQ=; z9KmPb5SIsixbv+|ReX(mEbsH%FPAhJMiEYHDw1?s+h>@umTrvs9ApKnK@L3#_;Dy6 zx#)ZWTo42fQ!Gk7!c&IfQy)d$fbZ_D*>vP6rTg2OY+PCpT4ql0Zauv{|HhI z6mUT{0zK}=a>2m#mzO{f=~0FNR|0HN1CMZt@+HgThzoJjrDkv`3{am*OR5VIA`VE0 z$KfVOTzq6IC1rb6o+qs?c1rI--iB)Y>M|GQBbX2Q2Iv!nmxeHDhBXIh@}IALK#3~f zJNjA1o9HzPN%!KKQVX#-c5^Su8pRQSBiN)j5mjOs6>&mN50&CrMk-E^0f7~Tm%-G zPuCUeF;_l(iY9=U8O_ZH4e#AWo0_k@-v7PINy?KkC(3n!zsSI5Y#0eIO2P3YfuxK^vDG^$Y4v z5MD2zMnI{W2VDs6vwkCU*1DKE$+wD>Osa{5iXjXnnhdPzjRD$M2s8FIiTj5rAo)stA52-TD*Wu@j-g_T+*o=H5DF4Dv*b(+#EPixCEQ0tKK zQS#xIPlTp^Pu&=)O00;tPu@>`!Kjuro9dNn!j;2pLhGk=TYTMupwT1IqZLq4tD7gc z*4Qf+tSFn2pKKmxo?#wt{%W0nK!fpZLO?<@zFL{YZtW+=B3g2($)abUS{ffUiY&ku zAXM^#g;$e_y91{*I~}KJFh-c+7Rwgp7Wr1gJ5A(Xi0MzLCRtHbVpMxn?03HH*71$; zMS&XICwFgIlP}nu*j^^nC0iu}lVi(ObeyLNr?bi(%hjeJc1(6=cJp>u(_9tN<0a!> z+2dIze0Q7Ip?kG$;m=zx?SAy_iR~qlXOMT2lacF?r}7Q*lVvz%jAU$OeBm2vv@yVV z12Wh$`10lnBsu--du=PmhDhPOs_MfM&I$3tjKq=F#_WRE<-`dW}{{; zFDW%;Zar?38jWvx;u<+sIfY&FU50+|x<7&!Od};TdrI>n^195j%?j2m=*#0PCfEuL z3LXp!Sv0vu9E)!lGS!A>DJCdJk37eH@6p9dMnwadiqs_o-v*k^QCOjkT8Vxiul9oXk{9#2`?gM6m(_IK7= z#*U*lP?%pZ*G(#xtY+ztv5sMl6<}bJ9Fa_MlDbW}uC-9Q#dBG5MVWZJJzhnRMoY?$L%N0A0GETgFo(9kQ_)CMI3_{4Ijx9)Yk*- zX%D2~{$#>nG)0UcbuIlgSQL#P)0pVeos^4%uafEDS&NRhueTS7(_B_Yy1!i^cKFk< zd}K7SGO?H3i2Q);wEQF47qY3c%BgxB`VE_MQ3g@qVliSG3ZfSbPuLf_ z5`Ap^6LuXBN)4(HnvhBDHx9Q@tX3pa+(1y1eDm3}-sB`;vn(ktselz)Hc|G$HmLDu zTFkA=4}~a(OvYFG7!7vy zE7tO`oOc@U@wQJlGd7(@sYbq!T>0`_@q2k%cRsxhx$fcrYL8tNJX3A7*sAJf+|#!9 zqjNvj5$vcie>5sL8V#dZb&6P-(#X}Q?KHJ%__jSL)hea2*1pkh}YTG09vBt$EGygu}k=36$V==`$?pYmPi zo1C+zj|LG2c{58h;Z21Cwl7p(tnT_xW>Jk%aX!qvdwCjlmY&h@&RTgqevIgKrf`mz z?}^l4@Nr}~NvOzU5An9f%}-g?wbd)50jV`lj$7Tz7CWx4by6#=tA&jp_+yC+Lx zyFM#76DVakPaeL1E_7paV{x#N&y23dmRSp@qtW{aZ=|S$S>3Bn!b!Uq%jtN%Qg-ouCP(MfO|l%lk>l$krs5)Ko(M^zdB31GTMVf{OR=^muz{@-C& z_qZufIOGV9GE%m+bcf@t?y+8QseeO83IB&+X)_Nq2WRU)U|{aQ@n8RfQVISZ@TV{^ zx1PQMoc9OR3;yuentASlV1F&z*24h?W^whjwXnBx_U3ZAUp|+a8!LPsH)l&v3pkpN z#nH?{O$)3Gb91+Kc7o%sxFBF77Aa3#2TL$7pClB@Z^R1L@HBJtcn*dG#(p8tyggXq zP%Jpo&chil;FnV1B7ZU9GGy-0_AD|YybxYKh%hgN2g(oOgBU;{%=HWqy{6$~^etsxF*b4k7j~9*``_&3|`jaOtBnSt+-P6B#5C|{aqW|DQ z`TjR!p>PBJqaU9D++qHW#{+@DbCrMRK?Het|5Ki@035FO&$0Y}Tc3xUnXLoN?H9;V z%l0`8zF#=_&)L}neD5i6oS&kTl{5I4Y42ZT;7%pV!)pmK=QV@!T9}&)L!eL#A*h*v pxe!Fi3My!2E@UZ={y%s5!ztZ8;Ku!>XdVbW|3YVElvR^M|6iKfq>cap literal 0 HcmV?d00001 diff --git a/spec/fixtures/heathen/quickfox.ar.txt b/spec/fixtures/heathen/quickfox.ar.txt new file mode 100644 index 0000000..36b2c79 --- /dev/null +++ b/spec/fixtures/heathen/quickfox.ar.txt @@ -0,0 +1 @@ +الثعلب البني السريع مفتون بالكلاب الكسولة diff --git a/spec/heathen/processor_methods/pdftotext_spec.rb b/spec/heathen/processor_methods/pdftotext_spec.rb index a48484d..8895141 100644 --- a/spec/heathen/processor_methods/pdftotext_spec.rb +++ b/spec/heathen/processor_methods/pdftotext_spec.rb @@ -4,7 +4,8 @@ RSpec.describe Heathen::Processor do let(:content) { fixture('heathen/quickfox.pdf').read } - let(:job) { Heathen::Job.new 'foo', content, 'en' } + let(:job) { Heathen::Job.new 'foo', content, language } + let(:language) { 'en' } let(:processor) { described_class.new job: job, logger: spec_logger } after do @@ -14,7 +15,19 @@ describe '#pdftotext' do it 'converts PDF to TXT' do processor.pdftotext + expect(job.content).to eq 'The quick brown fox jumps lazily over the dog' expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii' end + + context 'with Arabic files' do + let(:content) { fixture('heathen/quickfox.ar.pdf').read } + let(:language) { 'ar' } + + it 'extracts Arabic text from images' do + processor.pdftotext + expect(job.content).to eq fixture('heathen/quickfox.ar.txt').read.strip.force_encoding(Encoding::ASCII_8BIT) + expect(job.content.mime_type).to eq 'text/plain; charset=utf-8' + end + end end end diff --git a/spec/lib/tika_config_spec.rb b/spec/lib/tika_config_spec.rb new file mode 100644 index 0000000..e2bf234 --- /dev/null +++ b/spec/lib/tika_config_spec.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'fileutils' +require 'pathname' + +RSpec.describe Colore::TikaConfig do + let(:tika_config_directory) { '../tmp/tika-test' } + let(:tika_test_config_path) { Pathname.new(File.expand_path('../../tmp/tika-test', __dir__)) } + + before do + allow(Colore::C_.config).to receive(:tika_config_directory).and_return tika_config_directory + FileUtils.mkdir_p tika_test_config_path + FileUtils.rm_rf tika_test_config_path + end + + after do + FileUtils.rm_rf tika_test_config_path + end + + describe '.path_for' do + subject(:path_for) { described_class.path_for(language) } + + context 'when the language is found' do + let(:language) { 'fr' } + + before do + allow(Colore::Utils).to receive(:language_alpha3).with('fr').and_return('fra') + end + + it 'returns the correct configuration file path' do + expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.fra.xml') + end + end + + context 'when the language is not found' do + let(:language) { 'unknown' } + + it 'returns the default configuration file path' do + expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, "tika.#{described_class::DEFAULT_LANGUAGE}.xml") + end + end + + context 'when the configuration file is already present' do + let(:language) { 'en' } + + before do + allow(File).to receive(:write) + .with(tika_test_config_path.join('ocr', described_class::VERSION, 'tika.eng.xml'), an_instance_of(String)) + .and_call_original + end + + it 'does not overwrite it' do + 2.times { described_class.path_for(language) } + expect(File).to have_received(:write).once + end + end + end +end