From 54bf5d7035ffd00ebbd2d219e5db644855b388fb Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Tue, 17 Sep 2024 11:04:06 -0700 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=8E=81=20Implement=20all=5Ftext=20sea?= =?UTF-8?q?rching=20in=20Valkyrie=20for=20PDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit will introduce the Hyku::Indexers::FileSetIndexer to add indexing logic for born digital PDFs when using PDF.js. We also change the works' indexing field to match the file sets' indexing field (all_text_tsimv). We also "valyrized" the logic in the HykuIndexing module to accomplish this. Ref: - https://github.com/scientist-softserv/adventist_knapsack/issues/769 --- app/controllers/catalog_controller.rb | 8 +++- app/indexers/concerns/hyku_indexing.rb | 2 +- .../hyku/indexers/file_set_indexer.rb | 42 +++++++++++++++++++ config/initializers/hyrax.rb | 2 + 4 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 app/indexers/hyku/indexers/file_set_indexer.rb diff --git a/app/controllers/catalog_controller.rb b/app/controllers/catalog_controller.rb index 01940996a..fcf381063 100644 --- a/app/controllers/catalog_controller.rb +++ b/app/controllers/catalog_controller.rb @@ -41,7 +41,7 @@ def self.uploaded_field # IiifPrint index fields config.add_index_field 'all_text_timv' - config.add_index_field 'file_set_text_tsimv', label: "Item contents", highlight: true, helper_method: :render_ocr_snippets + config.add_index_field 'all_text_tsimv', label: "Item contents", highlight: true, helper_method: :render_ocr_snippets, if: :query_present? # configuration for Blacklight IIIF Content Search config.iiif_search = { @@ -83,7 +83,7 @@ def self.uploaded_field rows: 10, qf: ( IiifPrint.config.metadata_fields.keys.map { |attribute| "#{attribute}_tesim" } + - ["title_tesim", "description_tesim", "all_text_timv", "file_set_text_tsimv"] + ["title_tesim", "description_tesim", "all_text_timv", "all_text_tsimv"] ).uniq.join(' '), "hl": true, "hl.simple.pre": "", @@ -638,5 +638,9 @@ def show def render_bookmarks_control? false end + + def query_present? + params[:q].present? + end end # rubocop:enable Metrics/ClassLength, Metrics/BlockLength diff --git a/app/indexers/concerns/hyku_indexing.rb b/app/indexers/concerns/hyku_indexing.rb index 41a834a90..5747a01cb 100644 --- a/app/indexers/concerns/hyku_indexing.rb +++ b/app/indexers/concerns/hyku_indexing.rb @@ -22,7 +22,7 @@ module HykuIndexing solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource) solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource) # TODO: Reinstate once valkyrie fileset work is complete - https://github.com/scientist-softserv/hykuup_knapsack/issues/34 - solr_doc['all_text_tsimv'] = full_text(object.file_sets.first&.id) if object.kind_of?(ActiveFedora::Base) + solr_doc['all_text_tsimv'] = full_text(Hyrax.custom_queries.find_child_file_sets(resource: resource).first.id.to_s) # rubocop:enable Style/ClassCheck solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical solr_doc['depositor_ssi'] = object.depositor diff --git a/app/indexers/hyku/indexers/file_set_indexer.rb b/app/indexers/hyku/indexers/file_set_indexer.rb new file mode 100644 index 000000000..a7cd8f319 --- /dev/null +++ b/app/indexers/hyku/indexers/file_set_indexer.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +module Hyku + module Indexers + class FileSetIndexer < Hyrax::Indexers::FileSetIndexer + include Hyrax::Indexer(:bulkrax_metadata) + include Hyrax::Indexer(:hyku_file_set_metadata) + + def to_solr + return super unless Flipflop.default_pdf_viewer? + + super.tap do |solr_doc| + solr_doc['all_text_timv'] = solr_doc['all_text_tsimv'] = pdf_text + end + end + + private + + # rubocop:disable Metrics/MethodLength + def pdf_text + return unless resource.original_file.pdf? + return unless resource.original_file&.content.is_a? String + + begin + text = IO.popen(['pdftotext', '-', '-'], 'r+b') do |pdftotext| + pdftotext.write(resource.original_file.content) + pdftotext.close_write + pdftotext.read + end + + text.tr("\n", ' ') + .squeeze(' ') + .encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') # remove non-UTF-8 characters + rescue Errno::ENOENT => e + raise e unless e.message.include?("No such file or directory - pdftotext") + Rails.logger.warn("`pdfinfo' is not installed; unable to extract text from the PDF's content") + end + end + # rubocop:enable Metrics/MethodLength + end + end +end diff --git a/config/initializers/hyrax.rb b/config/initializers/hyrax.rb index 01d5fe380..f45a40cb1 100644 --- a/config/initializers/hyrax.rb +++ b/config/initializers/hyrax.rb @@ -242,6 +242,8 @@ # essence a "super" method. original_translator = config.translate_id_to_uri config.translate_id_to_uri = ->(id) { original_translator.call(id.to_s) } + + config.file_set_indexer = Hyku::Indexers::FileSetIndexer end # rubocop:enable Metrics/BlockLength From b666d84b9b33d1fa5f7a41ac11be3c7ba565fc22 Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Tue, 17 Sep 2024 11:48:58 -0700 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=8E=81=20Add=20logic=20for=20snippets?= =?UTF-8?q?=20when=20splitting=20PDFs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit will add logic to add the ability to see search snippets with PDFs that were split through IIIF Print. --- app/indexers/concerns/hyku_indexing.rb | 33 +++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/app/indexers/concerns/hyku_indexing.rb b/app/indexers/concerns/hyku_indexing.rb index 5747a01cb..51a7f2f64 100644 --- a/app/indexers/concerns/hyku_indexing.rb +++ b/app/indexers/concerns/hyku_indexing.rb @@ -22,7 +22,7 @@ module HykuIndexing solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource) solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource) # TODO: Reinstate once valkyrie fileset work is complete - https://github.com/scientist-softserv/hykuup_knapsack/issues/34 - solr_doc['all_text_tsimv'] = full_text(Hyrax.custom_queries.find_child_file_sets(resource: resource).first.id.to_s) + solr_doc['all_text_tsimv'] = full_text(object) # rubocop:enable Style/ClassCheck solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical solr_doc['depositor_ssi'] = object.depositor @@ -38,12 +38,39 @@ module HykuIndexing private - def full_text(file_set_id) - return if !Flipflop.default_pdf_viewer? || file_set_id.blank? + def full_text(object) + child_works = Hyrax.custom_queries.find_child_works(resource: object) + + if child_works.empty? + extract_text_from_pdf_directly(object) + else + file_set_texts = child_works_file_sets(child_works).map { |fs| all_text(fs) }.select(&:present?) + if file_set_texts.join.blank? + extract_text_from_pdf_directly(object) + else + file_set_texts.join("\n---------------------------\n") + end + end + end + + def extract_text_from_pdf_directly(object) + file_set_id = Hyrax.custom_queries.find_child_file_sets(resource: object).first&.id&.to_s + return if file_set_id.blank? SolrDocument.find(file_set_id)['all_text_tsimv'] end + def child_works_file_sets(child_works) + child_works.map { |child_work| Hyrax.custom_queries.find_child_file_sets(resource: child_work) }.flatten + end + + def all_text(fs) + text = IiifPrint::Data::WorkDerivatives.data(from: fs, of_type: 'txt') || '' + return text if text.empty? + + text.tr("\n", ' ').squeeze(' ') + end + def add_date(solr_doc) date_string = solr_doc['date_created_tesim']&.first return unless date_string From 9aa21fc2726107ed0cbf73b793baaeaf4a2f4232 Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Wed, 18 Sep 2024 08:53:35 -0700 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=85=20Add=20test=20for=20file=20set?= =?UTF-8?q?=20indexer=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit will add a simple test for the FileSetIndexer logic to check that the text extraction from a born digital pdf works as expected. --- spec/fixtures/pdf/pdf_sample.pdf | Bin 0 -> 13264 bytes .../hyku/indexers/file_set_indexer_spec.rb | 23 ++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 spec/fixtures/pdf/pdf_sample.pdf create mode 100644 spec/indexers/hyku/indexers/file_set_indexer_spec.rb diff --git a/spec/fixtures/pdf/pdf_sample.pdf b/spec/fixtures/pdf/pdf_sample.pdf new file mode 100644 index 0000000000000000000000000000000000000000..774c2ea70c55104973794121eae56bcad918da97 GIT binary patch literal 13264 zcmaibWmsIxvUW%|5FkJZ7A&~y%m9Oj;I6>~WPrgfxD$eVfZ*=#?hsspJHa(bATYRn zGueBev(G*EKHr+BrK+pDs^6;aH9u<6Dv3$30@ygwX}fZ|TDt1G($Rqw927PN=I8~c_R69-cY5S*jJE@5Wr0JUS6u!J~3#h`{ZMo=LkbbALoD8vfgB}Fh|2>mhOnfS$3 zNV5}8Ox=$fj;C0=UKy*{myZZPRVS|0mqr-HxZAy;()@wxQ}MN`QWAZTXb3Z&Om9W2 zbnA^OWoQbAW|3W^fw#J;YzDato8*`rHQs+@W70D&SyT{wb`SN*3nI z5G%$wJlq932=n{60Eii*9H8dFih2ks?QY=>nAFL=5g^P@#b{YUEHt0S$D7WbX zx%TzvzIK%zpvzLEd9LNr0ch#LFf_(9 zEGt0C9v~%b54vynAc{~;v&2?S(-sTTft@9CABMNFZHtY1W0-99CEbUNfp_yu{LDBz z@8z^$LPN$wX4Hi+dZQs6K3QiKKF0}Nme@EII;;F}IplC(YvT*C3-Oh#(A}e5pIz01 zyR}D2|ftBF0T=1moHZy}$wS*PSCmSzHQ%x z2tCQQCx4jt7w1cuhY69~eH`31KC4)ZZJ^)f=IabocAkBPa zEeg25yPX&9-i_N(Qiq!I3RDrfx&0t^i)&MSQ1D(w%|%#LTNr>1cPiltAYO;6kBn(B?r11c^Bz~#)z5~~V+*`U)lDFtKbZ|;? z&4wTUtK=KE&uQIWUQv1mDE;LIhXXgx44PMa@%Z<7a& zx45^oYSnei^~%}`?!O-+cgfSmn_c?`=Gmm*Z^I(96ve&$zDs|)r84)IEEiE1kfQ$q zm3km*m1)PjdU9nkk9BTlidI1~M|O~WfP7AUu2T}d>5is9l$<%;7r2&Re06w>W$KM~ zqITBTd=Ln>^crw`_N?{ z;2d_=E0n!*NisQ|XYuX9q3+UcqdA(MC45|>2tz^c6HdZOmXTB?X2Elx@_0f)1z&-gS;UxN`>Ll-kWb0X0 zTrQis=w9sJ(q7k|@|k3SA~DJ@uMXP@4(Mgn+LJC+3F~3NHW71pIzY(aHg~{O+squi zWO_|F>78)L5*gcRXXRD9IzQ(ddSxh}E7(8sC~EYrOz$9BkSMBCkGGO9FuZ{#*mW+h zvwE7d)6Ag=a*R5URs>}qdqb_E6g)kN2Wel;pWe9=hZ)XvRZR!RQg&gxAPGj8J0!gR zrdV<2@MZQ?_Ocbd5@0zI?t>$z3eD80_h^{DI)H5lk`T4lbn8kteH3%fOBH^g26#lLN2&P^s zr&d05GDs)u_8OKzCgNxllk5pLC<2wKmghL{zW%}5^}%S$?d=3OzjaSzT3>uWYikZN z2ZcR7*L|%UMs|u)wMi7#vkN?cxlBcyAM80Tyzzv&zHMF1TH9?Mx5&E57P^)^zE5N| z^foq}!--if$Uj=U6Tc>EM!Pv)e^_SZSdvtQ=@>)(ONejQ!XW8u6>ESl<*s^6cH;Q1 z#n}nL{#|{l}}@td^zNSA;R{`3A&Jjr8L9(3^2FSyZ1W9$%;!XP#N2 z-SAzyRfxtgq^py7_3*GJFO%x_v<`xJ46`~S*IukgQDKfLxzFnS&GYL!1LA{I z!c#{A90{k(b*tUfbgjOH>}{#V;%^O+LUU<*#QkLtWzjho*Kb?Cr&wC38%wxpn}^Wy zG6EpV9x3xioCWA6H6=aE3)%jmZePu#Ji7wy0CmkDZNG`a{J1i-2`Bt&UrFb&<~V$^ zy9i`R1<35M&{mtCz144%v#7LKBTPPApjoV}#W-gDc5cn;A@Mbt#zXUK@J9^vj*ME( zo8(%K{c-KDr8n1-I&Mjn)*i|pF|7l*`fXvo8-z&j{$NOfUPM-xILbX1D29IHp|__B zL*JQ8*7-VrZVY*&$!PiE%zv@osg`qx0M8+w9iy7Az7;HYezs;5NRvrdNM~t@o}5Gc zjagk3Y_>6!Ct;ITqhu3FojJO^(^SG-($M4|frkp?4y-QoSmFcw9Z%(z?eC0kGi9@? zm(vAgXU|%!6_)CrnqYL-Hj@B5hA?#8C3G^cjd?0dMSZ!wbe%O4bWvlIG=nwOEInVj zhjzd`Bry8sXBTfIUr+juZH5JyE#7~UQiwR!gmG@wm}aNyo`13xEo)tzP64MWWG|j8 z8u8a2_=C2FdRZ9(eG&Au`@$mY9vvWldP-@wj5@38H0W2V8wnaQO?!)qoS_J=(ieoI zOvH}mkBRh_p1oTW66+?3u-GH2Ex~c=BQiwpJ zJlF7O2PBaCojRRL_mp44*Iq}vcRFpBD>V9M7do5{w&b;4^<_V~Vr{+O_&hz9k5Sm` zq3|%Z(6B5~wz2k0iH-QlafAa>1%ZebdxkR;6SdA?@dK|4Jf8PIO%64Fpw$6RYG2R# zX>Iq(xf`5Xk)79-@;BAQjlWu|w@Ss3sJv3Ew&%lBu-H?vYsC8XPJD!lkv*A~z_-k= zLOaM?B5}$Sf-KF5BWHoB51WFA{GlweQna618{*tqVn)YKUVq?khU_=QER9uW?N17xgAponbjg0W`=>f;sulH3?st)Y_@k$We2-__a>^{E78lUiI13qq!3# zwxMEl75MK1q`~J>ST#?`mUx#vr%-jwpZ+DV;W!0KNkZmO#sK)zt)H@`EQl6RRWhwb z0&E7|fG~@z)wlK1-RsxN#8Gr)D5=xpv=b}=CWPbwz@(9bIhD0Crd-Q>qEo>~Gh{X7 z77AK5>TfF0wK!?7Nx!<5uDy?D{Qg$SEc_R3J9EuH!Z@qmEJ*QRRHd3BPirM6783nv zAnab$>rhdDJ6pO@%Ox(}BYw{Ba<3|=A%Fg5_Hfxj{%CfzZCFO{?%h&=?%CNBvi&p; z(otqN>+5giLLa^*G?xzN30=IgQrV+r7dW4bX;zKtuD)O$UnwAKC?CpkPt{77nUArH ze-jKcCfRrOlp(Q^b&W}mrgt4n%wikNxeSBBE_n>K-IOIzi6!<)xGRYA)wGgqp^s@d46N#krDHPc#9SOgXhI7Vbj?B z%c6@8dCOGPYBoNE#3N7HD^ihbC9*xGm6chu;?fcuv)s01keHHZ1vXl5D;29O7wZBr zyPzyLZHKMtUI%PK+*X2zTFtaDzU1qn(H=hRRj-SoJw7I5i%4b0u=&InEAKgoae-lp zXk0SkjlJ52HruS*1QykTZ&aCN`PbcKuw$1st{peJ@&aF^aR@~{XA@L&YvK%+VU}G4 ze5iuesu&i6=*#nvHbm_v-ZLr5^Ij#|YSAper4XpsH;0x(2h1-tIobIy;0~2a( z!G($SB!iu#P;;hGeI~C`O=-3|d~zoB0!`*JrU-)Ko_X5#kSpy5o^z49RG;{j#l~45 zF?X9Ih4IdviT(8@+q|`BveLTprbESZ6^2I&ew|V3pDXRe9gSyXT)zzqKQ;gCD;p+( zM)2(;YJ%P5)X(N3ZSn>dn6UIcEcvQOXZBn}uD!7V0yXr$f+d@eTSYoquPit2S8cPW zA8t3dX)Cv{0cKF`@e|PP(xS0|z2_R0(P6)#+kC$0^5- z$7Hs|bOQanE z1oJ;uh(dYiDt}mVmtC3&HaGT6-dY429v#ySHJ7V)C8ow=PSmnEI)=b3_RJsU(S*+J zV$p3>RkK?DFvTc;(-T=h!1u~CP!pE=0eSSu#c@N7S0Z57CPg}!5z{QL#`2v?DJDt^ zCGN{0p-&&=)Sb28Xlo;ZXc^CGdwL9prf30uu$y5aPeWD6WIk4%%~DEhTiwOvy!rS% z&3z#DWo2qBA*=M2xIu=_R0sbrmP;Y?_rRa^k}3WYU6n9H^(})Zi-woMKKXfgbab@J zWx3DUr0MLpdDYk_LO8As}d*Z=x^K+uIv#T&SnY6&C$9 zBn1u`G#TBt+n5b%a;Cr0h^sm5Fl^OdxJ^8IebW);DWATq#Ba=#rggj*wNKy5NMzz& zBm`bk9bcSVPJbC`dHrI>o^=LSvTFpT`VAK`x_naOpvS~*l2$1vIk$avBA!|aeZ+7c z$_9Zzh>fc4$uX&w@-$VORCscG(B)OA@SPj>BNY3gxkkcPgNi9bE=?&3A4`3ekrdsb zn~`M;p8I>4?@@ZI{9Afv(tC@pp@Oe5BYUw-%&J_WaTBGls)&d8q?t$i<<@=_CNfH! z4H!ww7#gkp_^`bxZaJI9@C+A9x7@E1ZRoG5PL?w3GDi>`8Qq%I+0ygfT78%{Zt#mP zqX0CzaHKn@hAOQsv=^8UbfpuyFnT8Ht++Vmmx$~09!e{5t8fMkEjr~tfIxMlIpr4zGwvEIWKC2`Q#C)c7QF9wet?hE zLKoU?t@nqm=iBc` z8_((*(i(g}7z)3{%SJ!uya{?Ir-2^Fiap*VC4pF@N zpL5F*DG+(taLhdu4DbyAP(0&60n@%?G~hHugBI^-X6@_YOu}8UqwbQ8V`2vwDRLMz z)aRFo+r1f?5idT9xRF`cjgx$a-IpH3AH|bs$emw}d23*3aU0hYNh4(D0o-Z+wIX{d zeann?lzjgsAt62`er@<$`G755?i7tl%CHNgXp}#j>j&S1n5wZ;ofNbI>B2*4L1}@3 zq(LzPqn()w{KBsX!5*a&=dv<}t=R%II;TcQatbnKM7S4Q1PQIoT=^$#=>Y(m{mBYtl5W z6}|l4kxikOcJ`C3o{TSxIi?8|N6sH7Lkhq5qttl@uBTA|-cBluU$hU0&xYKvNidrL z4q>|j76}G1Db23Fa|XlFm%W&jW0h#7B$_FD-ZhqJ5#7i!0ZmCrereX z|Jlf`<1zR2akFe|boWv-r=}kM03o|%$mZA7Of2T99u~e56~6sh$P=yk9f!H6msn)n zvFOLF?W?iqi6fK9C)a42Sgt0kz4#M6 z-UY6451Er~=V;ITs1O-q*>}{;bs74MMZ(Z&=Z{5#q+i@cw^vI#0|Dh~-Dh-tn2I(S zTXXp-bLEG{p0#BbIqIcTM|DWZmr`&br8u)jQ`CR*^+g_fIX%=K+)x}F%Oak-Uh$6nIHUavnNV5M7YffU80QPRD%y>T{bIzn<6Rsy zb6cW6`?0EwSn;uJddPn@`?^Cry2s(6ccP1ykKr!kmDg2~zbTJq@+e(z5N>ZNr|8$j zPi-~ofp7E|Xx1#H+f@UR@AS}iLP!}}dRwf{u!avAq-_hNw#uaoOD{2jo*eRn8$~bDK`h1&ssOC6ekGV38+hU!KR z+kpnSzT;y#o|V2h|F?SY4-z1MFxz0;)@Lk`H>Cj zSl@fR%*@F79;HJcsX%L8_d!%TwmQyi$|n&C{oBMJ9~Xm!@@#lZdz(WB9SgJ#NIC%@ zy+~ZnI|4E`7f@W0Y9I@N7UTs1fTPD-ZiU%Lr2MnP+2h8AGh?(WGVf>h@W-_M>jRkD z(KNxvo(UJ7)o+*t%fCcM10;2XM$1NAFKwhp(c917^io_ynn-yv58IFIF*UJUw*2Ma zm?a-a1yp9B?WxpLzap-c^$HKkX_IfT_W8Lqaltl*A%vZSZWAe`Kv}vjz}>Tc;Hw9T zA+Nc49X&{WDmxY~ReV0YceXdL!$9mTL$Q@_vXIW6I{G=`$KR7jFcE&IsHwnKX;KldV#YL z(xwKAB5cFiz+r6m*5iJvo&E)XQqVWjmA}BfyVS&dm9&Y%$Sp^sW!JE3iI0v(kQHdo zmhWk|gC!e@CFKPv4BE*U;mYo0y}J0J-Fhu!c%v+paQf9+3Ed2EkfPt(D7|Ok#t)^PGr3Y)RGfvO=k;@Xry=Cf3fLCQ# zi`%oCt+vyB-t{iEgI&+2dczmnMXj>EOmSpMuuL8Ob`1$D;fc$wM6j2HH4Q$ zqaoj&M$2sLhpptdJMbs!krJId=iOd}HdP4Lt@yf42OZ{pOoQ4_gShz_sMoWYX}yQd zDQ8(tc7UvTt%`0#?9K!C^J>GpucEnBhnsWg102Z=uzOlwez^q^j7nV$krID#wC}A$ zcRfc2)T5Y~({6@1`{yL-Lzs;miT@C9|1SIFBMK7cz*E;v2H|EStZphjfb5mGMpw{q z!pl;Vw772tuvDH4o$;j4u8)@=m+&BIf4Ix(u75P?Q{4Y8^uvpq)mCW(enuQc)hx$B zOY{`_*%~bm%k*x6y;)D8_-yYbMsC8y#1H}89X;M=a#*HT>d*NFf}x$pQ&X?nFtvzA zKH|l8y;frsm|&}<%&*}Yu}Yn0M=Jy8qe%<1qXRR%Nut}Aqr+1pQS*D7Cp`+8Y`RO02p14DyVOmSYlEzZ;9&JzYhtybMZ%e4s zlks=V(+aJ!LK-()3ox`%9c)lx#3#y4{ulL6KpG|&>9`n?Uh#m3G-mZy-3h98Scyja zH^3Pb7?P z+2hAkyvg}g$#)n$Gs2fL19JNOZ|~>Nx(|}lmwesC!>?Y~72mpf4XZ8t^TIwbCk;i0 z+a2ymSZ^=OrtrSH!(y#Vn!8KWk#O7<1-!if+`dDDy18U7wS3k$lIeM}Z0fhYqI)+x zo*o4*S$S|hGf6vL>PaQ(OQ_%eskx-G-FV|dXHbTH<#w@RbeIx9I$d$xqHh`{*&d3y zevlYNk)}w@cuu4A$^DYJsOvO7VBaom@Rx@gb$V5IKJ{Xue16H-1H0j=U0brW-aVRG znWCQRkESBmD^4?a7mB@!jf2>(Hs=Bd-;XX1oEilevb9axB^NhIPLO>jl03S+Rw|fx z&oIsIk(~W!4$zzKF|uSR<@S#;{r;fKup)iDaxz_9JouroY>XHcrN(Mm@UHV?-8bCh zXGfY~7U`rCasv(h-R*ava)^ zF1`BMT*n3xQBTdM?`n&h2Ecf*XXuLo7Zyl_El(v~oh>}mK01$%0a@#uzyiX_g>Bav2XWwH%YekAxU%pBT!p*?%cS#zA zv;^eDC#KZP@7o=^GDc_V8<3w>`*L(+=A#(fcH)dGjqM}Vk_el+c>B`{9xm<>IZ-Zm zLL!-Yf*3nju_(8ZGUd9*K`iofWW+BYFnZF&+a|=yxqV?oUOcG#ulnSR$DMs|e5Tph%WW zVjzE3nMh7+rG!}av)+~;o$#+EHyPX zzOUO?^#)Jh*t^b7pTW+I%f;xy&JMPCO&5RR``BmHX-Mw{qoJp9BjKea$;A9%>-iEZ zvuUBm%0j5UWax~`ue!K6dDdip+zs3f{+qQKqH;9C(1Z@95()-Ew=`BdLh2VS3zI8qYGH&&7m9+vpUc+x8l!i-ATXKhw34XL2;ya_VIQz!OL^)8mtqnb?q=~&^h-$;Zn^HRZ2p(gH z39An;`AWT=i&VP0u&CUe7OYW51Icv=q%Vc7%Zm z_uAp9n}osEUdk2*pV)*i`WRSa-FWtCwGqS-75@K#V0)r;+0(0XVp9vnb7lWiMj!q= z>Zf(ioa@gSwA55Jil$lh)%4U<)$j@HTQU2KwuUUsZA*2O^QTKobak8g0Qb~ROMTW7 zfTF2yF*na6i(lQ*Nq^rPen^0>$$b`K!Kp{FVa-VF`kCiXZg0Vtr}i*rcpny_YOR!} z+?Jiv?dWlT`}o$s9Fxt%%684d7ek-q-Q~jS*I5+8HtvSw+Rp!D=+gVr!gqcYy9K74 z&eClx6f6{1Din;ynjz?XZlJ~W7^A@0wiHIt8$aou;f>MYpU%gUlDwAK*nX0#vHtyl z_C=B+ZkOffY|oR^2>(+IlZCTMFirZMhn>bqzR=38hvJpcM4-@gUYY7_k^G*FW9;5r zc9q4c>C?hd{uS3{MThN*(w!3e05e?bI#SNlo$U&%>((Dz0_JeqbG|}!wI$& z%q2JQ)Vas;i0RYqNXW!CC~QK%u$K$beGI zT2KuzMjus26(zmofK;m2gY%d*o~sHBKA#`RBNc9c*-GLmbgh?*9V;^TBSot2E%~Q5 zl+R!WA_h_JT;+irbJ#Z-tSy-;B^t&&dOSwPV(T!CB)no8Y4sP%k(MD^0P!NL1vK&7 z`3luW2$gkI#Zf>IZT2=m4R&e@d zeo#B=Q|9`w8}%|)f%GBjYO01&Dk5qjm$+#1yia#CE=Sh~88Vdp%|VU}0a6mF@JkhUY&~W3f#rHK-1Qdo z>0*z5?#-hQUY}k^X7~1bkI?($-~3#c3mF4Cl@2%|0@1=ARZ z^qlNaN63&>;O_~mmto}?tAhznb}p;GpyIq1Z^yf<_6Ui~cpbbP;uV7W!+ke>wYG-f zPPz2~%UgSs(>vsKFle%uo=WIDYz;BR!doAy)aQ0QCpE_Wz1XK+3Kpr=V_H8w zqzaizn9ALx#?fo-N)_CtENYH*1|ID|x=xa9d#;9~1Wgrcx^8=evrfky*Xj`269~A;kh^O|ewZnM}=SmM7NX=?h#jjLh&1kIT+A z)If4luYo@s+e_L&eRJ$gw1`)>u#efOq=M0iYIPS$GII0z`T56eNxK@~Y%*^~Q&w$1b)jM9Z~kuRc~YX`6r#ySCskW5cq|#a39s;ZiaL~OdEpgu z1k*sKkLZ&?6fAi=)77yKI1xii%)@DG8r}663xkJcwLTj?s`h{GP@_2}`A|;w7zrzk4QOQ*O$(e|M^<`vLD*1^i>Nr*= z+A`y@f{!zLi)ys9OrFM5`Qw0292Ciyq>zC>8(TkG1O;#UUh?#I08kuwpS_vhufJ0v&p^Yr`=^WG7!qVG(8n9u7=J64fr zQq7B|9rzl7s)I_|8UeVp?=cqGILQ}0O(n+^vJz=vFBU9JmG$=DWzi+qCHw@D0a7`M zA`%pmU8+8W{u0{2*^tg&3;I&i`4`{YJe_n8 z{viTJZL?$}#l9w${3mydrW>Z%nY!WXf$HJv5$Zw4F%7^mXWsZ-s&olv31;C*KlH)j z?j?Eika^cI`l>)WJ*ga?%>0HwJm{%<)OP8pdvwMG@fm;Ca`jfy7ixY-sic42*f&ld zJg3(O0~;=Zsp@cdUj@&Zj~#~LX=F5Ws@!Ik0-~(wlbJO6&)S~s6WrAW9lrQ%6+S03 z&P&xJ{;BC%2s%J#uxZy3=Fc}fkwE9(T}QAK9b{FT!L3^PQ~;#X$T|9v&JFq)ru$h|ls zvPxYyWT}V&Dol3#)t6pVE4nIClEq=r++eGcG-tkOW4{n$Ra~3z?`@_gXRUiR`SrhY4K z#>C+t>pNtm>!Zw*;p^qI0|g<)Ob`r0jaN6asw2ZGLT}bMbHnQ$OH8cR7{Rq?=4%&x z2Qe&O`w$~b%fuo>fkgT`PVx=uto@&SdDpIXL)<da|A*x(b?o zdUj^iN+B9%;2{1URo7=%m@r*RJi3fQNO_`AZY;b#tClm;A}NQF#!Y;pMMdh=^fO@9 z>J>Xv^joKJM>M7x=xh!oSLO3JlxVwTn$DPHdGsnkAvB)9d)IE6ZHgd1vd+Z;W1d682CBy4zti z&6;T6!rzSKIy&zKKfAx9J%7q-=Mac{u-_GIYEaZt*`h25Ne?ch`E_c2{pGA<;nVkx z102u6#||N$g5MhA{!rFwaI(;8$S{1DePGc^L~j6?Q$2QMIO09 zPdma#_kX(|;oOau(pX877ac9V4O8x3g{Mdbr6oS)7 zN0v#H_j!bhUNl;q>GrkeA~){;lCg@&Mg5(z%E1HV`d7{>_}@9JZ(VJn>=HKC4q{My zLpw8D2OD@&E}T?=SV7rE-XI?4H+E(aOI8sZOC$NW=!leE6MG6ycn2;fB4XpB!^#Z= zQ?P=-+!R0#4h{+c2LPbUF6{uZG&6i-ZDI+f;6P`8V{ZtxcA((p;6i6ds6r4x005m` z6k;m{H8U}FK+J;+syaZe)G2u2J;eI(G+`)^0+C~@0#BIzJLi_?-}e8NR15?I|34|k zx>2LneiYApj|7nW4k1sp9h-vz^G);Jq7ONB*clw!(IJ2QT3sYWS)>yb_Ual2Um3r5 zw706UJD48HLY73$&Gm=sl|EYND&Uk>VT!eN_p49f6HS<{TU>u{4&#WYh1dwy^E8il ziH`_=$2m8k)y$Q2yDZQluP+AZbND!Yi7Co@fwHnw2pV1bo*=wGx2n7Urt$y1@imz1&#&nK47Nw zT-dLY@^1NHY?5B#-Qf9?`lA_={@NnLpmwJGQG7&oU}0>) ziZ`GdjY(jIKi2Q?e+d=de}nq3pkP;ZG;lyf$Xh!{=x?qF#2$)p%>NM^W_I=tqNWf# zgv;e1fAtY=)-W@2FtyhKb8%3Bfj|mw00#vR4=)857d&XdU z(4fLD4>dA_AWjHkeJ)-u3LZ|NF1w_ijiW6*A6^xXD#Y5}7O{k(E4!#F{9rhl8A4Sg zMcAb&9N>rx39*a9v4(4~r$8jq|MLt0{*hTPYU2nu0sub&aQG~$!9>qU@%LGVw1{ZAdD5crj3WAdl2KV62-uIT7sX=aUZ*>8aV1F3(c z_P=p-FtxG!8!9*^U<3>RcoByeFaipAK|lhB5)AqaI)n^@hmeEwxOw0OKK@%C0pZ{C z5o^F{FbEE(DEt!$_$B<8DlYiaV7ME855ql#Py+_S#o(c8`L;d6lqRR~$cn(zq-4};(pf)4`xt=`PWS`7YO27?$MdgtpDP{`vCa4 z{2x3Z5bm@8-~oUj5Zv+q!Gl}N`CoDX0N4M*gTIpgb1nb?;)Y)s|FIqb0Ot6gw!m#h zTnhg~j+YZ2)c?r?0yzIm4hZ1=FTFrc;D6}=a`OJeW(PY6{AFi{I1;L6ZcsR+>?$@k z@FNVDLEL!K*2XpzfZwk|I3Y%%Lm?mm76XGtKw?0k2(JV$kO#;s#>p!o!6gRf5#f;l j@(7{-|3%=32kuUL2Z)`+Z(jm{U>-0!Ev>ks1p5C2Hj`#V literal 0 HcmV?d00001 diff --git a/spec/indexers/hyku/indexers/file_set_indexer_spec.rb b/spec/indexers/hyku/indexers/file_set_indexer_spec.rb new file mode 100644 index 000000000..5d0fef968 --- /dev/null +++ b/spec/indexers/hyku/indexers/file_set_indexer_spec.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +RSpec.describe Hyku::Indexers::FileSetIndexer do + let(:indexer_class) { described_class } + let(:resource) { Hyrax.config.file_set_model.constantize.new } + let(:original_file) { Hyrax::FileMetadata.new } + + it 'is the configured file set indexer' do + expect(Hyrax.config.file_set_indexer).to eq described_class + end + + describe '#to_solr' do + let(:stream) { File.open('spec/fixtures/pdf/pdf_sample.pdf').read } + it 'indexes the text of a pdf that has text already' do + allow(Flipflop).to receive(:default_pdf_viewer?).and_return(true) + allow(resource).to receive(:original_file).and_return(original_file) + allow(original_file).to receive(:pdf?).and_return(true) + allow(original_file).to receive(:content).and_return(stream) + + expect(resource.to_solr['all_text_tsimv']).to include('Dummy PDF file') + end + end +end From 685266d84bf3548e436a9503629d4eca72b1cb4c Mon Sep 17 00:00:00 2001 From: Shana Moore Date: Wed, 18 Sep 2024 15:30:08 -0700 Subject: [PATCH 4/5] Update app/indexers/concerns/hyku_indexing.rb --- app/indexers/concerns/hyku_indexing.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/app/indexers/concerns/hyku_indexing.rb b/app/indexers/concerns/hyku_indexing.rb index 51a7f2f64..7fe0d2b8e 100644 --- a/app/indexers/concerns/hyku_indexing.rb +++ b/app/indexers/concerns/hyku_indexing.rb @@ -21,7 +21,6 @@ module HykuIndexing solr_doc['account_institution_name_ssim'] = Site.instance.institution_label solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource) solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource) - # TODO: Reinstate once valkyrie fileset work is complete - https://github.com/scientist-softserv/hykuup_knapsack/issues/34 solr_doc['all_text_tsimv'] = full_text(object) # rubocop:enable Style/ClassCheck solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical From 617bc248e0c66e638cba5ab625b39cdcb891cf9b Mon Sep 17 00:00:00 2001 From: Kirk Wang Date: Wed, 18 Sep 2024 18:23:46 -0700 Subject: [PATCH 5/5] =?UTF-8?q?=E2=9C=85=20Fix=20indexer=20specs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit will rename the #full_text method to #extract_full_text because it was causing weird issues with super. --- app/indexers/concerns/hyku_indexing.rb | 4 ++-- app/indexers/hyku/indexers/file_set_indexer.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/indexers/concerns/hyku_indexing.rb b/app/indexers/concerns/hyku_indexing.rb index 7fe0d2b8e..17073eb6e 100644 --- a/app/indexers/concerns/hyku_indexing.rb +++ b/app/indexers/concerns/hyku_indexing.rb @@ -21,7 +21,7 @@ module HykuIndexing solr_doc['account_institution_name_ssim'] = Site.instance.institution_label solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource) solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource) - solr_doc['all_text_tsimv'] = full_text(object) + solr_doc['all_text_tsimv'] = extract_full_text(object) # rubocop:enable Style/ClassCheck solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical solr_doc['depositor_ssi'] = object.depositor @@ -37,7 +37,7 @@ module HykuIndexing private - def full_text(object) + def extract_full_text(object) child_works = Hyrax.custom_queries.find_child_works(resource: object) if child_works.empty? diff --git a/app/indexers/hyku/indexers/file_set_indexer.rb b/app/indexers/hyku/indexers/file_set_indexer.rb index a7cd8f319..febbb2dd7 100644 --- a/app/indexers/hyku/indexers/file_set_indexer.rb +++ b/app/indexers/hyku/indexers/file_set_indexer.rb @@ -18,7 +18,7 @@ def to_solr # rubocop:disable Metrics/MethodLength def pdf_text - return unless resource.original_file.pdf? + return unless resource.original_file&.pdf? return unless resource.original_file&.content.is_a? String begin