From 699a4e26d6872c83dc08f148824ba7d37561cbdc Mon Sep 17 00:00:00 2001 From: Carolyn Cole Date: Thu, 29 Feb 2024 11:27:11 -0500 Subject: [PATCH] Allowing sort by year and first author The sort field for year was not defined by the PDC indexer, which caused the sort to be random The Author sort was the sorting by the alpabetically first of any of the authors, which made the sort seem random fixes #572 --- app/controllers/catalog_controller.rb | 6 +- app/lib/date_normalizer.rb | 6 +- lib/traject/dataspace_research_data_config.rb | 2 +- lib/traject/pdc_describe_indexing_config.rb | 20 ++++++- spec/fixtures/files/pdc_describe_data/89.json | 30 ++++------ spec/fixtures/files/sowing_the_seeds.json | 58 +++++++++++-------- spec/lib/date_normalizer_spec.rb | 2 + spec/lib/describe_indexer_spec.rb | 29 ++++++++++ 8 files changed, 105 insertions(+), 48 deletions(-) diff --git a/app/controllers/catalog_controller.rb b/app/controllers/catalog_controller.rb index b378fb67..81b8a35d 100644 --- a/app/controllers/catalog_controller.rb +++ b/app/controllers/catalog_controller.rb @@ -233,10 +233,10 @@ def retry_on_exception # whether the sort is ascending or descending (it must be asc or desc # except in the relevancy case). Add the sort: option to configure a # custom Blacklight url parameter value separate from the Solr sort fields. - config.add_sort_field 'relevance', sort: 'score desc, year_available_itsi desc, title_si asc', label: 'relevance' - config.add_sort_field 'year', sort: 'year_available_itsi desc, title_si asc', label: 'year' + config.add_sort_field 'relevance', sort: 'score desc, issue_date_strict_ssi desc, title_si asc', label: 'relevance' + config.add_sort_field 'year', sort: 'issue_date_strict_ssi desc, title_si asc', label: 'year' config.add_sort_field 'author', sort: 'author_si asc, title_si asc', label: 'author' - config.add_sort_field 'title', sort: 'title_si asc, year_available_itsi desc', label: 'title' + config.add_sort_field 'title', sort: 'title_si asc, issue_date_strict_ssi desc', label: 'title' # If there are more than this many search results, no spelling ("did you # mean") suggestion is offered. diff --git a/app/lib/date_normalizer.rb b/app/lib/date_normalizer.rb index 2c55b96f..332d8f6a 100644 --- a/app/lib/date_normalizer.rb +++ b/app/lib/date_normalizer.rb @@ -49,11 +49,15 @@ def self.year_from_date(date_string) Date.strptime(date_string).strftime('%Y').to_i elsif date_string.match?(/\d{4}-\d{2}/) Date.strptime(date_string, '%Y-%m').strftime('%Y').to_i - else + elsif date_string.match?(/^\d{4}/) && date_string.size == 4 date_string.to_i + else + time = Time.zone.parse(date_string) + time.year end rescue ArgumentError # bad formatted date + Rails.logger.warn("Error parsing date #{date_string}") nil end end diff --git a/lib/traject/dataspace_research_data_config.rb b/lib/traject/dataspace_research_data_config.rb index f995cfa3..bf1e3ee9 100644 --- a/lib/traject/dataspace_research_data_config.rb +++ b/lib/traject/dataspace_research_data_config.rb @@ -114,7 +114,7 @@ # single value is used for sorting to_field 'author_si' do |record, accumulator, _c| values = record.xpath("/item/metadata/key[text()='dc.contributor.author']/../value").map(&:text) - accumulator.concat [values.uniq.sort.first] + accumulator.concat [values.first] end # all values as strings for faceting diff --git a/lib/traject/pdc_describe_indexing_config.rb b/lib/traject/pdc_describe_indexing_config.rb index 5d410921..229bf473 100644 --- a/lib/traject/pdc_describe_indexing_config.rb +++ b/lib/traject/pdc_describe_indexing_config.rb @@ -89,7 +89,7 @@ # single value is used for sorting to_field 'author_si' do |record, accumulator, _c| author_names = record.xpath("/hash/resource/creators/creator/value").map(&:text) - accumulator.concat [author_names.uniq.sort.first] + accumulator.concat [author_names.first] end # all values as strings for faceting @@ -146,6 +146,24 @@ to_field 'pdc_created_at_dtsi', extract_xpath('/hash/created-at') +to_field "issue_date_strict_ssi" do |record, accumulator, _context| + migrated = record.xpath("/hash/resource/migrated/text()").to_s + date = if migrated == true + pub_year = record.xpath("/hash/resource/publication-year/text()").to_s + "#{pub_year}-01-01" + else + date_value = record.xpath("/hash/created-at/text()").to_s + begin + DateTime.parse(date_value).strftime('%Y-%m-%d') + rescue + nil + end + end + if date + accumulator.concat [date] + end +end + to_field 'pdc_updated_at_dtsi', extract_xpath('/hash/updated-at') # ================== diff --git a/spec/fixtures/files/pdc_describe_data/89.json b/spec/fixtures/files/pdc_describe_data/89.json index 9175ba6e..899a92f8 100644 --- a/spec/fixtures/files/pdc_describe_data/89.json +++ b/spec/fixtures/files/pdc_describe_data/89.json @@ -73,7 +73,7 @@ "resource_type": "Dataset", "resource_type_general": "Dataset", "publisher": "Princeton Plasma Physics Laboratory, Princeton University", - "publication_year": "2022", + "publication_year": "2023", "ark": "ark:/88435/dsp01wh246w38h", "doi": "10.34770/bm4s-t361", "rights_many": [ @@ -150,13 +150,7 @@ "errors": [] } ], - "keywords": [ - "HHFW", - "3D RF modeling", - "Petra-M", - "fast wave", - "NSTX-U" - ], + "keywords": ["HHFW", "3D RF modeling", "Petra-M", "fast wave", "NSTX-U"], "contributors": [], "organizational_contributors": [], "funders": [ @@ -167,15 +161,12 @@ "award_uri": "" } ], - "domains": [ - "Natural Sciences" - ], - "communities": [ - "Princeton Plasma Physics Laboratory" - ], + "domains": ["Natural Sciences"], + "communities": ["Princeton Plasma Physics Laboratory"], "subcommunities": [ "NSTX-U", - "Spherical Torus" + "Spherical Torus", + "Tokamak Experimental Sciences" ], "migrated": true }, @@ -321,9 +312,12 @@ ], "group": { "title": "Princeton Plasma Physics Lab (PPPL)", - "description": null, + "description": "", "code": "PPPL", "created_at": "2022-04-28T16:30:00.195-04:00", - "updated_at": "2023-05-18T14:20:45.272-04:00" - } + "updated_at": "2024-02-16T09:06:20.108-05:00" + }, + "embargo_date": null, + "created_at": "2023-08-18T13:47:14Z", + "updated_at": "2024-02-23T11:05:11Z" } \ No newline at end of file diff --git a/spec/fixtures/files/sowing_the_seeds.json b/spec/fixtures/files/sowing_the_seeds.json index 35937a9e..0f4fee3f 100644 --- a/spec/fixtures/files/sowing_the_seeds.json +++ b/spec/fixtures/files/sowing_the_seeds.json @@ -6,9 +6,18 @@ "title_type": null } ], - "description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service.This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators’ own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\n and the facilitators took notes on the subjects’ behavior and commentary during the sessions.Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site’s search box.\r\n-\r\nDownload the README.txt for a detailed description of this dataset's content.", + "description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service. This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators' own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\nand the facilitators took notes on the subjects' behavior and commentary during the sessions. Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site's search box.\r\n\r\nDownload the README.txt for a detailed description of this dataset's content.", "collection_tags": [], "creators": [ + { + "value": "Abrams, Samantha", + "name_type": "Personal", + "given_name": "Samantha", + "family_name": "Abrams", + "identifier": null, + "affiliations": [], + "sequence": 0 + }, { "value": "Antracoli, Alexis", "name_type": "Personal", @@ -62,40 +71,41 @@ "identifier": null, "affiliations": [], "sequence": 6 - }, - { - "value": "Abrams, Samantha", - "name_type": "Personal", - "given_name": "Samantha", - "family_name": "Abrams", - "identifier": null, - "affiliations": [], - "sequence": 7 } ], "resource_type": "Dataset", - "resource_type_general": "", + "resource_type_general": "Dataset", "publisher": "Princeton University", "publication_year": "2023", - "ark": null, - "doi": "10.34770/doc-1", - "rights": { - "identifier": "GPLv3", - "uri": "https://www.gnu.org/licenses/gpl-3.0.en.html", - "name": "GNU General Public License" - }, + "ark": "ark:/88435/dsp01d791sj97j", + "doi": "10.34770/00yp-2w12", + "rights_many": [ + { + "identifier": "CC BY", + "uri": "https://creativecommons.org/licenses/by/4.0/", + "name": "Creative Commons Attribution 4.0 International" + } + ], "version_number": "1", "related_objects": [], "keywords": [], "contributors": [], - "funders": [] + "organizational_contributors": [], + "funders": [], + "domains": [], + "communities": [], + "subcommunities": [], + "migrated": true }, "files": [], "group": { - "title": "Research Data", - "description": null, + "title": "Princeton Research Data Service (PRDS)", + "description": "", "code": "RD", - "created_at": "2023-01-05T11:26:07.005-05:00", - "updated_at": "2023-01-05T11:26:07.005-05:00" - } + "created_at": "2022-04-28T16:30:00.190-04:00", + "updated_at": "2024-01-23T11:29:40.724-05:00" + }, + "embargo_date": null, + "created_at": "2023-07-11T11:06:10Z", + "updated_at": "2023-09-13T08:23:50Z" } diff --git a/spec/lib/date_normalizer_spec.rb b/spec/lib/date_normalizer_spec.rb index 09a8f473..56c4d754 100644 --- a/spec/lib/date_normalizer_spec.rb +++ b/spec/lib/date_normalizer_spec.rb @@ -4,6 +4,7 @@ let(:years) { ['2015'] } let(:months_and_years) { ['2015-08'] } let(:timestamps) { ['2015-08-18T18:14:22Z'] } + let(:month_year_name) { ['August 2020'] } describe "#format_array_for_display" do it "formats four digit years" do @@ -27,6 +28,7 @@ expect(described_class.years_from_dates(timestamps)).to eq [2015] expect(described_class.years_from_dates(months_and_years)).to eq [2015] expect(described_class.years_from_dates(years)).to eq [2015] + expect(described_class.years_from_dates(month_year_name)).to eq [2020] end it "handles bad dates" do diff --git a/spec/lib/describe_indexer_spec.rb b/spec/lib/describe_indexer_spec.rb index bd864b39..647ccf9e 100644 --- a/spec/lib/describe_indexer_spec.rb +++ b/spec/lib/describe_indexer_spec.rb @@ -189,6 +189,35 @@ expect(response["response"]["numFound"]).to eq 2 end + it "can sort by issue_date_strict_ssi" do + Rails.configuration.pdc_discovery.index_pdc_describe = true + indexer.index + response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'issue_date_strict_ssi desc' } + expect(response["response"]["numFound"]).to eq 2 + expect(response["response"]['docs'].first['pdc_created_at_dtsi']).to eq("2023-07-11T11:06:10Z") + expect(response["response"]['docs'].last['pdc_created_at_dtsi']).to eq("2021-12-31T19:00:00Z") + end + + context "works with multiple creators" do + let(:pppl1) { File.read(File.join(fixture_path, 'files', 'pppl1.json')) } + let(:pppl2) { File.read(File.join(fixture_path, 'files', 'pppl2.json')) } + before do + stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json") + .to_return(status: 200, body: pppl1, headers: {}) + stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json") + .to_return(status: 200, body: pppl2, headers: {}) + end + + it "can sort by the first author" do + Rails.configuration.pdc_discovery.index_pdc_describe = true + indexer.index + response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'author_si desc' } + expect(response["response"]["numFound"]).to eq 2 + expect(response["response"]['docs'].first['author_tesim'].first).to eq("Wang, Yin") + expect(response["response"]['docs'].last['author_tesim'].first).to eq("Schwartz, Jacob A.") + end + end + context "when there are items which are under active embargo" do let(:item_file_fixture) { file_fixture("pdc_describe_active_embargo.json") } let(:embargo_resource) { item_file_fixture.read }