Allowing sort by year and first author

The sort field for year was not defined by the PDC indexer, which caused the sort to be random The Author sort was the sorting by the alpabetically first of any of the authors, which made the sort seem random fixes #572
pulibrary · Mar 1, 2024 · 8d631ed · 8d631ed
1 parent 63019ce
commit 8d631ed
Show file tree

Hide file tree

Showing 8 changed files with 181 additions and 48 deletions.
diff --git a/app/controllers/catalog_controller.rb b/app/controllers/catalog_controller.rb
@@ -233,10 +233,10 @@ def retry_on_exception
     # whether the sort is ascending or descending (it must be asc or desc
     # except in the relevancy case). Add the sort: option to configure a
     # custom Blacklight url parameter value separate from the Solr sort fields.
-    config.add_sort_field 'relevance', sort: 'score desc, year_available_itsi desc, title_si asc', label: 'relevance'
-    config.add_sort_field 'year', sort: 'year_available_itsi desc, title_si asc', label: 'year'
+    config.add_sort_field 'relevance', sort: 'score desc, date_available_ssi desc, title_si asc', label: 'relevance'
+    config.add_sort_field 'year', sort: 'date_available_ssi desc, pdc_created_at_dtsi desc, title_si asc', label: 'year'
     config.add_sort_field 'author', sort: 'author_si asc, title_si asc', label: 'author'
-    config.add_sort_field 'title', sort: 'title_si asc, year_available_itsi desc', label: 'title'
+    config.add_sort_field 'title', sort: 'title_si asc, date_available_ssi desc', label: 'title'
 
     # If there are more than this many search results, no spelling ("did you
     # mean") suggestion is offered.

diff --git a/app/lib/date_normalizer.rb b/app/lib/date_normalizer.rb
@@ -20,6 +20,31 @@ def self.format_string_for_display(date_string)
     end
   end
 
+  ##
+  # @param [<String>] date_strings
+  # @return [<String>] An array of strings formatted for sorting
+  def self.format_array_for_sorting(date_strings)
+    date_strings.map { |x| format_string_for_sorting(x) }.compact
+  end
+
+  def self.format_string_for_sorting(date_string)
+    if date_string.match?(/\d{4}-\d{2}-\d{2}/)
+      Date.strptime(date_string).strftime('%Y-%m-%d')
+    elsif date_string.match?(/\d{4}-\d{2}/)
+      Date.strptime(date_string, '%Y-%m').strftime('%Y-%m-%d')
+    elsif date_string.match?(/^\d{4}/) && date_string.size == 4
+      "#{date_string}-01-01"
+    else
+      begin
+        time = Time.zone.parse(date_string)
+        time.strftime('%Y-%m-%d')
+      rescue ArgumentError
+        Rails.logger.warn("Error parsing date #{date_string}")
+        nil
+      end
+    end
+  end
+
   def self.strict_dates(date_strings)
     date_strings.map { |date| strict_date(date) }.compact.sort
   end
@@ -49,11 +74,15 @@ def self.year_from_date(date_string)
       Date.strptime(date_string).strftime('%Y').to_i
     elsif date_string.match?(/\d{4}-\d{2}/)
       Date.strptime(date_string, '%Y-%m').strftime('%Y').to_i
-    else
+    elsif date_string.match?(/^\d{4}/) && date_string.size == 4
       date_string.to_i
+    else
+      time = Time.zone.parse(date_string)
+      time.year
     end
   rescue ArgumentError
     # bad formatted date
+    Rails.logger.warn("Error parsing date #{date_string}")
     nil
   end
 end
diff --git a/lib/traject/dataspace_research_data_config.rb b/lib/traject/dataspace_research_data_config.rb
@@ -114,7 +114,7 @@
 # single value is used for sorting
 to_field 'author_si' do |record, accumulator, _c|
   values = record.xpath("/item/metadata/key[text()='dc.contributor.author']/../value").map(&:text)
-  accumulator.concat [values.uniq.sort.first]
+  accumulator.concat [values.first]
 end
 
 # all values as strings for faceting
@@ -194,6 +194,11 @@
   accumulator.concat DateNormalizer.format_array_for_display(dates)
 end
 
+to_field "date_available_ssi" do |record, accumulator, _context|
+  dates = record.xpath("/item/metadata/key[text()='dc.date.available']/../value").map(&:text)
+  accumulator.concat [DateNormalizer.format_array_for_sorting(dates).first]
+end
+
 to_field "year_available_itsi" do |record, accumulator, _context|
   dates = record.xpath("/item/metadata/key[text()='dc.date.available']/../value").map(&:text)
   accumulator.concat [DateNormalizer.years_from_dates(dates).first]
@@ -234,6 +239,17 @@
   accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
 end
 
+# Add a signgular field for sorting
+to_field "issue_date_si" do |record, accumulator, _context|
+  issue_dates = record.xpath("/item/metadata/key[text()='dc.date.issued']/../value").map(&:text)
+  accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
+end
+
+to_field "issue_date_si" do |record, accumulator, _context|
+  issue_dates = record.xpath("/item/metadata/key[text()='dcterms.issued']/../value").map(&:text)
+  accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
+end
+
 # Date in yyyy-mm-dd format so we can sort by it
 to_field "issue_date_strict_ssi" do |record, accumulator, _context|
   dates = record.xpath("/item/metadata/key[text()='dc.date.issued']/../value").map(&:text)

diff --git a/lib/traject/pdc_describe_indexing_config.rb b/lib/traject/pdc_describe_indexing_config.rb
@@ -89,7 +89,7 @@
 # single value is used for sorting
 to_field 'author_si' do |record, accumulator, _c|
   author_names = record.xpath("/hash/resource/creators/creator/value").map(&:text)
-  accumulator.concat [author_names.uniq.sort.first]
+  accumulator.concat [author_names.first]
 end
 
 # all values as strings for faceting
@@ -144,8 +144,39 @@
 
 to_field 'issue_date_ssim', extract_xpath("/hash/resource/publication-year")
 
+to_field 'issue_date_si', extract_xpath("/hash/resource/publication-year")
+
 to_field 'pdc_created_at_dtsi', extract_xpath('/hash/created-at')
 
+to_field "date_available_ssi" do |record, accumulator, _context|
+  date_value = record.xpath("/hash/created-at/text()").to_s
+  date = begin
+          DateTime.parse(date_value)
+         rescue
+           nil
+        end
+  if date
+    accumulator.concat [date.strftime('%Y-%m-%d')]
+  end
+end
+
+to_field "year_available_itsi" do |record, accumulator, _context|
+  year_value = record.xpath("/hash/resource/publication-year/text()")
+  if year_value.present?
+    accumulator.concat [year_value]
+  else
+    date_value = record.xpath("/hash/created-at/text()").to_s
+    date = begin
+            DateTime.parse(date_value)
+           rescue
+             nil
+          end
+    if date
+      accumulator.concat [date.year]
+    end
+  end
+end
+
 to_field 'pdc_updated_at_dtsi', extract_xpath('/hash/updated-at')
 
 # ==================

diff --git a/spec/fixtures/files/pdc_describe_data/89.json b/spec/fixtures/files/pdc_describe_data/89.json
@@ -73,7 +73,7 @@
         "resource_type": "Dataset",
         "resource_type_general": "Dataset",
         "publisher": "Princeton Plasma Physics Laboratory, Princeton University",
-        "publication_year": "2022",
+        "publication_year": "2023",
         "ark": "ark:/88435/dsp01wh246w38h",
         "doi": "10.34770/bm4s-t361",
         "rights_many": [
@@ -150,13 +150,7 @@
                 "errors": []
             }
         ],
-        "keywords": [
-            "HHFW",
-            "3D RF modeling",
-            "Petra-M",
-            "fast wave",
-            "NSTX-U"
-        ],
+        "keywords": ["HHFW", "3D RF modeling", "Petra-M", "fast wave", "NSTX-U"],
         "contributors": [],
         "organizational_contributors": [],
         "funders": [
@@ -167,15 +161,12 @@
                 "award_uri": ""
             }
         ],
-        "domains": [
-            "Natural Sciences"
-        ],
-        "communities": [
-            "Princeton Plasma Physics Laboratory"
-        ],
+        "domains": ["Natural Sciences"],
+        "communities": ["Princeton Plasma Physics Laboratory"],
         "subcommunities": [
             "NSTX-U",
-            "Spherical Torus"
+            "Spherical Torus",
+            "Tokamak Experimental Sciences"
         ],
         "migrated": true
     },
@@ -321,9 +312,12 @@
     ],
     "group": {
         "title": "Princeton Plasma Physics Lab (PPPL)",
-        "description": null,
+        "description": "",
         "code": "PPPL",
         "created_at": "2022-04-28T16:30:00.195-04:00",
-        "updated_at": "2023-05-18T14:20:45.272-04:00"
-    }
+        "updated_at": "2024-02-16T09:06:20.108-05:00"
+    },
+    "embargo_date": null,
+    "created_at": "2023-08-18T13:47:14Z",
+    "updated_at": "2024-02-23T11:05:11Z"
 }
diff --git a/spec/fixtures/files/sowing_the_seeds.json b/spec/fixtures/files/sowing_the_seeds.json
@@ -6,9 +6,18 @@
                 "title_type": null
             }
         ],
-        "description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service.This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators’ own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\n    and the facilitators took notes on the subjects’ behavior and commentary during the sessions.Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site’s search box.\r\n-\r\nDownload the README.txt for a detailed description of this dataset's content.",
+        "description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service. This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators' own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\nand the facilitators took notes on the subjects' behavior and commentary during the sessions. Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site's search box.\r\n\r\nDownload the README.txt for a detailed description of this dataset's content.",
         "collection_tags": [],
         "creators": [
+            {
+                "value": "Abrams, Samantha",
+                "name_type": "Personal",
+                "given_name": "Samantha",
+                "family_name": "Abrams",
+                "identifier": null,
+                "affiliations": [],
+                "sequence": 0
+            },
             {
                 "value": "Antracoli, Alexis",
                 "name_type": "Personal",
@@ -62,40 +71,41 @@
                 "identifier": null,
                 "affiliations": [],
                 "sequence": 6
-            },
-            {
-                "value": "Abrams, Samantha",
-                "name_type": "Personal",
-                "given_name": "Samantha",
-                "family_name": "Abrams",
-                "identifier": null,
-                "affiliations": [],
-                "sequence": 7
             }
         ],
         "resource_type": "Dataset",
-        "resource_type_general": "",
+        "resource_type_general": "Dataset",
         "publisher": "Princeton University",
         "publication_year": "2023",
-        "ark": null,
-        "doi": "10.34770/doc-1",
-        "rights": {
-            "identifier": "GPLv3",
-            "uri": "https://www.gnu.org/licenses/gpl-3.0.en.html",
-            "name": "GNU General Public License"
-        },
+        "ark": "ark:/88435/dsp01d791sj97j",
+        "doi": "10.34770/00yp-2w12",
+        "rights_many": [
+            {
+                "identifier": "CC BY",
+                "uri": "https://creativecommons.org/licenses/by/4.0/",
+                "name": "Creative Commons Attribution 4.0 International"
+            }
+        ],
         "version_number": "1",
         "related_objects": [],
         "keywords": [],
         "contributors": [],
-        "funders": []
+        "organizational_contributors": [],
+        "funders": [],
+        "domains": [],
+        "communities": [],
+        "subcommunities": [],
+        "migrated": true
     },
     "files": [],
     "group": {
-        "title": "Research Data",
-        "description": null,
+        "title": "Princeton Research Data Service (PRDS)",
+        "description": "",
         "code": "RD",
-        "created_at": "2023-01-05T11:26:07.005-05:00",
-        "updated_at": "2023-01-05T11:26:07.005-05:00"
-    }
+        "created_at": "2022-04-28T16:30:00.190-04:00",
+        "updated_at": "2024-01-23T11:29:40.724-05:00"
+    },
+    "embargo_date": null,
+    "created_at": "2023-07-11T11:06:10Z",
+    "updated_at": "2023-09-13T08:23:50Z"
 }
diff --git a/spec/lib/date_normalizer_spec.rb b/spec/lib/date_normalizer_spec.rb
@@ -4,6 +4,7 @@
   let(:years) { ['2015'] }
   let(:months_and_years) { ['2015-08'] }
   let(:timestamps) { ['2015-08-18T18:14:22Z'] }
+  let(:month_year_name) { ['August 2020'] }
 
   describe "#format_array_for_display" do
     it "formats four digit years" do
@@ -22,11 +23,34 @@
     end
   end
 
+  describe "#format_array_for_sorting" do
+    it "formats four digit years" do
+      formatted_dates = described_class.format_array_for_sorting(years)
+      expect(formatted_dates.first).to eq "2015-01-01"
+    end
+
+    it "formats months and years" do
+      formatted_dates = described_class.format_array_for_sorting(months_and_years)
+      expect(formatted_dates.first).to eq "2015-08-01"
+    end
+
+    it "formats ISO-8601 timestamps" do
+      formatted_dates = described_class.format_array_for_sorting(timestamps)
+      expect(formatted_dates.first).to eq "2015-08-18"
+    end
+
+    it "formats month year name" do
+      formatted_dates = described_class.format_array_for_sorting(month_year_name)
+      expect(formatted_dates.first).to eq "2020-08-01"
+    end
+  end
+
   describe "#years_from_dates" do
     it "gets years correctly" do
       expect(described_class.years_from_dates(timestamps)).to eq [2015]
       expect(described_class.years_from_dates(months_and_years)).to eq [2015]
       expect(described_class.years_from_dates(years)).to eq [2015]
+      expect(described_class.years_from_dates(month_year_name)).to eq [2020]
     end
 
     it "handles bad dates" do

diff --git a/spec/lib/describe_indexer_spec.rb b/spec/lib/describe_indexer_spec.rb
@@ -189,6 +189,35 @@
         expect(response["response"]["numFound"]).to eq 2
       end
 
+      it "can sort by year_available_itsi" do
+        Rails.configuration.pdc_discovery.index_pdc_describe = true
+        indexer.index
+        response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'year_available_itsi desc' }
+        expect(response["response"]["numFound"]).to eq 2
+        expect(response["response"]['docs'].first['pdc_created_at_dtsi']).to eq("2023-07-11T11:06:10Z")
+        expect(response["response"]['docs'].last['pdc_created_at_dtsi']).to eq("2021-12-31T19:00:00Z")
+      end
+
+      context "works with multiple creators" do
+        let(:pppl1) { File.read(File.join(fixture_path, 'files', 'pppl1.json')) }
+        let(:pppl2) { File.read(File.join(fixture_path, 'files', 'pppl2.json')) }
+        before do
+          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
+            .to_return(status: 200, body: pppl1, headers: {})
+          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
+            .to_return(status: 200, body: pppl2, headers: {})
+        end
+
+        it "can sort by the first author" do
+          Rails.configuration.pdc_discovery.index_pdc_describe = true
+          indexer.index
+          response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'author_si desc' }
+          expect(response["response"]["numFound"]).to eq 2
+          expect(response["response"]['docs'].first['author_tesim'].first).to eq("Wang, Yin")
+          expect(response["response"]['docs'].last['author_tesim'].first).to eq("Schwartz, Jacob A.")
+        end
+      end
+
       context "when there are items which are under active embargo" do
         let(:item_file_fixture) { file_fixture("pdc_describe_active_embargo.json") }
         let(:embargo_resource) { item_file_fixture.read }