Skip to content

Commit

Permalink
Allowing sort by year and first author
Browse files Browse the repository at this point in the history
The sort field for year was not defined by the PDC indexer, which caused the sort to be random
The Author sort was the sorting by the alpabetically first of any of the authors, which made the sort seem random
fixes #572
  • Loading branch information
carolyncole committed Mar 1, 2024
1 parent 63019ce commit 8d631ed
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 48 deletions.
6 changes: 3 additions & 3 deletions app/controllers/catalog_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,10 @@ def retry_on_exception
# whether the sort is ascending or descending (it must be asc or desc
# except in the relevancy case). Add the sort: option to configure a
# custom Blacklight url parameter value separate from the Solr sort fields.
config.add_sort_field 'relevance', sort: 'score desc, year_available_itsi desc, title_si asc', label: 'relevance'
config.add_sort_field 'year', sort: 'year_available_itsi desc, title_si asc', label: 'year'
config.add_sort_field 'relevance', sort: 'score desc, date_available_ssi desc, title_si asc', label: 'relevance'
config.add_sort_field 'year', sort: 'date_available_ssi desc, pdc_created_at_dtsi desc, title_si asc', label: 'year'
config.add_sort_field 'author', sort: 'author_si asc, title_si asc', label: 'author'
config.add_sort_field 'title', sort: 'title_si asc, year_available_itsi desc', label: 'title'
config.add_sort_field 'title', sort: 'title_si asc, date_available_ssi desc', label: 'title'

# If there are more than this many search results, no spelling ("did you
# mean") suggestion is offered.
Expand Down
31 changes: 30 additions & 1 deletion app/lib/date_normalizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,31 @@ def self.format_string_for_display(date_string)
end
end

##
# @param [<String>] date_strings
# @return [<String>] An array of strings formatted for sorting
def self.format_array_for_sorting(date_strings)
date_strings.map { |x| format_string_for_sorting(x) }.compact
end

def self.format_string_for_sorting(date_string)
if date_string.match?(/\d{4}-\d{2}-\d{2}/)
Date.strptime(date_string).strftime('%Y-%m-%d')
elsif date_string.match?(/\d{4}-\d{2}/)
Date.strptime(date_string, '%Y-%m').strftime('%Y-%m-%d')
elsif date_string.match?(/^\d{4}/) && date_string.size == 4
"#{date_string}-01-01"
else
begin
time = Time.zone.parse(date_string)
time.strftime('%Y-%m-%d')
rescue ArgumentError
Rails.logger.warn("Error parsing date #{date_string}")
nil
end
end
end

def self.strict_dates(date_strings)
date_strings.map { |date| strict_date(date) }.compact.sort
end
Expand Down Expand Up @@ -49,11 +74,15 @@ def self.year_from_date(date_string)
Date.strptime(date_string).strftime('%Y').to_i
elsif date_string.match?(/\d{4}-\d{2}/)
Date.strptime(date_string, '%Y-%m').strftime('%Y').to_i
else
elsif date_string.match?(/^\d{4}/) && date_string.size == 4
date_string.to_i
else
time = Time.zone.parse(date_string)
time.year
end
rescue ArgumentError
# bad formatted date
Rails.logger.warn("Error parsing date #{date_string}")
nil
end
end
18 changes: 17 additions & 1 deletion lib/traject/dataspace_research_data_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
# single value is used for sorting
to_field 'author_si' do |record, accumulator, _c|
values = record.xpath("/item/metadata/key[text()='dc.contributor.author']/../value").map(&:text)
accumulator.concat [values.uniq.sort.first]
accumulator.concat [values.first]
end

# all values as strings for faceting
Expand Down Expand Up @@ -194,6 +194,11 @@
accumulator.concat DateNormalizer.format_array_for_display(dates)
end

to_field "date_available_ssi" do |record, accumulator, _context|
dates = record.xpath("/item/metadata/key[text()='dc.date.available']/../value").map(&:text)
accumulator.concat [DateNormalizer.format_array_for_sorting(dates).first]
end

to_field "year_available_itsi" do |record, accumulator, _context|
dates = record.xpath("/item/metadata/key[text()='dc.date.available']/../value").map(&:text)
accumulator.concat [DateNormalizer.years_from_dates(dates).first]
Expand Down Expand Up @@ -234,6 +239,17 @@
accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
end

# Add a signgular field for sorting
to_field "issue_date_si" do |record, accumulator, _context|
issue_dates = record.xpath("/item/metadata/key[text()='dc.date.issued']/../value").map(&:text)
accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
end

to_field "issue_date_si" do |record, accumulator, _context|
issue_dates = record.xpath("/item/metadata/key[text()='dcterms.issued']/../value").map(&:text)
accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
end

# Date in yyyy-mm-dd format so we can sort by it
to_field "issue_date_strict_ssi" do |record, accumulator, _context|
dates = record.xpath("/item/metadata/key[text()='dc.date.issued']/../value").map(&:text)
Expand Down
33 changes: 32 additions & 1 deletion lib/traject/pdc_describe_indexing_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
# single value is used for sorting
to_field 'author_si' do |record, accumulator, _c|
author_names = record.xpath("/hash/resource/creators/creator/value").map(&:text)
accumulator.concat [author_names.uniq.sort.first]
accumulator.concat [author_names.first]
end

# all values as strings for faceting
Expand Down Expand Up @@ -144,8 +144,39 @@

to_field 'issue_date_ssim', extract_xpath("/hash/resource/publication-year")

to_field 'issue_date_si', extract_xpath("/hash/resource/publication-year")

to_field 'pdc_created_at_dtsi', extract_xpath('/hash/created-at')

to_field "date_available_ssi" do |record, accumulator, _context|
date_value = record.xpath("/hash/created-at/text()").to_s
date = begin
DateTime.parse(date_value)
rescue
nil
end
if date
accumulator.concat [date.strftime('%Y-%m-%d')]
end
end

to_field "year_available_itsi" do |record, accumulator, _context|
year_value = record.xpath("/hash/resource/publication-year/text()")
if year_value.present?
accumulator.concat [year_value]
else
date_value = record.xpath("/hash/created-at/text()").to_s
date = begin
DateTime.parse(date_value)
rescue
nil
end
if date
accumulator.concat [date.year]
end
end
end

to_field 'pdc_updated_at_dtsi', extract_xpath('/hash/updated-at')

# ==================
Expand Down
30 changes: 12 additions & 18 deletions spec/fixtures/files/pdc_describe_data/89.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"resource_type": "Dataset",
"resource_type_general": "Dataset",
"publisher": "Princeton Plasma Physics Laboratory, Princeton University",
"publication_year": "2022",
"publication_year": "2023",
"ark": "ark:/88435/dsp01wh246w38h",
"doi": "10.34770/bm4s-t361",
"rights_many": [
Expand Down Expand Up @@ -150,13 +150,7 @@
"errors": []
}
],
"keywords": [
"HHFW",
"3D RF modeling",
"Petra-M",
"fast wave",
"NSTX-U"
],
"keywords": ["HHFW", "3D RF modeling", "Petra-M", "fast wave", "NSTX-U"],
"contributors": [],
"organizational_contributors": [],
"funders": [
Expand All @@ -167,15 +161,12 @@
"award_uri": ""
}
],
"domains": [
"Natural Sciences"
],
"communities": [
"Princeton Plasma Physics Laboratory"
],
"domains": ["Natural Sciences"],
"communities": ["Princeton Plasma Physics Laboratory"],
"subcommunities": [
"NSTX-U",
"Spherical Torus"
"Spherical Torus",
"Tokamak Experimental Sciences"
],
"migrated": true
},
Expand Down Expand Up @@ -321,9 +312,12 @@
],
"group": {
"title": "Princeton Plasma Physics Lab (PPPL)",
"description": null,
"description": "",
"code": "PPPL",
"created_at": "2022-04-28T16:30:00.195-04:00",
"updated_at": "2023-05-18T14:20:45.272-04:00"
}
"updated_at": "2024-02-16T09:06:20.108-05:00"
},
"embargo_date": null,
"created_at": "2023-08-18T13:47:14Z",
"updated_at": "2024-02-23T11:05:11Z"
}
58 changes: 34 additions & 24 deletions spec/fixtures/files/sowing_the_seeds.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
"title_type": null
}
],
"description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service.This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators’ own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\n and the facilitators took notes on the subjects’ behavior and commentary during the sessions.Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site’s search box.\r\n-\r\nDownload the README.txt for a detailed description of this dataset's content.",
"description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service. This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators' own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\nand the facilitators took notes on the subjects' behavior and commentary during the sessions. Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site's search box.\r\n\r\nDownload the README.txt for a detailed description of this dataset's content.",
"collection_tags": [],
"creators": [
{
"value": "Abrams, Samantha",
"name_type": "Personal",
"given_name": "Samantha",
"family_name": "Abrams",
"identifier": null,
"affiliations": [],
"sequence": 0
},
{
"value": "Antracoli, Alexis",
"name_type": "Personal",
Expand Down Expand Up @@ -62,40 +71,41 @@
"identifier": null,
"affiliations": [],
"sequence": 6
},
{
"value": "Abrams, Samantha",
"name_type": "Personal",
"given_name": "Samantha",
"family_name": "Abrams",
"identifier": null,
"affiliations": [],
"sequence": 7
}
],
"resource_type": "Dataset",
"resource_type_general": "",
"resource_type_general": "Dataset",
"publisher": "Princeton University",
"publication_year": "2023",
"ark": null,
"doi": "10.34770/doc-1",
"rights": {
"identifier": "GPLv3",
"uri": "https://www.gnu.org/licenses/gpl-3.0.en.html",
"name": "GNU General Public License"
},
"ark": "ark:/88435/dsp01d791sj97j",
"doi": "10.34770/00yp-2w12",
"rights_many": [
{
"identifier": "CC BY",
"uri": "https://creativecommons.org/licenses/by/4.0/",
"name": "Creative Commons Attribution 4.0 International"
}
],
"version_number": "1",
"related_objects": [],
"keywords": [],
"contributors": [],
"funders": []
"organizational_contributors": [],
"funders": [],
"domains": [],
"communities": [],
"subcommunities": [],
"migrated": true
},
"files": [],
"group": {
"title": "Research Data",
"description": null,
"title": "Princeton Research Data Service (PRDS)",
"description": "",
"code": "RD",
"created_at": "2023-01-05T11:26:07.005-05:00",
"updated_at": "2023-01-05T11:26:07.005-05:00"
}
"created_at": "2022-04-28T16:30:00.190-04:00",
"updated_at": "2024-01-23T11:29:40.724-05:00"
},
"embargo_date": null,
"created_at": "2023-07-11T11:06:10Z",
"updated_at": "2023-09-13T08:23:50Z"
}
24 changes: 24 additions & 0 deletions spec/lib/date_normalizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
let(:years) { ['2015'] }
let(:months_and_years) { ['2015-08'] }
let(:timestamps) { ['2015-08-18T18:14:22Z'] }
let(:month_year_name) { ['August 2020'] }

describe "#format_array_for_display" do
it "formats four digit years" do
Expand All @@ -22,11 +23,34 @@
end
end

describe "#format_array_for_sorting" do
it "formats four digit years" do
formatted_dates = described_class.format_array_for_sorting(years)
expect(formatted_dates.first).to eq "2015-01-01"
end

it "formats months and years" do
formatted_dates = described_class.format_array_for_sorting(months_and_years)
expect(formatted_dates.first).to eq "2015-08-01"
end

it "formats ISO-8601 timestamps" do
formatted_dates = described_class.format_array_for_sorting(timestamps)
expect(formatted_dates.first).to eq "2015-08-18"
end

it "formats month year name" do
formatted_dates = described_class.format_array_for_sorting(month_year_name)
expect(formatted_dates.first).to eq "2020-08-01"
end
end

describe "#years_from_dates" do
it "gets years correctly" do
expect(described_class.years_from_dates(timestamps)).to eq [2015]
expect(described_class.years_from_dates(months_and_years)).to eq [2015]
expect(described_class.years_from_dates(years)).to eq [2015]
expect(described_class.years_from_dates(month_year_name)).to eq [2020]
end

it "handles bad dates" do
Expand Down
29 changes: 29 additions & 0 deletions spec/lib/describe_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,35 @@
expect(response["response"]["numFound"]).to eq 2
end

it "can sort by year_available_itsi" do
Rails.configuration.pdc_discovery.index_pdc_describe = true
indexer.index
response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'year_available_itsi desc' }
expect(response["response"]["numFound"]).to eq 2
expect(response["response"]['docs'].first['pdc_created_at_dtsi']).to eq("2023-07-11T11:06:10Z")
expect(response["response"]['docs'].last['pdc_created_at_dtsi']).to eq("2021-12-31T19:00:00Z")
end

context "works with multiple creators" do
let(:pppl1) { File.read(File.join(fixture_path, 'files', 'pppl1.json')) }
let(:pppl2) { File.read(File.join(fixture_path, 'files', 'pppl2.json')) }
before do
stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
.to_return(status: 200, body: pppl1, headers: {})
stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
.to_return(status: 200, body: pppl2, headers: {})
end

it "can sort by the first author" do
Rails.configuration.pdc_discovery.index_pdc_describe = true
indexer.index
response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'author_si desc' }
expect(response["response"]["numFound"]).to eq 2
expect(response["response"]['docs'].first['author_tesim'].first).to eq("Wang, Yin")
expect(response["response"]['docs'].last['author_tesim'].first).to eq("Schwartz, Jacob A.")
end
end

context "when there are items which are under active embargo" do
let(:item_file_fixture) { file_fixture("pdc_describe_active_embargo.json") }
let(:embargo_resource) { item_file_fixture.read }
Expand Down

0 comments on commit 8d631ed

Please sign in to comment.