Skip to content

Commit

Permalink
Allowing sort by year and first author
Browse files Browse the repository at this point in the history
The sort field for year was not defined by the PDC indexer, which caused the sort to be random
The Author sort was the sorting by the alpabetically first of any of the authors, which made the sort seem random
fixes #572
  • Loading branch information
carolyncole committed Mar 1, 2024
1 parent 63019ce commit 7c43549
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 46 deletions.
2 changes: 1 addition & 1 deletion app/controllers/catalog_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def retry_on_exception
# except in the relevancy case). Add the sort: option to configure a
# custom Blacklight url parameter value separate from the Solr sort fields.
config.add_sort_field 'relevance', sort: 'score desc, year_available_itsi desc, title_si asc', label: 'relevance'
config.add_sort_field 'year', sort: 'year_available_itsi desc, title_si asc', label: 'year'
config.add_sort_field 'year', sort: 'year_available_itsi desc, pdc_created_at_dtsi desc, title_si asc', label: 'year'
config.add_sort_field 'author', sort: 'author_si asc, title_si asc', label: 'author'
config.add_sort_field 'title', sort: 'title_si asc, year_available_itsi desc', label: 'title'

Expand Down
6 changes: 5 additions & 1 deletion app/lib/date_normalizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ def self.year_from_date(date_string)
Date.strptime(date_string).strftime('%Y').to_i
elsif date_string.match?(/\d{4}-\d{2}/)
Date.strptime(date_string, '%Y-%m').strftime('%Y').to_i
elsif date_string.match?(/^\d{4}/) && date_string.size == 4
date_string.to_i
else
date_string.to_i
time = Time.parse(date_string)
time.year
end
rescue ArgumentError
# bad formatted date
Rails.logger.warn("Error parsing date #{date_string}")
nil
end
end
13 changes: 12 additions & 1 deletion lib/traject/dataspace_research_data_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
# single value is used for sorting
to_field 'author_si' do |record, accumulator, _c|
values = record.xpath("/item/metadata/key[text()='dc.contributor.author']/../value").map(&:text)
accumulator.concat [values.uniq.sort.first]
accumulator.concat [values.first]
end

# all values as strings for faceting
Expand Down Expand Up @@ -234,6 +234,17 @@
accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
end

# Add a signgular field for sorting
to_field "issue_date_si" do |record, accumulator, _context|
issue_dates = record.xpath("/item/metadata/key[text()='dc.date.issued']/../value").map(&:text)
accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
end

to_field "issue_date_si" do |record, accumulator, _context|
issue_dates = record.xpath("/item/metadata/key[text()='dcterms.issued']/../value").map(&:text)
accumulator.concat DateNormalizer.format_array_for_display(issue_dates)
end

# Date in yyyy-mm-dd format so we can sort by it
to_field "issue_date_strict_ssi" do |record, accumulator, _context|
dates = record.xpath("/item/metadata/key[text()='dc.date.issued']/../value").map(&:text)
Expand Down
21 changes: 20 additions & 1 deletion lib/traject/pdc_describe_indexing_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
# single value is used for sorting
to_field 'author_si' do |record, accumulator, _c|
author_names = record.xpath("/hash/resource/creators/creator/value").map(&:text)
accumulator.concat [author_names.uniq.sort.first]
accumulator.concat [author_names.first]
end

# all values as strings for faceting
Expand Down Expand Up @@ -144,8 +144,27 @@

to_field 'issue_date_ssim', extract_xpath("/hash/resource/publication-year")

to_field 'issue_date_si', extract_xpath("/hash/resource/publication-year")

to_field 'pdc_created_at_dtsi', extract_xpath('/hash/created-at')

to_field "year_available_itsi" do |record, accumulator, _context|
year_value = record.xpath("/hash/resource/publication-year/text()")
if year_value.present?
accumulator.concat [year_value]
else
date_value = record.xpath("/hash/created-at/text()").to_s
date = begin
DateTime.parse(date_value)
rescue
nil
end
if date
accumulator.concat [date.year]
end
end
end

to_field 'pdc_updated_at_dtsi', extract_xpath('/hash/updated-at')

# ==================
Expand Down
30 changes: 12 additions & 18 deletions spec/fixtures/files/pdc_describe_data/89.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"resource_type": "Dataset",
"resource_type_general": "Dataset",
"publisher": "Princeton Plasma Physics Laboratory, Princeton University",
"publication_year": "2022",
"publication_year": "2023",
"ark": "ark:/88435/dsp01wh246w38h",
"doi": "10.34770/bm4s-t361",
"rights_many": [
Expand Down Expand Up @@ -150,13 +150,7 @@
"errors": []
}
],
"keywords": [
"HHFW",
"3D RF modeling",
"Petra-M",
"fast wave",
"NSTX-U"
],
"keywords": ["HHFW", "3D RF modeling", "Petra-M", "fast wave", "NSTX-U"],
"contributors": [],
"organizational_contributors": [],
"funders": [
Expand All @@ -167,15 +161,12 @@
"award_uri": ""
}
],
"domains": [
"Natural Sciences"
],
"communities": [
"Princeton Plasma Physics Laboratory"
],
"domains": ["Natural Sciences"],
"communities": ["Princeton Plasma Physics Laboratory"],
"subcommunities": [
"NSTX-U",
"Spherical Torus"
"Spherical Torus",
"Tokamak Experimental Sciences"
],
"migrated": true
},
Expand Down Expand Up @@ -321,9 +312,12 @@
],
"group": {
"title": "Princeton Plasma Physics Lab (PPPL)",
"description": null,
"description": "",
"code": "PPPL",
"created_at": "2022-04-28T16:30:00.195-04:00",
"updated_at": "2023-05-18T14:20:45.272-04:00"
}
"updated_at": "2024-02-16T09:06:20.108-05:00"
},
"embargo_date": null,
"created_at": "2023-08-18T13:47:14Z",
"updated_at": "2024-02-23T11:05:11Z"
}
58 changes: 34 additions & 24 deletions spec/fixtures/files/sowing_the_seeds.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
"title_type": null
}
],
"description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service.This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators’ own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\n and the facilitators took notes on the subjects’ behavior and commentary during the sessions.Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site’s search box.\r\n-\r\nDownload the README.txt for a detailed description of this dataset's content.",
"description": "In 2017, seven members of the Archive-It Mid-Atlantic Users Group (AITMA) conducted a study of 14 subjects representative of their stakeholder populations to assess the usability of Archive-It, a web archiving subscription service of the Internet Archive. While Archive-It is the most widely-used tool for web archiving, little is known about how users interact with the service. This study intended to teach us what users expect from web archives, which exist as another form of archival material. End-user subjects executed four search tasks using the public Archive-It interface and the Wayback Machine to access archived information on websites from the facilitators' own harvested collections and provide feedback about their experiences. The tasks were designed to have straightforward pass or fail outcomes,\r\nand the facilitators took notes on the subjects' behavior and commentary during the sessions. Overall, participants reported mildly positive impressions of Archive-It public user interface based on their session. The study identified several key areas of improvement for the Archive-It service pertaining to metadata options, terminology display, indexing of dates, and the site's search box.\r\n\r\nDownload the README.txt for a detailed description of this dataset's content.",
"collection_tags": [],
"creators": [
{
"value": "Abrams, Samantha",
"name_type": "Personal",
"given_name": "Samantha",
"family_name": "Abrams",
"identifier": null,
"affiliations": [],
"sequence": 0
},
{
"value": "Antracoli, Alexis",
"name_type": "Personal",
Expand Down Expand Up @@ -62,40 +71,41 @@
"identifier": null,
"affiliations": [],
"sequence": 6
},
{
"value": "Abrams, Samantha",
"name_type": "Personal",
"given_name": "Samantha",
"family_name": "Abrams",
"identifier": null,
"affiliations": [],
"sequence": 7
}
],
"resource_type": "Dataset",
"resource_type_general": "",
"resource_type_general": "Dataset",
"publisher": "Princeton University",
"publication_year": "2023",
"ark": null,
"doi": "10.34770/doc-1",
"rights": {
"identifier": "GPLv3",
"uri": "https://www.gnu.org/licenses/gpl-3.0.en.html",
"name": "GNU General Public License"
},
"ark": "ark:/88435/dsp01d791sj97j",
"doi": "10.34770/00yp-2w12",
"rights_many": [
{
"identifier": "CC BY",
"uri": "https://creativecommons.org/licenses/by/4.0/",
"name": "Creative Commons Attribution 4.0 International"
}
],
"version_number": "1",
"related_objects": [],
"keywords": [],
"contributors": [],
"funders": []
"organizational_contributors": [],
"funders": [],
"domains": [],
"communities": [],
"subcommunities": [],
"migrated": true
},
"files": [],
"group": {
"title": "Research Data",
"description": null,
"title": "Princeton Research Data Service (PRDS)",
"description": "",
"code": "RD",
"created_at": "2023-01-05T11:26:07.005-05:00",
"updated_at": "2023-01-05T11:26:07.005-05:00"
}
"created_at": "2022-04-28T16:30:00.190-04:00",
"updated_at": "2024-01-23T11:29:40.724-05:00"
},
"embargo_date": null,
"created_at": "2023-07-11T11:06:10Z",
"updated_at": "2023-09-13T08:23:50Z"
}
2 changes: 2 additions & 0 deletions spec/lib/date_normalizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
let(:years) { ['2015'] }
let(:months_and_years) { ['2015-08'] }
let(:timestamps) { ['2015-08-18T18:14:22Z'] }
let(:month_year_name) { ['August 2020'] }

describe "#format_array_for_display" do
it "formats four digit years" do
Expand All @@ -27,6 +28,7 @@
expect(described_class.years_from_dates(timestamps)).to eq [2015]
expect(described_class.years_from_dates(months_and_years)).to eq [2015]
expect(described_class.years_from_dates(years)).to eq [2015]
expect(described_class.years_from_dates(month_year_name)).to eq [2020]
end

it "handles bad dates" do
Expand Down
29 changes: 29 additions & 0 deletions spec/lib/describe_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,35 @@
expect(response["response"]["numFound"]).to eq 2
end

it "can sort by year_available_itsi" do
Rails.configuration.pdc_discovery.index_pdc_describe = true
indexer.index
response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'year_available_itsi desc' }
expect(response["response"]["numFound"]).to eq 2
expect(response["response"]['docs'].first['pdc_created_at_dtsi']).to eq("2023-07-11T11:06:10Z")
expect(response["response"]['docs'].last['pdc_created_at_dtsi']).to eq("2021-12-31T19:00:00Z")
end

context "works with multiple creators" do
let(:pppl1) { File.read(File.join(fixture_path, 'files', 'pppl1.json')) }
let(:pppl2) { File.read(File.join(fixture_path, 'files', 'pppl2.json')) }
before do
stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
.to_return(status: 200, body: pppl1, headers: {})
stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
.to_return(status: 200, body: pppl2, headers: {})
end

it "can sort by the first author" do
Rails.configuration.pdc_discovery.index_pdc_describe = true
indexer.index
response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'author_si desc' }
expect(response["response"]["numFound"]).to eq 2
expect(response["response"]['docs'].first['author_tesim'].first).to eq("Wang, Yin")
expect(response["response"]['docs'].last['author_tesim'].first).to eq("Schwartz, Jacob A.")
end
end

context "when there are items which are under active embargo" do
let(:item_file_fixture) { file_fixture("pdc_describe_active_embargo.json") }
let(:embargo_resource) { item_file_fixture.read }
Expand Down

0 comments on commit 7c43549

Please sign in to comment.