Skip to content

Commit 92d093f

Browse files
committed
Improve Fuzzy Search Algorithm And Fix Undo Bug
- Added utility functions to ConferenceUtils for acronym candidate generation and string normalization - Updated fuzzy search algorithm in ConferenceRepository to include Longest Common Substring similarity combined with Levenshtein similarity - Fixed bug in ICORERankingEditor which was causing Undo to not work - Updated ICORERankingEditorViewModel to use the leaner ConferenceRepository API - Added LCSSimilarity method to StringSimilarity to compute Longest Common Substring similarity rating - Added Javadoc to various non-trivial methods - Add tests to cover the updated functionality Part of #13476
1 parent c63e2da commit 92d093f

File tree

12 files changed

+650
-160
lines changed

12 files changed

+650
-160
lines changed

jabgui/src/main/java/org/jabref/gui/fieldeditors/ICORERankingEditor.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,6 @@ public ICORERankingEditor(Field field,
6565
new Tooltip(Localization.lang("Visit ICORE conference page"))
6666
);
6767
visitICOREConferencePageButton.disableProperty().bind(textField.textProperty().isEmpty());
68-
69-
new EditorValidator(preferences).configureValidation(viewModel.getFieldValidator().getValidationStatus(), textField);
7068
}
7169

7270
@Override

jabgui/src/main/java/org/jabref/gui/fieldeditors/ICORERankingEditorViewModel.java

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import org.jabref.gui.autocompleter.SuggestionProvider;
1010
import org.jabref.gui.desktop.os.NativeDesktop;
1111
import org.jabref.gui.preferences.GuiPreferences;
12-
import org.jabref.logic.icore.ConferenceAcronymExtractor;
1312
import org.jabref.logic.icore.ConferenceRepository;
1413
import org.jabref.logic.integrity.FieldCheckers;
1514
import org.jabref.logic.l10n.Localization;
@@ -56,18 +55,7 @@ public void lookupIdentifier(BibEntry bibEntry) {
5655
return;
5756
}
5857

59-
Optional<ConferenceEntry> conference;
60-
Optional<String> acronym = ConferenceAcronymExtractor.extract(bookTitle.get());
61-
if (acronym.isPresent()) {
62-
conference = repo.getConferenceFromAcronym(acronym.get());
63-
if (conference.isPresent()) {
64-
entry.setField(field, conference.get().rank());
65-
matchedConference = conference.get();
66-
return;
67-
}
68-
}
69-
70-
conference = repo.getConferenceFromBookTitle(bookTitle.get());
58+
Optional<ConferenceEntry> conference = repo.getConferenceFromBookTitle(bookTitle.get());
7159
if (conference.isPresent()) {
7260
entry.setField(field, conference.get().rank());
7361
matchedConference = conference.get();

jabgui/src/main/java/org/jabref/migrations/PreferencesMigrations.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,9 +566,10 @@ static void moveApiKeysToKeyring(JabRefCliPreferences preferences) {
566566
/**
567567
* Updates the default preferences for the editor fields under the "General" tab to include the ICORE Ranking Field
568568
* if it is missing.
569-
*<p>
569+
* <p>
570570
* The function first ensures that the current preferences match the previous default (before the ICORE field was added)
571571
* and only then does the update.
572+
* </p>
572573
*
573574
* @implNote The default fields for the "General" tab are defined by {@link FieldFactory#getDefaultGeneralFields()}.
574575
* @param preferences the user's current preferences

jablib/src/main/java/org/jabref/logic/icore/ConferenceAcronymExtractor.java

Lines changed: 0 additions & 52 deletions
This file was deleted.

jablib/src/main/java/org/jabref/logic/icore/ConferenceRepository.java

Lines changed: 124 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
import java.util.HashMap;
88
import java.util.Map;
99
import java.util.Optional;
10+
import java.util.Set;
1011

1112
import org.jabref.logic.JabRefException;
1213
import org.jabref.logic.util.strings.StringSimilarity;
1314
import org.jabref.model.icore.ConferenceEntry;
1415

1516
import org.apache.commons.csv.CSVFormat;
1617
import org.apache.commons.csv.CSVRecord;
18+
import org.jspecify.annotations.NonNull;
1719
import org.slf4j.Logger;
1820
import org.slf4j.LoggerFactory;
1921

@@ -25,15 +27,20 @@
2527
* Since the website does not expose an API endpoint to fetch this data programmatically, it must be manually exported
2628
* from the website and stored as a resource. This means that when new ranking data is released, the old data must be
2729
* replaced and the <code>ICORE_RANK_DATA_FILE</code> variable must be modified to point to the new data file.
30+
* </p>
2831
*/
2932
public class ConferenceRepository {
3033
private static final Logger LOGGER = LoggerFactory.getLogger(ConferenceRepository.class);
3134
private static final String ICORE_RANK_DATA_FILE = "/icore/ICORE2023.csv";
32-
private static final double FUZZY_SEARCH_THRESHOLD = 0.9;
33-
private static final StringSimilarity MATCHER = new StringSimilarity();
35+
private static final double LEVENSHTEIN_THRESHOLD = 0.9;
36+
private static final double COMBINED_LCS_LEV_THRESHOLD = 0.75;
37+
private static final double EPSILON = 1e-6;
38+
private static final StringSimilarity LEVENSHTEIN_MATCHER = new StringSimilarity();
3439

3540
private final Map<String, ConferenceEntry> acronymToConference = new HashMap<>();
3641
private final Map<String, ConferenceEntry> titleToConference = new HashMap<>();
42+
private final Map<String, ConferenceEntry> normalizedTitleToConference = new HashMap<>();
43+
private int maxAcronymLength = 0;
3744

3845
public ConferenceRepository() throws JabRefException {
3946
InputStream inputStream = getClass().getResourceAsStream(ICORE_RANK_DATA_FILE);
@@ -63,74 +70,158 @@ private void loadConferenceDataFromInputStream(InputStream inputStream) throws J
6370
for (CSVRecord record : records) {
6471
String id = record.get("Id").strip();
6572
String title = record.get("Title").strip().toLowerCase();
66-
String acronym = record.get("Acronym").strip().toUpperCase();
73+
String acronym = record.get("Acronym").strip().toLowerCase();
6774
String rank = record.get("Rank").strip();
6875

6976
if (id.isEmpty() || title.isEmpty() || acronym.isEmpty() || rank.isEmpty()) {
7077
LOGGER.warn("Missing fields in row in ICORE rank data: {}", record);
7178
continue;
7279
}
7380

81+
if (title.indexOf('(') >= 0) {
82+
// remove any extra alias strings in parentheses
83+
title = ConferenceUtils.removeAllParenthesesWithContent(title);
84+
}
7485
ConferenceEntry conferenceEntry = new ConferenceEntry(id, title, acronym, rank);
7586
acronymToConference.put(acronym, conferenceEntry);
7687
titleToConference.put(title, conferenceEntry);
88+
normalizedTitleToConference.put(ConferenceUtils.normalize(title), conferenceEntry);
89+
if (acronym.length() >= maxAcronymLength) {
90+
maxAcronymLength = acronym.length();
91+
}
7792
}
7893
} catch (IOException e) {
7994
throw new JabRefException("I/O Error while reading ICORE data from resource", e);
8095
}
96+
LOGGER.debug("Max acronym length seen in data: {}", maxAcronymLength);
8197
}
8298

83-
public Optional<ConferenceEntry> getConferenceFromAcronym(String acronym) {
84-
String query = acronym.strip().toUpperCase();
85-
86-
ConferenceEntry conference = acronymToConference.get(query);
99+
/**
100+
* Searches the given query against the ICORE conference ranking data and returns a match, if found.
101+
* <p>
102+
* While searching, we first look for a conference acronym present inside parentheses, like <code>(ICSE)</code>
103+
* or <code>(ICSE 2022)</code>. If acronym lookup fails, the query is processed further and matched against the list
104+
* of conference titles.
105+
* </p>
106+
*
107+
* @implNote see {@link ConferenceRepository#fuzzySearchConferenceTitles} for more details on matching
108+
* @param bookTitle the string to search, must not be {@code null}
109+
* @return an {@code Optional} conference entry, if found
110+
* or {@code Optional.empty()} if no conference entry is found
111+
*/
112+
public Optional<ConferenceEntry> getConferenceFromBookTitle(@NonNull String bookTitle) {
113+
String query = bookTitle.strip().toLowerCase();
114+
ConferenceEntry conference;
87115

88-
if (conference == null) {
89-
return Optional.empty();
116+
conference = acronymToConference.get(query);
117+
if (conference != null) {
118+
return Optional.of(conference);
90119
}
91120

92-
return Optional.of(conference);
93-
}
94-
95-
public Optional<ConferenceEntry> getConferenceFromBookTitle(String bookTitle) {
96-
String query = bookTitle.strip().toLowerCase();
97-
98-
ConferenceEntry conference = titleToConference.get(query);
121+
conference = titleToConference.get(query);
99122
if (conference != null) {
100123
return Optional.of(conference);
101124
}
102125

103-
String bestMatch = fuzzySearchConferenceTitles(query);
104-
if (bestMatch.isEmpty()) {
105-
return Optional.empty();
126+
Optional<ConferenceEntry> acronymConference = getConferenceFromAcronym(query);
127+
128+
if (acronymConference.isPresent()) {
129+
return acronymConference;
106130
}
107131

108-
conference = titleToConference.get(bestMatch);
132+
return fuzzySearchConferenceTitles(query);
133+
}
109134

110-
return Optional.of(conference);
135+
private Optional<ConferenceEntry> getConferenceFromAcronym(String query) {
136+
Optional<String> acronym = ConferenceUtils.extractStringFromParentheses(query);
137+
138+
if (acronym.isPresent()) {
139+
ConferenceEntry conference;
140+
Set<String> acronymCandidates = ConferenceUtils.generateAcronymCandidates(acronym.get(), maxAcronymLength);
141+
LOGGER.debug("Extracted acronym string: {}, Acronym candidates: {}", acronym.get(), acronymCandidates);
142+
for (String candidate : acronymCandidates) {
143+
conference = acronymToConference.get(candidate);
144+
if (conference != null) {
145+
return Optional.of(conference);
146+
}
147+
}
148+
}
149+
return Optional.empty();
111150
}
112151

113152
/**
114-
* Searches all conference titles for the given query string using {@link StringSimilarity#similarity} as a MATCHER.
153+
* Searches the conference data for the given query string using a combination of Levenshtein similarity
154+
* {@link StringSimilarity#similarity} and Longest Common Substring (LCS) similarity {@link StringSimilarity#LCSSimilarity}.
155+
* <p>
156+
* The input query is first fed through the normalizer at {@link ConferenceUtils#normalize} which strips away much of the
157+
* noise.
158+
* </p>
115159
* <p>
116-
* The threshold for matching is set at 0.9. This function will always return the conference title with the highest
117-
* similarity rating.
160+
* While searching, the function computes Levenshtein similarity and LCS similarity between the query and the current conference
161+
* title (also normalized) and prioritizes them in the following order:
162+
* <ol>
163+
* <li>Whenever LCS similarity returns <code>1.0</code>, i.e., a conference title is found entirely as a substring in the query.</li>
164+
* <li>Whenever Levenshtein similarity exceeds the threshold defined by the <code>LEVENSHTEIN_THRESHOLD</code> constant.</li>
165+
* <li>The combined weighted score of both LCS and Levenshtein similarities exceeds the <code>COMBINED_LCS_LEV_THRESHOLD</code>.</li>
166+
* </ol>
167+
*
168+
* The combined score is calculated as follows:
169+
* <code>(0.6 * Levenshtein similarity) + (0.4 * LCS similarity)</code>
170+
* </p>
118171
*
119172
* @param query The query string to be searched
120-
* @return The conference title, if found. Otherwise, an empty string is returned.
173+
* @return an {@code Optional} conference entry, if found
174+
* or {@code Optional.empty()} if no conference entry is found
121175
*/
122-
private String fuzzySearchConferenceTitles(String query) {
176+
private Optional<ConferenceEntry> fuzzySearchConferenceTitles(String query) {
123177
String bestMatch = "";
124-
double bestSimilarity = 0.0;
178+
double bestScore = 0.0;
179+
String normalizedQuery = ConferenceUtils.normalize(query);
180+
181+
if (normalizedQuery.isEmpty()) {
182+
return Optional.empty();
183+
}
184+
185+
ConferenceEntry acronymConference = acronymToConference.get(normalizedQuery);
186+
if (acronymConference != null) {
187+
return Optional.of(acronymConference);
188+
}
125189

126-
for (String conferenceTitle : titleToConference.keySet()) {
127-
double similarity = MATCHER.similarity(query, conferenceTitle);
128-
if (similarity >= FUZZY_SEARCH_THRESHOLD && similarity > bestSimilarity) {
129-
bestMatch = conferenceTitle;
130-
bestSimilarity = similarity;
190+
acronymConference = normalizedTitleToConference.get(normalizedQuery);
191+
if (acronymConference != null) {
192+
return Optional.of(acronymConference);
193+
}
194+
195+
for (String conferenceTitle : normalizedTitleToConference.keySet()) {
196+
// only match for queries longer than the current conference title
197+
// this will safeguard against overfitting common prefixes
198+
if (normalizedQuery.length() >= conferenceTitle.length()) {
199+
double levSimilarity = LEVENSHTEIN_MATCHER.similarity(normalizedQuery, conferenceTitle);
200+
double LCSSimilarity = StringSimilarity.LCSSimilarity(normalizedQuery, conferenceTitle);
201+
double combinedScore = levSimilarity * 0.6 + LCSSimilarity * 0.4;
202+
boolean exactSubstringMatch = (Math.abs(LCSSimilarity - 1.0) <= EPSILON);
203+
204+
if (exactSubstringMatch) {
205+
return Optional.of(normalizedTitleToConference.get(conferenceTitle));
206+
}
207+
208+
if (levSimilarity >= LEVENSHTEIN_THRESHOLD) {
209+
return Optional.of(normalizedTitleToConference.get(conferenceTitle));
210+
}
211+
212+
if (combinedScore >= COMBINED_LCS_LEV_THRESHOLD && combinedScore >= bestScore) {
213+
bestMatch = conferenceTitle;
214+
bestScore = combinedScore;
215+
LOGGER.debug("Matched query: {} with title: {} with combinedScore: {} and LEV: {} and LCS: {}",
216+
normalizedQuery, conferenceTitle, combinedScore, levSimilarity, LCSSimilarity);
217+
}
131218
}
132219
}
133220

134-
return bestMatch;
221+
if (!bestMatch.isEmpty()) {
222+
return Optional.of(normalizedTitleToConference.get(bestMatch));
223+
}
224+
225+
return Optional.empty();
135226
}
136227
}

0 commit comments

Comments
 (0)