|
7 | 7 | import java.util.HashMap; |
8 | 8 | import java.util.Map; |
9 | 9 | import java.util.Optional; |
| 10 | +import java.util.Set; |
10 | 11 |
|
11 | 12 | import org.jabref.logic.JabRefException; |
12 | 13 | import org.jabref.logic.util.strings.StringSimilarity; |
13 | 14 | import org.jabref.model.icore.ConferenceEntry; |
14 | 15 |
|
15 | 16 | import org.apache.commons.csv.CSVFormat; |
16 | 17 | import org.apache.commons.csv.CSVRecord; |
| 18 | +import org.jspecify.annotations.NonNull; |
17 | 19 | import org.slf4j.Logger; |
18 | 20 | import org.slf4j.LoggerFactory; |
19 | 21 |
|
|
25 | 27 | * Since the website does not expose an API endpoint to fetch this data programmatically, it must be manually exported |
26 | 28 | * from the website and stored as a resource. This means that when new ranking data is released, the old data must be |
27 | 29 | * replaced and the <code>ICORE_RANK_DATA_FILE</code> variable must be modified to point to the new data file. |
| 30 | + * </p> |
28 | 31 | */ |
29 | 32 | public class ConferenceRepository { |
30 | 33 | private static final Logger LOGGER = LoggerFactory.getLogger(ConferenceRepository.class); |
31 | 34 | private static final String ICORE_RANK_DATA_FILE = "/icore/ICORE2023.csv"; |
32 | | - private static final double FUZZY_SEARCH_THRESHOLD = 0.9; |
33 | | - private static final StringSimilarity MATCHER = new StringSimilarity(); |
| 35 | + private static final double LEVENSHTEIN_THRESHOLD = 0.9; |
| 36 | + private static final double COMBINED_LCS_LEV_THRESHOLD = 0.75; |
| 37 | + private static final double EPSILON = 1e-6; |
| 38 | + private static final StringSimilarity LEVENSHTEIN_MATCHER = new StringSimilarity(); |
34 | 39 |
|
35 | 40 | private final Map<String, ConferenceEntry> acronymToConference = new HashMap<>(); |
36 | 41 | private final Map<String, ConferenceEntry> titleToConference = new HashMap<>(); |
| 42 | + private final Map<String, ConferenceEntry> normalizedTitleToConference = new HashMap<>(); |
| 43 | + private int maxAcronymLength = 0; |
37 | 44 |
|
38 | 45 | public ConferenceRepository() throws JabRefException { |
39 | 46 | InputStream inputStream = getClass().getResourceAsStream(ICORE_RANK_DATA_FILE); |
@@ -63,74 +70,158 @@ private void loadConferenceDataFromInputStream(InputStream inputStream) throws J |
63 | 70 | for (CSVRecord record : records) { |
64 | 71 | String id = record.get("Id").strip(); |
65 | 72 | String title = record.get("Title").strip().toLowerCase(); |
66 | | - String acronym = record.get("Acronym").strip().toUpperCase(); |
| 73 | + String acronym = record.get("Acronym").strip().toLowerCase(); |
67 | 74 | String rank = record.get("Rank").strip(); |
68 | 75 |
|
69 | 76 | if (id.isEmpty() || title.isEmpty() || acronym.isEmpty() || rank.isEmpty()) { |
70 | 77 | LOGGER.warn("Missing fields in row in ICORE rank data: {}", record); |
71 | 78 | continue; |
72 | 79 | } |
73 | 80 |
|
| 81 | + if (title.indexOf('(') >= 0) { |
| 82 | + // remove any extra alias strings in parentheses |
| 83 | + title = ConferenceUtils.removeAllParenthesesWithContent(title); |
| 84 | + } |
74 | 85 | ConferenceEntry conferenceEntry = new ConferenceEntry(id, title, acronym, rank); |
75 | 86 | acronymToConference.put(acronym, conferenceEntry); |
76 | 87 | titleToConference.put(title, conferenceEntry); |
| 88 | + normalizedTitleToConference.put(ConferenceUtils.normalize(title), conferenceEntry); |
| 89 | + if (acronym.length() >= maxAcronymLength) { |
| 90 | + maxAcronymLength = acronym.length(); |
| 91 | + } |
77 | 92 | } |
78 | 93 | } catch (IOException e) { |
79 | 94 | throw new JabRefException("I/O Error while reading ICORE data from resource", e); |
80 | 95 | } |
| 96 | + LOGGER.debug("Max acronym length seen in data: {}", maxAcronymLength); |
81 | 97 | } |
82 | 98 |
|
83 | | - public Optional<ConferenceEntry> getConferenceFromAcronym(String acronym) { |
84 | | - String query = acronym.strip().toUpperCase(); |
85 | | - |
86 | | - ConferenceEntry conference = acronymToConference.get(query); |
| 99 | + /** |
| 100 | + * Searches the given query against the ICORE conference ranking data and returns a match, if found. |
| 101 | + * <p> |
| 102 | + * While searching, we first look for a conference acronym present inside parentheses, like <code>(ICSE)</code> |
| 103 | + * or <code>(ICSE 2022)</code>. If acronym lookup fails, the query is processed further and matched against the list |
| 104 | + * of conference titles. |
| 105 | + * </p> |
| 106 | + * |
| 107 | + * @implNote see {@link ConferenceRepository#fuzzySearchConferenceTitles} for more details on matching |
| 108 | + * @param bookTitle the string to search, must not be {@code null} |
| 109 | + * @return an {@code Optional} conference entry, if found |
| 110 | + * or {@code Optional.empty()} if no conference entry is found |
| 111 | + */ |
| 112 | + public Optional<ConferenceEntry> getConferenceFromBookTitle(@NonNull String bookTitle) { |
| 113 | + String query = bookTitle.strip().toLowerCase(); |
| 114 | + ConferenceEntry conference; |
87 | 115 |
|
88 | | - if (conference == null) { |
89 | | - return Optional.empty(); |
| 116 | + conference = acronymToConference.get(query); |
| 117 | + if (conference != null) { |
| 118 | + return Optional.of(conference); |
90 | 119 | } |
91 | 120 |
|
92 | | - return Optional.of(conference); |
93 | | - } |
94 | | - |
95 | | - public Optional<ConferenceEntry> getConferenceFromBookTitle(String bookTitle) { |
96 | | - String query = bookTitle.strip().toLowerCase(); |
97 | | - |
98 | | - ConferenceEntry conference = titleToConference.get(query); |
| 121 | + conference = titleToConference.get(query); |
99 | 122 | if (conference != null) { |
100 | 123 | return Optional.of(conference); |
101 | 124 | } |
102 | 125 |
|
103 | | - String bestMatch = fuzzySearchConferenceTitles(query); |
104 | | - if (bestMatch.isEmpty()) { |
105 | | - return Optional.empty(); |
| 126 | + Optional<ConferenceEntry> acronymConference = getConferenceFromAcronym(query); |
| 127 | + |
| 128 | + if (acronymConference.isPresent()) { |
| 129 | + return acronymConference; |
106 | 130 | } |
107 | 131 |
|
108 | | - conference = titleToConference.get(bestMatch); |
| 132 | + return fuzzySearchConferenceTitles(query); |
| 133 | + } |
109 | 134 |
|
110 | | - return Optional.of(conference); |
| 135 | + private Optional<ConferenceEntry> getConferenceFromAcronym(String query) { |
| 136 | + Optional<String> acronym = ConferenceUtils.extractStringFromParentheses(query); |
| 137 | + |
| 138 | + if (acronym.isPresent()) { |
| 139 | + ConferenceEntry conference; |
| 140 | + Set<String> acronymCandidates = ConferenceUtils.generateAcronymCandidates(acronym.get(), maxAcronymLength); |
| 141 | + LOGGER.debug("Extracted acronym string: {}, Acronym candidates: {}", acronym.get(), acronymCandidates); |
| 142 | + for (String candidate : acronymCandidates) { |
| 143 | + conference = acronymToConference.get(candidate); |
| 144 | + if (conference != null) { |
| 145 | + return Optional.of(conference); |
| 146 | + } |
| 147 | + } |
| 148 | + } |
| 149 | + return Optional.empty(); |
111 | 150 | } |
112 | 151 |
|
113 | 152 | /** |
114 | | - * Searches all conference titles for the given query string using {@link StringSimilarity#similarity} as a MATCHER. |
| 153 | + * Searches the conference data for the given query string using a combination of Levenshtein similarity |
| 154 | + * {@link StringSimilarity#similarity} and Longest Common Substring (LCS) similarity {@link StringSimilarity#LCSSimilarity}. |
| 155 | + * <p> |
| 156 | + * The input query is first fed through the normalizer at {@link ConferenceUtils#normalize} which strips away much of the |
| 157 | + * noise. |
| 158 | + * </p> |
115 | 159 | * <p> |
116 | | - * The threshold for matching is set at 0.9. This function will always return the conference title with the highest |
117 | | - * similarity rating. |
| 160 | + * While searching, the function computes Levenshtein similarity and LCS similarity between the query and the current conference |
| 161 | + * title (also normalized) and prioritizes them in the following order: |
| 162 | + * <ol> |
| 163 | + * <li>Whenever LCS similarity returns <code>1.0</code>, i.e., a conference title is found entirely as a substring in the query.</li> |
| 164 | + * <li>Whenever Levenshtein similarity exceeds the threshold defined by the <code>LEVENSHTEIN_THRESHOLD</code> constant.</li> |
| 165 | + * <li>The combined weighted score of both LCS and Levenshtein similarities exceeds the <code>COMBINED_LCS_LEV_THRESHOLD</code>.</li> |
| 166 | + * </ol> |
| 167 | + * |
| 168 | + * The combined score is calculated as follows: |
| 169 | + * <code>(0.6 * Levenshtein similarity) + (0.4 * LCS similarity)</code> |
| 170 | + * </p> |
118 | 171 | * |
119 | 172 | * @param query The query string to be searched |
120 | | - * @return The conference title, if found. Otherwise, an empty string is returned. |
| 173 | + * @return an {@code Optional} conference entry, if found |
| 174 | + * or {@code Optional.empty()} if no conference entry is found |
121 | 175 | */ |
122 | | - private String fuzzySearchConferenceTitles(String query) { |
| 176 | + private Optional<ConferenceEntry> fuzzySearchConferenceTitles(String query) { |
123 | 177 | String bestMatch = ""; |
124 | | - double bestSimilarity = 0.0; |
| 178 | + double bestScore = 0.0; |
| 179 | + String normalizedQuery = ConferenceUtils.normalize(query); |
| 180 | + |
| 181 | + if (normalizedQuery.isEmpty()) { |
| 182 | + return Optional.empty(); |
| 183 | + } |
| 184 | + |
| 185 | + ConferenceEntry acronymConference = acronymToConference.get(normalizedQuery); |
| 186 | + if (acronymConference != null) { |
| 187 | + return Optional.of(acronymConference); |
| 188 | + } |
125 | 189 |
|
126 | | - for (String conferenceTitle : titleToConference.keySet()) { |
127 | | - double similarity = MATCHER.similarity(query, conferenceTitle); |
128 | | - if (similarity >= FUZZY_SEARCH_THRESHOLD && similarity > bestSimilarity) { |
129 | | - bestMatch = conferenceTitle; |
130 | | - bestSimilarity = similarity; |
| 190 | + acronymConference = normalizedTitleToConference.get(normalizedQuery); |
| 191 | + if (acronymConference != null) { |
| 192 | + return Optional.of(acronymConference); |
| 193 | + } |
| 194 | + |
| 195 | + for (String conferenceTitle : normalizedTitleToConference.keySet()) { |
| 196 | + // only match for queries longer than the current conference title |
| 197 | + // this will safeguard against overfitting common prefixes |
| 198 | + if (normalizedQuery.length() >= conferenceTitle.length()) { |
| 199 | + double levSimilarity = LEVENSHTEIN_MATCHER.similarity(normalizedQuery, conferenceTitle); |
| 200 | + double LCSSimilarity = StringSimilarity.LCSSimilarity(normalizedQuery, conferenceTitle); |
| 201 | + double combinedScore = levSimilarity * 0.6 + LCSSimilarity * 0.4; |
| 202 | + boolean exactSubstringMatch = (Math.abs(LCSSimilarity - 1.0) <= EPSILON); |
| 203 | + |
| 204 | + if (exactSubstringMatch) { |
| 205 | + return Optional.of(normalizedTitleToConference.get(conferenceTitle)); |
| 206 | + } |
| 207 | + |
| 208 | + if (levSimilarity >= LEVENSHTEIN_THRESHOLD) { |
| 209 | + return Optional.of(normalizedTitleToConference.get(conferenceTitle)); |
| 210 | + } |
| 211 | + |
| 212 | + if (combinedScore >= COMBINED_LCS_LEV_THRESHOLD && combinedScore >= bestScore) { |
| 213 | + bestMatch = conferenceTitle; |
| 214 | + bestScore = combinedScore; |
| 215 | + LOGGER.debug("Matched query: {} with title: {} with combinedScore: {} and LEV: {} and LCS: {}", |
| 216 | + normalizedQuery, conferenceTitle, combinedScore, levSimilarity, LCSSimilarity); |
| 217 | + } |
131 | 218 | } |
132 | 219 | } |
133 | 220 |
|
134 | | - return bestMatch; |
| 221 | + if (!bestMatch.isEmpty()) { |
| 222 | + return Optional.of(normalizedTitleToConference.get(bestMatch)); |
| 223 | + } |
| 224 | + |
| 225 | + return Optional.empty(); |
135 | 226 | } |
136 | 227 | } |
0 commit comments