-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6664 from daniel-price/new-fetcher-collection-of-…
…computer-science-bibliographies Add Collection of Comp Sci Bibliographies fetcher
- Loading branch information
Showing
20 changed files
with
665 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
src/main/java/org/jabref/logic/formatter/bibtexfields/RemoveDigitsFormatter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import java.util.Objects; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.model.cleanup.Formatter; | ||
|
||
public class RemoveDigitsFormatter extends Formatter { | ||
|
||
private static final Pattern DIGITS = Pattern.compile("[ ]\\d+"); | ||
|
||
@Override | ||
public String getName() { | ||
return Localization.lang("Remove digits"); | ||
} | ||
|
||
@Override | ||
public String getKey() { | ||
return "remove_digits"; | ||
} | ||
|
||
@Override | ||
public String format(String value) { | ||
Objects.requireNonNull(value); | ||
|
||
return DIGITS.matcher(value).replaceAll(""); | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Removes digits."); | ||
} | ||
|
||
@Override | ||
public String getExampleInput() { | ||
return "In 012 CDMA"; | ||
} | ||
} |
41 changes: 41 additions & 0 deletions
41
src/main/java/org/jabref/logic/formatter/bibtexfields/RemoveRedundantSpacesFormatter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import java.util.Objects; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.model.cleanup.Formatter; | ||
|
||
/** | ||
* Finds any occurrence of consecutive spaces and replaces it with a single space | ||
*/ | ||
public class RemoveRedundantSpacesFormatter extends Formatter { | ||
|
||
private static final Pattern MULTIPLE_SPACES = Pattern.compile(" {2,}"); | ||
|
||
@Override | ||
public String getName() { | ||
return Localization.lang("Remove redundant spaces"); | ||
} | ||
|
||
@Override | ||
public String getKey() { | ||
return "remove_redundant_spaces"; | ||
} | ||
|
||
@Override | ||
public String format(String value) { | ||
Objects.requireNonNull(value); | ||
return MULTIPLE_SPACES.matcher(value).replaceAll(" "); | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Replaces consecutive spaces with a single space in the field content."); | ||
} | ||
|
||
@Override | ||
public String getExampleInput() { | ||
return "In CDMA"; | ||
} | ||
} |
41 changes: 41 additions & 0 deletions
41
src/main/java/org/jabref/logic/formatter/bibtexfields/ReplaceTabsBySpaceFormater.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import java.util.Objects; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.model.cleanup.Formatter; | ||
|
||
/** | ||
* Replaces any tab with a space | ||
*/ | ||
public class ReplaceTabsBySpaceFormater extends Formatter { | ||
|
||
private static final Pattern TAB = Pattern.compile("\t+"); | ||
|
||
@Override | ||
public String getName() { | ||
return Localization.lang("Replace tabs with space"); | ||
} | ||
|
||
@Override | ||
public String getKey() { | ||
return "remove_tabs"; | ||
} | ||
|
||
@Override | ||
public String format(String value) { | ||
Objects.requireNonNull(value); | ||
return TAB.matcher(value).replaceAll(" "); | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Replace tabs with space in the field content."); | ||
} | ||
|
||
@Override | ||
public String getExampleInput() { | ||
return "In \t\t CDMA"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
...a/org/jabref/logic/importer/fetcher/CollectionOfComputerScienceBibliographiesFetcher.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.net.MalformedURLException; | ||
import java.net.URISyntaxException; | ||
import java.net.URL; | ||
|
||
import org.jabref.logic.formatter.bibtexfields.RemoveDigitsFormatter; | ||
import org.jabref.logic.formatter.bibtexfields.RemoveNewlinesFormatter; | ||
import org.jabref.logic.formatter.bibtexfields.RemoveRedundantSpacesFormatter; | ||
import org.jabref.logic.formatter.bibtexfields.ReplaceTabsBySpaceFormater; | ||
import org.jabref.logic.importer.FetcherException; | ||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.Parser; | ||
import org.jabref.logic.importer.SearchBasedParserFetcher; | ||
import org.jabref.model.cleanup.FieldFormatterCleanup; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
|
||
import org.apache.http.client.utils.URIBuilder; | ||
|
||
public class CollectionOfComputerScienceBibliographiesFetcher implements SearchBasedParserFetcher { | ||
|
||
private static final String BASIC_SEARCH_URL = "http://liinwww.ira.uka.de/bibliography/rss?"; | ||
|
||
private final CollectionOfComputerScienceBibliographiesParser parser; | ||
|
||
public CollectionOfComputerScienceBibliographiesFetcher(ImportFormatPreferences importFormatPreferences) { | ||
this.parser = new CollectionOfComputerScienceBibliographiesParser(importFormatPreferences); | ||
} | ||
|
||
@Override | ||
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException { | ||
return new URIBuilder(BASIC_SEARCH_URL) | ||
.addParameter("query", query) | ||
.addParameter("sort", "score") | ||
.build() | ||
.toURL(); | ||
} | ||
|
||
@Override | ||
public Parser getParser() { | ||
return parser; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "Collection of Computer Science Bibliographies"; | ||
} | ||
|
||
@Override | ||
public void doPostCleanup(BibEntry entry) { | ||
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveNewlinesFormatter()).cleanup(entry); | ||
new FieldFormatterCleanup(StandardField.ABSTRACT, new ReplaceTabsBySpaceFormater()).cleanup(entry); | ||
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveRedundantSpacesFormatter()).cleanup(entry); | ||
new FieldFormatterCleanup(StandardField.EDITOR, new RemoveDigitsFormatter()).cleanup(entry); | ||
} | ||
} |
67 changes: 67 additions & 0 deletions
67
...va/org/jabref/logic/importer/fetcher/CollectionOfComputerScienceBibliographiesParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Scanner; | ||
import java.util.regex.Pattern; | ||
import java.util.stream.Collectors; | ||
|
||
import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter; | ||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.logic.importer.Parser; | ||
import org.jabref.logic.importer.fileformat.BibtexParser; | ||
import org.jabref.logic.net.URLDownload; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.util.DummyFileUpdateMonitor; | ||
|
||
public class CollectionOfComputerScienceBibliographiesParser implements Parser { | ||
|
||
final static Pattern REGEX_FOR_LINKS = Pattern.compile("<item>[\\s\\S]*?<link>([\\s\\S]*?)<\\/link>[\\s\\S]*?<\\/item>"); | ||
final static Pattern REGEX_FOR_BIBTEX = Pattern.compile("<pre class=\"bibtex\">([\\s\\S]*?)<\\/pre>"); | ||
|
||
final BibtexParser bibtexParser; | ||
final HtmlToUnicodeFormatter htmlToUnicodeFormatter; | ||
|
||
public CollectionOfComputerScienceBibliographiesParser(ImportFormatPreferences importFormatPreferences) { | ||
this.bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); | ||
this.htmlToUnicodeFormatter = new HtmlToUnicodeFormatter(); | ||
} | ||
|
||
@Override | ||
public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException { | ||
try { | ||
List<String> links = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_LINKS); | ||
String bibtexDataString = parseBibtexStringsFromLinks(links) | ||
.stream() | ||
.collect(Collectors.joining()); | ||
|
||
return bibtexParser.parseEntries(bibtexDataString); | ||
} catch (IOException e) { | ||
throw new ParseException(e); | ||
} | ||
} | ||
|
||
private List<String> matchRegexFromInputStreamHtml(InputStream inputStream, Pattern pattern) { | ||
try (Scanner scanner = new Scanner(inputStream)) { | ||
return scanner.findAll(pattern) | ||
.map(match -> htmlToUnicodeFormatter.format(match.group(1))) | ||
.collect(Collectors.toList()); | ||
} | ||
} | ||
|
||
private List<String> parseBibtexStringsFromLinks(List<String> links) throws IOException { | ||
List<String> bibtexStringsFromAllLinks = new ArrayList(); | ||
for (String link : links) { | ||
try (InputStream inputStream = new URLDownload(link).asInputStream()) { | ||
List<String> bibtexStringsFromLink = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_BIBTEX); | ||
bibtexStringsFromAllLinks.addAll(bibtexStringsFromLink); | ||
} | ||
} | ||
|
||
return bibtexStringsFromAllLinks; | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
31 changes: 31 additions & 0 deletions
31
src/test/java/org/jabref/logic/formatter/bibtexfields/RemoveDigitsFormatterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
public class RemoveDigitsFormatterTest { | ||
|
||
private RemoveDigitsFormatter formatter; | ||
|
||
@BeforeEach | ||
public void setUp() { | ||
formatter = new RemoveDigitsFormatter(); | ||
} | ||
|
||
@Test | ||
public void doNothingIfSingleSpace() { | ||
assertEquals("one digit", formatter.format("one 1 digit")); | ||
} | ||
|
||
@Test | ||
public void doNothingIfNoSpace() { | ||
assertEquals("two digits", formatter.format("two 01 digits")); | ||
} | ||
|
||
@Test | ||
public void removeAllButOneSpacesIfTwo() { | ||
assertEquals("no digits", formatter.format("no digits")); | ||
} | ||
} |
36 changes: 36 additions & 0 deletions
36
...test/java/org/jabref/logic/formatter/bibtexfields/RemoveRedundantSpacesFormatterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
public class RemoveRedundantSpacesFormatterTest { | ||
|
||
private RemoveRedundantSpacesFormatter formatter; | ||
|
||
@BeforeEach | ||
public void setUp() { | ||
formatter = new RemoveRedundantSpacesFormatter(); | ||
} | ||
|
||
@Test | ||
public void doNothingIfSingleSpace() { | ||
assertEquals("single space", formatter.format("single space")); | ||
} | ||
|
||
@Test | ||
public void doNothingIfNoSpace() { | ||
assertEquals("nospace", formatter.format("nospace")); | ||
} | ||
|
||
@Test | ||
public void removeAllButOneSpacesIfTwo() { | ||
assertEquals("two spaces", formatter.format("two spaces")); | ||
} | ||
|
||
@Test | ||
public void removeAllButOneSpacesIfThree() { | ||
assertEquals("three spaces", formatter.format("three spaces")); | ||
} | ||
} |
32 changes: 32 additions & 0 deletions
32
src/test/java/org/jabref/logic/formatter/bibtexfields/ReplaceTabsBySpaceFormaterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
|
||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
public class ReplaceTabsBySpaceFormaterTest { | ||
|
||
private ReplaceTabsBySpaceFormater formatter; | ||
|
||
@BeforeEach | ||
public void setUp() { | ||
formatter = new ReplaceTabsBySpaceFormater(); | ||
} | ||
|
||
@Test | ||
public void removeSingleTab() { | ||
assertEquals("single tab", formatter.format("single\ttab")); | ||
} | ||
|
||
@Test | ||
public void removeMultipleTabs() { | ||
assertEquals("multiple tabs", formatter.format("multiple\t\ttabs")); | ||
} | ||
|
||
@Test | ||
public void doNothingIfNoTab() { | ||
assertEquals("notab", formatter.format("notab")); | ||
} | ||
} |
Oops, something went wrong.