Skip to content

Commit

Permalink
Merge pull request #6664 from daniel-price/new-fetcher-collection-of-…
Browse files Browse the repository at this point in the history
…computer-science-bibliographies

Add Collection of Comp Sci Bibliographies fetcher
  • Loading branch information
Siedlerchr authored Jul 8, 2020
2 parents e7bc5d1 + 8375029 commit 558c810
Show file tree
Hide file tree
Showing 20 changed files with 665 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Added

- We added a new fetcher to enable users to search "[Collection of Computer Science Bibliographies](https://liinwww.ira.uka.de/bibliography/index.html)". [#6638](https://github.com/JabRef/jabref/issues/6638)
- We added default values for delimiters in Add Subgroup window [#6624](https://github.com/JabRef/jabref/issues/6624)
- We improved responsiveness of general fields specification dialog window. [#6643](https://github.com/JabRef/jabref/issues/6604)
- We added support for importing ris file and load DOI [#6530](https://github.com/JabRef/jabref/issues/6530)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jabref.logic.formatter.bibtexfields;

import java.util.Objects;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

public class RemoveDigitsFormatter extends Formatter {

private static final Pattern DIGITS = Pattern.compile("[ ]\\d+");

@Override
public String getName() {
return Localization.lang("Remove digits");
}

@Override
public String getKey() {
return "remove_digits";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);

return DIGITS.matcher(value).replaceAll("");
}

@Override
public String getDescription() {
return Localization.lang("Removes digits.");
}

@Override
public String getExampleInput() {
return "In 012 CDMA";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.jabref.logic.formatter.bibtexfields;

import java.util.Objects;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

/**
* Finds any occurrence of consecutive spaces and replaces it with a single space
*/
public class RemoveRedundantSpacesFormatter extends Formatter {

private static final Pattern MULTIPLE_SPACES = Pattern.compile(" {2,}");

@Override
public String getName() {
return Localization.lang("Remove redundant spaces");
}

@Override
public String getKey() {
return "remove_redundant_spaces";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);
return MULTIPLE_SPACES.matcher(value).replaceAll(" ");
}

@Override
public String getDescription() {
return Localization.lang("Replaces consecutive spaces with a single space in the field content.");
}

@Override
public String getExampleInput() {
return "In CDMA";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.jabref.logic.formatter.bibtexfields;

import java.util.Objects;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

/**
* Replaces any tab with a space
*/
public class ReplaceTabsBySpaceFormater extends Formatter {

private static final Pattern TAB = Pattern.compile("\t+");

@Override
public String getName() {
return Localization.lang("Replace tabs with space");
}

@Override
public String getKey() {
return "remove_tabs";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);
return TAB.matcher(value).replaceAll(" ");
}

@Override
public String getDescription() {
return Localization.lang("Replace tabs with space in the field content.");
}

@Override
public String getExampleInput() {
return "In \t\t CDMA";
}
}
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.jabref.logic.importer.fetcher.ArXiv;
import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
import org.jabref.logic.importer.fetcher.CiteSeer;
import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
import org.jabref.logic.importer.fetcher.CrossRef;
import org.jabref.logic.importer.fetcher.DBLPFetcher;
Expand Down Expand Up @@ -101,6 +102,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new DOAJFetcher(importFormatPreferences));
set.add(new IEEE(importFormatPreferences));
set.add(new CompositeSearchBasedFetcher(set, 30));
set.add(new CollectionOfComputerScienceBibliographiesFetcher(importFormatPreferences));
return set;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package org.jabref.logic.importer.fetcher;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;

import org.jabref.logic.formatter.bibtexfields.RemoveDigitsFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveNewlinesFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveRedundantSpacesFormatter;
import org.jabref.logic.formatter.bibtexfields.ReplaceTabsBySpaceFormater;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.model.cleanup.FieldFormatterCleanup;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.apache.http.client.utils.URIBuilder;

public class CollectionOfComputerScienceBibliographiesFetcher implements SearchBasedParserFetcher {

private static final String BASIC_SEARCH_URL = "http://liinwww.ira.uka.de/bibliography/rss?";

private final CollectionOfComputerScienceBibliographiesParser parser;

public CollectionOfComputerScienceBibliographiesFetcher(ImportFormatPreferences importFormatPreferences) {
this.parser = new CollectionOfComputerScienceBibliographiesParser(importFormatPreferences);
}

@Override
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException {
return new URIBuilder(BASIC_SEARCH_URL)
.addParameter("query", query)
.addParameter("sort", "score")
.build()
.toURL();
}

@Override
public Parser getParser() {
return parser;
}

@Override
public String getName() {
return "Collection of Computer Science Bibliographies";
}

@Override
public void doPostCleanup(BibEntry entry) {
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveNewlinesFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.ABSTRACT, new ReplaceTabsBySpaceFormater()).cleanup(entry);
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveRedundantSpacesFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.EDITOR, new RemoveDigitsFormatter()).cleanup(entry);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.util.DummyFileUpdateMonitor;

public class CollectionOfComputerScienceBibliographiesParser implements Parser {

final static Pattern REGEX_FOR_LINKS = Pattern.compile("<item>[\\s\\S]*?<link>([\\s\\S]*?)<\\/link>[\\s\\S]*?<\\/item>");
final static Pattern REGEX_FOR_BIBTEX = Pattern.compile("<pre class=\"bibtex\">([\\s\\S]*?)<\\/pre>");

final BibtexParser bibtexParser;
final HtmlToUnicodeFormatter htmlToUnicodeFormatter;

public CollectionOfComputerScienceBibliographiesParser(ImportFormatPreferences importFormatPreferences) {
this.bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor());
this.htmlToUnicodeFormatter = new HtmlToUnicodeFormatter();
}

@Override
public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException {
try {
List<String> links = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_LINKS);
String bibtexDataString = parseBibtexStringsFromLinks(links)
.stream()
.collect(Collectors.joining());

return bibtexParser.parseEntries(bibtexDataString);
} catch (IOException e) {
throw new ParseException(e);
}
}

private List<String> matchRegexFromInputStreamHtml(InputStream inputStream, Pattern pattern) {
try (Scanner scanner = new Scanner(inputStream)) {
return scanner.findAll(pattern)
.map(match -> htmlToUnicodeFormatter.format(match.group(1)))
.collect(Collectors.toList());
}
}

private List<String> parseBibtexStringsFromLinks(List<String> links) throws IOException {
List<String> bibtexStringsFromAllLinks = new ArrayList();
for (String link : links) {
try (InputStream inputStream = new URLDownload(link).asInputStream()) {
List<String> bibtexStringsFromLink = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_BIBTEX);
bibtexStringsFromAllLinks.addAll(bibtexStringsFromLink);
}
}

return bibtexStringsFromAllLinks;
}
}

7 changes: 7 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2255,3 +2255,10 @@ Reveal\ in\ file\ explorer=Reveal in file explorer
Reset=Reset
Reset\ entry\ types\ and\ fields\ to\ defaults=Reset entry types and fields to defaults
This\ will\ reset\ all\ entry\ types\ to\ their\ default\ values\ and\ remove\ all\ custom\ entry\ types=This will reset all entry types to their default values and remove all custom entry types
Replace\ tabs\ with\ space=Replace tabs with space
Replace\ tabs\ with\ space\ in\ the\ field\ content.=Replace tabs with space in the field content.
Remove\ redundant\ spaces=Remove redundant spaces
Replaces\ consecutive\ spaces\ with\ a\ single\ space\ in\ the\ field\ content.=Replaces consecutive spaces with a single space in the field content.
Remove\ digits=Remove digits
Removes\ digits.=Removes digits.
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class RemoveDigitsFormatterTest {

private RemoveDigitsFormatter formatter;

@BeforeEach
public void setUp() {
formatter = new RemoveDigitsFormatter();
}

@Test
public void doNothingIfSingleSpace() {
assertEquals("one digit", formatter.format("one 1 digit"));
}

@Test
public void doNothingIfNoSpace() {
assertEquals("two digits", formatter.format("two 01 digits"));
}

@Test
public void removeAllButOneSpacesIfTwo() {
assertEquals("no digits", formatter.format("no digits"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class RemoveRedundantSpacesFormatterTest {

private RemoveRedundantSpacesFormatter formatter;

@BeforeEach
public void setUp() {
formatter = new RemoveRedundantSpacesFormatter();
}

@Test
public void doNothingIfSingleSpace() {
assertEquals("single space", formatter.format("single space"));
}

@Test
public void doNothingIfNoSpace() {
assertEquals("nospace", formatter.format("nospace"));
}

@Test
public void removeAllButOneSpacesIfTwo() {
assertEquals("two spaces", formatter.format("two spaces"));
}

@Test
public void removeAllButOneSpacesIfThree() {
assertEquals("three spaces", formatter.format("three spaces"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class ReplaceTabsBySpaceFormaterTest {

private ReplaceTabsBySpaceFormater formatter;

@BeforeEach
public void setUp() {
formatter = new ReplaceTabsBySpaceFormater();
}

@Test
public void removeSingleTab() {
assertEquals("single tab", formatter.format("single\ttab"));
}

@Test
public void removeMultipleTabs() {
assertEquals("multiple tabs", formatter.format("multiple\t\ttabs"));
}

@Test
public void doNothingIfNoTab() {
assertEquals("notab", formatter.format("notab"));
}
}
Loading

0 comments on commit 558c810

Please sign in to comment.