Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Collection of Comp Sci Bibliographies fetcher #6664

Merged
merged 18 commits into from
Jul 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Added

- We added a new fetcher to enable users to search "[Collection of Computer Science Bibliographies](https://liinwww.ira.uka.de/bibliography/index.html)". [#6638](https://github.com/JabRef/jabref/issues/6638)
- We added default values for delimiters in Add Subgroup window [#6624](https://github.com/JabRef/jabref/issues/6624)
- We improved responsiveness of general fields specification dialog window. [#6643](https://github.com/JabRef/jabref/issues/6604)
- We added support for importing ris file and load DOI [#6530](https://github.com/JabRef/jabref/issues/6530)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jabref.logic.formatter.bibtexfields;

import java.util.Objects;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

public class RemoveDigitsFormatter extends Formatter {

private static final Pattern DIGITS = Pattern.compile("[ ]\\d+");

@Override
public String getName() {
return Localization.lang("Remove digits");
}

@Override
public String getKey() {
return "remove_digits";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);

return DIGITS.matcher(value).replaceAll("");
}

@Override
public String getDescription() {
return Localization.lang("Removes digits.");
}

@Override
public String getExampleInput() {
return "In 012 CDMA";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.jabref.logic.formatter.bibtexfields;

import java.util.Objects;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

/**
* Finds any occurrence of consecutive spaces and replaces it with a single space
*/
public class RemoveRedundantSpacesFormatter extends Formatter {

private static final Pattern MULTIPLE_SPACES = Pattern.compile(" {2,}");

@Override
public String getName() {
return Localization.lang("Remove redundant spaces");
}

@Override
public String getKey() {
return "remove_redundant_spaces";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);
return MULTIPLE_SPACES.matcher(value).replaceAll(" ");
}

@Override
public String getDescription() {
return Localization.lang("Replaces consecutive spaces with a single space in the field content.");
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public String getExampleInput() {
return "In CDMA";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.jabref.logic.formatter.bibtexfields;

import java.util.Objects;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

/**
* Replaces any tab with a space
*/
public class ReplaceTabsBySpaceFormater extends Formatter {

private static final Pattern TAB = Pattern.compile("\t+");

@Override
public String getName() {
return Localization.lang("Replace tabs with space");
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public String getKey() {
return "remove_tabs";
}

@Override
public String format(String value) {
Objects.requireNonNull(value);
return TAB.matcher(value).replaceAll(" ");
}

@Override
public String getDescription() {
return Localization.lang("Replace tabs with space in the field content.");
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public String getExampleInput() {
return "In \t\t CDMA";
}
}
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.jabref.logic.importer.fetcher.ArXiv;
import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
import org.jabref.logic.importer.fetcher.CiteSeer;
import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
import org.jabref.logic.importer.fetcher.CrossRef;
import org.jabref.logic.importer.fetcher.DBLPFetcher;
Expand Down Expand Up @@ -101,6 +102,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new DOAJFetcher(importFormatPreferences));
set.add(new IEEE(importFormatPreferences));
set.add(new CompositeSearchBasedFetcher(set, 30));
set.add(new CollectionOfComputerScienceBibliographiesFetcher(importFormatPreferences));
return set;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package org.jabref.logic.importer.fetcher;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;

import org.jabref.logic.formatter.bibtexfields.RemoveDigitsFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveNewlinesFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveRedundantSpacesFormatter;
import org.jabref.logic.formatter.bibtexfields.ReplaceTabsBySpaceFormater;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.model.cleanup.FieldFormatterCleanup;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.apache.http.client.utils.URIBuilder;

public class CollectionOfComputerScienceBibliographiesFetcher implements SearchBasedParserFetcher {

private static final String BASIC_SEARCH_URL = "http://liinwww.ira.uka.de/bibliography/rss?";
daniel-price marked this conversation as resolved.
Show resolved Hide resolved

private final CollectionOfComputerScienceBibliographiesParser parser;

public CollectionOfComputerScienceBibliographiesFetcher(ImportFormatPreferences importFormatPreferences) {
this.parser = new CollectionOfComputerScienceBibliographiesParser(importFormatPreferences);
}

@Override
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException {
return new URIBuilder(BASIC_SEARCH_URL)
.addParameter("query", query)
.addParameter("sort", "score")
.build()
.toURL();
}

@Override
public Parser getParser() {
return parser;
}

@Override
public String getName() {
return "Collection of Computer Science Bibliographies";
}

@Override
public void doPostCleanup(BibEntry entry) {
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveNewlinesFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.ABSTRACT, new ReplaceTabsBySpaceFormater()).cleanup(entry);
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveRedundantSpacesFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.EDITOR, new RemoveDigitsFormatter()).cleanup(entry);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.util.DummyFileUpdateMonitor;

public class CollectionOfComputerScienceBibliographiesParser implements Parser {

final static Pattern REGEX_FOR_LINKS = Pattern.compile("<item>[\\s\\S]*?<link>([\\s\\S]*?)<\\/link>[\\s\\S]*?<\\/item>");
final static Pattern REGEX_FOR_BIBTEX = Pattern.compile("<pre class=\"bibtex\">([\\s\\S]*?)<\\/pre>");

final BibtexParser bibtexParser;
final HtmlToUnicodeFormatter htmlToUnicodeFormatter;

public CollectionOfComputerScienceBibliographiesParser(ImportFormatPreferences importFormatPreferences) {
this.bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor());
this.htmlToUnicodeFormatter = new HtmlToUnicodeFormatter();
}

@Override
public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException {
try {
List<String> links = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_LINKS);
String bibtexDataString = parseBibtexStringsFromLinks(links)
.stream()
.collect(Collectors.joining());

return bibtexParser.parseEntries(bibtexDataString);
} catch (IOException e) {
throw new ParseException(e);
}
}

private List<String> matchRegexFromInputStreamHtml(InputStream inputStream, Pattern pattern) {
try (Scanner scanner = new Scanner(inputStream)) {
return scanner.findAll(pattern)
.map(match -> htmlToUnicodeFormatter.format(match.group(1)))
.collect(Collectors.toList());
}
}

private List<String> parseBibtexStringsFromLinks(List<String> links) throws IOException {
List<String> bibtexStringsFromAllLinks = new ArrayList();
for (String link : links) {
try (InputStream inputStream = new URLDownload(link).asInputStream()) {
List<String> bibtexStringsFromLink = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_BIBTEX);
bibtexStringsFromAllLinks.addAll(bibtexStringsFromLink);
}
}

return bibtexStringsFromAllLinks;
}
}

7 changes: 7 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2255,3 +2255,10 @@ Reveal\ in\ file\ explorer=Reveal in file explorer
Reset=Reset
Reset\ entry\ types\ and\ fields\ to\ defaults=Reset entry types and fields to defaults
This\ will\ reset\ all\ entry\ types\ to\ their\ default\ values\ and\ remove\ all\ custom\ entry\ types=This will reset all entry types to their default values and remove all custom entry types

Replace\ tabs\ with\ space=Replace tabs with space
Replace\ tabs\ with\ space\ in\ the\ field\ content.=Replace tabs with space in the field content.
Remove\ redundant\ spaces=Remove redundant spaces
Replaces\ consecutive\ spaces\ with\ a\ single\ space\ in\ the\ field\ content.=Replaces consecutive spaces with a single space in the field content.
Remove\ digits=Remove digits
Removes\ digits.=Removes digits.
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class RemoveDigitsFormatterTest {

private RemoveDigitsFormatter formatter;

@BeforeEach
public void setUp() {
formatter = new RemoveDigitsFormatter();
}

@Test
public void doNothingIfSingleSpace() {
assertEquals("one digit", formatter.format("one 1 digit"));
}

@Test
public void doNothingIfNoSpace() {
assertEquals("two digits", formatter.format("two 01 digits"));
}

@Test
public void removeAllButOneSpacesIfTwo() {
assertEquals("no digits", formatter.format("no digits"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class RemoveRedundantSpacesFormatterTest {

private RemoveRedundantSpacesFormatter formatter;

@BeforeEach
public void setUp() {
formatter = new RemoveRedundantSpacesFormatter();
}

@Test
public void doNothingIfSingleSpace() {
assertEquals("single space", formatter.format("single space"));
}

@Test
public void doNothingIfNoSpace() {
assertEquals("nospace", formatter.format("nospace"));
}

@Test
public void removeAllButOneSpacesIfTwo() {
assertEquals("two spaces", formatter.format("two spaces"));
}

@Test
public void removeAllButOneSpacesIfThree() {
assertEquals("three spaces", formatter.format("three spaces"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class ReplaceTabsBySpaceFormaterTest {

private ReplaceTabsBySpaceFormater formatter;

@BeforeEach
public void setUp() {
formatter = new ReplaceTabsBySpaceFormater();
}

@Test
public void removeSingleTab() {
assertEquals("single tab", formatter.format("single\ttab"));
}

@Test
public void removeMultipleTabs() {
assertEquals("multiple tabs", formatter.format("multiple\t\ttabs"));
}

@Test
public void doNothingIfNoTab() {
assertEquals("notab", formatter.format("notab"));
}
}
Loading