Skip to content

Commit

Permalink
Merge pull request #5206 from JabRef/bibtexextractor
Browse files Browse the repository at this point in the history
Bibtexextractor
  • Loading branch information
Siedlerchr committed Aug 24, 2019
2 parents dd0f304 + 74a3ac5 commit ad3e811
Show file tree
Hide file tree
Showing 10 changed files with 305 additions and 0 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,4 @@ Yang Zongze
Yara Grassi Gouffon
Yifan Peng
Zhang Liang
Nikita Borovikov
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- The Medline fetcher now normalizes the author names according to the BibTeX-Standard [#4345](https://github.com/JabRef/jabref/issues/4345)
- We added an option on the Linked File Viewer to rename the attached file of an entry directly on the JabRef. [#4844](https://github.com/JabRef/jabref/issues/4844)
- We added an option in the preference dialog box that allows user to enable helpful tooltips.[#3599](https://github.com/JabRef/jabref/issues/3599)
- We added a tool for extracting BibTeX entries from plain text. [#5206](https://github.com/JabRef/jabref/pull/5206)
- We moved the dropdown menu for selecting the push-application from the toolbar into the external application preferences. [#674](https://github.com/JabRef/jabref/issues/674)
- We removed the alphabetical ordering of the custom tabs and updated the error message when trying to create a general field with a name containing an illegal character. [#5019](https://github.com/JabRef/jabref/issues/5019)
- We added a context menu to the bib(la)tex-source-editor to copy'n'paste. [#5007](https://github.com/JabRef/jabref/pull/5007)
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/gui/JabRefFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.jabref.gui.actions.SimpleCommand;
import org.jabref.gui.actions.StandardActions;
import org.jabref.gui.auximport.NewSubLibraryAction;
import org.jabref.gui.bibtexextractor.ExtractBibtexAction;
import org.jabref.gui.bibtexkeypattern.BibtexKeyPatternAction;
import org.jabref.gui.contentselector.ManageContentSelectorAction;
import org.jabref.gui.copyfiles.CopyFilesAction;
Expand Down Expand Up @@ -772,6 +773,7 @@ private MenuBar createMenu() {
factory.createMenuItem(StandardActions.FIND_UNLINKED_FILES, new FindUnlinkedFilesAction(this, stateManager)),
factory.createMenuItem(StandardActions.WRITE_XMP, new OldDatabaseCommandWrapper(Actions.WRITE_XMP, this, stateManager)),
factory.createMenuItem(StandardActions.COPY_LINKED_FILES, new CopyFilesAction(stateManager, this.getDialogService())),
factory.createMenuItem(StandardActions.EXTRACT_BIBTEX, new ExtractBibtexAction(stateManager)),

new SeparatorMenuItem(),

Expand Down
1 change: 1 addition & 0 deletions src/main/java/org/jabref/gui/actions/StandardActions.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ public enum StandardActions implements Action {
DOWNLOAD_FULL_TEXT(Localization.lang("Search full text documents online"), IconTheme.JabRefIcons.FILE_SEARCH, KeyBinding.DOWNLOAD_FULL_TEXT),
CLEANUP_ENTRIES(Localization.lang("Cleanup entries"), IconTheme.JabRefIcons.CLEANUP_ENTRIES, KeyBinding.CLEANUP),
SET_FILE_LINKS(Localization.lang("Automatically set file links"), KeyBinding.AUTOMATICALLY_LINK_FILES),
EXTRACT_BIBTEX(Localization.lang("Extract BibTeX from plain text")),

HELP(Localization.lang("Online help"), IconTheme.JabRefIcons.HELP, KeyBinding.HELP),
HELP_KEY_PATTERNS(Localization.lang("Help on key patterns"), IconTheme.JabRefIcons.HELP, KeyBinding.HELP),
Expand Down
170 changes: 170 additions & 0 deletions src/main/java/org/jabref/gui/bibtexextractor/BibtexExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package org.jabref.gui.bibtexextractor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

public class BibtexExtractor {

private static final String AUTHOR_TAG = "[author_tag]";
private static final String URL_TAG = "[url_tag]";
private static final String YEAR_TAG = "[year_tag]";
private static final String PAGES_TAG = "[pages_tag]";

private static final String INITIALS_GROUP = "INITIALS";
private static final String LASTNAME_GROUP = "LASTNAME";

private static final Pattern URL_PATTERN = Pattern.compile(
"(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)" +
"(([\\w\\-]+\\.)+?([\\w\\-.~]+\\/?)*" +
"[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern YEAR_PATTERN = Pattern.compile(
"\\d{4}",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern AUTHOR_PATTERN = Pattern.compile(
"(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+),?\\s(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})" +
"\\s*(and|,|\\.)*",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern AUTHOR_PATTERN_2 = Pattern.compile(
"(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+)" +
"\\s*(and|,|\\.)*",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern PAGES_PATTERN = Pattern.compile(
"(p.)?\\s?\\d+(-\\d+)?",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private final List<String> urls = new ArrayList<>();
private final List<String> authors = new ArrayList<>();
private String year = "";
private String pages = "";
private String title = "";
private boolean isArticle = true;
private String journalOrPublisher = "";

public BibEntry extract(String input) {
String inputWithoutUrls = findUrls(input);
String inputWithoutAuthors = findAuthors(inputWithoutUrls);
String inputWithoutYear = findYear(inputWithoutAuthors);
String inputWithoutPages = findPages(inputWithoutYear);
String nonParsed = findParts(inputWithoutPages);
return generateEntity(nonParsed);
}

private BibEntry generateEntity(String input) {
EntryType type = isArticle ? StandardEntryType.Article : StandardEntryType.Book;
BibEntry extractedEntity = new BibEntry(type);
extractedEntity.setField(StandardField.AUTHOR, String.join(" and ", authors));
extractedEntity.setField(StandardField.URL, String.join(", ", urls));
extractedEntity.setField(StandardField.YEAR, year);
extractedEntity.setField(StandardField.PAGES, pages);
extractedEntity.setField(StandardField.TITLE, title);
if (isArticle) {
extractedEntity.setField(StandardField.JOURNAL, journalOrPublisher);
} else {
extractedEntity.setField(StandardField.PUBLISHER, journalOrPublisher);
}
extractedEntity.setField(StandardField.COMMENT, input);
return extractedEntity;
}

private String findUrls(String input) {
Matcher matcher = URL_PATTERN.matcher(input);
while (matcher.find()) {
urls.add(input.substring(matcher.start(1), matcher.end()));
}
return fixSpaces(matcher.replaceAll(URL_TAG));
}

private String findYear(String input) {
Matcher matcher = YEAR_PATTERN.matcher(input);
while (matcher.find()) {
String yearCandidate = input.substring(matcher.start(), matcher.end());
int intYearCandidate = Integer.parseInt(yearCandidate);
if ((intYearCandidate > 1700) && (intYearCandidate <= Calendar.getInstance().get(Calendar.YEAR))) {
year = yearCandidate;
return fixSpaces(input.replace(year, YEAR_TAG));
}
}
return input;
}

private String findAuthors(String input) {
String currentInput = findAuthorsByPattern(input, AUTHOR_PATTERN);
return findAuthorsByPattern(currentInput, AUTHOR_PATTERN_2);
}

private String findAuthorsByPattern(String input, Pattern pattern) {
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
authors.add(GenerateAuthor(matcher.group(LASTNAME_GROUP), matcher.group(INITIALS_GROUP)));
}
return fixSpaces(matcher.replaceAll(AUTHOR_TAG));
}

private String GenerateAuthor(String lastName, String initials) {
return lastName + ", " + initials;
}

private String findPages(String input) {
Matcher matcher = PAGES_PATTERN.matcher(input);
if (matcher.find()) {
pages = input.substring(matcher.start(), matcher.end());
}
return fixSpaces(matcher.replaceFirst(PAGES_TAG));
}

private String fixSpaces(String input) {
return input.replaceAll("[,.!?;:]", "$0 ")
.replaceAll("\\p{Lt}", " $0")
.replaceAll("\\s+", " ").trim();
}

private String findParts(String input) {
ArrayList<String> lastParts = new ArrayList<>();
int afterAuthorsIndex = input.lastIndexOf(AUTHOR_TAG);
if (afterAuthorsIndex == -1) {
return input;
} else {
afterAuthorsIndex += AUTHOR_TAG.length();
}
int delimiterIndex = input.lastIndexOf("//");
if (delimiterIndex != -1) {
lastParts.add(input.substring(afterAuthorsIndex, delimiterIndex)
.replace(YEAR_TAG, "")
.replace(PAGES_TAG, ""));
lastParts.addAll(Arrays.asList(input.substring(delimiterIndex + 2).split(",|\\.")));
} else {
lastParts.addAll(Arrays.asList(input.substring(afterAuthorsIndex).split(",|\\.")));
}
int nonDigitParts = 0;
for (String part : lastParts) {
if (part.matches(".*\\d.*")) {
break;
}
nonDigitParts++;
}
if (nonDigitParts > 0) {
title = lastParts.get(0);
}
if (nonDigitParts > 1) {
journalOrPublisher = lastParts.get(1);
}
if (nonDigitParts > 2) {
isArticle = false;
}
return fixSpaces(input);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.jabref.gui.bibtexextractor;

import java.util.HashMap;
import java.util.Map;

import javafx.beans.property.SimpleStringProperty;
import javafx.beans.property.StringProperty;

import org.jabref.Globals;
import org.jabref.model.database.BibDatabaseContext;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

public class BibtexExtractorViewModel {

private final StringProperty inputTextProperty = new SimpleStringProperty("");
private final BibDatabaseContext bibdatabaseContext;

public BibtexExtractorViewModel(BibDatabaseContext bibdatabaseContext) {
this.bibdatabaseContext = bibdatabaseContext;
}

public StringProperty inputTextProperty() {
return this.inputTextProperty;
}

public void startExtraction() {

BibtexExtractor extractor = new BibtexExtractor();
BibEntry entity = extractor.extract(inputTextProperty.getValue());
this.bibdatabaseContext.getDatabase().insertEntry(entity);
trackNewEntry(StandardEntryType.Article);
}

private void trackNewEntry(EntryType type) {
Map<String, String> properties = new HashMap<>();
properties.put("EntryType", type.getName());

Globals.getTelemetryClient().ifPresent(client -> client.trackEvent("NewEntry", properties, new HashMap<>()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.jabref.gui.bibtexextractor;

import org.jabref.gui.StateManager;
import org.jabref.gui.actions.SimpleCommand;

import static org.jabref.gui.actions.ActionHelper.needsDatabase;

public class ExtractBibtexAction extends SimpleCommand {

public ExtractBibtexAction(StateManager stateManager) {
this.executable.bind(needsDatabase(stateManager));
}

@Override
public void execute() {
ExtractBibtexDialog dlg = new ExtractBibtexDialog();
dlg.showAndWait();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>

<?import javafx.scene.control.ButtonType?>
<?import javafx.scene.control.DialogPane?>
<?import javafx.scene.control.TextArea?>

<DialogPane prefHeight="430.0" prefWidth="586.0" xmlns="http://javafx.com/javafx/8.0.171"
xmlns:fx="http://javafx.com/fxml/1" fx:controller="org.jabref.gui.bibtexextractor.ExtractBibtexDialog">
<content>
<TextArea fx:id="input" minHeight="-Infinity" prefHeight="350.0" prefWidth="586.0"/>
</content>
<ButtonType fx:id="extractButtonType" buttonData="OK_DONE" text="%Extract"/>
<ButtonType fx:constant="CANCEL"/>
</DialogPane>
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.jabref.gui.bibtexextractor;

import javax.inject.Inject;

import javafx.fxml.FXML;
import javafx.scene.control.Button;
import javafx.scene.control.ButtonType;
import javafx.scene.control.TextArea;
import javafx.scene.control.Tooltip;

import org.jabref.gui.StateManager;
import org.jabref.gui.util.BaseDialog;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabaseContext;

import com.airhacks.afterburner.views.ViewLoader;

/**
* GUI Dialog for the feature "Extract BibTeX from plain text".
*/
public class ExtractBibtexDialog extends BaseDialog<Void> {

private final Button buttonExtract;
@FXML private TextArea input;
@FXML private ButtonType extractButtonType;
private BibtexExtractorViewModel viewModel;

@Inject private StateManager stateManager;

public ExtractBibtexDialog() {

ViewLoader.view(this)
.load()
.setAsDialogPane(this);

this.setTitle(Localization.lang("Input text to parse"));
buttonExtract = (Button) getDialogPane().lookupButton(extractButtonType);
buttonExtract.setTooltip(new Tooltip((Localization.lang("Starts the extraction of the BibTeX entry"))));
buttonExtract.setOnAction(e -> viewModel.startExtraction());
buttonExtract.disableProperty().bind(viewModel.inputTextProperty().isEmpty());
}

@FXML
private void initialize() {
BibDatabaseContext database = stateManager.getActiveDatabase().orElseThrow(() -> new NullPointerException("Database null"));
this.viewModel = new BibtexExtractorViewModel(database);

input.textProperty().bindBidirectional(viewModel.inputTextProperty());
}
}
5 changes: 5 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,11 @@ Accept\ changes=Accept changes
Dismiss\ changes=Dismiss changes
The\ library\ has\ been\ modified\ by\ another\ program.=The library has been modified by another program.
Extract=Extract
Extract\ BibTeX\ from\ plain\ text= Extract BibTeX from plain text
Input\ text\ to\ parse=Input text to parse
Starts\ the\ extraction\ of\ the\ BibTeX\ entry=Starts the extraction of the BibTeX entry
Execute\ command=Execute command
Open\ File\ Browser=Open File Browser
Use\ default\ file\ browser=Use default file browser
Expand Down

0 comments on commit ad3e811

Please sign in to comment.