Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bibtexextractor #4985

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,4 @@ Yang Zongze
Yara Grassi Gouffon
Yifan Peng
Zhang Liang
Nikita Borovikov
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- The Medline fetcher now normalizes the author names according to the BibTeX-Standard [#4345](https://github.com/JabRef/jabref/issues/4345)
- We added an option on the Linked File Viewer to rename the attached file of an entry directly on the JabRef. [#4844](https://github.com/JabRef/jabref/issues/4844)
- We added an option in the preference dialog box that allows user to enable helpful tooltips.[#3599](https://github.com/JabRef/jabref/issues/3599)
- We add tool for extracting BibTeX entity from plain text
- We moved the dropdown menu for selecting the push-application from the toolbar into the external application preferences. [#674](https://github.com/JabRef/jabref/issues/674)
- We removed the alphabetical ordering of the custom tabs and updated the error message when trying to create a general field with a name containing an illegal character. [#5019](https://github.com/JabRef/jabref/issues/5019)
- We added a context menu to the bib(la)tex-source-editor to copy'n'paste. [#5007](https://github.com/JabRef/jabref/pull/5007)
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/gui/JabRefFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import org.jabref.gui.actions.OldDatabaseCommandWrapper;
import org.jabref.gui.actions.SimpleCommand;
import org.jabref.gui.actions.StandardActions;
import org.jabref.gui.bibtexextractor.ExtractBibtexAction;
import org.jabref.gui.auximport.NewSubLibraryAction;
import org.jabref.gui.bibtexkeypattern.BibtexKeyPatternAction;
import org.jabref.gui.contentselector.ManageContentSelectorAction;
Expand Down Expand Up @@ -774,6 +775,7 @@ private MenuBar createMenu() {
factory.createMenuItem(StandardActions.FIND_UNLINKED_FILES, new FindUnlinkedFilesAction(this, stateManager)),
factory.createMenuItem(StandardActions.WRITE_XMP, new OldDatabaseCommandWrapper(Actions.WRITE_XMP, this, stateManager)),
factory.createMenuItem(StandardActions.COPY_LINKED_FILES, new CopyFilesAction(stateManager, this.getDialogService())),
factory.createMenuItem(StandardActions.EXTRACT_BIBTEX, new ExtractBibtexAction(this)),

new SeparatorMenuItem(),

Expand Down
1 change: 1 addition & 0 deletions src/main/java/org/jabref/gui/actions/StandardActions.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ public enum StandardActions implements Action {
DOWNLOAD_FULL_TEXT(Localization.lang("Search full text documents online"), IconTheme.JabRefIcons.FILE_SEARCH, KeyBinding.DOWNLOAD_FULL_TEXT),
CLEANUP_ENTRIES(Localization.lang("Cleanup entries"), IconTheme.JabRefIcons.CLEANUP_ENTRIES, KeyBinding.CLEANUP),
SET_FILE_LINKS(Localization.lang("Automatically set file links"), KeyBinding.AUTOMATICALLY_LINK_FILES),
EXTRACT_BIBTEX(Localization.lang("Extract BibTeX from plain text")),

HELP(Localization.lang("Online help"), IconTheme.JabRefIcons.HELP, KeyBinding.HELP),
HELP_KEY_PATTERNS(Localization.lang("Help on key patterns"), IconTheme.JabRefIcons.HELP, KeyBinding.HELP),
Expand Down
181 changes: 181 additions & 0 deletions src/main/java/org/jabref/gui/bibtexextractor/BibtexExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package org.jabref.gui.bibtexextractor;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BiblatexEntryType;
import org.jabref.model.entry.BiblatexEntryTypes;
import org.jabref.model.entry.FieldName;

import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class BibtexExtractor {

private final static String authorTag = "[author_tag]";
private final static String urlTag = "[url_tag]";
private final static String yearTag = "[year_tag]";
private final static String pagesTag = "[pages_tag]";
private final static String titleTag = "[title_tag]";
private final static String journalTag = "[journal_tag]";

private final static String INITIALS_GROUP = "INITIALS";
private final static String LASTNAME_GROUP = "LASTNAME";

private ArrayList<String> urls = new ArrayList<>();
private ArrayList<String> authors = new ArrayList<>();
private String year = new String();
private String pages = new String();
private String title = new String();
private boolean isArticle = true;
private String journalOrPublisher = new String();

private static final Pattern urlPattern = Pattern.compile(
"(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)" +
"(([\\w\\-]+\\.)+?([\\w\\-.~]+\\/?)*" +
"[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern yearPattern = Pattern.compile(
"\\d{4}",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern authorPattern1 = Pattern.compile(
"(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+),?\\s(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})" +
"\\s*(and|,|\\.)*",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern authorPattern2 = Pattern.compile(
"(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+)" +
"\\s*(and|,|\\.)*",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern pagesPattern = Pattern.compile(
"(p.)?\\s?\\d+(-\\d+)?",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);


public BibEntry extract(String input){
String inputWithoutUrls = findUrls(input);
String inputWithoutAuthors = findAuthors(inputWithoutUrls);
String inputWithoutYear = findYear(inputWithoutAuthors);
String inputWithoutPages = findPages(inputWithoutYear);
String nonparsed = findParts(inputWithoutPages);
return GenerateEntity(nonparsed);
}

private BibEntry GenerateEntity(String input){
BiblatexEntryType type = isArticle ? BiblatexEntryTypes.ARTICLE : BiblatexEntryTypes.BOOK;
BibEntry extractedEntity = new BibEntry(type);
extractedEntity.setField(FieldName.AUTHOR, String.join(" and ", authors));
extractedEntity.setField(FieldName.URL, String.join(", ", urls));
extractedEntity.setField(FieldName.YEAR, year);
extractedEntity.setField(FieldName.PAGES, pages);
extractedEntity.setField(FieldName.TITLE, title);
if (isArticle){
extractedEntity.setField(FieldName.JOURNAL, journalOrPublisher);
}
else {
extractedEntity.setField(FieldName.PUBLISHER, journalOrPublisher);
}
extractedEntity.setField(FieldName.COMMENT, input);
return extractedEntity;
}

private String findUrls(String input){
Matcher matcher = urlPattern.matcher(input);
while (matcher.find()) {
urls.add(input.substring(matcher.start(1), matcher.end()));
}
return fixSpaces(matcher.replaceAll(urlTag));
}

private String findYear(String input){
Matcher matcher = yearPattern.matcher(input);
while (matcher.find()){
String yearCandidate = input.substring(matcher.start(), matcher.end());
Integer intYearCandidate = Integer.parseInt(yearCandidate);
if (intYearCandidate > 1700 && intYearCandidate <= Calendar.getInstance().get(Calendar.YEAR)){
year = yearCandidate;
return fixSpaces(input.replace(year, yearTag));
}
}
return input;
}

private String findAuthors(String input){
String currentInput = findAuthorsByPattern(input, authorPattern1);
return findAuthorsByPattern(currentInput, authorPattern2);
}

private String findAuthorsByPattern(String input, Pattern pattern){
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
authors.add(GenerateAuthor(matcher.group(LASTNAME_GROUP), matcher.group(INITIALS_GROUP)));
}
return fixSpaces(matcher.replaceAll(authorTag));
}

private String GenerateAuthor(String lastName, String initials){
return lastName + ", " + initials;
}

private String findPages(String input){
Matcher matcher = pagesPattern.matcher(input);
if (matcher.find()){
pages = input.substring(matcher.start(1), matcher.end());
}
return fixSpaces(matcher.replaceFirst(pagesTag));
}

private String fixSpaces(String input){
return input.replaceAll("[,.!?;:]", "$0 ")
.replaceAll("\\p{Lt}", " $0")
.replaceAll("\\s+", " ").trim();
}

private String findParts(String input)
{
ArrayList<String> lastParts = new ArrayList<>();
String line = input;
int afterAuthorsIndex = input.lastIndexOf(authorTag);
if (afterAuthorsIndex == -1){
return input;
}
else {
afterAuthorsIndex += authorTag.length();
}
int delimiterIndex = input.lastIndexOf("//");
if (delimiterIndex != -1){
lastParts.add(input.substring(afterAuthorsIndex, delimiterIndex)
.replace(yearTag, "")
.replace(pagesTag, ""));
lastParts.addAll(Arrays.asList(input.substring(delimiterIndex + 2).split(",|\\.")));
}

else {
lastParts.addAll(Arrays.asList(input.substring(afterAuthorsIndex).split(",|\\.")));
}
int nonDigitParts = 0;
for (String part: lastParts) {
if (part.matches(".*\\d.*")){
break;
}
nonDigitParts++;
}
if (nonDigitParts > 0){
title = lastParts.get(0);
line.replace(title, titleTag);
}
if (nonDigitParts > 1){
journalOrPublisher = lastParts.get(1);
line.replace(journalOrPublisher, journalTag);
}
if (nonDigitParts > 2){
isArticle = false;
}
return fixSpaces(line);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.jabref.gui.bibtexextractor;

import org.jabref.gui.JabRefFrame;
import org.jabref.gui.actions.SimpleCommand;

public class ExtractBibtexAction extends SimpleCommand {

private final JabRefFrame jabRefFrame;

public ExtractBibtexAction(JabRefFrame jabRefFrame) {
this.jabRefFrame = jabRefFrame;
}

@Override
public void execute() {
ExtractBibtexDialog dlg = new ExtractBibtexDialog(jabRefFrame);
dlg.showAndWait();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package org.jabref.gui.bibtexextractor;

import javafx.scene.control.*;
import javafx.scene.layout.VBox;
import org.jabref.Globals;
import org.jabref.gui.JabRefFrame;
import org.jabref.gui.util.BaseDialog;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BiblatexEntryTypes;
import org.jabref.model.entry.EntryType;

import java.util.HashMap;
import java.util.Map;

/**
* GUI Dialog for the feature "Extract BibTeX from plain text".
*/
public class ExtractBibtexDialog extends BaseDialog<Void> {

private final JabRefFrame frame;
private TextArea textArea;
private Button buttonExtract;

public ExtractBibtexDialog(JabRefFrame frame) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't JabRef Frame here.
Best is to use the StateManager and then get the active Database and call insertEntries there

super();
this.setTitle(Localization.lang("Input text to parse"));
this.frame = frame;

initialize();
}

private void initialize(){
textArea = new TextArea();
textArea.setWrapText(true);
textArea.textProperty()
.addListener((observable, oldValue, newValue) -> buttonExtract.setDisable(newValue.isEmpty()));

VBox container = new VBox(20);
container.getChildren().addAll(
textArea);
container.setPrefWidth(600);

ButtonType buttonTypeGenerate = new ButtonType(Localization.lang("Extract"), ButtonBar.ButtonData.OK_DONE);
getDialogPane().getButtonTypes().setAll(
buttonTypeGenerate,
ButtonType.CANCEL
);

buttonExtract = (Button) getDialogPane().lookupButton(buttonTypeGenerate);
buttonExtract.setTooltip(new Tooltip((Localization.lang("Starts the extraction of the BibTeX entry"))));
buttonExtract.setDisable(true);
buttonExtract.setOnAction(e -> startExtraction());

getDialogPane().setContent(container);
}

private void startExtraction()
{
BibtexExtractor extractor = new BibtexExtractor();
BibEntry entity = extractor.extract(textArea.getText());
trackNewEntry(BiblatexEntryTypes.ARTICLE);
frame.getCurrentBasePanel().insertEntry(entity);
}

private void trackNewEntry(EntryType type) {
Map<String, String> properties = new HashMap<>();
properties.put("EntryType", type.getName());

Globals.getTelemetryClient().ifPresent(client -> client.trackEvent("NewEntry", properties, new HashMap<>()));
}
}
6 changes: 6 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2074,6 +2074,12 @@ Accept\ changes=Accept changes
Dismiss\ changes=Dismiss changes
The\ library\ has\ been\ modified\ by\ another\ program.=The library has been modified by another program.

Extract=Extract
Extract\ BibTeX\ from\ plain\ text= Extract BibTeX from plain text
Input\ text\ for\ parse=Input text to parse
Starts\ the\ extraction\ of\ the\ BibTeX\ entry=Starts the extraction of the BibTeX entry

Browser=Browser
Execute\ command=Execute command
Open\ File\ Browser=Open File Browser
Use\ default\ file\ browser=Use default file browser
Expand Down