Skip to content

Commit

Permalink
Try JCEF
Browse files Browse the repository at this point in the history
  • Loading branch information
koppor committed Sep 7, 2024
1 parent 9e64e0c commit b256ba7
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 24 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ src/main/gen/
src/main/generated/
src-gen/


.lycheecache

jcef-bundle/

javafx/javafx-sdk-*
javafx/javafx-jmods-*
javafx/javafx.html
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ dependencies {
implementation 'org.controlsfx:controlsfx:11.2.1'

// region HTTP clients
implementation 'org.htmlunit:htmlunit:4.4.0' // used for web scraping
implementation 'me.friwi:jcefmaven:126.2.0' // used for web scraping
implementation 'org.jsoup:jsoup:1.18.1'
implementation 'com.konghq:unirest-java-core:4.4.4'
implementation 'com.konghq:unirest-modules-gson:4.4.4'
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
requires org.glassfish.hk2.api;

// region: http clients
requires htmlunit;
requires jcefmaven;
requires org.apache.httpcomponents.core5.httpcore5;
requires org.jsoup;
requires unirest.java.core;
Expand Down Expand Up @@ -184,5 +184,6 @@
requires mslinks;
requires org.antlr.antlr4.runtime;
requires org.libreoffice.uno;
requires jcef;
// endregion
}
75 changes: 53 additions & 22 deletions src/main/java/org/jabref/logic/importer/fetcher/ACS.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.DOI;

import org.htmlunit.BrowserVersion;
import org.htmlunit.WebClient;
import org.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import me.friwi.jcefmaven.CefAppBuilder;
import me.friwi.jcefmaven.MavenCefAppHandlerAdapter;
import org.cef.CefApp;
import org.cef.CefClient;
import org.cef.CefSettings;
import org.cef.browser.CefBrowser;
import org.cef.browser.CefFrame;
import org.cef.callback.CefStringVisitor;
import org.cef.handler.CefDisplayHandlerAdapter;
import org.cef.handler.CefLoadHandlerAdapter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* FulltextFetcher implementation that attempts to find a PDF URL at ACS.
* FulltextFetcher implementation that attempts to find a PDF URL at <a href="https://pubs.acs.org/">ACS</a>.
*/
public class ACS implements FulltextFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(ACS.class);
Expand All @@ -42,24 +46,51 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {

String source = SOURCE.formatted(doi.get().getDOI());

try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
webClient.getOptions().setSSLClientProtocols("TLSv1.3", "TLSv1.2");
// inspired by https://www.innoq.com/en/blog/2016/01/webscraping/
webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setTimeout(10_000);
webClient.waitForBackgroundJavaScript(5000);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setPrintContentOnFailingStatusCode(true);
CefAppBuilder builder = new CefAppBuilder();
builder.setAppHandler(new MavenCefAppHandlerAdapter(){});
CefApp cefApp;
try {
cefApp = builder.build();
} catch (Exception e) {
LOGGER.error("Could not initialize CEF", e);
throw new IOException(e);
}

CefClient client = cefApp.createClient();
CefBrowser browser = client.createBrowser(source, false, false);

client.addLoadHandler(new CefLoadHandlerAdapter() {
@Override
public void onLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode) {
System.out.println("lalala");
if (frame.isMain()) {
frame.executeJavaScript(
"document.documentElement.outerHTML;",
frame.getURL(),
0
);
}
}
});

HtmlPage page = webClient.getPage(source);
boolean pdfButtonExists = page.querySelectorAll("a[title=\"PDF\"].article__btn__secondary").isEmpty();
if (pdfButtonExists) {
LOGGER.info("Fulltext PDF found at ACS.");
// We "guess" the URL instead of parsing the HTML for the actual link
return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
client.addDisplayHandler(new CefDisplayHandlerAdapter() {
@Override
public boolean onConsoleMessage(CefBrowser browser, CefSettings.LogSeverity level, String message, String source, int line) {
// Capture the result of the JavaScript execution in the console message
System.out.println("Page HTML content:\n" + message);
return true;
}
});

browser.loadURL(source);

try {
Thread.sleep(5000);
} catch (
InterruptedException e) {
throw new RuntimeException(e);
}

return Optional.empty();
}

Expand Down

0 comments on commit b256ba7

Please sign in to comment.