Skip to content

Commit

Permalink
This improves the way Nutch is erroring out, at least for local mode.
Browse files Browse the repository at this point in the history
  • Loading branch information
cube committed Oct 19, 2024
1 parent d6f55b8 commit 53cf454
Showing 1 changed file with 21 additions and 16 deletions.
37 changes: 21 additions & 16 deletions src/java/org/apache/nutch/crawl/Injector.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,23 +130,28 @@ public static class InjectMapper

@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
boolean normalize = conf.getBoolean(CrawlDbFilter.URL_NORMALIZING, true);
boolean filter = conf.getBoolean(CrawlDbFilter.URL_FILTERING, true);
filterNormalizeAll = conf.getBoolean(URL_FILTER_NORMALIZE_ALL, false);
if (normalize) {
scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_INJECT);
urlNormalizers = new URLNormalizers(conf, scope);
}
interval = conf.getInt("db.fetch.interval.default", 2592000);
if (filter) {
filters = new URLFilters(conf);
try {
Configuration conf = context.getConfiguration();
boolean normalize = conf.getBoolean(CrawlDbFilter.URL_NORMALIZING, true);
boolean filter = conf.getBoolean(CrawlDbFilter.URL_FILTERING, true);
filterNormalizeAll = conf.getBoolean(URL_FILTER_NORMALIZE_ALL, false);
if (normalize) {
scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_INJECT);
urlNormalizers = new URLNormalizers(conf, scope);
}
interval = conf.getInt("db.fetch.interval.default", 2592000);
if (filter) {
filters = new URLFilters(conf);
}
scfilters = new ScoringFilters(conf);
scoreInjected = conf.getFloat("db.score.injected", 1.0f);
curTime = conf.getLong("injector.current.time",
System.currentTimeMillis());
url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
} catch (Exception e) {
LOG.error("Could not configure InjectMapper", e);
throw e;
}
scfilters = new ScoringFilters(conf);
scoreInjected = conf.getFloat("db.score.injected", 1.0f);
curTime = conf.getLong("injector.current.time",
System.currentTimeMillis());
url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
}

/* Filter and normalize the input url */
Expand Down

0 comments on commit 53cf454

Please sign in to comment.