Skip to content

Commit

Permalink
gatherer: Disallow multiple crawls of the same IP address
Browse files Browse the repository at this point in the history
This might slow things down too much but let's see if gets us blocked
less.
  • Loading branch information
ato committed Jul 11, 2024
1 parent 05b3554 commit 13d281a
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions gatherer/src/pandas/gatherer/core/GatherManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.file.Path;
import java.time.Instant;
import java.time.LocalDateTime;
Expand Down Expand Up @@ -105,14 +107,24 @@ Instance nextInstance(String gatherMethod, String threadName) {
Instant startOfThisMinute = LocalDateTime.now().withSecond(0).atZone(ZoneId.systemDefault()).toInstant();
for (Title title : titleRepository.fetchNewGathers(gatherMethod, Instant.now(), startOfThisMinute)) {
String primarySeedHost = title.getPrimarySeedHost();
String ipAddress = null;
if (primarySeedHost == null) {
log.warn("Title.getPrimarySeedHost() returned null for title {}", title.getHumanId());
primarySeedHost = "unknown";
}
} else {
try {
var address = InetAddress.getByName(primarySeedHost);
ipAddress = address.getHostAddress();
} catch (UnknownHostException e) {
// oh well
}
}
if (!currentlyGatheringTitles.containsKey(title.getId()) &&
!currentlyGatheringHosts.containsKey(primarySeedHost)) {
!currentlyGatheringHosts.containsKey(primarySeedHost) &&
(ipAddress == null || currentlyGatheringHosts.containsKey(ipAddress))) {
currentlyGatheringTitles.put(title.getId(), threadName);
currentlyGatheringHosts.put(primarySeedHost, threadName);
if (ipAddress != null) currentlyGatheringHosts.put(ipAddress, threadName);
return instanceService.createInstance(gatherMethod, title);
}
}
Expand Down

0 comments on commit 13d281a

Please sign in to comment.