Skip to content

Commit

Permalink
Before and After methods (#175)
Browse files Browse the repository at this point in the history
* Added before and after functions

* add tests

* formatting
  • Loading branch information
ArztKlein authored Nov 17, 2022
1 parent 0202efd commit 3b3e78d
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 0 deletions.
36 changes: 36 additions & 0 deletions tests/test_cdx_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,39 @@ def test_near() -> None:
filters=["statuscode:200"],
)
cdx.near(unix_timestamp=1286705410)


def test_before() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)

cdx = WaybackMachineCDXServerAPI(
url="http://www.google.com/",
user_agent=user_agent,
filters=["statuscode:200"],
)
before = cdx.before(wayback_machine_timestamp=20160731235949)
assert "20160731233347" in before.timestamp
assert "google" in before.urlkey
assert before.original.find("google.com") != -1
assert before.archive_url.find("google.com") != -1


def test_after() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)

cdx = WaybackMachineCDXServerAPI(
url="http://www.google.com/",
user_agent=user_agent,
filters=["statuscode:200"],
)
after = cdx.after(wayback_machine_timestamp=20160731235949)
assert "20160801000917" in after.timestamp, after.timestamp
assert "google" in after.urlkey
assert after.original.find("google.com") != -1
assert after.archive_url.find("google.com") != -1
82 changes: 82 additions & 0 deletions waybackpy/cdx_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,88 @@ def add_payload(self, payload: Dict[str, str]) -> None:

payload["url"] = self.url

def before(
self,
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
wayback_machine_timestamp: Optional[Union[int, str]] = None,
) -> CDXSnapshot:
"""
Gets the nearest archive before the given datetime.
"""
if unix_timestamp:
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
elif wayback_machine_timestamp:
timestamp = str(wayback_machine_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = wayback_timestamp(
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.closest = timestamp
self.sort = "closest"
self.limit = 25000
for snapshot in self.snapshots():
if snapshot.timestamp < timestamp:
return snapshot

# If a snapshot isn't returned, then none were found.
raise NoCDXRecordFound(
"No records were found before the given date for the query."
+ "Either there are no archives before the given date,"
+ " the URL may not have any archived, or the URL may have been"
+ " recently archived and is still not available on the CDX server."
)

def after(
self,
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
wayback_machine_timestamp: Optional[Union[int, str]] = None,
) -> CDXSnapshot:
"""
Gets the nearest archive after the given datetime.
"""
if unix_timestamp:
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
elif wayback_machine_timestamp:
timestamp = str(wayback_machine_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = wayback_timestamp(
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.closest = timestamp
self.sort = "closest"
self.limit = 25000
for snapshot in self.snapshots():
if snapshot.timestamp > timestamp:
return snapshot

# If a snapshot isn't returned, then none were found.
raise NoCDXRecordFound(
"No records were found after the given date for the query."
+ "Either there are no archives after the given date,"
+ " the URL may not have any archives, or the URL may have been"
+ " recently archived and is still not available on the CDX server."
)

def near(
self,
year: Optional[int] = None,
Expand Down

0 comments on commit 3b3e78d

Please sign in to comment.