Skip to content

Commit ae36a83

Browse files
committed
feat(cli): add hashing group with journal-entry
Hopefully there's enough operator toggles to tune this run, as there's a large amount of records in the database. This should hopefully allow for balancing load vs locking. Signed-off-by: Mike Fiedler <miketheman@gmail.com>
1 parent a091728 commit ae36a83

File tree

2 files changed

+251
-0
lines changed

2 files changed

+251
-0
lines changed

tests/unit/cli/test_hashing.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License");
2+
# you may not use this file except in compliance with the License.
3+
# You may obtain a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS,
9+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
# See the License for the specific language governing permissions and
11+
# limitations under the License.
12+
import hashlib
13+
14+
import pretend
15+
16+
from warehouse import db
17+
from warehouse.cli import hashing
18+
19+
from ...common.db.packaging import JournalEntry, JournalEntryFactory
20+
21+
22+
def remote_addr_salty_hash(remote_addr, salt):
23+
return hashlib.sha256(f"{remote_addr}{salt}".encode()).hexdigest()
24+
25+
26+
class TestHashingJournalEntry:
27+
def test_no_records_to_hash(self, cli, db_request, monkeypatch):
28+
engine = pretend.stub()
29+
config = pretend.stub(registry={"sqlalchemy.engine": engine})
30+
session_cls = pretend.call_recorder(lambda bind: db_request.db)
31+
monkeypatch.setattr(db, "Session", session_cls)
32+
33+
assert db_request.db.query(JournalEntry).count() == 0
34+
35+
args = ["--salt", "test"]
36+
37+
result = cli.invoke(hashing.journal_entry, args, obj=config)
38+
39+
assert result.exit_code == 0
40+
assert result.output.strip() == "No rows to hash. Done!"
41+
42+
def tests_hashes_records(self, cli, db_request, remote_addr, monkeypatch):
43+
engine = pretend.stub()
44+
config = pretend.stub(registry={"sqlalchemy.engine": engine})
45+
session_cls = pretend.call_recorder(lambda bind: db_request.db)
46+
monkeypatch.setattr(db, "Session", session_cls)
47+
48+
# create some JournalEntry records with unhashed ip addresses
49+
JournalEntryFactory.create_batch(3, submitted_from=remote_addr)
50+
assert db_request.db.query(JournalEntry).count() == 3
51+
52+
salt = "NaCl"
53+
salted_hash = remote_addr_salty_hash(remote_addr, salt)
54+
55+
args = [
56+
"--salt",
57+
salt,
58+
"--batch-size",
59+
"2",
60+
]
61+
62+
result = cli.invoke(hashing.journal_entry, args, obj=config)
63+
64+
assert result.exit_code == 0
65+
assert result.output.strip() == "Hashing 2 rows...\nHashed 2 rows"
66+
# check that two of the ip addresses have been hashed
67+
assert (
68+
db_request.db.query(JournalEntry)
69+
.filter_by(submitted_from=remote_addr)
70+
.one()
71+
)
72+
assert (
73+
db_request.db.query(JournalEntry)
74+
.filter_by(submitted_from=salted_hash)
75+
.count()
76+
== 2
77+
)
78+
79+
def test_continue_until_done(self, cli, db_request, remote_addr, monkeypatch):
80+
engine = pretend.stub()
81+
config = pretend.stub(registry={"sqlalchemy.engine": engine})
82+
session_cls = pretend.call_recorder(lambda bind: db_request.db)
83+
monkeypatch.setattr(db, "Session", session_cls)
84+
85+
# create some JournalEntry records with unhashed ip addresses
86+
JournalEntryFactory.create_batch(3, submitted_from=remote_addr)
87+
88+
salt = "NaCl"
89+
salted_hash = remote_addr_salty_hash(remote_addr, salt)
90+
91+
args = [
92+
"--salt",
93+
salt,
94+
"--batch-size",
95+
"1",
96+
"--sleep-time",
97+
"0",
98+
"--continue-until-done",
99+
]
100+
101+
result = cli.invoke(hashing.journal_entry, args, obj=config)
102+
103+
assert result.exit_code == 0
104+
# check that all the ip addresses have been hashed
105+
assert (
106+
db_request.db.query(JournalEntry)
107+
.filter_by(submitted_from=salted_hash)
108+
.count()
109+
== 3
110+
)
111+
assert (
112+
db_request.db.query(JournalEntry)
113+
.filter_by(submitted_from=remote_addr)
114+
.count()
115+
== 0
116+
)

warehouse/cli/hashing.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License");
2+
# you may not use this file except in compliance with the License.
3+
# You may obtain a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS,
9+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
# See the License for the specific language governing permissions and
11+
# limitations under the License.
12+
import hashlib
13+
import time
14+
15+
import click
16+
17+
from warehouse.cli import warehouse
18+
19+
20+
@warehouse.group()
21+
def hashing():
22+
"""
23+
Run Hashing operations for Warehouse data
24+
"""
25+
26+
27+
@hashing.command()
28+
@click.option(
29+
"-s",
30+
"--salt",
31+
prompt=True,
32+
hide_input=True,
33+
help="Pass value instead of prompting for salt",
34+
)
35+
@click.option(
36+
"-b",
37+
"--batch-size",
38+
default=10_000,
39+
show_default=True,
40+
help="Number of rows to hash at a time",
41+
)
42+
@click.option(
43+
"-st",
44+
"--sleep-time",
45+
default=1,
46+
show_default=True,
47+
help="Number of seconds to sleep between batches",
48+
)
49+
@click.option(
50+
"--continue-until-done",
51+
is_flag=True,
52+
default=False,
53+
help="Continue hashing until all rows are hashed",
54+
)
55+
@click.pass_obj
56+
def journal_entry(
57+
config,
58+
salt: str,
59+
batch_size: int,
60+
sleep_time: int,
61+
continue_until_done: bool,
62+
):
63+
"""
64+
Hash `journals.submitted_from` column with salt
65+
"""
66+
# Imported here because we don't want to trigger an import from anything
67+
# but warehouse.cli at the module scope.
68+
from warehouse.db import Session
69+
70+
# This lives in the outer function so we only create a single session per
71+
# invocation of the CLI command.
72+
session = Session(bind=config.registry["sqlalchemy.engine"])
73+
74+
_hash_journal_entries_submitted_from(
75+
session, salt, batch_size, sleep_time, continue_until_done
76+
)
77+
78+
79+
def _hash_journal_entries_submitted_from(
80+
session,
81+
salt: str,
82+
batch_size: int,
83+
sleep_time: int,
84+
continue_until_done: bool,
85+
) -> None:
86+
"""
87+
Perform hashing of the `journals.submitted_from` column
88+
89+
Broken out from the CLI command so that it can be called recursively.
90+
"""
91+
from sqlalchemy import func, select
92+
93+
from warehouse.packaging.models import JournalEntry
94+
95+
# Get rows a batch at a time, only if the row hasn't already been hashed
96+
# (i.e. the value is shorter than 64 characters)
97+
unhashed_rows = session.scalars(
98+
select(JournalEntry)
99+
.where(func.length(JournalEntry.submitted_from) < 63)
100+
.order_by(JournalEntry.submitted_date)
101+
.limit(batch_size)
102+
).all()
103+
104+
# If there are no rows to hash, we're done
105+
if not unhashed_rows:
106+
click.echo("No rows to hash. Done!")
107+
return
108+
109+
how_many = len(unhashed_rows)
110+
111+
# Hash the value rows
112+
click.echo(f"Hashing {how_many} rows...")
113+
for row in unhashed_rows:
114+
row.submitted_from = hashlib.sha256(
115+
(row.submitted_from + salt).encode("utf8")
116+
).hexdigest()
117+
118+
# Update the rows
119+
session.add_all(unhashed_rows)
120+
session.commit()
121+
122+
# If there are more rows to hash, recurse until done
123+
if continue_until_done and how_many == batch_size:
124+
click.echo(f"Hashed {batch_size} rows. Sleeping for {sleep_time} second(s)...")
125+
time.sleep(sleep_time)
126+
_hash_journal_entries_submitted_from(
127+
session,
128+
salt,
129+
batch_size,
130+
sleep_time,
131+
continue_until_done,
132+
)
133+
else:
134+
click.echo(f"Hashed {how_many} rows")
135+
return

0 commit comments

Comments
 (0)