Skip to content

Commit

Permalink
better testing and better parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
mg98 committed Nov 7, 2023
1 parent f5ef185 commit 0a0350a
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 36 deletions.
6 changes: 3 additions & 3 deletions config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ Optimizer = SGD
EpochScale = 1

# Learning rate for the optimizer
LearningRate = 0.0005
LearningRate = 0.0006

# Number of hidden layers and units per layer
HiddenLayers = 2
HiddenUnits = 512
HiddenUnits = 1024

# Regularization through dropout to prevent overfitting (set 0 to disable)
Dropout = 0.2
Expand All @@ -28,4 +28,4 @@ Dropout = 0.2
Quantize = False

# Number of results to return on each query
NumberOfResults = 9
NumberOfResults = 10
14 changes: 7 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

# Enhance normal dataclasses for IPv8 (see the serialization documentation)
dataclass = overwrite_dataclass(dataclass)

@dataclass(msg_id=1) # The value 1 identifies this message and must be unique per community
class UpdateModel:
id: bytes
Expand Down Expand Up @@ -58,6 +57,7 @@ def started(self) -> None:
self.ltr = LTR(cfg)

if args.simulation:
print(cfg)
print(fmt('Enter query for simulation', 'yellow'))
query = input(f"\r{fmt('QUERY', 'purple')}: ")

Expand All @@ -79,18 +79,18 @@ def started(self) -> None:
remaining_results.pop(selected_id)

# For result #1, e.g., simulate sim_epochs=100 clicks; for result #2, simulate 90 clicks; etc.
selected_results = []
sim_epochs = int(input(f"\r{fmt('Number of epochs on #1 (e.g., 1000)', 'yellow')}: "))
sim_epoch_diff = int(input(f"\r{fmt('Deduction per rank (e.g., 100)', 'yellow')}: "))
clicks = [] # list of clicked result indices
for i in range(len(ranked_result_ids)):
if sim_epoch_diff <= 0: break
selected_results += [list(results.keys()).index(ranked_result_ids[i])] * (sim_epochs - i*sim_epoch_diff)
random.shuffle(selected_results)
clicks.extend([i] * (sim_epochs - i*sim_epoch_diff))
random.shuffle(clicks)

print(fmt(f'Training model on simulation ({len(selected_results)} epochs)...', 'gray'))
print(fmt(f'Training model on simulation ({len(clicks) * cfg.epoch_scale} epochs)...', 'gray'))
with silence():
for res in selected_results:
self.ltr.on_result_selected(query, ranked_result_ids, res)
for i in clicks:
self.ltr.train(self.ltr.gen_train_data(query, ranked_result_ids, i))

inferred_ranking = list(self.ltr.query(query).keys())
print(fmt(f'nDCG: {round(ndcg(ranked_result_ids, inferred_ranking), 3)}', 'yellow'))
Expand Down
12 changes: 12 additions & 0 deletions p2p_ol2r/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,15 @@ def __init__(self, parser: ConfigParser):
self.dropout = parser.getfloat('Dropout')
self.quantize = parser.getboolean('Quantize')
self.number_of_results = parser.getint('NumberOfResults')

def __str__(self) -> str:
return (f"Config(SingleOutput={self.single_output}, "
f"LossFunction={self.loss_fn.__name__ if self.loss_fn else 'None'}, "
f"Optimizer={self.optimizer.__name__ if self.optimizer else 'None'}, "
f"EpochScale={self.epoch_scale}, "
f"LearningRate={self.lr}, "
f"HiddenLayers={self.hidden_layers}, "
f"HiddenUnits={self.hidden_units}, "
f"Dropout={self.dropout}, "
f"Quantize={self.quantize}, "
f"NumberOfResults={self.number_of_results})")
1 change: 1 addition & 0 deletions p2p_ol2r/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .utils import *
from .config import Config


class ModelInput(torch.Tensor):
"""
A tensor representing the input to the model (the query-doc-doc triplet).
Expand Down
77 changes: 57 additions & 20 deletions tests/test_ltr.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,89 @@
import unittest
from unittest.mock import patch
from p2p_ol2r.ltr import *
from p2p_ol2r.utils import *
from tests import cfg

@patch('p2p_ol2r.ltr.LTR.embed', new=lambda _, __: np.array([0.123] * 768)) # embeds query
@patch('p2p_ol2r.ltr.LTR._get_results', new=lambda _, __: [f'id{i}' for i in range(cfg.number_of_results)])
class TestLTR(unittest.TestCase):

ltr = LTR(cfg)
def setUp(self) -> None:
self.ltr = LTR(cfg)
self.ltr.embeddings_map = {
f'id{i}': np.random.RandomState(i).rand(768) for i in range(cfg.number_of_results)
}
self.ltr.metadata = {
f'id{i}': f'title{i}' for i in range(cfg.number_of_results)
}
self.doc_ids = [f'id{i}' for i in range(cfg.number_of_results)]

@patch('p2p_ol2r.ltr.LTR.embed', new=lambda _, x: np.array([0.123] * 768))
@patch.dict(LTR.embeddings_map, {
'id0': np.array([.0] * 768),
'id1': np.array([.1] * 768),
'id2': np.array([.2] * 768),
'id3': np.array([.3] * 768),
})
def test_gen_train_data(self):
query = 'molecular tumor'
query_vec = np.array([.123] * 768)
results = [f'id{i}' for i in range(4)]
train_data = self.ltr.gen_train_data(query, results, 1)
selected_res = 1
train_data = self.ltr.gen_train_data(query, self.doc_ids, selected_res)
expected_train_data = [
ModelInput(query_vec, np.array([.1] * 768), np.array([.0] * 768)),
ModelInput(query_vec, np.array([.1] * 768), np.array([.2] * 768)),
ModelInput(query_vec, np.array([.1] * 768), np.array([.3] * 768))
ModelInput(
query_vec,
np.random.RandomState(selected_res).rand(768), np.random.RandomState(i).rand(768)
) for i in range(cfg.number_of_results) if i != selected_res
]
self.assertEqual(len(train_data), len(expected_train_data))
for generated, expected in zip(train_data, expected_train_data):
self.assertTrue(torch.allclose(generated, expected))

def test_rank_results(self):
q = np.random.rand(768)
k = cfg.number_of_results
doc_ids = [f'id{i}' for i in range(k)]
self.ltr.embeddings_map = {id: np.random.rand(768) for id in doc_ids}

# We mock `infer` to respond positively for each query.
# We expect the results to be ordered as they are in `doc_ids`.
for r in [(True, (1.0, 0.0)), (True, (0.51, 0.49)), (True, 1.0), (True, 0.51)]:
with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r):
ordered_docs = self.ltr.rank_results(q, doc_ids)
self.assertListEqual(ordered_docs, doc_ids)
ordered_docs = self.ltr.rank_results(q, self.doc_ids)
self.assertListEqual(ordered_docs, self.doc_ids)

# We mock `infer` to respond negatively for each query.
# We expect the results to be ordered reversed to what they are in `doc_ids`.
for r in [(False, (0.0, 1.0)), (False, (0.49, 0.51)), (False, 0.0), (False, 0.49)]:
with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r):
ordered_docs = self.ltr.rank_results(q, doc_ids)
self.assertListEqual(ordered_docs, list(reversed(doc_ids)))
ordered_docs = self.ltr.rank_results(q, self.doc_ids)
self.assertListEqual(ordered_docs, list(reversed(self.doc_ids)))

def test_query(self):
# We mock `infer` to respond positively for each query.
# We expect the results to be ordered as they are in `doc_ids`.
for r in [(True, (1.0, 0.0)), (True, (0.51, 0.49)), (True, 1.0), (True, 0.51)]:
with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r):
self.assertDictEqual(self.ltr.query('molecular tumor'), {
f'id{i}': f'title{i}' for i in range(cfg.number_of_results)
})
# We mock `infer` to respond negatively for each query.
# We expect the results to be ordered reversed to what they are in `doc_ids`.
for r in [(False, (0.0, 1.0)), (False, (0.49, 0.51)), (False, 0.0), (False, 0.49)]:
with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r):
self.assertDictEqual(self.ltr.query('molecular tumor'), {
f'id{i}': f'title{i}' for i in reversed(range(cfg.number_of_results))
})

def test_train_and_query(self):
k = cfg.number_of_results
q = self.ltr.embed('molecular tumor')
docs = list(self.ltr.embeddings_map.values())
train_data = []

for i in range(k-1):
# docs[i] to be above all others
i_over_all = [ModelInput(q, docs[i], docs[j]) for j in range(k) if i != j]
epochs = max(0, k*10 - i*10)
train_data.extend(i_over_all * epochs)

with silence(): self.ltr.train(train_data)

res = self.ltr.query('molecular tumor')
self.assertDictEqual(res, {
f'id{i}': f'title{i}' for i in range(cfg.number_of_results)
})

if __name__ == "__main__":
unittest.main()
12 changes: 6 additions & 6 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def setUp():
ltr_model = LTRModel(cfg)
q = np.random.rand(768)
docs = [np.random.rand(768) for _ in range(cfg.number_of_results)]
docs = [np.random.RandomState(i).rand(768) for i in range(cfg.number_of_results)]
return ltr_model, cfg.number_of_results, q, docs

class TestModel(unittest.TestCase):
Expand Down Expand Up @@ -69,17 +69,17 @@ def test_full_ranking(self):
for i in range(k-1):
# docs[i] to be above all others
i_over_all = [ModelInput(q, docs[i], docs[j]) for j in range(k) if i != j]
epochs = max(0, k*10 - i*10) # with k=9 and epochs+100, this test will fail
epochs = max(0, k*10 - i*10)
train_data.extend(i_over_all * epochs)

with silence(): ltr_model.train(train_data)

for i in range(k-1):
for j in range(i+1, k):
res, _ = ltr_model.infer(ModelInput(q, docs[i], docs[j]))
self.assertTrue(res)
res, _ = ltr_model.infer(ModelInput(q, docs[j], docs[i]))
self.assertFalse(res)
res, v = ltr_model.infer(ModelInput(q, docs[i], docs[j]))
self.assertTrue(res, f'docs[{i}] > docs[{j}] failed with {v}')
res, v = ltr_model.infer(ModelInput(q, docs[j], docs[i]))
self.assertFalse(res, f'docs[{j}] > docs[{i}] failed with {v}')

if __name__ == "__main__":
unittest.main()

0 comments on commit 0a0350a

Please sign in to comment.