diff --git a/config.ini b/config.ini index bfb25f5..362c52e 100644 --- a/config.ini +++ b/config.ini @@ -15,11 +15,11 @@ Optimizer = SGD EpochScale = 1 # Learning rate for the optimizer -LearningRate = 0.0005 +LearningRate = 0.0006 # Number of hidden layers and units per layer HiddenLayers = 2 -HiddenUnits = 512 +HiddenUnits = 1024 # Regularization through dropout to prevent overfitting (set 0 to disable) Dropout = 0.2 @@ -28,4 +28,4 @@ Dropout = 0.2 Quantize = False # Number of results to return on each query -NumberOfResults = 9 \ No newline at end of file +NumberOfResults = 10 \ No newline at end of file diff --git a/main.py b/main.py index 6e15776..eb429f8 100644 --- a/main.py +++ b/main.py @@ -25,7 +25,6 @@ # Enhance normal dataclasses for IPv8 (see the serialization documentation) dataclass = overwrite_dataclass(dataclass) - @dataclass(msg_id=1) # The value 1 identifies this message and must be unique per community class UpdateModel: id: bytes @@ -58,6 +57,7 @@ def started(self) -> None: self.ltr = LTR(cfg) if args.simulation: + print(cfg) print(fmt('Enter query for simulation', 'yellow')) query = input(f"\r{fmt('QUERY', 'purple')}: ") @@ -79,18 +79,18 @@ def started(self) -> None: remaining_results.pop(selected_id) # For result #1, e.g., simulate sim_epochs=100 clicks; for result #2, simulate 90 clicks; etc. - selected_results = [] sim_epochs = int(input(f"\r{fmt('Number of epochs on #1 (e.g., 1000)', 'yellow')}: ")) sim_epoch_diff = int(input(f"\r{fmt('Deduction per rank (e.g., 100)', 'yellow')}: ")) + clicks = [] # list of clicked result indices for i in range(len(ranked_result_ids)): if sim_epoch_diff <= 0: break - selected_results += [list(results.keys()).index(ranked_result_ids[i])] * (sim_epochs - i*sim_epoch_diff) - random.shuffle(selected_results) + clicks.extend([i] * (sim_epochs - i*sim_epoch_diff)) + random.shuffle(clicks) - print(fmt(f'Training model on simulation ({len(selected_results)} epochs)...', 'gray')) + print(fmt(f'Training model on simulation ({len(clicks) * cfg.epoch_scale} epochs)...', 'gray')) with silence(): - for res in selected_results: - self.ltr.on_result_selected(query, ranked_result_ids, res) + for i in clicks: + self.ltr.train(self.ltr.gen_train_data(query, ranked_result_ids, i)) inferred_ranking = list(self.ltr.query(query).keys()) print(fmt(f'nDCG: {round(ndcg(ranked_result_ids, inferred_ranking), 3)}', 'yellow')) diff --git a/p2p_ol2r/config.py b/p2p_ol2r/config.py index e6a32ea..9008e38 100644 --- a/p2p_ol2r/config.py +++ b/p2p_ol2r/config.py @@ -13,3 +13,15 @@ def __init__(self, parser: ConfigParser): self.dropout = parser.getfloat('Dropout') self.quantize = parser.getboolean('Quantize') self.number_of_results = parser.getint('NumberOfResults') + + def __str__(self) -> str: + return (f"Config(SingleOutput={self.single_output}, " + f"LossFunction={self.loss_fn.__name__ if self.loss_fn else 'None'}, " + f"Optimizer={self.optimizer.__name__ if self.optimizer else 'None'}, " + f"EpochScale={self.epoch_scale}, " + f"LearningRate={self.lr}, " + f"HiddenLayers={self.hidden_layers}, " + f"HiddenUnits={self.hidden_units}, " + f"Dropout={self.dropout}, " + f"Quantize={self.quantize}, " + f"NumberOfResults={self.number_of_results})") \ No newline at end of file diff --git a/p2p_ol2r/model.py b/p2p_ol2r/model.py index 3ce3de8..5396a1e 100644 --- a/p2p_ol2r/model.py +++ b/p2p_ol2r/model.py @@ -9,6 +9,7 @@ from .utils import * from .config import Config + class ModelInput(torch.Tensor): """ A tensor representing the input to the model (the query-doc-doc triplet). diff --git a/tests/test_ltr.py b/tests/test_ltr.py index eaabbd1..997df4d 100644 --- a/tests/test_ltr.py +++ b/tests/test_ltr.py @@ -1,28 +1,33 @@ import unittest from unittest.mock import patch from p2p_ol2r.ltr import * +from p2p_ol2r.utils import * from tests import cfg +@patch('p2p_ol2r.ltr.LTR.embed', new=lambda _, __: np.array([0.123] * 768)) # embeds query +@patch('p2p_ol2r.ltr.LTR._get_results', new=lambda _, __: [f'id{i}' for i in range(cfg.number_of_results)]) class TestLTR(unittest.TestCase): - ltr = LTR(cfg) + def setUp(self) -> None: + self.ltr = LTR(cfg) + self.ltr.embeddings_map = { + f'id{i}': np.random.RandomState(i).rand(768) for i in range(cfg.number_of_results) + } + self.ltr.metadata = { + f'id{i}': f'title{i}' for i in range(cfg.number_of_results) + } + self.doc_ids = [f'id{i}' for i in range(cfg.number_of_results)] - @patch('p2p_ol2r.ltr.LTR.embed', new=lambda _, x: np.array([0.123] * 768)) - @patch.dict(LTR.embeddings_map, { - 'id0': np.array([.0] * 768), - 'id1': np.array([.1] * 768), - 'id2': np.array([.2] * 768), - 'id3': np.array([.3] * 768), - }) def test_gen_train_data(self): query = 'molecular tumor' query_vec = np.array([.123] * 768) - results = [f'id{i}' for i in range(4)] - train_data = self.ltr.gen_train_data(query, results, 1) + selected_res = 1 + train_data = self.ltr.gen_train_data(query, self.doc_ids, selected_res) expected_train_data = [ - ModelInput(query_vec, np.array([.1] * 768), np.array([.0] * 768)), - ModelInput(query_vec, np.array([.1] * 768), np.array([.2] * 768)), - ModelInput(query_vec, np.array([.1] * 768), np.array([.3] * 768)) + ModelInput( + query_vec, + np.random.RandomState(selected_res).rand(768), np.random.RandomState(i).rand(768) + ) for i in range(cfg.number_of_results) if i != selected_res ] self.assertEqual(len(train_data), len(expected_train_data)) for generated, expected in zip(train_data, expected_train_data): @@ -30,23 +35,55 @@ def test_gen_train_data(self): def test_rank_results(self): q = np.random.rand(768) - k = cfg.number_of_results - doc_ids = [f'id{i}' for i in range(k)] - self.ltr.embeddings_map = {id: np.random.rand(768) for id in doc_ids} # We mock `infer` to respond positively for each query. # We expect the results to be ordered as they are in `doc_ids`. for r in [(True, (1.0, 0.0)), (True, (0.51, 0.49)), (True, 1.0), (True, 0.51)]: with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r): - ordered_docs = self.ltr.rank_results(q, doc_ids) - self.assertListEqual(ordered_docs, doc_ids) + ordered_docs = self.ltr.rank_results(q, self.doc_ids) + self.assertListEqual(ordered_docs, self.doc_ids) # We mock `infer` to respond negatively for each query. # We expect the results to be ordered reversed to what they are in `doc_ids`. for r in [(False, (0.0, 1.0)), (False, (0.49, 0.51)), (False, 0.0), (False, 0.49)]: with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r): - ordered_docs = self.ltr.rank_results(q, doc_ids) - self.assertListEqual(ordered_docs, list(reversed(doc_ids))) + ordered_docs = self.ltr.rank_results(q, self.doc_ids) + self.assertListEqual(ordered_docs, list(reversed(self.doc_ids))) + + def test_query(self): + # We mock `infer` to respond positively for each query. + # We expect the results to be ordered as they are in `doc_ids`. + for r in [(True, (1.0, 0.0)), (True, (0.51, 0.49)), (True, 1.0), (True, 0.51)]: + with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r): + self.assertDictEqual(self.ltr.query('molecular tumor'), { + f'id{i}': f'title{i}' for i in range(cfg.number_of_results) + }) + # We mock `infer` to respond negatively for each query. + # We expect the results to be ordered reversed to what they are in `doc_ids`. + for r in [(False, (0.0, 1.0)), (False, (0.49, 0.51)), (False, 0.0), (False, 0.49)]: + with patch('p2p_ol2r.model.LTRModel.infer', lambda _, __: r): + self.assertDictEqual(self.ltr.query('molecular tumor'), { + f'id{i}': f'title{i}' for i in reversed(range(cfg.number_of_results)) + }) + + def test_train_and_query(self): + k = cfg.number_of_results + q = self.ltr.embed('molecular tumor') + docs = list(self.ltr.embeddings_map.values()) + train_data = [] + + for i in range(k-1): + # docs[i] to be above all others + i_over_all = [ModelInput(q, docs[i], docs[j]) for j in range(k) if i != j] + epochs = max(0, k*10 - i*10) + train_data.extend(i_over_all * epochs) + + with silence(): self.ltr.train(train_data) + + res = self.ltr.query('molecular tumor') + self.assertDictEqual(res, { + f'id{i}': f'title{i}' for i in range(cfg.number_of_results) + }) if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/tests/test_model.py b/tests/test_model.py index 21e9266..ff73329 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -8,7 +8,7 @@ def setUp(): ltr_model = LTRModel(cfg) q = np.random.rand(768) - docs = [np.random.rand(768) for _ in range(cfg.number_of_results)] + docs = [np.random.RandomState(i).rand(768) for i in range(cfg.number_of_results)] return ltr_model, cfg.number_of_results, q, docs class TestModel(unittest.TestCase): @@ -69,17 +69,17 @@ def test_full_ranking(self): for i in range(k-1): # docs[i] to be above all others i_over_all = [ModelInput(q, docs[i], docs[j]) for j in range(k) if i != j] - epochs = max(0, k*10 - i*10) # with k=9 and epochs+100, this test will fail + epochs = max(0, k*10 - i*10) train_data.extend(i_over_all * epochs) with silence(): ltr_model.train(train_data) for i in range(k-1): for j in range(i+1, k): - res, _ = ltr_model.infer(ModelInput(q, docs[i], docs[j])) - self.assertTrue(res) - res, _ = ltr_model.infer(ModelInput(q, docs[j], docs[i])) - self.assertFalse(res) + res, v = ltr_model.infer(ModelInput(q, docs[i], docs[j])) + self.assertTrue(res, f'docs[{i}] > docs[{j}] failed with {v}') + res, v = ltr_model.infer(ModelInput(q, docs[j], docs[i])) + self.assertFalse(res, f'docs[{j}] > docs[{i}] failed with {v}') if __name__ == "__main__": unittest.main() \ No newline at end of file