diff --git a/rust/lance-encoding/compression-algo/fsst/examples/benchmark.rs b/rust/lance-encoding/compression-algo/fsst/examples/benchmark.rs index 0b3555edd2a..b3e5bc6cfeb 100644 --- a/rust/lance-encoding/compression-algo/fsst/examples/benchmark.rs +++ b/rust/lance-encoding/compression-algo/fsst/examples/benchmark.rs @@ -4,7 +4,7 @@ use fsst::fsst::{compress, decompress, FSST_SYMBOL_TABLE_SIZE}; use rand::Rng; -const TEST_NUM: usize = 10; +const TEST_NUM: usize = 20; const BUFFER_SIZE: usize = 8 * 1024 * 1024; use arrow::array::StringArray; diff --git a/rust/lance-encoding/compression-algo/fsst/src/fsst.rs b/rust/lance-encoding/compression-algo/fsst/src/fsst.rs index e1666cd29b5..3cee547db9c 100644 --- a/rust/lance-encoding/compression-algo/fsst/src/fsst.rs +++ b/rust/lance-encoding/compression-algo/fsst/src/fsst.rs @@ -796,27 +796,183 @@ fn decompress_bulk( let symbols = decoder.symbols; let lens = decoder.lens; let mut decompress = |mut in_curr: usize, in_end: usize, out_curr: &mut usize| { - let mut prev_esc = false; - while in_curr < in_end { - if prev_esc { - out[*out_curr] = compressed_strs[in_curr]; - *out_curr += 1; - prev_esc = false; + while in_curr + 4 <= in_end { + let next_block; + let mut code; + let mut len; + unsafe { + next_block = + ptr::read_unaligned(compressed_strs.as_ptr().add(in_curr) as *const u32); + } + let escape_mask = (next_block & 0x80808080u32) + & ((((!next_block) & 0x7F7F7F7Fu32) + 0x7F7F7F7Fu32) ^ 0x80808080u32); + if escape_mask == 0 { + // 0th byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // 1st byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // 2nd byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // 3rd byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; } else { - let code = compressed_strs[in_curr]; - if code == FSST_ESC { - prev_esc = true; + let first_escape_pos = escape_mask.trailing_zeros() >> 3; + if first_escape_pos == 3 { + // 0th byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // 1st byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // 2nd byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // escape byte + in_curr += 2; + out[*out_curr] = compressed_strs[in_curr - 1]; + *out_curr += 1; + } else if first_escape_pos == 2 { + // 0th byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // 1st byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // escape byte + in_curr += 2; + out[*out_curr] = compressed_strs[in_curr - 1]; + *out_curr += 1; + } else if first_escape_pos == 1 { + // 0th byte + code = compressed_strs[in_curr] as usize; + len = lens[code] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += len; + + // escape byte + in_curr += 2; + out[*out_curr] = compressed_strs[in_curr - 1]; + *out_curr += 1; + } else { + // escape byte + in_curr += 2; + out[*out_curr] = compressed_strs[in_curr - 1]; + *out_curr += 1; + } + } + } + + // handle the remaining bytes + if in_curr + 2 <= in_end { + out[*out_curr] = compressed_strs[in_curr + 1]; + if compressed_strs[in_curr] != FSST_ESC { + let code = compressed_strs[in_curr] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += lens[code] as usize; + if compressed_strs[in_curr] != FSST_ESC { + let code = compressed_strs[in_curr] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + in_curr += 1; + *out_curr += lens[code] as usize; } else { - let s = symbols[code as usize]; - let len = lens[code as usize]; - out[*out_curr..*out_curr + len as usize] - .copy_from_slice(&s.to_ne_bytes()[..len as usize]); - *out_curr += len as usize; + in_curr += 2; + out[*out_curr] = compressed_strs[in_curr - 1]; + *out_curr += 1; } + } else { + in_curr += 2; + *out_curr += 1; } - in_curr += 1; + } + + if in_curr < in_end { + // last code cannot be an escape code + let code = compressed_strs[in_curr] as usize; + unsafe { + let src = symbols[code]; + ptr::write_unaligned(out.as_mut_ptr().add(*out_curr) as *mut u64, src); + } + *out_curr += lens[code] as usize; } }; + let mut out_curr = *out_pos; out_offsets[0] = 0; for i in 1..offsets.len() { @@ -1229,6 +1385,72 @@ mod tests { Doth make the night joint-labourer with the day? Who is't that can inform me?"; + const TEST_PARAGRAPH2: &str = "Towards the end of November, during a thaw, at nine o’clock one morning, a train on the Warsaw and Petersburg railway was approaching the latter city at full speed. +The morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking; +and it was impossible to distinguish anything more than a few yards away from the carriage windows. +Some of the passengers by this particular train were returning from abroad; but the third-class carriages were the best filled, chiefly with insignificant persons of various occupations and degrees, +picked up at the different stations nearer town. +All of them seemed weary, and most of them had sleepy eyes and a shivering expression, while their complexions generally appeared to have taken on the colour of the fog outside. +When day dawned, two passengers in one of the third-class carriages found themselves opposite each other. Both were young fellows, both were rather poorly dressed, both had remarkable faces, +and both were evidently anxious to start a conversation. +If they had but known why, at this particular moment, they were both remarkable persons, they would undoubtedly have wondered at the strange chance which had set them down opposite to one another in a third-class carriage of the Warsaw Railway Company. +One of them was a young fellow of about twenty-seven, not tall, with black curling hair, and small, grey, fiery eyes. His nose was broad and flat, and he had high cheek bones; his thin lips were constantly compressed into an impudent, +ironical—it might almost be called a malicious—smile; +but his forehead was high and well formed, and atoned for a good deal of the ugliness of the lower part of his face. +A special feature of this physiognomy was its death-like pallor, which gave to the whole man an indescribably emaciated appearance in spite of his hard look, +and at the same time a sort of passionate and suffering expression which did not harmonize with his impudent, +sarcastic smile and keen, self-satisfied bearing. +He wore a large fur—or rather astrachan—overcoat, which had kept him warm all night, while his neighbour had been obliged to bear the full severity of a Russian November night entirely unprepared. +His wide sleeveless mantle with a large cape to it—the sort of cloak one sees upon travellers during the winter months in Switzerland or North Italy—was by no means adapted to the long cold journey through Russia, from Eydkuhnen to St. Petersburg. +The wearer of this cloak was a young fellow, also of about twenty-six or twenty-seven years of age, slightly above the middle height, very fair, with a thin, pointed and very light coloured beard; +his eyes were large and blue, and had an intent look about them, yet that heavy expression which some people affirm to be a peculiarity as well as evidence, of an epileptic subject. +His face was decidedly a pleasant one for all that; refined, but quite colourless, except for the circumstance that at this moment it was blue with cold. +He held a bundle made up of an old faded silk handkerchief that apparently contained all his travelling wardrobe, and wore thick shoes and gaiters, his whole appearance being very un-Russian. +His black-haired neighbour inspected these peculiarities, having nothing better to do, and at length remarked, with that rude enjoyment of the discomforts of others which the common classes so often show: +“Cold?” +“Very,” said his neighbour, readily, “and this is a thaw, too. Fancy if it had been a hard frost! I never thought it would be so cold in the old country. I’ve grown quite out of the way of it.” +“What, been abroad, I suppose?” +“Yes, straight from Switzerland.” +“Wheugh! my goodness!” The black-haired young fellow whistled, and then laughed. +The conversation proceeded. The readiness of the fair-haired young man in the cloak to answer all his opposite neighbour’s questions was surprising. +He seemed to have no suspicion of any impertinence or inappropriateness in the fact of such questions being put to him. +Replying to them, he made known to the inquirer that he certainly had been long absent from Russia, more than four years; that he had been sent abroad for his health; +that he had suffered from some strange nervous malady—a kind of epilepsy, with convulsive spasms. His interlocutor burst out laughing several times at his answers; and more than ever, when to the question, “whether he had been cured?” the patient replied: +“No, they did not cure me.” +“Hey! that’s it! You stumped up your money for nothing, and we believe in those fellows, here!” remarked the black-haired individual, sarcastically."; + + const TEST_PARAGRAPH3: &str = "When the widow hurried away to Pavlofsk, she went straight to Daria Alexeyevna’s house, and telling all she knew, threw her into a state of great alarm. +Both ladies decided to communicate at once with Lebedeff, who, as the friend and landlord of the prince, was also much agitated. +Vera Lebedeff told all she knew, and by Lebedeff’s advice it was decided that all three should go to Petersburg as quickly as possible, in order to avert “what might so easily happen.” +This is how it came about that at eleven o’clock next morning Rogojin’s flat was opened by the police in the presence of Lebedeff, the two ladies, and Rogojin’s own brother, who lived in the wing. +The evidence of the porter went further than anything else towards the success of Lebedeff in gaining the assistance of the police. +He declared that he had seen Rogojin return to the house last night, accompanied by a friend, and that both had gone upstairs very secretly and cautiously. +After this there was no hesitation about breaking open the door, since it could not be got open in any other way. +Rogojin suffered from brain fever for two months. When he recovered from the attack he was at once brought up on trial for murder. +He gave full, satisfactory, and direct evidence on every point; and the prince’s name was, thanks to this, not brought into the proceedings. +Rogojin was very quiet during the progress of the trial. He did not contradict his clever and eloquent counsel, who argued that the brain fever, +or inflammation of the brain, was the cause of the crime; clearly proving that this malady had existed long before the murder was perpetrated, and had been brought on by the sufferings of the accused. +But Rogojin added no words of his own in confirmation of this view, and as before, he recounted with marvellous exactness the details of his crime. +He was convicted, but with extenuating circumstances, and condemned to hard labour in Siberia for fifteen years. He heard his sentence grimly, silently, and thoughtfully. His colossal fortune, +with the exception of the comparatively small portion wasted in the first wanton period of his inheritance, went to his brother, to the great satisfaction of the latter. +The old lady, Rogojin’s mother, is still alive, and remembers her favourite son Parfen sometimes, but not clearly. God spared her the knowledge of this dreadful calamity which had overtaken her house. +Lebedeff, Keller, Gania, Ptitsin, and many other friends of ours continue to live as before. There is scarcely any change in them, so that there is no need to tell of their subsequent doings. +Hippolyte died in great agitation, and rather sooner than he expected, about a fortnight after Nastasia Philipovna’s death. Colia was much affected by these events, +and drew nearer to his mother in heart and sympathy. Nina Alexandrovna is anxious, because he is “thoughtful beyond his years,” but he will, we think, make a useful and active man. +The prince’s further fate was more or less decided by Colia, who selected, out of all the persons he had met during the last six or seven months, Evgenie Pavlovitch, as friend and confidant. +To him he made over all that he knew as to the events above recorded, and as to the present condition of the prince. He was not far wrong in his choice. +Evgenie Pavlovitch took the deepest interest in the fate of the unfortunate “idiot,” and, thanks to his influence, the prince found himself once more with Dr. Schneider, in Switzerland. +Evgenie Pavlovitch, who went abroad at this time, intending to live a long while on the continent, being, as he often said, quite superfluous in Russia, visits his sick friend at Schneider’s every few months. +But Dr. Schneider frowns ever more and more and shakes his head; he hints that the brain is fatally injured; he does not as yet declare that his patient is incurable, but he allows himself to express the gravest fears. +Evgenie takes this much to heart, and he has a heart, as is proved by the fact that he receives and even answers letters from Colia. But besides this, +another trait in his character has become apparent, and as it is a good trait we will make haste to reveal it. +After each visit to Schneider’s establishment, Evgenie Pavlovitch writes another letter, besides that to Colia, giving the most minute particulars concerning the invalid’s condition. +In these letters is to be detected, and in each one more than the last, a growing feeling of friendship and sympathy. +The individual who corresponds thus with Evgenie Pavlovitch, and who engages so much of his attention and respect, is Vera Lebedeff. +We have never been able to discover clearly how such relations sprang up. +Of course the root of them was in the events which we have already recorded, and which so filled Vera with grief on the prince’s account that she fell seriously ill. +But exactly how the acquaintance and friendship came about, we cannot say."; + #[test_log::test(tokio::test)] async fn test_symbol_new() { let st = SymbolTable::new(); @@ -1245,13 +1467,42 @@ mod tests { #[test_log::test(tokio::test)] async fn test_fsst() { - let test_paragraph_len = TEST_PARAGRAPH.len(); - let repeat_num = 8 * 1024 * 1024 / test_paragraph_len; - let paragraph = TEST_PARAGRAPH.to_string().repeat(repeat_num); - let words = paragraph.lines().collect::>(); - let string_array = StringArray::from(words); - let mut compress_output_buf: Vec = vec![0; 16 * 1024 * 1024]; - let mut compress_offset_buf: Vec = vec![0; 16 * 1024 * 1024]; + let test_input_size = 8 * 1024 * 1024; + let repeat_num = test_input_size / TEST_PARAGRAPH.len(); + let test_input = TEST_PARAGRAPH.repeat(repeat_num); + helper(&test_input); + + let test_input_size = 16 * 1024 * 1024; + let repeat_num = test_input_size / TEST_PARAGRAPH.len(); + let test_input = TEST_PARAGRAPH.repeat(repeat_num); + helper(&test_input); + + let test_input_size = 8 * 1024 * 1024; + let repeat_num = test_input_size / TEST_PARAGRAPH2.len(); + let test_input = TEST_PARAGRAPH.repeat(repeat_num); + helper(&test_input); + + let test_input_size = 16 * 1024 * 1024; + let repeat_num = test_input_size / TEST_PARAGRAPH2.len(); + let test_input = TEST_PARAGRAPH2.repeat(repeat_num); + helper(&test_input); + + let test_input_size = 8 * 1024 * 1024; + let repeat_num = test_input_size / TEST_PARAGRAPH3.len(); + let test_input = TEST_PARAGRAPH3.repeat(repeat_num); // Also corrected `repea_num` to `repeat_num` + helper(&test_input); + + let test_input_size = 16 * 1024 * 1024; + let repeat_num = test_input_size / TEST_PARAGRAPH3.len(); + let test_input = TEST_PARAGRAPH3.repeat(repeat_num); // Also corrected `repea_num` to `repeat_num` + helper(&test_input); + } + + fn helper(test_input: &str) { + let lines_vec = test_input.lines().collect::>(); + let string_array = StringArray::from(lines_vec); + let mut compress_output_buf: Vec = vec![0; string_array.value_data().len()]; + let mut compress_offset_buf: Vec = vec![0; string_array.value_offsets().len()]; let mut symbol_table = [0; FSST_SYMBOL_TABLE_SIZE]; compress( symbol_table.as_mut(), @@ -1261,8 +1512,8 @@ mod tests { &mut compress_offset_buf, ) .unwrap(); - let mut decompress_output: Vec = vec![0; 3 * 16 * 1024 * 1024]; - let mut decompress_offsets: Vec = vec![0; 3 * 16 * 1024 * 1024]; + let mut decompress_output: Vec = vec![0; compress_output_buf.len() * 8]; + let mut decompress_offsets: Vec = vec![0; compress_offset_buf.len()]; decompress( &symbol_table, &compress_output_buf,