diff --git a/ghcide/ghcide.cabal b/ghcide/ghcide.cabal index a59c6ac70c..e390813cd4 100644 --- a/ghcide/ghcide.cabal +++ b/ghcide/ghcide.cabal @@ -50,7 +50,6 @@ library dlist, exceptions, extra >= 1.7.4, - fuzzy, filepath, fingertree, focus, @@ -205,6 +204,7 @@ library Development.IDE.Plugin.HLS.GhcIde Development.IDE.Plugin.Test Development.IDE.Plugin.TypeLenses + Text.Fuzzy.Parallel other-modules: Development.IDE.Core.FileExists @@ -216,7 +216,6 @@ library Development.IDE.Plugin.Completions.Logic Development.IDE.Session.VersionCheck Development.IDE.Types.Action - Text.Fuzzy.Parallel ghc-options: -Wall @@ -371,6 +370,7 @@ test-suite ghcide-tests directory, extra, filepath, + fuzzy, -------------------------------------------------------------- -- The MIN_VERSION_ghc macro relies on MIN_VERSION pragmas -- which require depending on ghc. So the tests need to depend @@ -385,11 +385,13 @@ test-suite ghcide-tests lsp, lsp-types, hls-plugin-api, - network-uri, lens, list-t, lsp-test ^>= 0.14, + monoid-subclasses, + network-uri, optparse-applicative, + parallel, process, QuickCheck, quickcheck-instances, @@ -410,6 +412,7 @@ test-suite ghcide-tests tasty-rerun, text, unordered-containers, + vector, if (impl(ghc >= 8.6) && impl(ghc < 9.2)) build-depends: record-dot-preprocessor, @@ -423,6 +426,7 @@ test-suite ghcide-tests Development.IDE.Test.Runfiles Experiments Experiments.Types + FuzzySearch Progress HieDbRetry default-extensions: diff --git a/ghcide/src/Development/IDE/Plugin/Completions/Logic.hs b/ghcide/src/Development/IDE/Plugin/Completions/Logic.hs index b7a538abad..845cb12c93 100644 --- a/ghcide/src/Development/IDE/Plugin/Completions/Logic.hs +++ b/ghcide/src/Development/IDE/Plugin/Completions/Logic.hs @@ -63,7 +63,7 @@ import Ide.Types (CommandId (..), import Language.LSP.Types import Language.LSP.Types.Capabilities import qualified Language.LSP.VFS as VFS -import Text.Fuzzy.Parallel (Scored (score_), +import Text.Fuzzy.Parallel (Scored (score), original) -- Chunk size used for parallelizing fuzzy matching @@ -590,7 +590,7 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu $ (if T.null enteredQual then id else mapMaybe (T.stripPrefix enteredQual)) allModNamesAsNS - filtCompls = Fuzzy.filter chunkSize maxC prefixText ctxCompls "" "" (label . snd) + filtCompls = Fuzzy.filter chunkSize maxC prefixText ctxCompls (label . snd) where mcc = case maybe_parsed of @@ -668,7 +668,7 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu return $ (fmap.fmap) snd $ sortBy (compare `on` lexicographicOrdering) $ - mergeListsBy (flip compare `on` score_) + mergeListsBy (flip compare `on` score) [ (fmap.fmap) (notQual,) filtModNameCompls , (fmap.fmap) (notQual,) filtKeywordCompls , (fmap.fmap.fmap) (toggleSnippets caps config) compls @@ -681,11 +681,11 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu -- 3. In-scope completions rank next -- 4. label alphabetical ordering next -- 4. detail alphabetical ordering (proxy for module) - lexicographicOrdering Fuzzy.Scored{score_, original} = + lexicographicOrdering Fuzzy.Scored{score, original} = case original of (isQual, CompletionItem{_label,_detail}) -> do let isLocal = maybe False (":" `T.isPrefixOf`) _detail - (Down isQual, Down score_, Down isLocal, _label, _detail) + (Down isQual, Down score, Down isLocal, _label, _detail) diff --git a/ghcide/src/Text/Fuzzy/Parallel.hs b/ghcide/src/Text/Fuzzy/Parallel.hs index 4f7ad45c02..1fc6a4e679 100644 --- a/ghcide/src/Text/Fuzzy/Parallel.hs +++ b/ghcide/src/Text/Fuzzy/Parallel.hs @@ -1,96 +1,91 @@ -- | Parallel versions of 'filter' and 'simpleFilter' + module Text.Fuzzy.Parallel ( filter, simpleFilter, - Scored(..), - -- reexports - Fuzzy, + match, + Scored(..) ) where -import Control.Monad.ST (runST) -import Control.Parallel.Strategies (Eval, Strategy, evalTraversable, - parTraversable, rseq, using) -import Data.Monoid.Textual (TextualMonoid) -import Data.Vector (Vector, (!)) -import qualified Data.Vector as V --- need to use a stable sort -import Data.Bifunctor (second) -import Data.Char (toLower) -import Data.Maybe (fromMaybe) -import qualified Data.Monoid.Textual as T +import Control.Parallel.Strategies (rseq, using, parList, evalList) +import Data.Bits ((.|.)) +import Data.Maybe (fromMaybe, mapMaybe) +import qualified Data.Text as T +import qualified Data.Text.Internal as T +import qualified Data.Text.Array as TA import Prelude hiding (filter) -import Text.Fuzzy (Fuzzy (..)) -data Scored a = Scored {score_ :: !Int, original:: !a} - deriving (Functor,Show) +data Scored a = Scored {score :: !Int, original:: !a} + deriving (Functor, Show) -- | Returns the rendered output and the -- matching score for a pattern and a text. -- Two examples are given below: -- --- >>> match "fnt" "infinite" "" "" id True --- Just ("infinite",3) +-- >>> match "fnt" "infinite" +-- Just 3 -- --- >>> match "hsk" ("Haskell",1995) "<" ">" fst False --- Just ("aell",5) +-- >>> match "hsk" "Haskell" +-- Just 5 -- {-# INLINABLE match #-} -match :: (T.TextualMonoid s) - => s -- ^ Pattern in lowercase except for first character - -> t -- ^ The value containing the text to search in. - -> s -- ^ The text to add before each match. - -> s -- ^ The text to add after each match. - -> (t -> s) -- ^ The function to extract the text from the container. - -> Maybe (Fuzzy t s) -- ^ The original value, rendered string and score. -match pattern t pre post extract = - if null pat then Just (Fuzzy t result totalScore) else Nothing +match :: T.Text -- ^ Pattern in lowercase except for first character + -> T.Text -- ^ The text to search in. + -> Maybe Int -- ^ The score +match (T.Text pArr pOff pLen) (T.Text sArr sOff sLen) = go 0 1 pOff sOff where - null :: (T.TextualMonoid s) => s -> Bool - null = not . T.any (const True) - - s = extract t - (totalScore, _currScore, result, pat, _) = - T.foldl' - undefined - (\(tot, cur, res, pat, isFirst) c -> - case T.splitCharacterPrefix pat of - Nothing -> (tot, 0, res <> T.singleton c, pat, isFirst) - Just (x, xs) -> - -- the case of the first character has to match - -- otherwise use lower case since the pattern is assumed lower - let !c' = if isFirst then c else toLower c in - if x == c' then - let cur' = cur * 2 + 1 in - (tot + cur', cur', res <> pre <> T.singleton c <> post, xs, False) - else (tot, 0, res <> T.singleton c, pat, isFirst) - ) ( 0 - , 1 -- matching at the start gives a bonus (cur = 1) - , mempty, pattern, True) s + pTotal = pOff + pLen + sDelta = sOff + sLen - pTotal + + go !totalScore !currScore !currPOff !currSOff + -- If pattern has been matched in full + | currPOff >= pTotal + = Just totalScore + -- If there is not enough left to match the rest of the pattern, equivalent to + -- (sOff + sLen - currSOff) < (pOff + pLen - currPOff) + | currSOff > currPOff + sDelta + = Nothing + -- This is slightly broken for non-ASCII: + -- 1. If code units, consisting a single pattern code point, are found as parts + -- of different code points, it counts as a match. Unless you use a ton of emojis + -- as identifiers, such false positives should not be be a big deal, + -- and anyways HLS does not currently support such use cases, because it uses + -- code point and UTF-16 code unit positions interchangeably. + -- 2. Case conversions is not applied to non-ASCII code points, because one has + -- to call T.toLower (not T.map toLower), reallocating the string in full, which + -- is too much of performance penalty for fuzzy search. Again, anyway HLS does not + -- attempt to do justice to Unicode: proper Unicode text matching requires + -- `unicode-transforms` and friends. + -- Altogether we sacrifice correctness for the sake of performance, which + -- is a right trade-off for fuzzy search. + | pByte <- TA.unsafeIndex pArr currPOff + , sByte <- TA.unsafeIndex sArr currSOff + -- First byte (currPOff == pOff) should match exactly, otherwise - up to case. + , pByte == sByte || (currPOff /= pOff && pByte == toLowerAscii sByte) + = let curr = currScore * 2 + 1 in + go (totalScore + curr) curr (currPOff + 1) (currSOff + 1) + | otherwise + = go totalScore 0 currPOff (currSOff + 1) + + toLowerAscii w = if (w - 65) < 26 then w .|. 0x20 else w -- | The function to filter a list of values by fuzzy search on the text extracted from them. -filter :: (TextualMonoid s) - => Int -- ^ Chunk size. 1000 works well. - -> Int -- ^ Max. number of results wanted - -> s -- ^ Pattern. - -> [t] -- ^ The list of values containing the text to search in. - -> s -- ^ The text to add before each match. - -> s -- ^ The text to add after each match. - -> (t -> s) -- ^ The function to extract the text from the container. - -> [Scored t] -- ^ The list of results, sorted, highest score first. -filter chunkSize maxRes pattern ts pre post extract = runST $ do - let v = V.mapMaybe id - (V.map (\t -> match pattern' t pre post extract) (V.fromList ts) - `using` - parVectorChunk chunkSize (evalTraversable forceScore)) - perfectScore = score $ fromMaybe (error $ T.toString undefined pattern) $ - match pattern' pattern' "" "" id - return $ partialSortByAscScore maxRes perfectScore v +filter :: Int -- ^ Chunk size. 1000 works well. + -> Int -- ^ Max. number of results wanted + -> T.Text -- ^ Pattern. + -> [t] -- ^ The list of values containing the text to search in. + -> (t -> T.Text) -- ^ The function to extract the text from the container. + -> [Scored t] -- ^ The list of results, sorted, highest score first. +filter chunkSize maxRes pattern ts extract = partialSortByAscScore maxRes perfectScore (concat vss) where -- Preserve case for the first character, make all others lowercase - pattern' = case T.splitCharacterPrefix pattern of - Just (c, rest) -> T.singleton c <> T.map toLower rest - _ -> pattern + pattern' = case T.uncons pattern of + Just (c, rest) -> T.cons c (T.toLower rest) + _ -> pattern + vss = map (mapMaybe (\t -> flip Scored t <$> match pattern' (extract t))) (chunkList chunkSize ts) + `using` parList (evalList rseq) + perfectScore = fromMaybe (error $ T.unpack pattern) $ match pattern' pattern' -- | Return all elements of the list that have a fuzzy -- match against the pattern. Runs with default settings where @@ -99,84 +94,44 @@ filter chunkSize maxRes pattern ts pre post extract = runST $ do -- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"] -- ["vim","virtual machine"] {-# INLINABLE simpleFilter #-} -simpleFilter :: (TextualMonoid s) - => Int -- ^ Chunk size. 1000 works well. - -> Int -- ^ Max. number of results wanted - -> s -- ^ Pattern to look for. - -> [s] -- ^ List of texts to check. - -> [Scored s] -- ^ The ones that match. +simpleFilter :: Int -- ^ Chunk size. 1000 works well. + -> Int -- ^ Max. number of results wanted + -> T.Text -- ^ Pattern to look for. + -> [T.Text] -- ^ List of texts to check. + -> [Scored T.Text] -- ^ The ones that match. simpleFilter chunk maxRes pattern xs = - filter chunk maxRes pattern xs mempty mempty id - --------------------------------------------------------------------------------- - --- | Evaluation that forces the 'score' field -forceScore :: TextualMonoid s => Fuzzy t s -> Eval(Fuzzy t s) -forceScore it@Fuzzy{score} = do - score' <- rseq score - return it{score = score'} + filter chunk maxRes pattern xs id -------------------------------------------------------------------------------- --- | Divides a vector in chunks, applies the strategy in parallel to each chunk. -parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a) -parVectorChunk chunkSize st v = - V.concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v) - --- >>> chunkVector 3 (V.fromList [0..10]) --- >>> chunkVector 3 (V.fromList [0..11]) --- >>> chunkVector 3 (V.fromList [0..12]) --- [[0,1,2],[3,4,5],[6,7,8],[9,10]] --- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]] --- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]] -chunkVector :: Int -> Vector a -> [Vector a] -chunkVector chunkSize v = do - let indices = chunkIndices chunkSize (0,V.length v) - [V.slice l (h-l+1) v | (l,h) <- indices] - --- >>> chunkIndices 3 (0,9) --- >>> chunkIndices 3 (0,10) --- >>> chunkIndices 3 (0,11) --- [(0,2),(3,5),(6,8)] --- [(0,2),(3,5),(6,8),(9,9)] --- [(0,2),(3,5),(6,8),(9,10)] -chunkIndices :: Int -> (Int,Int) -> [(Int,Int)] -chunkIndices chunkSize (from,to) = - map (second pred) $ - pairwise $ - [from, from+chunkSize .. to-1] ++ [to] - -pairwise :: [a] -> [(a,a)] -pairwise [] = [] -pairwise [_] = [] -pairwise (x:y:xs) = (x,y) : pairwise (y:xs) +chunkList :: Int -> [a] -> [[a]] +chunkList chunkSize = go + where + go [] = [] + go xs = ys : go zs + where + (ys, zs) = splitAt chunkSize xs -- | A stable partial sort ascending by score. O(N) best case, O(wanted*N) worst case -partialSortByAscScore :: TextualMonoid s - => Int -- ^ Number of items needed +partialSortByAscScore + :: Int -- ^ Number of items needed -> Int -- ^ Value of a perfect score - -> Vector (Fuzzy t s) -> [Scored t] -partialSortByAscScore wantedCount perfectScore v = loop 0 (SortState minBound perfectScore 0) [] where - l = V.length v - loop index st@SortState{..} acc + -> [Scored t] +partialSortByAscScore wantedCount perfectScore orig = loop orig (SortState minBound perfectScore 0) [] where + loop [] st@SortState{..} acc | foundCount == wantedCount = reverse acc - | index == l --- ProgressCancelledException - = if bestScoreSeen < scoreWanted - then loop 0 st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc + | otherwise = if bestScoreSeen < scoreWanted + then loop orig st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc else reverse acc - | otherwise = - case v!index of - x | score x == scoreWanted - -> loop (index+1) st{foundCount = foundCount+1} (toScored x:acc) - | score x < scoreWanted && score x > bestScoreSeen - -> loop (index+1) st{bestScoreSeen = score x} acc - | otherwise - -> loop (index+1) st acc - -toScored :: TextualMonoid s => Fuzzy t s -> Scored t -toScored Fuzzy{..} = Scored score original + loop (x : xs) st@SortState{..} acc + | foundCount == wantedCount = reverse acc + | score x == scoreWanted + = loop xs st{foundCount = foundCount+1} (x:acc) + | score x < scoreWanted && score x > bestScoreSeen + = loop xs st{bestScoreSeen = score x} acc + | otherwise + = loop xs st acc data SortState a = SortState { bestScoreSeen :: !Int diff --git a/ghcide/test/exe/FuzzySearch.hs b/ghcide/test/exe/FuzzySearch.hs new file mode 100644 index 0000000000..f9794270ae --- /dev/null +++ b/ghcide/test/exe/FuzzySearch.hs @@ -0,0 +1,132 @@ +module FuzzySearch (tests) where + +import Control.Monad (guard) +import Data.Char (toLower) +import Data.Maybe (catMaybes) +import qualified Data.Monoid.Textual as T +import Data.Text (Text, inits, pack) +import qualified Data.Text as Text +import System.Directory (doesFileExist) +import System.IO.Unsafe (unsafePerformIO) +import System.Info.Extra (isWindows) +import Test.QuickCheck +import Test.Tasty +import Test.Tasty.ExpectedFailure +import Test.Tasty.HUnit +import Test.Tasty.QuickCheck (testProperty) +import Text.Fuzzy (Fuzzy (..)) +import qualified Text.Fuzzy as Fuzzy +import Text.Fuzzy.Parallel +import Prelude hiding (filter) + +tests :: TestTree +tests = + testGroup + "Fuzzy search" + [ needDictionary $ + testGroup + "match works as expected on the english dictionary" + [ testProperty "for legit words" propLegit, + testProperty "for prefixes" propPrefix, + testProperty "for typos" propTypo + ] + ] + +test :: Text -> Bool +test candidate = do + let previous = + catMaybes + [ (d,) . Fuzzy.score + <$> referenceImplementation candidate d "" "" id + | d <- dictionary + ] + new = catMaybes [(d,) <$> match candidate d | d <- dictionary] + previous == new + +propLegit :: Property +propLegit = forAll (elements dictionary) test + +propPrefix :: Property +propPrefix = forAll (elements dictionary >>= elements . inits) test + +propTypo :: Property +propTypo = forAll typoGen test + +typoGen :: Gen Text +typoGen = do + w <- elements dictionary + l <- elements [0 .. Text.length w -1] + let wl = Text.index w l + c <- elements [ c | c <- ['a' .. 'z'], c /= wl] + return $ replaceAt w l c + +replaceAt :: Text -> Int -> Char -> Text +replaceAt t i c = + let (l, r) = Text.splitAt i t + in l <> Text.singleton c <> r + +dictionaryPath :: FilePath +dictionaryPath = "/usr/share/dict/words" + +{-# NOINLINE dictionary #-} +dictionary :: [Text] +dictionary = unsafePerformIO $ do + existsDictionary <- doesFileExist dictionaryPath + if existsDictionary + then map pack . words <$> readFile dictionaryPath + else pure [] + +referenceImplementation :: + (T.TextualMonoid s) => + -- | Pattern in lowercase except for first character + s -> + -- | The value containing the text to search in. + t -> + -- | The text to add before each match. + s -> + -- | The text to add after each match. + s -> + -- | The function to extract the text from the container. + (t -> s) -> + -- | The original value, rendered string and score. + Maybe (Fuzzy t s) +referenceImplementation pattern t pre post extract = + if null pat then Just (Fuzzy t result totalScore) else Nothing + where + null :: (T.TextualMonoid s) => s -> Bool + null = not . T.any (const True) + + s = extract t + (totalScore, _currScore, result, pat, _) = + T.foldl' + undefined + ( \(tot, cur, res, pat, isFirst) c -> + case T.splitCharacterPrefix pat of + Nothing -> (tot, 0, res <> T.singleton c, pat, isFirst) + Just (x, xs) -> + -- the case of the first character has to match + -- otherwise use lower case since the pattern is assumed lower + let !c' = if isFirst then c else toLower c + in if x == c' + then + let cur' = cur * 2 + 1 + in ( tot + cur', + cur', + res <> pre <> T.singleton c <> post, + xs, + False + ) + else (tot, 0, res <> T.singleton c, pat, isFirst) + ) + ( 0, + 1, -- matching at the start gives a bonus (cur = 1) + mempty, + pattern, + True + ) + s + +needDictionary :: TestTree -> TestTree +needDictionary + | null dictionary = ignoreTestBecause ("not found: " <> dictionaryPath) + | otherwise = id diff --git a/ghcide/test/exe/Main.hs b/ghcide/test/exe/Main.hs index afc57c4bf7..6f4481f9e5 100644 --- a/ghcide/test/exe/Main.hs +++ b/ghcide/test/exe/Main.hs @@ -121,6 +121,7 @@ import Test.Tasty.Ingredients.Rerun import Test.Tasty.QuickCheck import Text.Printf (printf) import Text.Regex.TDFA ((=~)) +import qualified FuzzySearch -- | Wait for the next progress begin step waitForProgressBegin :: Session () @@ -6228,6 +6229,7 @@ unitTests = do let msg = printf "Timestamps do not have millisecond resolution: %dus" resolution_us assertBool msg (resolution_us <= 1000) , Progress.tests + , FuzzySearch.tests ] garbageCollectionTests :: TestTree