Skip to content

Speed up fuzzy search #2639

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions ghcide/ghcide.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ library
dlist,
exceptions,
extra >= 1.7.4,
fuzzy,
filepath,
fingertree,
focus,
Expand Down Expand Up @@ -205,6 +204,7 @@ library
Development.IDE.Plugin.HLS.GhcIde
Development.IDE.Plugin.Test
Development.IDE.Plugin.TypeLenses
Text.Fuzzy.Parallel

other-modules:
Development.IDE.Core.FileExists
Expand All @@ -216,7 +216,6 @@ library
Development.IDE.Plugin.Completions.Logic
Development.IDE.Session.VersionCheck
Development.IDE.Types.Action
Text.Fuzzy.Parallel

ghc-options:
-Wall
Expand Down Expand Up @@ -371,6 +370,7 @@ test-suite ghcide-tests
directory,
extra,
filepath,
fuzzy,
--------------------------------------------------------------
-- The MIN_VERSION_ghc macro relies on MIN_VERSION pragmas
-- which require depending on ghc. So the tests need to depend
Expand All @@ -385,11 +385,13 @@ test-suite ghcide-tests
lsp,
lsp-types,
hls-plugin-api,
network-uri,
lens,
list-t,
lsp-test ^>= 0.14,
monoid-subclasses,
network-uri,
optparse-applicative,
parallel,
process,
QuickCheck,
quickcheck-instances,
Expand All @@ -410,6 +412,7 @@ test-suite ghcide-tests
tasty-rerun,
text,
unordered-containers,
vector,
if (impl(ghc >= 8.6) && impl(ghc < 9.2))
build-depends:
record-dot-preprocessor,
Expand All @@ -423,6 +426,7 @@ test-suite ghcide-tests
Development.IDE.Test.Runfiles
Experiments
Experiments.Types
FuzzySearch
Progress
HieDbRetry
default-extensions:
Expand Down
10 changes: 5 additions & 5 deletions ghcide/src/Development/IDE/Plugin/Completions/Logic.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ import Ide.Types (CommandId (..),
import Language.LSP.Types
import Language.LSP.Types.Capabilities
import qualified Language.LSP.VFS as VFS
import Text.Fuzzy.Parallel (Scored (score_),
import Text.Fuzzy.Parallel (Scored (score),
original)

-- Chunk size used for parallelizing fuzzy matching
Expand Down Expand Up @@ -590,7 +590,7 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu
$ (if T.null enteredQual then id else mapMaybe (T.stripPrefix enteredQual))
allModNamesAsNS

filtCompls = Fuzzy.filter chunkSize maxC prefixText ctxCompls "" "" (label . snd)
filtCompls = Fuzzy.filter chunkSize maxC prefixText ctxCompls (label . snd)
where

mcc = case maybe_parsed of
Expand Down Expand Up @@ -668,7 +668,7 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu
return $
(fmap.fmap) snd $
sortBy (compare `on` lexicographicOrdering) $
mergeListsBy (flip compare `on` score_)
mergeListsBy (flip compare `on` score)
[ (fmap.fmap) (notQual,) filtModNameCompls
, (fmap.fmap) (notQual,) filtKeywordCompls
, (fmap.fmap.fmap) (toggleSnippets caps config) compls
Expand All @@ -681,11 +681,11 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu
-- 3. In-scope completions rank next
-- 4. label alphabetical ordering next
-- 4. detail alphabetical ordering (proxy for module)
lexicographicOrdering Fuzzy.Scored{score_, original} =
lexicographicOrdering Fuzzy.Scored{score, original} =
case original of
(isQual, CompletionItem{_label,_detail}) -> do
let isLocal = maybe False (":" `T.isPrefixOf`) _detail
(Down isQual, Down score_, Down isLocal, _label, _detail)
(Down isQual, Down score, Down isLocal, _label, _detail)



Expand Down
233 changes: 94 additions & 139 deletions ghcide/src/Text/Fuzzy/Parallel.hs
Original file line number Diff line number Diff line change
@@ -1,96 +1,91 @@
-- | Parallel versions of 'filter' and 'simpleFilter'

module Text.Fuzzy.Parallel
( filter,
simpleFilter,
Scored(..),
-- reexports
Fuzzy,
match,
Scored(..)
) where

import Control.Monad.ST (runST)
import Control.Parallel.Strategies (Eval, Strategy, evalTraversable,
parTraversable, rseq, using)
import Data.Monoid.Textual (TextualMonoid)
import Data.Vector (Vector, (!))
import qualified Data.Vector as V
-- need to use a stable sort
import Data.Bifunctor (second)
import Data.Char (toLower)
import Data.Maybe (fromMaybe)
import qualified Data.Monoid.Textual as T
import Control.Parallel.Strategies (rseq, using, parList, evalList)
import Data.Bits ((.|.))
import Data.Maybe (fromMaybe, mapMaybe)
import qualified Data.Text as T
import qualified Data.Text.Internal as T
import qualified Data.Text.Array as TA
import Prelude hiding (filter)
import Text.Fuzzy (Fuzzy (..))

data Scored a = Scored {score_ :: !Int, original:: !a}
deriving (Functor,Show)
data Scored a = Scored {score :: !Int, original:: !a}
deriving (Functor, Show)

-- | Returns the rendered output and the
-- matching score for a pattern and a text.
-- Two examples are given below:
--
-- >>> match "fnt" "infinite" "" "" id True
-- Just ("infinite",3)
-- >>> match "fnt" "infinite"
-- Just 3
--
-- >>> match "hsk" ("Haskell",1995) "<" ">" fst False
-- Just ("<h>a<s><k>ell",5)
-- >>> match "hsk" "Haskell"
-- Just 5
--
{-# INLINABLE match #-}

match :: (T.TextualMonoid s)
=> s -- ^ Pattern in lowercase except for first character
-> t -- ^ The value containing the text to search in.
-> s -- ^ The text to add before each match.
-> s -- ^ The text to add after each match.
-> (t -> s) -- ^ The function to extract the text from the container.
-> Maybe (Fuzzy t s) -- ^ The original value, rendered string and score.
match pattern t pre post extract =
if null pat then Just (Fuzzy t result totalScore) else Nothing
match :: T.Text -- ^ Pattern in lowercase except for first character
-> T.Text -- ^ The text to search in.
-> Maybe Int -- ^ The score
match (T.Text pArr pOff pLen) (T.Text sArr sOff sLen) = go 0 1 pOff sOff
where
null :: (T.TextualMonoid s) => s -> Bool
null = not . T.any (const True)

s = extract t
(totalScore, _currScore, result, pat, _) =
T.foldl'
undefined
(\(tot, cur, res, pat, isFirst) c ->
case T.splitCharacterPrefix pat of
Nothing -> (tot, 0, res <> T.singleton c, pat, isFirst)
Just (x, xs) ->
-- the case of the first character has to match
-- otherwise use lower case since the pattern is assumed lower
let !c' = if isFirst then c else toLower c in
if x == c' then
let cur' = cur * 2 + 1 in
(tot + cur', cur', res <> pre <> T.singleton c <> post, xs, False)
else (tot, 0, res <> T.singleton c, pat, isFirst)
) ( 0
, 1 -- matching at the start gives a bonus (cur = 1)
, mempty, pattern, True) s
pTotal = pOff + pLen
sDelta = sOff + sLen - pTotal

go !totalScore !currScore !currPOff !currSOff
-- If pattern has been matched in full
| currPOff >= pTotal
= Just totalScore
-- If there is not enough left to match the rest of the pattern, equivalent to
-- (sOff + sLen - currSOff) < (pOff + pLen - currPOff)
| currSOff > currPOff + sDelta
= Nothing
-- This is slightly broken for non-ASCII:
-- 1. If code units, consisting a single pattern code point, are found as parts
-- of different code points, it counts as a match. Unless you use a ton of emojis
-- as identifiers, such false positives should not be be a big deal,
-- and anyways HLS does not currently support such use cases, because it uses
-- code point and UTF-16 code unit positions interchangeably.
-- 2. Case conversions is not applied to non-ASCII code points, because one has
-- to call T.toLower (not T.map toLower), reallocating the string in full, which
-- is too much of performance penalty for fuzzy search. Again, anyway HLS does not
-- attempt to do justice to Unicode: proper Unicode text matching requires
-- `unicode-transforms` and friends.
-- Altogether we sacrifice correctness for the sake of performance, which
-- is a right trade-off for fuzzy search.
| pByte <- TA.unsafeIndex pArr currPOff
, sByte <- TA.unsafeIndex sArr currSOff
-- First byte (currPOff == pOff) should match exactly, otherwise - up to case.
, pByte == sByte || (currPOff /= pOff && pByte == toLowerAscii sByte)
= let curr = currScore * 2 + 1 in
go (totalScore + curr) curr (currPOff + 1) (currSOff + 1)
| otherwise
Comment on lines +41 to +68
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we know that the new implementation produces the same scores? I would like to see a unit test here, or a property test.

It's ok for the test suite to depend on the fuzzy package.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, even the old implementation was not fully matching fuzzy package because of a different approach to case matching. What's the best place to add unit tests?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ghcide test suite.

= go totalScore 0 currPOff (currSOff + 1)

toLowerAscii w = if (w - 65) < 26 then w .|. 0x20 else w

-- | The function to filter a list of values by fuzzy search on the text extracted from them.
filter :: (TextualMonoid s)
=> Int -- ^ Chunk size. 1000 works well.
-> Int -- ^ Max. number of results wanted
-> s -- ^ Pattern.
-> [t] -- ^ The list of values containing the text to search in.
-> s -- ^ The text to add before each match.
-> s -- ^ The text to add after each match.
-> (t -> s) -- ^ The function to extract the text from the container.
-> [Scored t] -- ^ The list of results, sorted, highest score first.
filter chunkSize maxRes pattern ts pre post extract = runST $ do
let v = V.mapMaybe id
(V.map (\t -> match pattern' t pre post extract) (V.fromList ts)
`using`
parVectorChunk chunkSize (evalTraversable forceScore))
perfectScore = score $ fromMaybe (error $ T.toString undefined pattern) $
match pattern' pattern' "" "" id
return $ partialSortByAscScore maxRes perfectScore v
filter :: Int -- ^ Chunk size. 1000 works well.
-> Int -- ^ Max. number of results wanted
-> T.Text -- ^ Pattern.
-> [t] -- ^ The list of values containing the text to search in.
-> (t -> T.Text) -- ^ The function to extract the text from the container.
-> [Scored t] -- ^ The list of results, sorted, highest score first.
filter chunkSize maxRes pattern ts extract = partialSortByAscScore maxRes perfectScore (concat vss)
where
-- Preserve case for the first character, make all others lowercase
pattern' = case T.splitCharacterPrefix pattern of
Just (c, rest) -> T.singleton c <> T.map toLower rest
_ -> pattern
pattern' = case T.uncons pattern of
Just (c, rest) -> T.cons c (T.toLower rest)
_ -> pattern
vss = map (mapMaybe (\t -> flip Scored t <$> match pattern' (extract t))) (chunkList chunkSize ts)
`using` parList (evalList rseq)
perfectScore = fromMaybe (error $ T.unpack pattern) $ match pattern' pattern'

-- | Return all elements of the list that have a fuzzy
-- match against the pattern. Runs with default settings where
Expand All @@ -99,84 +94,44 @@ filter chunkSize maxRes pattern ts pre post extract = runST $ do
-- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"]
-- ["vim","virtual machine"]
{-# INLINABLE simpleFilter #-}
simpleFilter :: (TextualMonoid s)
=> Int -- ^ Chunk size. 1000 works well.
-> Int -- ^ Max. number of results wanted
-> s -- ^ Pattern to look for.
-> [s] -- ^ List of texts to check.
-> [Scored s] -- ^ The ones that match.
simpleFilter :: Int -- ^ Chunk size. 1000 works well.
-> Int -- ^ Max. number of results wanted
-> T.Text -- ^ Pattern to look for.
-> [T.Text] -- ^ List of texts to check.
-> [Scored T.Text] -- ^ The ones that match.
simpleFilter chunk maxRes pattern xs =
filter chunk maxRes pattern xs mempty mempty id

--------------------------------------------------------------------------------

-- | Evaluation that forces the 'score' field
forceScore :: TextualMonoid s => Fuzzy t s -> Eval(Fuzzy t s)
forceScore it@Fuzzy{score} = do
score' <- rseq score
return it{score = score'}
filter chunk maxRes pattern xs id

--------------------------------------------------------------------------------

-- | Divides a vector in chunks, applies the strategy in parallel to each chunk.
parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a)
parVectorChunk chunkSize st v =
V.concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v)

-- >>> chunkVector 3 (V.fromList [0..10])
-- >>> chunkVector 3 (V.fromList [0..11])
-- >>> chunkVector 3 (V.fromList [0..12])
-- [[0,1,2],[3,4,5],[6,7,8],[9,10]]
-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]]
-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]]
chunkVector :: Int -> Vector a -> [Vector a]
chunkVector chunkSize v = do
let indices = chunkIndices chunkSize (0,V.length v)
[V.slice l (h-l+1) v | (l,h) <- indices]

-- >>> chunkIndices 3 (0,9)
-- >>> chunkIndices 3 (0,10)
-- >>> chunkIndices 3 (0,11)
-- [(0,2),(3,5),(6,8)]
-- [(0,2),(3,5),(6,8),(9,9)]
-- [(0,2),(3,5),(6,8),(9,10)]
chunkIndices :: Int -> (Int,Int) -> [(Int,Int)]
chunkIndices chunkSize (from,to) =
map (second pred) $
pairwise $
[from, from+chunkSize .. to-1] ++ [to]

pairwise :: [a] -> [(a,a)]
pairwise [] = []
pairwise [_] = []
pairwise (x:y:xs) = (x,y) : pairwise (y:xs)
chunkList :: Int -> [a] -> [[a]]
chunkList chunkSize = go
where
go [] = []
go xs = ys : go zs
where
(ys, zs) = splitAt chunkSize xs

-- | A stable partial sort ascending by score. O(N) best case, O(wanted*N) worst case
partialSortByAscScore :: TextualMonoid s
=> Int -- ^ Number of items needed
partialSortByAscScore
:: Int -- ^ Number of items needed
-> Int -- ^ Value of a perfect score
-> Vector (Fuzzy t s)
-> [Scored t]
partialSortByAscScore wantedCount perfectScore v = loop 0 (SortState minBound perfectScore 0) [] where
l = V.length v
loop index st@SortState{..} acc
-> [Scored t]
partialSortByAscScore wantedCount perfectScore orig = loop orig (SortState minBound perfectScore 0) [] where
loop [] st@SortState{..} acc
| foundCount == wantedCount = reverse acc
| index == l
-- ProgressCancelledException
= if bestScoreSeen < scoreWanted
then loop 0 st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc
| otherwise = if bestScoreSeen < scoreWanted
then loop orig st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc
else reverse acc
| otherwise =
case v!index of
x | score x == scoreWanted
-> loop (index+1) st{foundCount = foundCount+1} (toScored x:acc)
| score x < scoreWanted && score x > bestScoreSeen
-> loop (index+1) st{bestScoreSeen = score x} acc
| otherwise
-> loop (index+1) st acc

toScored :: TextualMonoid s => Fuzzy t s -> Scored t
toScored Fuzzy{..} = Scored score original
loop (x : xs) st@SortState{..} acc
| foundCount == wantedCount = reverse acc
| score x == scoreWanted
= loop xs st{foundCount = foundCount+1} (x:acc)
| score x < scoreWanted && score x > bestScoreSeen
= loop xs st{bestScoreSeen = score x} acc
| otherwise
= loop xs st acc

data SortState a = SortState
{ bestScoreSeen :: !Int
Expand Down
Loading