Skip to content

Commit

Permalink
Add package index table and store timestamps
Browse files Browse the repository at this point in the history
Now when mirroring from an index tarball we'll only import packages
newer than this timestamp from the same repository. We store the
repository provenance in the release table, which means the timestamp is
set to only the relevant packages.
  • Loading branch information
RaoulHC committed Jun 26, 2023
1 parent 2708d82 commit dcccd48
Show file tree
Hide file tree
Showing 16 changed files with 339 additions and 56 deletions.
42 changes: 29 additions & 13 deletions app/cli/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ data Command
= Provision ProvisionTarget
| CreateUser UserCreationOptions
| GenDesignSystemComponents
| ImportPackages FilePath
| ImportIndex FilePath
| ImportPackages FilePath (Maybe Text)
| ImportIndex FilePath (Maybe Text)
deriving stock (Show, Eq)

data ProvisionTarget
Expand Down Expand Up @@ -92,14 +92,30 @@ parseGenDesignSystem :: Parser Command
parseGenDesignSystem = pure GenDesignSystemComponents

parseImportPackages :: Parser Command
parseImportPackages = ImportPackages <$> argument str (metavar "PATH")
parseImportPackages =
ImportPackages
<$> argument str (metavar "PATH")
<*> optional
( strOption $
long "repository"
<> metavar "<repository>"
<> help "Which repository we're importing from"
)

parseImportIndex :: Parser Command
parseImportIndex = ImportIndex <$> argument str (metavar "PATH")
parseImportIndex =
ImportIndex
<$> argument str (metavar "PATH")
<*> optional
( strOption $
long "repository"
<> metavar "<repository>"
<> help "Which repository we're importing from"
)

runOptions :: (Reader PoolConfig :> es, DB :> es, Fail :> es, IOE :> es) => Options -> Eff es ()
runOptions (Options (Provision Categories)) = importCategories
runOptions (Options (Provision TestPackages)) = importFolderOfCabalFiles "./test/fixtures/Cabal/"
runOptions (Options (Provision TestPackages)) = importFolderOfCabalFiles "./test/fixtures/Cabal/" Nothing
runOptions (Options (CreateUser opts)) = do
let username = opts ^. #username
email = opts ^. #email
Expand All @@ -117,18 +133,18 @@ runOptions (Options (CreateUser opts)) = do
let user = if canLogin then templateUser else templateUser & #userFlags % #canLogin .~ False
insertUser user
runOptions (Options GenDesignSystemComponents) = generateComponents
runOptions (Options (ImportPackages path)) = importFolderOfCabalFiles path
runOptions (Options (ImportIndex path)) = importIndex path
runOptions (Options (ImportPackages path repository)) = importFolderOfCabalFiles path repository
runOptions (Options (ImportIndex path repository)) = importIndex path repository

importFolderOfCabalFiles :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => FilePath -> Eff es ()
importFolderOfCabalFiles path = Log.withStdOutLogger $ \appLogger -> do
importFolderOfCabalFiles :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => FilePath -> Maybe Text -> Eff es ()
importFolderOfCabalFiles path repository = Log.withStdOutLogger $ \appLogger -> do
user <- fromJust <$> Query.getUserByUsername "hackage-user"
importAllFilesInRelativeDirectory appLogger (user ^. #userId) path True
importAllFilesInRelativeDirectory appLogger (user ^. #userId) repository path True

importIndex :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => FilePath -> Eff es ()
importIndex path = Log.withStdOutLogger $ \logger -> do
importIndex :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => FilePath -> Maybe Text -> Eff es ()
importIndex path repository = Log.withStdOutLogger $ \logger -> do
user <- fromJust <$> Query.getUserByUsername "hackage-user"
importFromIndex logger (user ^. #userId) path True
importFromIndex logger (user ^. #userId) repository path True

withInfo :: Parser a -> String -> ParserInfo a
withInfo opts desc = info (helper <*> opts) $ progDesc desc
9 changes: 8 additions & 1 deletion flora.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ library
import: common-extensions
import: common-ghc-options
extra-libraries: stdc++
cxx-options: -std=c++17 -Wall -D__EMBEDDED_SOUFFLE__

-- -Wall
cxx-options:
-std=c++17 -D__EMBEDDED_SOUFFLE__ -Wno-deprecated -fno-gnu-unique

cxx-sources: cbits/categorise.cpp
hs-source-dirs: ./src/core ./src/orphans

Expand Down Expand Up @@ -105,6 +109,7 @@ library
Flora.Model.Package.Query
Flora.Model.Package.Types
Flora.Model.Package.Update
Flora.Model.PackageIndex
Flora.Model.PersistentSession
Flora.Model.Release
Flora.Model.Release.Query
Expand Down Expand Up @@ -167,6 +172,7 @@ library
, text-display
, time
, time-effectful
, unliftio
, uuid
, vector
, vector-algorithms
Expand Down Expand Up @@ -418,6 +424,7 @@ test-suite flora-test
other-modules:
Flora.CabalSpec
Flora.CategorySpec
Flora.ImportSpec
Flora.OddJobSpec
Flora.PackageSpec
Flora.TemplateSpec
Expand Down
7 changes: 7 additions & 0 deletions migrations/20230611171412_packageindexes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
create table if not exists package_indexes (
package_index_id uuid primary key,
repository text not null,
timestamp timestamptz
);

create unique index on package_indexes(repository);
6 changes: 6 additions & 0 deletions migrations/20230623168754_add_repo_to_release.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
alter table releases
add repository text default null;
alter table releases
add constraint repository
foreign key(repository)
references package_indexes(repository);
18 changes: 10 additions & 8 deletions src/core/Flora/Import/Package.hs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ import Effectful.PostgreSQL.Transact.Effect (DB, getPool, runDB)
import Effectful.Reader.Static (Reader, ask)
import Effectful.Time (Time, UTCTime)
import Effectful.Time qualified as Time
import GHC.Stack (HasCallStack)
import Log qualified
import OddJobs.Job (createJob)
import Optics.Core
Expand Down Expand Up @@ -169,7 +168,7 @@ importFile
importFile userId path =
withWorkerDbPool $ \wq ->
loadFile path
>>= uncurry (extractPackageDataFromCabal userId)
>>= uncurry (extractPackageDataFromCabal userId Nothing)
>>= persistImportOutput wq

enqueueImportJob :: (DB :> es, IOE :> es) => ImportOutput -> Eff es ()
Expand Down Expand Up @@ -208,11 +207,11 @@ loadFile path = do
descr <- loadContent path content
pure (timestamp, descr)

loadContent :: (IOE :> es, Log :> es) => String -> BS.ByteString -> Eff es GenericPackageDescription
loadContent :: Log :> es => String -> BS.ByteString -> Eff es GenericPackageDescription
loadContent = parseString parseGenericPackageDescription

parseString
:: (HasCallStack, Log :> es)
:: Log :> es
=> (BS.ByteString -> ParseResult a)
-- ^ File contents to final value parser
-> String
Expand All @@ -227,8 +226,10 @@ parseString parser name bs = do
Log.logAttention_ (display $! show err)
throw $! CabalFileCouldNotBeParsed name

loadAndExtractCabalFile :: (DB :> es, IOE :> es, Log :> es, Time :> es) => UserId -> FilePath -> Eff es ImportOutput
loadAndExtractCabalFile userId filePath = loadFile filePath >>= uncurry (extractPackageDataFromCabal userId)
loadAndExtractCabalFile :: (IOE :> es, Log :> es, Time :> es) => UserId -> FilePath -> Eff es ImportOutput
loadAndExtractCabalFile userId filePath =
loadFile filePath
>>= uncurry (extractPackageDataFromCabal userId Nothing)

-- | Persists an 'ImportOutput' to the database. An 'ImportOutput' can be obtained
-- by extracting relevant information from a Cabal file using 'extractPackageDataFromCabal'
Expand Down Expand Up @@ -273,8 +274,8 @@ withWorkerDbPool f = do
-- | Transforms a 'GenericPackageDescription' from Cabal into an 'ImportOutput'
-- that can later be inserted into the database. This function produces stable, deterministic ids,
-- so it should be possible to extract and insert a single package many times in a row.
extractPackageDataFromCabal :: (IOE :> es, Time :> es) => UserId -> UTCTime -> GenericPackageDescription -> Eff es ImportOutput
extractPackageDataFromCabal userId uploadTime genericDesc = do
extractPackageDataFromCabal :: (IOE :> es, Time :> es) => UserId -> Maybe Text -> UTCTime -> GenericPackageDescription -> Eff es ImportOutput
extractPackageDataFromCabal userId repository uploadTime genericDesc = do
let packageDesc = genericDesc.packageDescription
let flags = Vector.fromList genericDesc.genPackageFlags
let packageName = force $! packageDesc ^. #package % #pkgName % to unPackageName % to pack % to PackageName
Expand Down Expand Up @@ -328,6 +329,7 @@ extractPackageDataFromCabal userId uploadTime genericDesc = do
, readmeStatus = NotImported
, changelog = Nothing
, changelogStatus = NotImported
, repository
}

let lib = extractLibrary package release Nothing [] <$> allLibraries packageDesc
Expand Down
102 changes: 75 additions & 27 deletions src/core/Flora/Import/Package/Bulk.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
module Flora.Import.Package.Bulk (importAllFilesInDirectory, importAllFilesInRelativeDirectory, importFromIndex) where

import Codec.Archive.Tar qualified as Tar
import Codec.Archive.Tar.Entry qualified as Tar
import Codec.Compression.GZip qualified as GZip
import Control.Monad (when, (>=>))
import Control.Monad (join, when, (>=>))
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BL
import Data.Function ((&))
import Data.List (isSuffixOf)
import Data.Maybe (fromMaybe, isNothing)
import Data.Text (Text)
import Data.Text qualified as Text
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Effectful
Expand All @@ -23,69 +24,116 @@ import Streamly.Data.Fold qualified as SFold
import Streamly.Prelude qualified as S
import System.Directory qualified as System
import System.FilePath
import UnliftIO.Exception (finally)

import Codec.Archive.Tar.Entry qualified as Tar
import Flora.Environment.Config (PoolConfig (..))
import Flora.Import.Package (enqueueImportJob, extractPackageDataFromCabal, loadContent, persistImportOutput, withWorkerDbPool)
import Flora.Model.Package.Update qualified as Update
import Flora.Model.PackageIndex (getPackageIndexTimestamp, updatePackageIndexTimestamp)
import Flora.Model.Release.Query qualified as Query
import Flora.Model.Release.Update qualified as Update
import Flora.Model.User

-- | Same as 'importAllFilesInDirectory' but accepts a relative path to the current working directory
importAllFilesInRelativeDirectory :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => Logger -> UserId -> FilePath -> Bool -> Eff es ()
importAllFilesInRelativeDirectory appLogger user dir directImport = do
importAllFilesInRelativeDirectory
:: (Reader PoolConfig :> es, DB :> es, IOE :> es)
=> Logger
-> UserId
-> Maybe Text
-> FilePath
-> Bool
-> Eff es ()
importAllFilesInRelativeDirectory appLogger user repository dir directImport = do
workdir <- (</> dir) <$> liftIO System.getCurrentDirectory
importAllFilesInDirectory appLogger user workdir directImport
importAllFilesInDirectory appLogger user repository workdir directImport

importFromIndex :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => Logger -> UserId -> FilePath -> Bool -> Eff es ()
importFromIndex appLogger user index directImport = do
importFromIndex
:: (Reader PoolConfig :> es, DB :> es, IOE :> es)
=> Logger
-> UserId
-> Maybe Text
-> FilePath
-> Bool
-> Eff es ()
importFromIndex appLogger user repository index directImport = do
entries <- Tar.read . GZip.decompress <$> liftIO (BL.readFile index)
case Tar.foldlEntries buildContentStream S.nil entries of
Right stream -> importFromStream appLogger user directImport stream
time <- fromMaybe (posixSecondsToUTCTime 0) . join <$> traverse getPackageIndexTimestamp repository
case Tar.foldlEntries (buildContentStream time) S.nil entries of
Right stream -> importFromStream appLogger user repository directImport stream
Left (err, _) ->
Log.runLog "flora-cli" appLogger defaultLogLevel $
Log.logAttention_ $
"Failed to get files from index: " <> Text.pack (show err)
where
buildContentStream acc entry =
buildContentStream time acc entry =
let entryPath = Tar.entryPath entry
entryTime = posixSecondsToUTCTime . fromIntegral $ Tar.entryTime entry
in Tar.entryContent entry & \case
Tar.NormalFile bs _
| ".cabal" `isSuffixOf` entryPath ->
| ".cabal" `isSuffixOf` entryPath && entryTime > time ->
(entryPath, entryTime, BL.toStrict bs) `S.cons` acc
_ -> acc

-- | Finds all cabal files in the specified directory, and inserts them into the database after extracting the relevant data
importAllFilesInDirectory :: (Reader PoolConfig :> es, DB :> es, IOE :> es) => Logger -> UserId -> FilePath -> Bool -> Eff es ()
importAllFilesInDirectory appLogger user dir directImport = do
importAllFilesInDirectory
:: (Reader PoolConfig :> es, DB :> es, IOE :> es)
=> Logger
-> UserId
-> Maybe Text
-> FilePath
-> Bool
-> Eff es ()
importAllFilesInDirectory appLogger user repository dir directImport = do
liftIO $! System.createDirectoryIfMissing True dir
liftIO . putStrLn $! "🔎 Searching cabal files in " <> dir
importFromStream appLogger user directImport $ findAllCabalFilesInDirectory dir
importFromStream appLogger user repository directImport $ findAllCabalFilesInDirectory dir

importFromStream
:: (Reader PoolConfig :> es, DB :> es, IOE :> es)
=> Logger
-> UserId
-> Maybe Text
-> Bool
-> S.AsyncT IO (String, UTCTime, BS.ByteString)
-> Eff es ()
importFromStream appLogger user directImport stream = do
importFromStream appLogger user repository directImport stream = do
pool <- getPool
poolConfig <- ask @PoolConfig
let displayCount =
flip SFold.foldlM' (return 0) $
\previousCount _ ->
let currentCount = previousCount + 1
in do
when (currentCount `mod` 400 == 0) $
displayStats currentCount
return currentCount
-- create a packageindex if it doesn't exist
maybe (pure ()) createPkgIdx repository
processedPackageCount <-
withWorkerDbPool $ \wq ->
liftIO $! S.fold displayCount $! S.fromAsync $! S.mapM (processFile wq pool poolConfig) $! stream
finally
( withWorkerDbPool $ \wq ->
liftIO $!
S.fold displayCount $!
S.fromAsync $!
S.mapM (processFile wq pool poolConfig) $!
stream
)
-- We want to refresh db and update latest timestamp even if we fell
-- over at some point
( Update.refreshLatestVersions
>> Update.refreshDependents
>> maybe (pure ()) updatePkgIdxTimestamp repository
)
displayStats processedPackageCount
Update.refreshLatestVersions >> Update.refreshDependents
where
updatePkgIdxTimestamp repository =
Query.getLatestReleaseTime (Just repository)
>>= updatePackageIndexTimestamp repository
createPkgIdx repo = do
pkgIndexTz <- getPackageIndexTimestamp repo
when (isNothing pkgIndexTz) $
updatePackageIndexTimestamp repo Nothing
displayCount =
flip SFold.foldlM' (return 0) $
\previousCount _ ->
let currentCount = previousCount + 1
in do
when (currentCount `mod` 400 == 0) $
displayStats currentCount
return currentCount
processFile wq pool poolConfig =
runEff
. runReader poolConfig
Expand All @@ -94,7 +142,7 @@ importFromStream appLogger user directImport stream = do
. Log.runLog "flora-jobs" appLogger defaultLogLevel
. ( \(path, timestamp, content) ->
loadContent path content
>>= ( extractPackageDataFromCabal user timestamp
>>= ( extractPackageDataFromCabal user repository timestamp
>=> \importedPackage ->
if directImport
then persistImportOutput wq importedPackage
Expand Down
2 changes: 1 addition & 1 deletion src/core/Flora/Model/Package/Query.hs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ packageDependentsWithLatestVersionQuery =
FROM "packages" AS p
INNER JOIN "dependents" AS dep
ON p."package_id" = dep."dependent_id"
INNER JOIN "releases" AS r
INNER JOIN "releases" AS r
ON r."package_id" = p."package_id"
WHERE dep."namespace" = ?
AND dep."name" = ?
Expand Down
Loading

0 comments on commit dcccd48

Please sign in to comment.