Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

codeintel: Consider dependency graph for LSIF data retention #22930

Merged
merged 2 commits into from
Jul 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 20 additions & 19 deletions enterprise/internal/codeintel/stores/dbstore/dumps.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ func (s *Store) FindClosestDumps(ctx context.Context, repositoryID int, commit,

const findClosestDumpsQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/dumps.go:FindClosestDumps
WITH visible_uploads AS (%s)
WITH
visible_uploads AS (%s)
SELECT
u.id,
u.commit,
Expand Down Expand Up @@ -245,7 +246,8 @@ func (s *Store) FindClosestDumpsFromGraphFragment(ctx context.Context, repositor

const findClosestDumpsFromGraphFragmentCommitGraphQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/dumps.go:FindClosestDumpsFromGraphFragment
WITH visible_uploads AS (%s)
WITH
visible_uploads AS (%s)
SELECT
vu.upload_id,
encode(vu.commit_bytea, 'hex'),
Expand Down Expand Up @@ -379,26 +381,25 @@ func (s *Store) DeleteOverlappingDumps(ctx context.Context, repositoryID int, co

const deleteOverlappingDumpsQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/dumps.go:DeleteOverlappingDumps
WITH overlapping_dumps AS (
SELECT id
FROM lsif_uploads
WITH
candidates AS (
SELECT u.id
FROM lsif_uploads u
WHERE
state = 'completed' AND
repository_id = %s AND
commit = %s AND
root = %s AND
indexer = %s

-- Lock these rows in a deterministic order before the update
-- below. If we don't do this then we run into a pretty high
-- deadlock rate during upload processing as multiple workers
-- issue commands for the same set of records, but upload locks
-- records nondeterministically.
ORDER BY id FOR UPDATE
u.state = 'completed' AND
u.repository_id = %s AND
u.commit = %s AND
u.root = %s AND
u.indexer = %s

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_uploads table.
ORDER BY u.id FOR UPDATE
),
updated AS (
UPDATE lsif_uploads SET state = 'deleted'
WHERE id IN (SELECT id FROM overlapping_dumps)
UPDATE lsif_uploads u
SET state = 'deleted'
WHERE id IN (SELECT id FROM candidates)
RETURNING 1
)
SELECT COUNT(*) FROM updated
Expand Down
39 changes: 28 additions & 11 deletions enterprise/internal/codeintel/stores/dbstore/indexes.go
Original file line number Diff line number Diff line change
Expand Up @@ -499,17 +499,23 @@ func (s *Store) DeleteIndexesWithoutRepository(ctx context.Context, now time.Tim

const deleteIndexesWithoutRepositoryQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/indexes.go:DeleteIndexesWithoutRepository
WITH deleted_repos AS (
SELECT r.id AS id FROM repo r
WHERE
%s - r.deleted_at >= %s * interval '1 second' AND
EXISTS (SELECT 1 from lsif_indexes u WHERE u.repository_id = r.id)
WITH
candidates AS (
SELECT u.id
FROM repo r
JOIN lsif_indexes u ON u.repository_id = r.id
WHERE %s - r.deleted_at >= %s * interval '1 second'

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_indexes table.
ORDER BY u.id FOR UPDATE
),
deleted_uploads AS (
DELETE FROM lsif_indexes u WHERE repository_id IN (SELECT id FROM deleted_repos)
deleted AS (
DELETE FROM lsif_indexes u
WHERE id IN (SELECT id FROM candidates)
RETURNING u.id, u.repository_id
)
SELECT d.repository_id, COUNT(*) FROM deleted_uploads d GROUP BY d.repository_id
SELECT d.repository_id, COUNT(*) FROM deleted d GROUP BY d.repository_id
`

// DeleteOldIndexes deletes indexes older than the given age.
Expand Down Expand Up @@ -543,9 +549,20 @@ func (s *Store) DeleteOldIndexes(ctx context.Context, maxAge time.Duration, now

const deleteOldIndexesQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/indexes.go:DeleteOldIndexes
WITH deleted_indexes AS (
DELETE FROM lsif_indexes u WHERE %s - u.queued_at > (%s || ' second')::interval
WITH
candidates AS (
SELECT u.id
FROM lsif_indexes u
WHERE %s - u.queued_at > (%s || ' second')::interval

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_indexes table.
ORDER BY u.id FOR UPDATE
),
deleted AS (
DELETE FROM lsif_indexes u
WHERE id IN (SELECT id FROM candidates)
RETURNING u.id, u.repository_id
)
SELECT d.repository_id, COUNT(*) FROM deleted_indexes d GROUP BY d.repository_id
SELECT d.repository_id, COUNT(*) FROM deleted d GROUP BY d.repository_id
`
39 changes: 34 additions & 5 deletions enterprise/internal/codeintel/stores/dbstore/janitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ func (s *Store) StaleSourcedCommits(ctx context.Context, minimumTimeSinceLastChe

const staleSourcedCommitsQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/janitor.go:StaleSourcedCommits
WITH candidates AS (%s UNION %s)
WITH
candidates AS (%s UNION %s)
SELECT r.id, r.name, c.commit
FROM candidates c
JOIN repo r ON r.id = c.repository_id
Expand Down Expand Up @@ -141,8 +142,8 @@ func (s *Store) RefreshCommitResolvability(ctx context.Context, repositoryID int

rows, err := s.Query(ctx, sqlf.Sprintf(
refreshCommitResolvabilityQuery,
assignmentExpression, repositoryID, commit,
assignmentExpression, repositoryID, commit,
repositoryID, commit, assignmentExpression,
repositoryID, commit, assignmentExpression,
))
if err != nil {
return 0, 0, err
Expand All @@ -167,8 +168,36 @@ func (s *Store) RefreshCommitResolvability(ctx context.Context, repositoryID int
const refreshCommitResolvabilityQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/janitor.go:RefreshCommitResolvability
WITH
update_uploads AS (UPDATE lsif_uploads SET %s WHERE repository_id = %s AND commit = %s RETURNING 1),
update_indexes AS (UPDATE lsif_indexes SET %s WHERE repository_id = %s AND commit = %s RETURNING 1)
candidate_uploads AS (
SELECT u.id
FROM lsif_uploads u
WHERE u.repository_id = %s AND u.commit = %s

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_uploads table.
ORDER BY u.id FOR UPDATE
),
update_uploads AS (
UPDATE lsif_uploads u
SET %s
WHERE id IN (SELECT id FROM candidate_uploads)
RETURNING 1
),
candidate_indexes AS (
SELECT u.id
FROM lsif_indexes u
WHERE u.repository_id = %s AND u.commit = %s

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_indexes table.
ORDER BY u.id FOR UPDATE
),
update_indexes AS (
UPDATE lsif_indexes u
SET %s
WHERE id IN (SELECT id FROM candidate_indexes)
RETURNING 1
)
SELECT
(SELECT COUNT(*) FROM update_uploads) AS num_uploads,
(SELECT COUNT(*) FROM update_indexes) AS num_indexes
Expand Down
98 changes: 74 additions & 24 deletions enterprise/internal/codeintel/stores/dbstore/uploads.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package dbstore
import (
"context"
"database/sql"
"sort"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -257,11 +258,21 @@ func (s *Store) DeleteUploadsStuckUploading(ctx context.Context, uploadedBefore

const deleteUploadsStuckUploadingQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/uploads.go:DeleteUploadsStuckUploading
WITH deleted AS (
UPDATE lsif_uploads
WITH
candidates AS (
SELECT u.id
FROM lsif_uploads u
WHERE u.state = 'uploading' AND u.uploaded_at < %s

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_uploads table.
ORDER BY u.id FOR UPDATE
),
deleted AS (
UPDATE lsif_uploads u
SET state = 'deleted'
WHERE state = 'uploading' AND uploaded_at < %s
RETURNING repository_id
WHERE id IN (SELECT id FROM candidates)
RETURNING u.repository_id
)
SELECT count(*) FROM deleted
`
Expand Down Expand Up @@ -604,19 +615,24 @@ func (s *Store) DeleteUploadsWithoutRepository(ctx context.Context, now time.Tim

const deleteUploadsWithoutRepositoryQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/uploads.go:DeleteUploadsWithoutRepository
WITH deleted_repos AS (
SELECT r.id AS id FROM repo r
WHERE
%s - r.deleted_at >= %s * interval '1 second' AND
EXISTS (SELECT 1 from lsif_uploads u WHERE u.repository_id = r.id)
WITH
candidates AS (
SELECT u.id
FROM repo r
JOIN lsif_uploads u ON u.repository_id = r.id
WHERE %s - r.deleted_at >= %s * interval '1 second'

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_uploads table.
ORDER BY u.id FOR UPDATE
),
deleted_uploads AS (
deleted AS (
UPDATE lsif_uploads u
SET state = 'deleted'
WHERE u.repository_id IN (SELECT id FROM deleted_repos)
WHERE u.id IN (SELECT id FROM candidates)
RETURNING u.id, u.repository_id
)
SELECT d.repository_id, COUNT(*) FROM deleted_uploads d GROUP BY d.repository_id
SELECT d.repository_id, COUNT(*) FROM deleted d GROUP BY d.repository_id
`

// HardDeleteUploadByID deletes the upload record with the given identifier.
Expand All @@ -631,6 +647,11 @@ func (s *Store) HardDeleteUploadByID(ctx context.Context, ids ...int) (err error
return nil
}

// Ensure ids are sorted so that we take row locks during the
// DELETE query in a determinstic order. This should prevent
// deadlocks with other queries that mass update lsif_uploads.
sort.Ints(ids)

var idQueries []*sqlf.Query
for _, id := range ids {
idQueries = append(idQueries, sqlf.Sprintf("%s", id))
Expand Down Expand Up @@ -660,7 +681,7 @@ func (s *Store) SoftDeleteOldUploads(ctx context.Context, maxAge time.Duration,
defer func() { err = tx.Done(err) }()

seconds := strconv.Itoa(int(maxAge / time.Second))
repositories, err := scanCounts(tx.Store.Query(ctx, sqlf.Sprintf(softDeleteOldUploadsQuery, now, seconds, now, seconds)))
repositories, err := scanCounts(tx.Store.Query(ctx, sqlf.Sprintf(softDeleteOldUploadsQuery, now, seconds)))
if err != nil {
return 0, err
}
Expand All @@ -684,19 +705,48 @@ func (s *Store) SoftDeleteOldUploads(ctx context.Context, maxAge time.Duration,

const softDeleteOldUploadsQuery = `
-- source: enterprise/internal/codeintel/stores/dbstore/uploads.go:SoftDeleteOldUploads
WITH u AS (
WITH RECURSIVE
protected_uploads AS (
(
-- Base case: select all upload records that are yonger than the configured
-- retention age, as well as all upload records visible from a non-stale
-- branch or tag. These form the roots of our dependency graph traversal.

SELECT u.id FROM lsif_uploads u
WHERE %s - COALESCE(u.finished_at, u.uploaded_at) <= (%s || ' second')::interval
UNION
SELECT upload_id as id FROM lsif_uploads_visible_at_tip
) UNION (
-- Iterative case: expand the working set of protected uploads by traversing
-- the dependency graph: select all upload records that define an LSIF package
-- that is referenced by an upload already in the working set. We skip any
-- self-imports here, which may occur on some older Sourcegraph instances.

SELECT p.dump_id as id
FROM protected_uploads pu
JOIN lsif_references r ON r.dump_id = pu.id
JOIN lsif_packages p ON p.scheme = r.scheme AND p.name = r.name AND p.version = r.version AND p.dump_id != r.dump_id
)
),
candidates AS (
-- Find the inverse of protected_uploads, which contains each upload record
-- that is older than the configured retention age and is not reachable via
-- the dependencies of any upload in protected_uploads.
SELECT u.id
FROM lsif_uploads u
WHERE u.id NOT IN (SELECT id FROM protected_uploads)

-- Lock these rows in a deterministic order so that we don't
-- deadlock with other processes updating the lsif_uploads table.
ORDER BY u.id FOR UPDATE
),
updated AS (
UPDATE lsif_uploads u
SET state = 'deleted'
WHERE
(
%s - u.finished_at > (%s || ' second')::interval OR
(u.finished_at IS NULL AND %s - u.uploaded_at > (%s || ' second')::interval)
) AND
-- Anything visible from a non-stale branch or tag is protected from expiration
u.id NOT IN (SELECT uvt.upload_id FROM lsif_uploads_visible_at_tip uvt WHERE uvt.repository_id = u.repository_id)
RETURNING id, repository_id
SET state = 'deleted'
WHERE u.id IN (SELECT id FROM candidates)
RETURNING u.id, u.repository_id
)
SELECT u.repository_id, count(*) FROM u GROUP BY u.repository_id
SELECT u.repository_id, count(*) FROM updated u GROUP BY u.repository_id
`

// GetOldestCommitDate returns the oldest commit date for all uploads for the given repository. If there are no
Expand Down
Loading