Skip to content

Commit

Permalink
refactor: test and refactor split file
Browse files Browse the repository at this point in the history
  • Loading branch information
phillebaba committed Jul 10, 2024
1 parent 7288cf2 commit f5ed02c
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 143 deletions.
2 changes: 1 addition & 1 deletion src/pkg/layout/package.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ func (pp *PackagePaths) ArchivePackage(destinationTarball string, maxPackageSize
return fmt.Errorf("unable to split the package archive into multiple files: must be less than 1,000 files")
}
message.Notef("Package is larger than %dMB, splitting into multiple files", maxPackageSizeMB)
err := utils.SplitFile(destinationTarball, chunkSize)
err := splitFile(destinationTarball, chunkSize)

Check warning on line 246 in src/pkg/layout/package.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/package.go#L246

Added line #L246 was not covered by tests
if err != nil {
return fmt.Errorf("unable to split the package archive into multiple files: %w", err)
}
Expand Down
109 changes: 109 additions & 0 deletions src/pkg/layout/split.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2021-Present The Zarf Authors

// Package layout contains functions for interacting with Zarf's package layout on disk.
package layout

import (
"crypto/sha256"
"encoding/json"
"errors"
"fmt"
"io"
"os"

"github.com/defenseunicorns/pkg/helpers/v2"
"github.com/defenseunicorns/zarf/src/pkg/message"
"github.com/defenseunicorns/zarf/src/types"
)

// splitFile will split the file into chunks and remove the original file.
func splitFile(srcPath string, chunkSize int) error {
srcFile, err := os.Open(srcPath)
if err != nil {
return err

Check warning on line 24 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L24

Added line #L24 was not covered by tests
}
defer srcFile.Close()
fi, err := srcFile.Stat()
if err != nil {
return err

Check warning on line 29 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L29

Added line #L29 was not covered by tests
}

title := fmt.Sprintf("[0/%d] MB bytes written", fi.Size()/1000/1000)
progressBar := message.NewProgressBar(fi.Size(), title)
defer progressBar.Close()

hash := sha256.New()
fileCount := 0
for {
path := fmt.Sprintf("%s.part%03d", srcPath, fileCount+1)
dstFile, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, helpers.ReadAllWriteUser)
if err != nil {
return err

Check warning on line 42 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L42

Added line #L42 was not covered by tests
}
defer dstFile.Close()

written, copyErr := io.CopyN(dstFile, srcFile, int64(chunkSize))
if copyErr != nil && !errors.Is(copyErr, io.EOF) {
return err

Check warning on line 48 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L48

Added line #L48 was not covered by tests
}
progressBar.Add(int(written))
title := fmt.Sprintf("[%d/%d] MB bytes written", progressBar.GetCurrent()/1000/1000, fi.Size()/1000/1000)
progressBar.Updatef(title)

_, err = dstFile.Seek(0, io.SeekStart)
if err != nil {
return err

Check warning on line 56 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L56

Added line #L56 was not covered by tests
}
_, err = io.Copy(hash, dstFile)
if err != nil {
return err

Check warning on line 60 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L60

Added line #L60 was not covered by tests
}
err = dstFile.Close()
if err != nil {
return err

Check warning on line 64 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L64

Added line #L64 was not covered by tests
}

// EOF error could be returned on 0 bytes written.
if written == 0 {
err = os.Remove(path)
if err != nil {
return err

Check warning on line 71 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L71

Added line #L71 was not covered by tests
}
break
}

fileCount++
if errors.Is(copyErr, io.EOF) {
break
}
}

// Remove original file
err = srcFile.Close()
if err != nil {
return err

Check warning on line 85 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L85

Added line #L85 was not covered by tests
}
err = os.Remove(srcPath)
if err != nil {
return err

Check warning on line 89 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L89

Added line #L89 was not covered by tests
}

// Write header file
data := types.ZarfSplitPackageData{
Count: fileCount,
Bytes: fi.Size(),
Sha256Sum: fmt.Sprintf("%x", hash.Sum(nil)),
}
b, err := json.Marshal(data)
if err != nil {
return fmt.Errorf("unable to marshal the split package data: %w", err)

Check warning on line 100 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L100

Added line #L100 was not covered by tests
}
path := fmt.Sprintf("%s.part000", srcPath)
if err := os.WriteFile(path, b, helpers.ReadAllWriteUser); err != nil {
return fmt.Errorf("unable to write the file %s: %w", path, err)

Check warning on line 104 in src/pkg/layout/split.go

View check run for this annotation

Codecov / codecov/patch

src/pkg/layout/split.go#L104

Added line #L104 was not covered by tests
}
progressBar.Successf("Package split across %d files", fileCount+1)

return nil
}
96 changes: 96 additions & 0 deletions src/pkg/layout/split_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2021-Present The Zarf Authors

package layout

import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"

"github.com/defenseunicorns/zarf/src/types"
"github.com/stretchr/testify/require"
)

func TestSplitFile(t *testing.T) {
t.Parallel()

tests := []struct {
name string
fileSize int
chunkSize int
expectedFileSize int64
expectedLastFileSize int64
expectedFileCount int
expectedSha256Sum string
}{
{
name: "split evenly",
fileSize: 2048,
chunkSize: 16,
expectedFileSize: 16,
expectedLastFileSize: 16,
expectedFileCount: 128,
expectedSha256Sum: "93ecad679eff0df493aaf5d7d615211b0f1d7a919016efb15c98f0b8efb1ba43",
},
{
name: "split with remainder",
fileSize: 2048,
chunkSize: 10,
expectedFileSize: 10,
expectedLastFileSize: 8,
expectedFileCount: 205,
expectedSha256Sum: "fe8460f4d53d3578aa37191acf55b3db7bbcb706056f4b6b02a0c70f24b0d95a",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

dir := t.TempDir()
name := "random"
p := filepath.Join(dir, name)
f, err := os.Create(p)
require.NoError(t, err)
b := make([]byte, tt.fileSize)
for i := range tt.fileSize {
b[i] = byte(tt.chunkSize)
}
require.NoError(t, err)
_, err = f.Write(b)
require.NoError(t, err)
f.Close()

err = splitFile(p, tt.chunkSize)
require.NoError(t, err)

_, err = os.Stat(p)
require.ErrorIs(t, err, os.ErrNotExist)
entries, err := os.ReadDir(dir)
require.NoError(t, err)
require.Len(t, entries, tt.expectedFileCount+1)
for i, entry := range entries[1:] {
require.Equal(t, fmt.Sprintf("%s.part%03d", name, i+1), entry.Name())

fi, err := entry.Info()
require.NoError(t, err)
if i == len(entries)-2 {
require.Equal(t, tt.expectedLastFileSize, fi.Size())
} else {
require.Equal(t, tt.expectedFileSize, fi.Size())
}
}

b, err = os.ReadFile(filepath.Join(dir, fmt.Sprintf("%s.part000", name)))
require.NoError(t, err)
var data types.ZarfSplitPackageData
err = json.Unmarshal(b, &data)
require.NoError(t, err)
require.Equal(t, tt.expectedFileCount, data.Count)
require.Equal(t, int64(tt.fileSize), data.Bytes)
require.Equal(t, tt.expectedSha256Sum, data.Sha256Sum)
})
}
}
142 changes: 0 additions & 142 deletions src/pkg/utils/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,13 @@
package utils

import (
"crypto/sha256"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"

"github.com/defenseunicorns/pkg/helpers/v2"
"github.com/defenseunicorns/zarf/src/config"
"github.com/defenseunicorns/zarf/src/pkg/message"
"github.com/defenseunicorns/zarf/src/types"
)

const (
Expand Down Expand Up @@ -73,141 +69,3 @@ func GetFinalExecutableCommand() (string, error) {

return zarfCommand, err
}

// SplitFile will take a srcFile path and split it into files based on chunkSizeBytes
// the first file will be a metadata file containing:
// - sha256sum of the original file
// - number of bytes in the original file
// - number of files the srcFile was split into
// SplitFile will delete the original file
//
// Returns:
// - fileNames: list of file paths srcFile was split across
// - sha256sum: sha256sum of the srcFile before splitting
// - err: any errors encountered
func SplitFile(srcPath string, chunkSizeBytes int) (err error) {
var fileNames []string
var sha256sum string
hash := sha256.New()

// Set buffer size to some multiple of 4096 KiB for modern file system cluster sizes
bufferSize := 16 * 1024 * 1024 // 16 MiB
// if chunkSizeBytes is less than bufferSize, use chunkSizeBytes as bufferSize for simplicity
if chunkSizeBytes < bufferSize {
bufferSize = chunkSizeBytes
}
buf := make([]byte, bufferSize)

// get file size
fi, err := os.Stat(srcPath)
if err != nil {
return err
}
fileSize := fi.Size()

// start progress bar
title := fmt.Sprintf("[0/%d] MB bytes written", fileSize/1000/1000)
progressBar := message.NewProgressBar(fileSize, title)
defer progressBar.Close()

// open srcFile
srcFile, err := os.Open(srcPath)
if err != nil {
return err
}
defer srcFile.Close()

// create file path starting from part 001
path := fmt.Sprintf("%s.part001", srcPath)
chunkFile, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, helpers.ReadAllWriteUser)
if err != nil {
return err
}
fileNames = append(fileNames, path)
defer chunkFile.Close()

// setup counter for tracking how many bytes are left to write to file
chunkBytesRemaining := chunkSizeBytes
// Loop over the tarball hashing as we go and breaking it into chunks based on the chunkSizeBytes
for {
bytesRead, err := srcFile.Read(buf)

if err != nil {
if err == io.EOF {
// At end of file, break out of loop
break
}
return err
}

// Pass data to hash
hash.Write(buf[0:bytesRead])

// handle if we should split the data between two chunks
if chunkBytesRemaining < bytesRead {
// write the remaining chunk size to file
_, err := chunkFile.Write(buf[0:chunkBytesRemaining])
if err != nil {
return err
}
err = chunkFile.Close()
if err != nil {
return err
}

// create new file
path = fmt.Sprintf("%s.part%03d", srcPath, len(fileNames)+1)
chunkFile, err = os.OpenFile(path, os.O_CREATE|os.O_WRONLY, helpers.ReadAllWriteUser)
if err != nil {
return err
}
fileNames = append(fileNames, path)
defer chunkFile.Close()

// write to new file where we left off
_, err = chunkFile.Write(buf[chunkBytesRemaining:bytesRead])
if err != nil {
return err
}

// set chunkBytesRemaining considering how many bytes are already written to new file
chunkBytesRemaining = chunkSizeBytes - (bufferSize - chunkBytesRemaining)
} else {
_, err := chunkFile.Write(buf[0:bytesRead])
if err != nil {
return err
}
chunkBytesRemaining = chunkBytesRemaining - bytesRead
}

// update progress bar
progressBar.Add(bufferSize)
title := fmt.Sprintf("[%d/%d] MB bytes written", progressBar.GetCurrent()/1000/1000, fileSize/1000/1000)
progressBar.Updatef(title)
}
srcFile.Close()
_ = os.RemoveAll(srcPath)

// calculate sha256 sum
sha256sum = fmt.Sprintf("%x", hash.Sum(nil))

// Marshal the data into a json file.
jsonData, err := json.Marshal(types.ZarfSplitPackageData{
Count: len(fileNames),
Bytes: fileSize,
Sha256Sum: sha256sum,
})
if err != nil {
return fmt.Errorf("unable to marshal the split package data: %w", err)
}

// write header file
path = fmt.Sprintf("%s.part000", srcPath)
if err := os.WriteFile(path, jsonData, helpers.ReadAllWriteUser); err != nil {
return fmt.Errorf("unable to write the file %s: %w", path, err)
}
fileNames = append(fileNames, path)
progressBar.Successf("Package split across %d files", len(fileNames))

return nil
}

0 comments on commit f5ed02c

Please sign in to comment.