Skip to content

Commit

Permalink
Merge pull request #167 from r0mainK/add-uast-dataset
Browse files Browse the repository at this point in the history
Support the UAST dataset in `pga` tool
  • Loading branch information
vmarkovtsev authored Oct 11, 2019
2 parents 662cad0 + 9cb72b3 commit 6aa7e3c
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 0 deletions.
1 change: 1 addition & 0 deletions PublicGitArchive/pga/pga/pga.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func ForEachRepository(ctx context.Context, r *csv.Reader, dataset Dataset, filt
// Datasets is a slice containing Dataset objects on which we can apply the `get` and `list` commands.
var Datasets = []Dataset{
&SivaDataset{},
&UastDataset{},
}

type badHeaderLengthError struct {
Expand Down
126 changes: 126 additions & 0 deletions PublicGitArchive/pga/pga/uast.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package pga

const (
uastHeaderURL = iota
uastHeaderFilenames
uastHeaderFileCount
uastHeaderSize
uastHeaderFileExtractionRate
uastHeaderByteExtractionRate
uastHeaderLangs
uastHeaderLangsFileCount
uastHeaderLangsByteCount
uastHeaderLangsFileExtractionRate
uastHeaderLangsByteExtractionRate
)

var uastCSVHeaders = []string{
uastHeaderURL: "URL",
uastHeaderFilenames: "PARQUET_FILENAMES",
uastHeaderFileCount: "FILE_COUNT",
uastHeaderSize: "SIZE",
uastHeaderFileExtractionRate: "FILE_EXTRACT_RATE",
uastHeaderByteExtractionRate: "BYTE_EXTRACT_RATE",
uastHeaderLangs: "LANGS",
uastHeaderLangsFileCount: "LANGS_FILE_COUNT",
uastHeaderLangsByteCount: "LANGS_BYTE_COUNT",
uastHeaderLangsFileExtractionRate: "LANGS_FILE_EXTRACT_RATE",
uastHeaderLangsByteExtractionRate: "LANGS_BYTE_EXTRACT_RATE",
}

// UastRepository contains the data from a row of the CSV index
type UastRepository struct {
URL string `json:"url"` // URL of the repository.
ParquetFilenames []string `json:"parquetFilenames"` // Parquet filenames.
Size int64 `json:"size"` // Sum of the files sizes.

// Stats per language
Languages []string `json:"langs"` // Languages found in the repository.
LanguagesFileCount []int64 `json:"langsFileCount"` // Number of files in the language in the same index.
LanguagesByteCount []int64 `json:"langsByteCount"` // Number of bytes for the language in same index.
LanguagesFileExtractionRate []float64 `json:"langsFileExtractRate"` // Ratio of files extracted and converted to UAST in the repository per language.
LanguagesByteExtractionRate []float64 `json:"langsByteExtractRate"` // Ratio of bytes extracted and converted to UAST in the repository per language.

// Global stats
Files int64 `json:"fileCount"` // Number of files in the repository.
FileExtractionRate float64 `json:"fileExtractRate"` // Ratio of files extracted and converted to UAST in the repository.
ByteExtractionRate float64 `json:"byteExtractRate"` // Ratio of bytes extracted and converted to UAST in the repository.
}

// ToCSV returns a slice of strings corresponding to the CSV representation of the repository.
func (r *UastRepository) ToCSV() []string {
return []string{
uastHeaderURL: r.URL,
uastHeaderFilenames: formatStringList(r.ParquetFilenames),
uastHeaderFileCount: formatInt(r.Files),
uastHeaderSize: formatInt(r.Size),
uastHeaderFileExtractionRate: formatFloat(r.FileExtractionRate),
uastHeaderByteExtractionRate: formatFloat(r.ByteExtractionRate),
uastHeaderLangs: formatStringList(r.Languages),
uastHeaderLangsFileCount: formatIntList(r.LanguagesFileCount),
uastHeaderLangsByteCount: formatIntList(r.LanguagesByteCount),
uastHeaderLangsFileExtractionRate: formatFloatList(r.LanguagesFileExtractionRate),
uastHeaderLangsByteExtractionRate: formatFloatList(r.LanguagesByteExtractionRate),
}
}

// GetURL returns the string corresponding to the URL of the repository.
func (r *UastRepository) GetURL() string {
return r.URL
}

// GetLanguages returns a slice of strings corresponding to the languages found in the repository.
func (r *UastRepository) GetLanguages() []string {
return r.Languages
}

// GetFilenames returns a slice of strings corresponding to the filenames found in the repository.
func (r *UastRepository) GetFilenames() []string {
return r.ParquetFilenames
}

// UastDataset provides iteration over the SivaRepositories.
type UastDataset struct{}

// Name returns the name of the dataset
func (UastDataset) Name() string {
return "uast"
}

// ReadHeader reads the header of the CSV index.
func (dataset *UastDataset) ReadHeader(columnNames []string) error {
length := len(columnNames)
expected := len(uastCSVHeaders)
if length != expected {
return &badHeaderLengthError{
length: length,
expectedMin: expected,
expectedMax: expected,
}
}

for i, h := range columnNames {
if h != uastCSVHeaders[i] {
return &badHeaderColumnError{expected: uastCSVHeaders[i], index: i, col: h}
}
}
return nil
}

// RepositoryFromTuple returns a UastRepository from a slice of strings corresponding to it's CSV representation.
func (dataset *UastDataset) RepositoryFromTuple(cols []string) (repo Repository, err error) {
p := parser{cols: cols, csvHeaders: &uastCSVHeaders}
return &UastRepository{
URL: p.readString(uastHeaderURL),
ParquetFilenames: p.readStringList(uastHeaderFilenames),
Files: p.readInt(uastHeaderFileCount),
Size: p.readInt(uastHeaderSize),
FileExtractionRate: p.readFloat(uastHeaderFileExtractionRate),
ByteExtractionRate: p.readFloat(uastHeaderByteExtractionRate),
Languages: p.readStringList(uastHeaderLangs),
LanguagesFileCount: p.readIntList(uastHeaderLangsFileCount),
LanguagesByteCount: p.readIntList(uastHeaderLangsByteCount),
LanguagesFileExtractionRate: p.readFloatList(uastHeaderLangsFileExtractionRate),
LanguagesByteExtractionRate: p.readFloatList(uastHeaderLangsByteExtractionRate),
}, p.err
}

0 comments on commit 6aa7e3c

Please sign in to comment.