Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve text file parsing #107

Merged
merged 4 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/libmodle/internal/genome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ std::vector<std::shared_ptr<const Chromosome>> Genome::import_chromosomes(
std::vector<std::shared_ptr<const Chromosome>> buffer{};
for (auto&& record : chrom_sizes::Parser(path_to_chrom_sizes).parse_all()) {
if (auto it = chrom_names.find(record.chrom); it != chrom_names.end()) {
assert(record.size() != 0);
throw std::runtime_error(fmt::format(
FMT_STRING(
"Found duplicate entry for {} at line {} of file {}! First entry was at line {}"),
Expand Down
4 changes: 3 additions & 1 deletion src/libmodle_io/bed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "modle/bed/bed.hpp"

#include <absl/strings/ascii.h>
#include <absl/strings/match.h> // for StrContains
#include <absl/strings/str_join.h> // for StrJoin
#include <absl/strings/str_split.h> // for StrSplit, Splitter, SplitIterator
Expand Down Expand Up @@ -247,7 +248,8 @@ BED::BED(BED::Dialect d) : _standard(d) {}
// NOLINTNEXTLINE(readability-function-cognitive-complexity)
BED::BED(std::string_view record, usize id_, BED::Dialect bed_standard, bool validate) : _id(id_) {
std::vector<std::string_view> toks;
for (std::string_view tok : absl::StrSplit(record, absl::ByAnyChar("\t "))) {
for (std::string_view tok :
absl::StrSplit(absl::StripTrailingAsciiWhitespace(record), absl::ByAnyChar("\t "))) {
if (!tok.empty()) {
toks.push_back(tok);
}
Expand Down
21 changes: 15 additions & 6 deletions src/libmodle_io/chrom_sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
#include "modle/chrom_sizes/chrom_sizes.hpp" // for Parser

#include <absl/container/flat_hash_set.h> // for flat_hash_set
#include <absl/strings/str_split.h> // for StrSplit, Splitter
#include <absl/strings/ascii.h>
#include <absl/strings/str_split.h> // for StrSplit, Splitter
#include <fmt/compile.h>
#include <fmt/format.h> // for format, FMT_COMPILE_STRING, FMT_STRING...
#include <fmt/std.h>

#include <cassert> // for assert
#include <filesystem> // for filesystem::path
Expand All @@ -29,33 +31,40 @@ std::vector<bed::BED> Parser::parse_all(char sep) {
std::vector<bed::BED> chrom_sizes;

for (usize i = 1UL, id = 0; this->_reader.getline(buff); ++i) {
buff = absl::StripTrailingAsciiWhitespace(buff);
if (buff.empty()) {
continue;
}

const auto splitter = absl::StrSplit(buff, sep);
const auto num_toks = std::distance(splitter.begin(), splitter.end());
try {
if (num_toks < 2) {
if (num_toks != 2) {
throw std::runtime_error(
fmt::format(FMT_STRING("expected 2 or more tokens, got {}: \"{}\""), num_toks, buff));
fmt::format(FMT_STRING("expected exactly 2 fields, found {}: \"{}\""), num_toks, buff));
}
DISABLE_WARNING_PUSH
DISABLE_WARNING_NULL_DEREF
const auto chrom_name = utils::strip_quote_pairs(*splitter.begin());
const auto chrom_size = *std::next(splitter.begin());
DISABLE_WARNING_POP
if (chrom_names.contains(chrom_name)) {
throw std::runtime_error(
fmt::format(FMT_STRING("found multiple records for chrom \"{}\""), chrom_name));
}
DISABLE_WARNING_POP

if (chrom_size == "0") {
throw std::runtime_error(
fmt::format(FMT_STRING("chrom \"{}\" has a length of 0bp"), chrom_name));
}
chrom_sizes.emplace_back(
fmt::format(FMT_COMPILE("{}\t0\t{}"), chrom_name, *std::next(splitter.begin())), id++,
bed::BED::BED3);
} catch (const std::runtime_error& e) {
throw std::runtime_error(
fmt::format(FMT_STRING("encountered a malformed record at line {} of file \"{}\": {}.\n "
fmt::format(FMT_STRING("encountered a malformed record at line {} of file {}: {}.\n "
"Line that triggered the error:\n\"{}\""),
i, this->_reader.path_string(), e.what(), buff.data()));
i, this->_reader.path(), e.what(), buff.data()));
}
}
return chrom_sizes;
Expand Down
55 changes: 41 additions & 14 deletions test/units/libmodle_io/bed_parser_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,16 @@
#include "modle/bed/bed.hpp" // for BED, Parser, formatter<>::format, BED::BED3
#include "modle/common/common.hpp" // for usize
#include "modle/compressed_io/compressed_io.hpp" // for Reader
#include "modle/test/self_deleting_folder.hpp" // for SelfDeletingFolder

namespace modle::test {
inline const SelfDeletingFolder testdir{true}; // NOLINT(cert-err58-cpp)
} // namespace modle::test

namespace modle::bed::test {

constexpr auto &testdir = modle::test::testdir;

[[maybe_unused]] static const std::filesystem::path &data_dir() {
static const std::filesystem::path data_dir{"test/data/unit_tests"};
return data_dir;
Expand Down Expand Up @@ -62,20 +69,6 @@ static void compare_bed_records_with_file(std::vector<BED> records, const std::s
}
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") {
const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";
auto p = bed::Parser(bed_file);
auto records = p.parse_all();
CHECK(records.size() == 9);
std::sort(records.begin(), records.end());
CHECK(records[0].chrom == "chr7");
CHECK(records[0].chrom_start == 127471196);
CHECK(records[0].chrom_end == 127472363);
CHECK(records[0].score == 0);
CHECK(records[0].strand == '+');
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") {
SECTION("valid") {
Expand Down Expand Up @@ -110,6 +103,40 @@ TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") {
}
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED Parser CRLF", "[parsers][BED][io][short]") {
const auto bed_file = testdir() / "crlf.bed";

const usize num_records = 3;

{
compressed_io::Writer w(bed_file, compressed_io::Writer::NONE);
for (usize i = 0; i < num_records; ++i) {
w.write(fmt::format(FMT_STRING("chr{}\t0\t1\r\n"), i));
}
}

const auto records = bed::Parser(bed_file).parse_all();
REQUIRE(records.size() == num_records);
for (usize i = 0; i < num_records; ++i) {
CHECK(bed::BED(fmt::format(FMT_STRING("chr{}\t0\t1"), i)) == records[i]);
}
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") {
const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";
auto p = bed::Parser(bed_file);
auto records = p.parse_all();
CHECK(records.size() == 9);
std::sort(records.begin(), records.end());
CHECK(records[0].chrom == "chr7");
CHECK(records[0].chrom_start == 127471196);
CHECK(records[0].chrom_end == 127472363);
CHECK(records[0].score == 0);
CHECK(records[0].strand == '+');
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED Parser simple: BED6 -> BED3", "[parsers][BED][io][short]") {
const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";
Expand Down