diff --git a/src/libmodle/internal/genome.cpp b/src/libmodle/internal/genome.cpp index 4dcee6dc..65d722fe 100644 --- a/src/libmodle/internal/genome.cpp +++ b/src/libmodle/internal/genome.cpp @@ -346,6 +346,7 @@ std::vector> Genome::import_chromosomes( std::vector> buffer{}; for (auto&& record : chrom_sizes::Parser(path_to_chrom_sizes).parse_all()) { if (auto it = chrom_names.find(record.chrom); it != chrom_names.end()) { + assert(record.size() != 0); throw std::runtime_error(fmt::format( FMT_STRING( "Found duplicate entry for {} at line {} of file {}! First entry was at line {}"), diff --git a/src/libmodle_io/bed.cpp b/src/libmodle_io/bed.cpp index 83236f1c..933e4a6a 100644 --- a/src/libmodle_io/bed.cpp +++ b/src/libmodle_io/bed.cpp @@ -6,6 +6,7 @@ #include "modle/bed/bed.hpp" +#include #include // for StrContains #include // for StrJoin #include // for StrSplit, Splitter, SplitIterator @@ -247,7 +248,8 @@ BED::BED(BED::Dialect d) : _standard(d) {} // NOLINTNEXTLINE(readability-function-cognitive-complexity) BED::BED(std::string_view record, usize id_, BED::Dialect bed_standard, bool validate) : _id(id_) { std::vector toks; - for (std::string_view tok : absl::StrSplit(record, absl::ByAnyChar("\t "))) { + for (std::string_view tok : + absl::StrSplit(absl::StripTrailingAsciiWhitespace(record), absl::ByAnyChar("\t "))) { if (!tok.empty()) { toks.push_back(tok); } diff --git a/src/libmodle_io/chrom_sizes.cpp b/src/libmodle_io/chrom_sizes.cpp index 420a1ecb..fe98ad46 100644 --- a/src/libmodle_io/chrom_sizes.cpp +++ b/src/libmodle_io/chrom_sizes.cpp @@ -5,9 +5,11 @@ #include "modle/chrom_sizes/chrom_sizes.hpp" // for Parser #include // for flat_hash_set -#include // for StrSplit, Splitter +#include +#include // for StrSplit, Splitter #include #include // for format, FMT_COMPILE_STRING, FMT_STRING... +#include #include // for assert #include // for filesystem::path @@ -29,6 +31,7 @@ std::vector Parser::parse_all(char sep) { std::vector chrom_sizes; for (usize i = 1UL, id = 0; this->_reader.getline(buff); ++i) { + buff = absl::StripTrailingAsciiWhitespace(buff); if (buff.empty()) { continue; } @@ -36,26 +39,32 @@ std::vector Parser::parse_all(char sep) { const auto splitter = absl::StrSplit(buff, sep); const auto num_toks = std::distance(splitter.begin(), splitter.end()); try { - if (num_toks < 2) { + if (num_toks != 2) { throw std::runtime_error( - fmt::format(FMT_STRING("expected 2 or more tokens, got {}: \"{}\""), num_toks, buff)); + fmt::format(FMT_STRING("expected exactly 2 fields, found {}: \"{}\""), num_toks, buff)); } DISABLE_WARNING_PUSH DISABLE_WARNING_NULL_DEREF const auto chrom_name = utils::strip_quote_pairs(*splitter.begin()); + const auto chrom_size = *std::next(splitter.begin()); + DISABLE_WARNING_POP if (chrom_names.contains(chrom_name)) { throw std::runtime_error( fmt::format(FMT_STRING("found multiple records for chrom \"{}\""), chrom_name)); } - DISABLE_WARNING_POP + + if (chrom_size == "0") { + throw std::runtime_error( + fmt::format(FMT_STRING("chrom \"{}\" has a length of 0bp"), chrom_name)); + } chrom_sizes.emplace_back( fmt::format(FMT_COMPILE("{}\t0\t{}"), chrom_name, *std::next(splitter.begin())), id++, bed::BED::BED3); } catch (const std::runtime_error& e) { throw std::runtime_error( - fmt::format(FMT_STRING("encountered a malformed record at line {} of file \"{}\": {}.\n " + fmt::format(FMT_STRING("encountered a malformed record at line {} of file {}: {}.\n " "Line that triggered the error:\n\"{}\""), - i, this->_reader.path_string(), e.what(), buff.data())); + i, this->_reader.path(), e.what(), buff.data())); } } return chrom_sizes; diff --git a/test/units/libmodle_io/bed_parser_test.cpp b/test/units/libmodle_io/bed_parser_test.cpp index b3fdb943..d50273af 100644 --- a/test/units/libmodle_io/bed_parser_test.cpp +++ b/test/units/libmodle_io/bed_parser_test.cpp @@ -16,9 +16,16 @@ #include "modle/bed/bed.hpp" // for BED, Parser, formatter<>::format, BED::BED3 #include "modle/common/common.hpp" // for usize #include "modle/compressed_io/compressed_io.hpp" // for Reader +#include "modle/test/self_deleting_folder.hpp" // for SelfDeletingFolder + +namespace modle::test { +inline const SelfDeletingFolder testdir{true}; // NOLINT(cert-err58-cpp) +} // namespace modle::test namespace modle::bed::test { +constexpr auto &testdir = modle::test::testdir; + [[maybe_unused]] static const std::filesystem::path &data_dir() { static const std::filesystem::path data_dir{"test/data/unit_tests"}; return data_dir; @@ -62,20 +69,6 @@ static void compare_bed_records_with_file(std::vector records, const std::s } } -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") { - const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz"; - auto p = bed::Parser(bed_file); - auto records = p.parse_all(); - CHECK(records.size() == 9); - std::sort(records.begin(), records.end()); - CHECK(records[0].chrom == "chr7"); - CHECK(records[0].chrom_start == 127471196); - CHECK(records[0].chrom_end == 127472363); - CHECK(records[0].score == 0); - CHECK(records[0].strand == '+'); -} - // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") { SECTION("valid") { @@ -110,6 +103,40 @@ TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") { } } +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("BED Parser CRLF", "[parsers][BED][io][short]") { + const auto bed_file = testdir() / "crlf.bed"; + + const usize num_records = 3; + + { + compressed_io::Writer w(bed_file, compressed_io::Writer::NONE); + for (usize i = 0; i < num_records; ++i) { + w.write(fmt::format(FMT_STRING("chr{}\t0\t1\r\n"), i)); + } + } + + const auto records = bed::Parser(bed_file).parse_all(); + REQUIRE(records.size() == num_records); + for (usize i = 0; i < num_records; ++i) { + CHECK(bed::BED(fmt::format(FMT_STRING("chr{}\t0\t1"), i)) == records[i]); + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") { + const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz"; + auto p = bed::Parser(bed_file); + auto records = p.parse_all(); + CHECK(records.size() == 9); + std::sort(records.begin(), records.end()); + CHECK(records[0].chrom == "chr7"); + CHECK(records[0].chrom_start == 127471196); + CHECK(records[0].chrom_end == 127472363); + CHECK(records[0].score == 0); + CHECK(records[0].strand == '+'); +} + // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("BED Parser simple: BED6 -> BED3", "[parsers][BED][io][short]") { const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";