paulsengroup · robomics · May 15, 2023 · May 12, 2023 · May 12, 2023 · May 12, 2023
diff --git a/src/libmodle/internal/genome.cpp b/src/libmodle/internal/genome.cpp
@@ -346,6 +346,7 @@ std::vector<std::shared_ptr<const Chromosome>> Genome::import_chromosomes(
   std::vector<std::shared_ptr<const Chromosome>> buffer{};
   for (auto&& record : chrom_sizes::Parser(path_to_chrom_sizes).parse_all()) {
     if (auto it = chrom_names.find(record.chrom); it != chrom_names.end()) {
+      assert(record.size() != 0);
       throw std::runtime_error(fmt::format(
           FMT_STRING(
               "Found duplicate entry for {} at line {} of file {}! First entry was at line {}"),

diff --git a/src/libmodle_io/bed.cpp b/src/libmodle_io/bed.cpp
@@ -6,6 +6,7 @@
 
 #include "modle/bed/bed.hpp"
 
+#include <absl/strings/ascii.h>
 #include <absl/strings/match.h>      // for StrContains
 #include <absl/strings/str_join.h>   // for StrJoin
 #include <absl/strings/str_split.h>  // for StrSplit, Splitter, SplitIterator
@@ -247,7 +248,8 @@ BED::BED(BED::Dialect d) : _standard(d) {}
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
 BED::BED(std::string_view record, usize id_, BED::Dialect bed_standard, bool validate) : _id(id_) {
   std::vector<std::string_view> toks;
-  for (std::string_view tok : absl::StrSplit(record, absl::ByAnyChar("\t "))) {
+  for (std::string_view tok :
+       absl::StrSplit(absl::StripTrailingAsciiWhitespace(record), absl::ByAnyChar("\t "))) {
     if (!tok.empty()) {
       toks.push_back(tok);
     }

diff --git a/src/libmodle_io/chrom_sizes.cpp b/src/libmodle_io/chrom_sizes.cpp
@@ -5,9 +5,11 @@
 #include "modle/chrom_sizes/chrom_sizes.hpp"  // for Parser
 
 #include <absl/container/flat_hash_set.h>  // for flat_hash_set
-#include <absl/strings/str_split.h>        // for StrSplit, Splitter
+#include <absl/strings/ascii.h>
+#include <absl/strings/str_split.h>  // for StrSplit, Splitter
 #include <fmt/compile.h>
 #include <fmt/format.h>  // for format, FMT_COMPILE_STRING, FMT_STRING...
+#include <fmt/std.h>
 
 #include <cassert>      // for assert
 #include <filesystem>   // for filesystem::path
@@ -29,33 +31,40 @@ std::vector<bed::BED> Parser::parse_all(char sep) {
   std::vector<bed::BED> chrom_sizes;
 
   for (usize i = 1UL, id = 0; this->_reader.getline(buff); ++i) {
+    buff = absl::StripTrailingAsciiWhitespace(buff);
     if (buff.empty()) {
       continue;
     }
 
     const auto splitter = absl::StrSplit(buff, sep);
     const auto num_toks = std::distance(splitter.begin(), splitter.end());
     try {
-      if (num_toks < 2) {
+      if (num_toks != 2) {
         throw std::runtime_error(
-            fmt::format(FMT_STRING("expected 2 or more tokens, got {}: \"{}\""), num_toks, buff));
+            fmt::format(FMT_STRING("expected exactly 2 fields, found {}: \"{}\""), num_toks, buff));
       }
       DISABLE_WARNING_PUSH
       DISABLE_WARNING_NULL_DEREF
       const auto chrom_name = utils::strip_quote_pairs(*splitter.begin());
+      const auto chrom_size = *std::next(splitter.begin());
+      DISABLE_WARNING_POP
       if (chrom_names.contains(chrom_name)) {
         throw std::runtime_error(
             fmt::format(FMT_STRING("found multiple records for chrom \"{}\""), chrom_name));
       }
-      DISABLE_WARNING_POP
+
+      if (chrom_size == "0") {
+        throw std::runtime_error(
+            fmt::format(FMT_STRING("chrom \"{}\" has a length of 0bp"), chrom_name));
+      }
       chrom_sizes.emplace_back(
           fmt::format(FMT_COMPILE("{}\t0\t{}"), chrom_name, *std::next(splitter.begin())), id++,
           bed::BED::BED3);
     } catch (const std::runtime_error& e) {
       throw std::runtime_error(
-          fmt::format(FMT_STRING("encountered a malformed record at line {} of file \"{}\": {}.\n "
+          fmt::format(FMT_STRING("encountered a malformed record at line {} of file {}: {}.\n "
                                  "Line that triggered the error:\n\"{}\""),
-                      i, this->_reader.path_string(), e.what(), buff.data()));
+                      i, this->_reader.path(), e.what(), buff.data()));
     }
   }
   return chrom_sizes;

diff --git a/test/units/libmodle_io/bed_parser_test.cpp b/test/units/libmodle_io/bed_parser_test.cpp
@@ -16,9 +16,16 @@
 #include "modle/bed/bed.hpp"                      // for BED, Parser, formatter<>::format, BED::BED3
 #include "modle/common/common.hpp"                // for usize
 #include "modle/compressed_io/compressed_io.hpp"  // for Reader
+#include "modle/test/self_deleting_folder.hpp"    // for SelfDeletingFolder
+
+namespace modle::test {
+inline const SelfDeletingFolder testdir{true};  // NOLINT(cert-err58-cpp)
+}  // namespace modle::test
 
 namespace modle::bed::test {
 
+constexpr auto &testdir = modle::test::testdir;
+
 [[maybe_unused]] static const std::filesystem::path &data_dir() {
   static const std::filesystem::path data_dir{"test/data/unit_tests"};
   return data_dir;
@@ -62,20 +69,6 @@ static void compare_bed_records_with_file(std::vector<BED> records, const std::s
   }
 }
 
-// NOLINTNEXTLINE(readability-function-cognitive-complexity)
-TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") {
-  const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";
-  auto p = bed::Parser(bed_file);
-  auto records = p.parse_all();
-  CHECK(records.size() == 9);
-  std::sort(records.begin(), records.end());
-  CHECK(records[0].chrom == "chr7");
-  CHECK(records[0].chrom_start == 127471196);
-  CHECK(records[0].chrom_end == 127472363);
-  CHECK(records[0].score == 0);
-  CHECK(records[0].strand == '+');
-}
-
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
 TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") {
   SECTION("valid") {
@@ -110,6 +103,40 @@ TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") {
   }
 }
 
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEST_CASE("BED Parser CRLF", "[parsers][BED][io][short]") {
+  const auto bed_file = testdir() / "crlf.bed";
+
+  const usize num_records = 3;
+
+  {
+    compressed_io::Writer w(bed_file, compressed_io::Writer::NONE);
+    for (usize i = 0; i < num_records; ++i) {
+      w.write(fmt::format(FMT_STRING("chr{}\t0\t1\r\n"), i));
+    }
+  }
+
+  const auto records = bed::Parser(bed_file).parse_all();
+  REQUIRE(records.size() == num_records);
+  for (usize i = 0; i < num_records; ++i) {
+    CHECK(bed::BED(fmt::format(FMT_STRING("chr{}\t0\t1"), i)) == records[i]);
+  }
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") {
+  const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";
+  auto p = bed::Parser(bed_file);
+  auto records = p.parse_all();
+  CHECK(records.size() == 9);
+  std::sort(records.begin(), records.end());
+  CHECK(records[0].chrom == "chr7");
+  CHECK(records[0].chrom_start == 127471196);
+  CHECK(records[0].chrom_end == 127472363);
+  CHECK(records[0].score == 0);
+  CHECK(records[0].strand == '+');
+}
+
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
 TEST_CASE("BED Parser simple: BED6 -> BED3", "[parsers][BED][io][short]") {
   const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";