Skip to content
This repository has been archived by the owner on Jul 17, 2023. It is now read-only.

Commit

Permalink
Merge pull request #77 from Bioinformatics/feature-MANTA-1043
Browse files Browse the repository at this point in the history
MANTA-1043 Add more read statistics when 'too few read pair observation' error
  • Loading branch information
nnariai authored and GitHub Enterprise committed Aug 25, 2017
2 parents d6d8a5c + 5cc0a94 commit 0a9716a
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 14 deletions.
2 changes: 2 additions & 0 deletions ChangeLog.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
- MANTA-1043 Add more read statistics when 'too few read pair observations' exception
- MANTA-696 Improve the iterative assembler and set it to the default assembler
- MANTA-1175 Fix a bug of duplicate RG & PG in evidence bams
- MANTA-886 Verify extension of alignment input file
- MANTA-1177 Improve CRAM reference handling
- MANTA-1118 Add QC check of read length for a bam record
- MANTA-865 Fix sortVcf.py for vcf's without a header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ runSAS(const SASOptions& opt)
{
report_os << quantLevel[quantLevelIndex] << '\t' << rgs.fragStats.quantile(quantLevel[quantLevelIndex]) << '\n';
}
report_os << "Total sampled reads:\t" << rgs.readCounter.totalReadCount() << '\n';
report_os << "Total sampled paired reads:\t" << rgs.readCounter.totalPairedReadCount() << '\n';
report_os << "Total sampled unpaired reads:\t" << rgs.readCounter.totalUnpairedReadCount() << '\n';
report_os << "Total sampled paired reads with low MAPQ:\t" << rgs.readCounter.totalPairedLowMapqReadCount() << '\n';
report_os << "Total sampled high-confidence read pairs passing all filters:\t" << rgs.readCounter.totalHighConfidenceReadPairCount() << '\n';
report_os << '\n';

#ifdef OUTPUT_CDF
Expand Down
2 changes: 1 addition & 1 deletion src/c++/lib/htsapi/bam_record_util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ get_avg_quality(

/// select 'first' read in pair such that you
/// consistently get only one read per-pair
/// (assuming the bam file is properly formated)
/// (assuming the bam file is properly formatted)
inline
bool
isFirstRead(
Expand Down
136 changes: 136 additions & 0 deletions src/c++/lib/manta/ReadCounter.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//
// Manta - Structural Variant and Indel Caller
// Copyright (c) 2013-2017 Illumina, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
//

/// \file
/// \author Naoki Nariai
///

#pragma once

#include "boost/foreach.hpp"
#include "boost/serialization/access.hpp"
#include "boost/serialization/nvp.hpp"
#include "boost/serialization/split_member.hpp"
#include <iostream>

/// \brief Accumulate read statistics scanned for insert size estimation
struct ReadCounter
{
ReadCounter() :
_totalReadCount(0),
_totalPairedReadCount(0),
_totalUnpairedReadCount(0),
_totalPairedLowMapqReadCount(0),
_totalHighConfidenceReadPairCount(0)
{}

unsigned
totalReadCount() const
{
return _totalReadCount;
}

unsigned
totalPairedReadCount() const
{
return _totalPairedReadCount;
}

unsigned
totalUnpairedReadCount() const
{
return _totalUnpairedReadCount;
}

unsigned
totalPairedLowMapqReadCount() const
{
return _totalPairedLowMapqReadCount;
}

unsigned
totalHighConfidenceReadPairCount() const
{
return _totalHighConfidenceReadPairCount;
}

void
addReadCount()
{
_totalReadCount++;
}

void
addPairedReadCount()
{
_totalPairedReadCount++;
}

void
addUnpairedReadCount()
{
_totalUnpairedReadCount++;
}

void
addPairedLowMapqReadCount()
{
_totalPairedLowMapqReadCount++;
}

void
addHighConfidenceReadPairCount()
{
_totalHighConfidenceReadPairCount += 1;
}

private:
friend class boost::serialization::access;
template<class Archive>
void serialize(
Archive& ar,
const unsigned /*version*/)
{
ar& boost::serialization::make_nvp("totalReadCount", _totalReadCount);
ar& boost::serialization::make_nvp("totalPairedReadCount", _totalPairedReadCount);
ar& boost::serialization::make_nvp("totalUnpairedReadCount", _totalUnpairedReadCount);
ar& boost::serialization::make_nvp("totalPairedLowMapqReadCount", _totalPairedLowMapqReadCount);
ar& boost::serialization::make_nvp("totalHighConfidenceReadPairCount", _totalHighConfidenceReadPairCount);
}

///////////////////////////////////// data:
unsigned _totalReadCount;
unsigned _totalPairedReadCount;
unsigned _totalUnpairedReadCount;
unsigned _totalPairedLowMapqReadCount;
unsigned _totalHighConfidenceReadPairCount;
};

BOOST_CLASS_IMPLEMENTATION(ReadCounter, object_serializable)


inline std::ostream&
operator<<(std::ostream& os, const ReadCounter& rs)
{
os << "\tTotal sampled reads: " + std::to_string(rs.totalReadCount()) + "\n"
<< "\tTotal sampled paired reads: " + std::to_string(rs.totalPairedReadCount()) + "\n"
<< "\tTotal sampled paired reads passing MAPQ filter: " + std::to_string(rs.totalPairedReadCount() - rs.totalPairedLowMapqReadCount()) + "\n"
<< "\tTotal sampled high-confidence read pairs passing all filters: " + std::to_string(rs.totalHighConfidenceReadPairCount()) + "\n";
return os;
}
3 changes: 3 additions & 0 deletions src/c++/lib/manta/ReadGroupStats.hh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "blt_util/SizeDistribution.hh"
#include "common/ReadPairOrient.hh"
#include "manta/ReadCounter.hh"


/// Read pair insert stats can be computed for each sample or read group, this
Expand All @@ -41,12 +42,14 @@ private:
{
ar& boost::serialization::make_nvp("fragmentSizeDistribution", fragStats);
ar& boost::serialization::make_nvp("pairOrientation", relOrients);
ar& boost::serialization::make_nvp("readCount", readCounter);
}

///////////////////////////// data:
public:
SizeDistribution fragStats;
ReadPairOrient relOrients;
ReadCounter readCounter;
};

BOOST_CLASS_IMPLEMENTATION(ReadGroupStats, boost::serialization::object_serializable)
79 changes: 66 additions & 13 deletions src/c++/lib/manta/ReadGroupStatsUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,9 @@ struct ReadGroupOrientTracker
}

const ReadPairOrient&
getConsensusOrient()
getConsensusOrient(const ReadCounter& readCounter)
{
finalize();
finalize(readCounter);
return _finalOrient;
}

Expand All @@ -212,10 +212,9 @@ struct ReadGroupOrientTracker
}

void
finalize()
finalize(const ReadCounter& readCounter)
{
if (_isFinalized) return;

bool isMaxIndex(false);
unsigned maxIndex(0);
for (unsigned i(0); i<_orientCount.size(); ++i)
Expand All @@ -241,7 +240,10 @@ struct ReadGroupOrientTracker
if (_totalOrientCount < minCount)
{
std::ostringstream oss;
oss << "ERROR: Too few observations (" << _totalOrientCount << ") to determine pair orientation for " << _rgLabel << "'\n";
oss << "ERROR: Too few high-confidence read pairs (" << _totalOrientCount << ") to determine pair orientation for " << _rgLabel << "'\n"
<< "\tAt least " << minCount << " high-confidence read pairs are required to determine pair orientation.\n"
<< readCounter << "\n";

BOOST_THROW_EXCEPTION(LogicException(oss.str()));
}

Expand All @@ -251,7 +253,10 @@ struct ReadGroupOrientTracker
const unsigned maxPercent((_orientCount[maxIndex]*100)/_totalOrientCount);
std::ostringstream oss;
oss << "ERROR: Can't determine consensus pair orientation of " << _rgLabel << ".\n"
<< "\tThe most frequent orientation is '" << _finalOrient << "' (" << maxPercent << "% of " << _totalOrientCount << " total observations)\n";
<< "\tThe most frequent orientation is '" << _finalOrient << "' (" << maxPercent << "% of " << _totalOrientCount << " total used read pairs)\n"
<< "\tThe fraction of '" << _finalOrient << "' among total high-confidence read pairs needs to be more than " << minMaxFrac << " to determine consensus pair orientation.\n"
<< readCounter << "\n";

BOOST_THROW_EXCEPTION(LogicException(oss.str()));
}
}
Expand Down Expand Up @@ -438,6 +443,9 @@ struct ReadGroupTracker
const PAIR_ORIENT::index_t ori(srd._orient);
addOrient(ori);

// we define "high-confidence" read pairs as those reads passing all filters
addHighConfidenceReadPairCount();

// filter mapped innies on the same chrom
//
// note we don't rely on the proper pair bit because this already contains an
Expand Down Expand Up @@ -481,7 +489,6 @@ struct ReadGroupTracker
<< "\n";
#endif
}

_buffer.clearBuffer();
}

Expand All @@ -502,17 +509,47 @@ struct ReadGroupTracker
return _stats;
}

void
addReadCount()
{
_stats.readCounter.addReadCount();
}

void
addPairedReadCount()
{
_stats.readCounter.addPairedReadCount();
}

void
addUnpairedReadCount() {
_stats.readCounter.addUnpairedReadCount();
}

void
addPairedLowMapqReadCount() {
_stats.readCounter.addPairedLowMapqReadCount();
}

void
addHighConfidenceReadPairCount() {
_stats.readCounter.addHighConfidenceReadPairCount();
}

void
finalize()
{
if (_isFinalized) return;

// add the remaining data in the buffer
if (_buffer.isBufferNormal()) addBufferedData();
if (_buffer.isBufferNormal())
{
addBufferedData();
}
_buffer.clearBuffer();

// finalize pair orientation:
_stats.relOrients = _orientInfo.getConsensusOrient();
_stats.relOrients = _orientInfo.getConsensusOrient(_stats.readCounter);

if (_stats.relOrients.val() != PAIR_ORIENT::Rp)
{
Expand All @@ -527,13 +564,16 @@ struct ReadGroupTracker
// finalize insert size distro:
if (! isInsertSizeConverged())
{
if (_stats.fragStats.totalObservations() < 100)
static const unsigned minObservations(100);
if (_stats.fragStats.totalObservations() < minObservations)
{
using namespace illumina::common;

std::ostringstream oss;
oss << "ERROR: Can't generate pair statistics for " << _rgLabel << "\n"
<< "\tTotal observed read pairs: " << insertSizeObservations() << "\n";
<< "\tTotal high-confidence read pairs (FR) used for insert size estimation: " << insertSizeObservations() << "\n"
<< "\tAt least " << minObservations << " high-confidence read pairs (FR) are required to estimate insert size.\n"
<< _stats.readCounter << "\n";
BOOST_THROW_EXCEPTION(LogicException(oss.str()));
}
else if (! isInsertSizeChecked())
Expand All @@ -544,7 +584,8 @@ struct ReadGroupTracker
if (! isInsertSizeConverged())
{
log_os << "WARNING: read pair statistics did not converge for " << _rgLabel << "\n"
<< "\tTotal observed read pairs: " << insertSizeObservations() << "\n";
<< "\tTotal high-confidence read pairs (FR) used for insert size estimation: " << insertSizeObservations() << "\n"
<< _stats.readCounter << "\n";
}
}

Expand Down Expand Up @@ -1003,6 +1044,19 @@ extractReadGroupStatsFromAlignmentFile(
chromHighestPos[chromIndex] = bamRead.pos();
isActiveChrom = true;

rgInfo.addReadCount();
if (bamRead.is_paired())
{
rgInfo.addPairedReadCount();
if(bamRead.map_qual()==0)
{
rgInfo.addPairedLowMapqReadCount();
}
}
else{
rgInfo.addUnpairedReadCount();
}

if (coreFilter.isFilterRead(bamRead)) continue;

#ifdef READ_GROUPS
Expand Down Expand Up @@ -1051,7 +1105,6 @@ extractReadGroupStatsFromAlignmentFile(
chromHighestPos[chromIndex] += chromSize[chromIndex]/100;

}

}
}
}
Expand Down

0 comments on commit 0a9716a

Please sign in to comment.