Skip to content

Commit

Permalink
Added function to extract scan title, Fixes #22
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewThe committed Nov 8, 2022
1 parent 1e42cf1 commit 6027bcc
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 16 deletions.
19 changes: 14 additions & 5 deletions src/MaRaCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ namespace maracluster {

MaRaCluster::MaRaCluster() :
mode_(NONE), call_(""), percOutFN_(""), fnPrefix_("MaRaCluster"),
peakCountFN_(""), datFNFile_(""), scanInfoFN_(""), pvaluesFN_(""),
clusterFileFN_(""), pvalVecInFileFN_(""), pvalueVectorsBaseFN_(""),
overlapBatchFileFN_(""), overlapBatchIdx_(0u),
peakCountFN_(""), datFNFile_(""), scanInfoFN_(""), addSpecIds_(false),
pvaluesFN_(""), clusterFileFN_(""), pvalVecInFileFN_(""),
pvalueVectorsBaseFN_(""), overlapBatchFileFN_(""), overlapBatchIdx_(0u),
spectrumBatchFileFN_(""), spectrumInFN_(""), spectrumOutFN_(""),
spectrumLibraryFN_(""), matrixFN_(""), resultTreeFN_(""),
skipFilterAndSort_(false), writeAll_(false), precursorTolerance_(20),
Expand Down Expand Up @@ -106,6 +106,11 @@ bool MaRaCluster::parseOptions(int argc, char **argv) {
"dat-folder",
"Writable folder for converted .dat binary files. Can be used to re-use already converted spectrum files (default: ./maracluster_output/dat_files).",
"path");
cmd.defineOption("I",
"addSpecIds",
"Add column with spectrum id/title to clustering file output.",
"",
TRUE_IF_SET);
cmd.defineOption("a",
"prefix",
"Output files will be prefixed as e.g. <prefix>.clusters_p10.tsv (default: 'MaRaCluster')",
Expand Down Expand Up @@ -221,14 +226,18 @@ bool MaRaCluster::parseOptions(int argc, char **argv) {

// file input for maracluster batch and index (also for some other methods)
if (cmd.optionSet("batch")) spectrumBatchFileFN_ = cmd.options["batch"];
if (cmd.optionSet("output-folder")) outputFolder_ = cmd.options["output-folder"];
if (cmd.optionSet("output-folder")) {
outputFolder_ = cmd.options["output-folder"];
datFolder_ = outputFolder_ + "/dat_files";
}
if (cmd.optionSet("dat-folder")) datFolder_ = cmd.options["dat-folder"];
if (cmd.optionSet("prefix")) fnPrefix_ = cmd.options["prefix"];

// file output for maracluster batch and index (also for some other methods)
if (cmd.optionSet("datFNfile")) datFNFile_ = cmd.options["datFNfile"];
if (cmd.optionSet("peakCountsFN")) peakCountFN_ = cmd.options["peakCountsFN"];
if (cmd.optionSet("scanInfoFN")) scanInfoFN_ = cmd.options["scanInfoFN"];
if (cmd.optionSet("addSpecIds")) addSpecIds_ = true;

// file input options for maracluster pvalue
if (cmd.optionSet("specIn")) spectrumInFN_ = cmd.options["specIn"];
Expand Down Expand Up @@ -314,7 +323,7 @@ int MaRaCluster::createIndex() {
fileList.initFromFile(spectrumBatchFileFN_);

if (!Globals::fileExists(datFNFile_) || !Globals::fileExists(scanInfoFN_)) {
SpectrumFiles spectrumFiles(outputFolder_, datFolder_, chargeUncertainty_);
SpectrumFiles spectrumFiles(outputFolder_, datFolder_, chargeUncertainty_, addSpecIds_);
spectrumFiles.convertToDat(fileList);
spectrumFiles.splitByPrecursorMz(fileList, datFNFile_, peakCountFN_,
scanInfoFN_, precursorTolerance_, precursorToleranceDa_);
Expand Down
1 change: 1 addition & 0 deletions src/MaRaCluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class MaRaCluster {
std::string peakCountFN_;
std::string datFNFile_;
std::string scanInfoFN_;
bool addSpecIds_;
std::string pvaluesFN_;
std::string clusterFileFN_;
std::string pvalVecInFileFN_;
Expand Down
5 changes: 5 additions & 0 deletions src/ScanId.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,9 @@ std::size_t hash_value(ScanId const& si) {
return hasher((si.fileIdx+1) * 1000000 + si.scannr);
}

std::ostream& operator<<(std::ostream& stream, const ScanIdExtended& si) {
stream << si.scanId.fileIdx << " " << si.scanId.scannr << " " << si.title;
return stream;
}

} /* namespace maracluster */
7 changes: 7 additions & 0 deletions src/ScanId.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ struct ScanId {
std::ostream& operator<<(std::ostream& stream, const ScanId& si);
std::size_t hash_value(ScanId const& si);

struct ScanIdExtended {
ScanId scanId;
std::string title;
};

std::ostream& operator<<(std::ostream& stream, const ScanIdExtended& si);

} /* namespace maracluster */

#endif /* MARACLUSTER_SCANID_H_ */
3 changes: 2 additions & 1 deletion src/Spectra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ void Spectra::convertToBatchSpectra(const std::string& spectrumFN,
SpectrumFiles specFiles;
std::vector<Spectrum> localSpectra;
std::vector<ScanInfo> localScanInfos;
specFiles.getBatchSpectra(spectrumFN, fileList, localSpectra, localScanInfos);
std::vector<ScanIdExtended> localScanTitles;
specFiles.getBatchSpectra(spectrumFN, fileList, localSpectra, localScanInfos, localScanTitles);

#pragma omp critical(append_spectra)
{
Expand Down
29 changes: 27 additions & 2 deletions src/SpectrumFiles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ void SpectrumFiles::writeSplittedPrecursorMzFiles(
void SpectrumFiles::getBatchSpectra(
const std::string& spectrumFN, SpectrumFileList& fileList,
std::vector<Spectrum>& localSpectra,
std::vector<ScanInfo>& scanInfos) {
std::vector<ScanInfo>& scanInfos,
std::vector<ScanIdExtended>& scanTitles) {
if ( !boost::filesystem::exists( spectrumFN ) ) {
std::cerr << "Ignoring missing file " << spectrumFN << std::endl;
return;
Expand All @@ -199,6 +200,13 @@ void SpectrumFiles::getBatchSpectra(
ScanInfo scanInfo;
scanInfo.scanId = fileList.getScanId(spectrumFN, scannr);

if (addSpecIds_) {
ScanIdExtended scanTitle;
scanTitle.scanId = scanInfo.scanId;
scanTitle.title = SpectrumHandler::getScanTitle(s);
scanTitles.push_back(scanTitle);
}

std::vector<MassChargeCandidate> mccs;
getMassChargeCandidates(s, mccs, scanInfo.scanId);

Expand Down Expand Up @@ -293,17 +301,22 @@ void SpectrumFiles::convertAndWriteDatFiles(
const std::string& spectrumFN) {
std::string datFile = SpectrumFiles::getOutputFile(spectrumFN, datFolder_, ".dat");
std::string scanInfoFile = SpectrumFiles::getOutputFile(spectrumFN, datFolder_, ".scan_info.dat");
std::string scanTitleFile = SpectrumFiles::getOutputFile(spectrumFN, datFolder_, ".scan_titles.txt");
if (boost::filesystem::exists(datFile) && boost::filesystem::exists(scanInfoFile)) {
return;
}

std::vector<Spectrum> localSpectra;
std::vector<ScanInfo> scanInfos;
getBatchSpectra(spectrumFN, fileList, localSpectra, scanInfos);
std::vector<ScanIdExtended> scanTitles;
getBatchSpectra(spectrumFN, fileList, localSpectra, scanInfos, scanTitles);

bool append = false;
BinaryInterface::write<Spectrum>(localSpectra, datFile, append);
BinaryInterface::write<ScanInfo>(scanInfos, scanInfoFile, append);
if (addSpecIds_) {
writeScanTitlesToFile(scanTitles, scanTitleFile);
}
}

void SpectrumFiles::readDatFNsFromFile(const std::string& datFNFile,
Expand Down Expand Up @@ -331,6 +344,18 @@ void SpectrumFiles::writeDatFNsToFile(std::vector<std::string>& datFNs,
}
}

void SpectrumFiles::writeScanTitlesToFile(std::vector<ScanIdExtended>& scanTitles,
const std::string& scanTitlesFile) {
std::ofstream outfile(scanTitlesFile.c_str(), std::ios_base::out);
if (outfile.is_open()) {
BOOST_FOREACH (const ScanIdExtended& scanTitle, scanTitles) {
outfile << scanTitle << "\n";
}
} else {
std::cerr << "Could not write scan titles to file" << std::endl;
}
}

void SpectrumFiles::writePrecMzs(const std::vector<double>& precMzs) {
BOOST_FOREACH (const double precMz, precMzs) {
std::cout << precMz << std::endl;
Expand Down
20 changes: 13 additions & 7 deletions src/SpectrumFiles.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,21 @@ struct ScanInfo {

class SpectrumFiles {
public:
SpectrumFiles() : outputFolder_(""), chargeUncertainty_(0) {}
SpectrumFiles() : outputFolder_(""), chargeUncertainty_(0), datFolder_(""), addSpecIds_(false) {}
SpectrumFiles(const std::string& precMzFileFolder,
const std::string& datFolder) :
outputFolder_(precMzFileFolder),
datFolder_(datFolder),
chargeUncertainty_(0) {}
chargeUncertainty_(0),
addSpecIds_(false) {}
SpectrumFiles(const std::string& precMzFileFolder,
const std::string& datFolder,
const int chargeUncertainty) :
const int chargeUncertainty,
const bool addSpecIds) :
outputFolder_(precMzFileFolder),
datFolder_(datFolder),
chargeUncertainty_(chargeUncertainty) {}
chargeUncertainty_(chargeUncertainty),
addSpecIds_(addSpecIds) {}

void convertToDat(SpectrumFileList& fileList);
void splitByPrecursorMz(SpectrumFileList& fileList,
Expand All @@ -93,17 +96,19 @@ class SpectrumFiles {

void getBatchSpectra(const std::string& spectrumFN,
SpectrumFileList& fileList, std::vector<Spectrum>& localSpectra,
std::vector<ScanInfo>& localScanInfos);
std::vector<ScanInfo>& localScanInfos,
std::vector<ScanIdExtended>& scanTitles);

void getDatFNs(std::vector<double>& limits,
std::vector<std::string>& datFNs);

static void writeDatFNsToFile(std::vector<std::string>& datFNs,
const std::string& datFNFile);

static void readDatFNsFromFile(const std::string& datFNFile,
std::vector<std::string>& datFNs);

static void writeScanTitlesToFile(std::vector<ScanIdExtended>& scanTitles,
const std::string& scanTitlesFile);

static void readPrecMzLimits(const std::string& scanInfoFN,
std::map<ScanId, std::pair<float, float> >& precMzLimits);

Expand All @@ -113,6 +118,7 @@ class SpectrumFiles {
std::string outputFolder_;
std::string datFolder_;
int chargeUncertainty_;
bool addSpecIds_;

virtual void getMassChargeCandidates(pwiz::msdata::SpectrumPtr s,
std::vector<MassChargeCandidate>& mccs, ScanId scanId);
Expand Down
12 changes: 11 additions & 1 deletion src/SpectrumHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ unsigned int SpectrumHandler::getScannr(pwiz::msdata::SpectrumPtr s) {
std::string mgfScansField = s->cvParam(pwiz::cv::MS_peak_list_scans).valueAs<std::string>();
return atoi(mgfScansField.c_str());
}
std::stringstream ss(s->id);
std::string scanTitle = SpectrumHandler::getScanTitle(s);
std::stringstream ss(scanTitle);
while (ss.good()) {
std::string tmp;
ss >> tmp;
Expand Down Expand Up @@ -56,6 +57,15 @@ void SpectrumHandler::setScannr(pwiz::msdata::SpectrumPtr s, const ScanId& scanI
setScannr(s, hash_value(scanId));
}

std::string SpectrumHandler::getScanTitle(pwiz::msdata::SpectrumPtr s) {
// special check for the TITLE field in mgf files
if (s->hasCVParam(pwiz::msdata::MS_spectrum_title)) {
std::string mgfScanTitle = s->cvParam(pwiz::msdata::MS_spectrum_title).valueAs<std::string>();
return mgfScanTitle;
}
return s->id;
}

double SpectrumHandler::interpolateIntensity(MZIntensityPair p1, MZIntensityPair p2, double mz) {
return ( (p2.intensity - p1.intensity) / (p2.mz - p1.mz) * (mz - p1.mz) + p1.intensity );
}
Expand Down
2 changes: 2 additions & 0 deletions src/SpectrumHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class SpectrumHandler {
static void setScannr(pwiz::msdata::SpectrumPtr s, unsigned int scanNr);
static void setScannr(pwiz::msdata::SpectrumPtr s, const ScanId& scanId);

static std::string getScanTitle(pwiz::msdata::SpectrumPtr s);

static double interpolateIntensity(MZIntensityPair p1, MZIntensityPair p2, double mz);
static void scaleIntensities(std::vector<MZIntensityPair>& mziPairs, double scaling);

Expand Down

0 comments on commit 6027bcc

Please sign in to comment.