diff --git a/src/MaRaCluster.cpp b/src/MaRaCluster.cpp index f2457df..8f05bba 100644 --- a/src/MaRaCluster.cpp +++ b/src/MaRaCluster.cpp @@ -20,9 +20,9 @@ namespace maracluster { MaRaCluster::MaRaCluster() : mode_(NONE), call_(""), percOutFN_(""), fnPrefix_("MaRaCluster"), - peakCountFN_(""), datFNFile_(""), scanInfoFN_(""), pvaluesFN_(""), - clusterFileFN_(""), pvalVecInFileFN_(""), pvalueVectorsBaseFN_(""), - overlapBatchFileFN_(""), overlapBatchIdx_(0u), + peakCountFN_(""), datFNFile_(""), scanInfoFN_(""), addSpecIds_(false), + pvaluesFN_(""), clusterFileFN_(""), pvalVecInFileFN_(""), + pvalueVectorsBaseFN_(""), overlapBatchFileFN_(""), overlapBatchIdx_(0u), spectrumBatchFileFN_(""), spectrumInFN_(""), spectrumOutFN_(""), spectrumLibraryFN_(""), matrixFN_(""), resultTreeFN_(""), skipFilterAndSort_(false), writeAll_(false), precursorTolerance_(20), @@ -106,6 +106,11 @@ bool MaRaCluster::parseOptions(int argc, char **argv) { "dat-folder", "Writable folder for converted .dat binary files. Can be used to re-use already converted spectrum files (default: ./maracluster_output/dat_files).", "path"); + cmd.defineOption("I", + "addSpecIds", + "Add column with spectrum id/title to clustering file output.", + "", + TRUE_IF_SET); cmd.defineOption("a", "prefix", "Output files will be prefixed as e.g. .clusters_p10.tsv (default: 'MaRaCluster')", @@ -221,7 +226,10 @@ bool MaRaCluster::parseOptions(int argc, char **argv) { // file input for maracluster batch and index (also for some other methods) if (cmd.optionSet("batch")) spectrumBatchFileFN_ = cmd.options["batch"]; - if (cmd.optionSet("output-folder")) outputFolder_ = cmd.options["output-folder"]; + if (cmd.optionSet("output-folder")) { + outputFolder_ = cmd.options["output-folder"]; + datFolder_ = outputFolder_ + "/dat_files"; + } if (cmd.optionSet("dat-folder")) datFolder_ = cmd.options["dat-folder"]; if (cmd.optionSet("prefix")) fnPrefix_ = cmd.options["prefix"]; @@ -229,6 +237,7 @@ bool MaRaCluster::parseOptions(int argc, char **argv) { if (cmd.optionSet("datFNfile")) datFNFile_ = cmd.options["datFNfile"]; if (cmd.optionSet("peakCountsFN")) peakCountFN_ = cmd.options["peakCountsFN"]; if (cmd.optionSet("scanInfoFN")) scanInfoFN_ = cmd.options["scanInfoFN"]; + if (cmd.optionSet("addSpecIds")) addSpecIds_ = true; // file input options for maracluster pvalue if (cmd.optionSet("specIn")) spectrumInFN_ = cmd.options["specIn"]; @@ -314,7 +323,7 @@ int MaRaCluster::createIndex() { fileList.initFromFile(spectrumBatchFileFN_); if (!Globals::fileExists(datFNFile_) || !Globals::fileExists(scanInfoFN_)) { - SpectrumFiles spectrumFiles(outputFolder_, datFolder_, chargeUncertainty_); + SpectrumFiles spectrumFiles(outputFolder_, datFolder_, chargeUncertainty_, addSpecIds_); spectrumFiles.convertToDat(fileList); spectrumFiles.splitByPrecursorMz(fileList, datFNFile_, peakCountFN_, scanInfoFN_, precursorTolerance_, precursorToleranceDa_); diff --git a/src/MaRaCluster.h b/src/MaRaCluster.h index f9fce77..3e5e5ec 100644 --- a/src/MaRaCluster.h +++ b/src/MaRaCluster.h @@ -104,6 +104,7 @@ class MaRaCluster { std::string peakCountFN_; std::string datFNFile_; std::string scanInfoFN_; + bool addSpecIds_; std::string pvaluesFN_; std::string clusterFileFN_; std::string pvalVecInFileFN_; diff --git a/src/ScanId.cpp b/src/ScanId.cpp index ef22186..c608470 100644 --- a/src/ScanId.cpp +++ b/src/ScanId.cpp @@ -33,4 +33,9 @@ std::size_t hash_value(ScanId const& si) { return hasher((si.fileIdx+1) * 1000000 + si.scannr); } +std::ostream& operator<<(std::ostream& stream, const ScanIdExtended& si) { + stream << si.scanId.fileIdx << " " << si.scanId.scannr << " " << si.title; + return stream; +} + } /* namespace maracluster */ diff --git a/src/ScanId.h b/src/ScanId.h index 9fe0fa7..dde21d1 100644 --- a/src/ScanId.h +++ b/src/ScanId.h @@ -47,6 +47,13 @@ struct ScanId { std::ostream& operator<<(std::ostream& stream, const ScanId& si); std::size_t hash_value(ScanId const& si); +struct ScanIdExtended { + ScanId scanId; + std::string title; +}; + +std::ostream& operator<<(std::ostream& stream, const ScanIdExtended& si); + } /* namespace maracluster */ #endif /* MARACLUSTER_SCANID_H_ */ diff --git a/src/Spectra.cpp b/src/Spectra.cpp index bf09f06..08c81cc 100644 --- a/src/Spectra.cpp +++ b/src/Spectra.cpp @@ -36,7 +36,8 @@ void Spectra::convertToBatchSpectra(const std::string& spectrumFN, SpectrumFiles specFiles; std::vector localSpectra; std::vector localScanInfos; - specFiles.getBatchSpectra(spectrumFN, fileList, localSpectra, localScanInfos); + std::vector localScanTitles; + specFiles.getBatchSpectra(spectrumFN, fileList, localSpectra, localScanInfos, localScanTitles); #pragma omp critical(append_spectra) { diff --git a/src/SpectrumFiles.cpp b/src/SpectrumFiles.cpp index e3dfb2e..1e1da4c 100644 --- a/src/SpectrumFiles.cpp +++ b/src/SpectrumFiles.cpp @@ -175,7 +175,8 @@ void SpectrumFiles::writeSplittedPrecursorMzFiles( void SpectrumFiles::getBatchSpectra( const std::string& spectrumFN, SpectrumFileList& fileList, std::vector& localSpectra, - std::vector& scanInfos) { + std::vector& scanInfos, + std::vector& scanTitles) { if ( !boost::filesystem::exists( spectrumFN ) ) { std::cerr << "Ignoring missing file " << spectrumFN << std::endl; return; @@ -199,6 +200,13 @@ void SpectrumFiles::getBatchSpectra( ScanInfo scanInfo; scanInfo.scanId = fileList.getScanId(spectrumFN, scannr); + if (addSpecIds_) { + ScanIdExtended scanTitle; + scanTitle.scanId = scanInfo.scanId; + scanTitle.title = SpectrumHandler::getScanTitle(s); + scanTitles.push_back(scanTitle); + } + std::vector mccs; getMassChargeCandidates(s, mccs, scanInfo.scanId); @@ -293,17 +301,22 @@ void SpectrumFiles::convertAndWriteDatFiles( const std::string& spectrumFN) { std::string datFile = SpectrumFiles::getOutputFile(spectrumFN, datFolder_, ".dat"); std::string scanInfoFile = SpectrumFiles::getOutputFile(spectrumFN, datFolder_, ".scan_info.dat"); + std::string scanTitleFile = SpectrumFiles::getOutputFile(spectrumFN, datFolder_, ".scan_titles.txt"); if (boost::filesystem::exists(datFile) && boost::filesystem::exists(scanInfoFile)) { return; } std::vector localSpectra; std::vector scanInfos; - getBatchSpectra(spectrumFN, fileList, localSpectra, scanInfos); + std::vector scanTitles; + getBatchSpectra(spectrumFN, fileList, localSpectra, scanInfos, scanTitles); bool append = false; BinaryInterface::write(localSpectra, datFile, append); BinaryInterface::write(scanInfos, scanInfoFile, append); + if (addSpecIds_) { + writeScanTitlesToFile(scanTitles, scanTitleFile); + } } void SpectrumFiles::readDatFNsFromFile(const std::string& datFNFile, @@ -331,6 +344,18 @@ void SpectrumFiles::writeDatFNsToFile(std::vector& datFNs, } } +void SpectrumFiles::writeScanTitlesToFile(std::vector& scanTitles, + const std::string& scanTitlesFile) { + std::ofstream outfile(scanTitlesFile.c_str(), std::ios_base::out); + if (outfile.is_open()) { + BOOST_FOREACH (const ScanIdExtended& scanTitle, scanTitles) { + outfile << scanTitle << "\n"; + } + } else { + std::cerr << "Could not write scan titles to file" << std::endl; + } +} + void SpectrumFiles::writePrecMzs(const std::vector& precMzs) { BOOST_FOREACH (const double precMz, precMzs) { std::cout << precMz << std::endl; diff --git a/src/SpectrumFiles.h b/src/SpectrumFiles.h index 7b6671b..aa41b31 100644 --- a/src/SpectrumFiles.h +++ b/src/SpectrumFiles.h @@ -65,18 +65,21 @@ struct ScanInfo { class SpectrumFiles { public: - SpectrumFiles() : outputFolder_(""), chargeUncertainty_(0) {} + SpectrumFiles() : outputFolder_(""), chargeUncertainty_(0), datFolder_(""), addSpecIds_(false) {} SpectrumFiles(const std::string& precMzFileFolder, const std::string& datFolder) : outputFolder_(precMzFileFolder), datFolder_(datFolder), - chargeUncertainty_(0) {} + chargeUncertainty_(0), + addSpecIds_(false) {} SpectrumFiles(const std::string& precMzFileFolder, const std::string& datFolder, - const int chargeUncertainty) : + const int chargeUncertainty, + const bool addSpecIds) : outputFolder_(precMzFileFolder), datFolder_(datFolder), - chargeUncertainty_(chargeUncertainty) {} + chargeUncertainty_(chargeUncertainty), + addSpecIds_(addSpecIds) {} void convertToDat(SpectrumFileList& fileList); void splitByPrecursorMz(SpectrumFileList& fileList, @@ -93,17 +96,19 @@ class SpectrumFiles { void getBatchSpectra(const std::string& spectrumFN, SpectrumFileList& fileList, std::vector& localSpectra, - std::vector& localScanInfos); + std::vector& localScanInfos, + std::vector& scanTitles); void getDatFNs(std::vector& limits, std::vector& datFNs); - static void writeDatFNsToFile(std::vector& datFNs, const std::string& datFNFile); - static void readDatFNsFromFile(const std::string& datFNFile, std::vector& datFNs); + static void writeScanTitlesToFile(std::vector& scanTitles, + const std::string& scanTitlesFile); + static void readPrecMzLimits(const std::string& scanInfoFN, std::map >& precMzLimits); @@ -113,6 +118,7 @@ class SpectrumFiles { std::string outputFolder_; std::string datFolder_; int chargeUncertainty_; + bool addSpecIds_; virtual void getMassChargeCandidates(pwiz::msdata::SpectrumPtr s, std::vector& mccs, ScanId scanId); diff --git a/src/SpectrumHandler.cpp b/src/SpectrumHandler.cpp index 1990544..f581d52 100644 --- a/src/SpectrumHandler.cpp +++ b/src/SpectrumHandler.cpp @@ -24,7 +24,8 @@ unsigned int SpectrumHandler::getScannr(pwiz::msdata::SpectrumPtr s) { std::string mgfScansField = s->cvParam(pwiz::cv::MS_peak_list_scans).valueAs(); return atoi(mgfScansField.c_str()); } - std::stringstream ss(s->id); + std::string scanTitle = SpectrumHandler::getScanTitle(s); + std::stringstream ss(scanTitle); while (ss.good()) { std::string tmp; ss >> tmp; @@ -56,6 +57,15 @@ void SpectrumHandler::setScannr(pwiz::msdata::SpectrumPtr s, const ScanId& scanI setScannr(s, hash_value(scanId)); } +std::string SpectrumHandler::getScanTitle(pwiz::msdata::SpectrumPtr s) { + // special check for the TITLE field in mgf files + if (s->hasCVParam(pwiz::msdata::MS_spectrum_title)) { + std::string mgfScanTitle = s->cvParam(pwiz::msdata::MS_spectrum_title).valueAs(); + return mgfScanTitle; + } + return s->id; +} + double SpectrumHandler::interpolateIntensity(MZIntensityPair p1, MZIntensityPair p2, double mz) { return ( (p2.intensity - p1.intensity) / (p2.mz - p1.mz) * (mz - p1.mz) + p1.intensity ); } diff --git a/src/SpectrumHandler.h b/src/SpectrumHandler.h index 5532cf9..48ba19a 100644 --- a/src/SpectrumHandler.h +++ b/src/SpectrumHandler.h @@ -70,6 +70,8 @@ class SpectrumHandler { static void setScannr(pwiz::msdata::SpectrumPtr s, unsigned int scanNr); static void setScannr(pwiz::msdata::SpectrumPtr s, const ScanId& scanId); + static std::string getScanTitle(pwiz::msdata::SpectrumPtr s); + static double interpolateIntensity(MZIntensityPair p1, MZIntensityPair p2, double mz); static void scaleIntensities(std::vector& mziPairs, double scaling);