diff --git a/.gitignore b/.gitignore index ecaa20a..6b4a66d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__ # SWIG generated files lib/*.so lib/lrst.py +lib/.nfs* src/lrst_wrap.cpp # Build and dist @@ -27,4 +28,4 @@ SampleData *.log # Testing scripts -linktoscripts \ No newline at end of file +linktoscripts diff --git a/conda/meta.yaml b/conda/meta.yaml index fa74056..0039435 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,13 +1,13 @@ {% set version = "1.3.1" %} -{% set sha256 = "eb4c7677d43d80d19ddbac11900ae3a14efd6f2a3c4cc8bbc08e063a81e0e1df" %} +{% set revision = "725c239b6c1f4d6daf5788871cd47f8781358e3d" %} package: name: longreadsum version: {{ version }} source: - url: https://github.com/WGLab/LongReadSum/archive/refs/tags/v{{ version }}.tar.gz - sha256: '{{ sha256 }}' + git_url: https://github.com/WGLab/LongReadSum.git + git_rev: {{ revision }} build: number: 0 diff --git a/include/kseq.h b/include/kseq.h deleted file mode 100644 index b2238d1..0000000 --- a/include/kseq.h +++ /dev/null @@ -1,235 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Last Modified: 05MAR2012 */ - -#ifndef AC_KSEQ_H -#define AC_KSEQ_H - -#include -#include -#include - -#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r -#define KS_SEP_TAB 1 // isspace() && !' ' -#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) -#define KS_SEP_MAX 2 - -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - unsigned char *buf; \ - int begin, end, is_eof; \ - type_t f; \ - } kstream_t; - -#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) -#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) - -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = (unsigned char*)malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ - } - -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end == 0) { ks->is_eof = 1; return -1;} \ - } \ - return (int)ks->buf[ks->begin++]; \ - } - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ - { \ - int gotany = 0; \ - if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end == 0) { ks->is_eof = 1; break; } \ - } else break; \ - } \ - if (delimiter == KS_SEP_LINE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == '\n') break; \ - } else if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - gotany = 1; \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (!gotany && ks_eof(ks)) return -1; \ - if (str->s == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ - str->s[str->l] = '\0'; \ - return str->l; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } - -#define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ - __KS_GETUNTIL(__read, __bufsize) - -#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) - -#define __KSEQ_BASIC(SCOPE, type_t) \ - SCOPE kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - SCOPE void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } - -/* Return value: - >=0 length of the sequence (normal) - -1 end-of-file - -2 truncated quality string - */ -#define __KSEQ_READ(SCOPE) \ - SCOPE int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* else: the first header char has been read in the previous call */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ - if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ - if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ - seq->seq.m = 256; \ - seq->seq.s = (char*)malloc(seq->seq.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (c == '\n') continue; /* skip empty lines */ \ - seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ - ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT(type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(SCOPE, type_t) \ - __KSEQ_READ(SCOPE) - -#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) - -#define KSEQ_DECLARE(type_t) \ - __KS_TYPE(type_t) \ - __KSEQ_TYPE(type_t) \ - extern kseq_t *kseq_init(type_t fd); \ - void kseq_destroy(kseq_t *ks); \ - int kseq_read(kseq_t *seq); - -#endif diff --git a/include/output_data.h b/include/output_data.h index 9d2069f..10c1f39 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -61,6 +61,7 @@ class Basic_Seq_Statistics void global_sum_no_gc(); void calculate_NXX_scores(); // Calculate N50, N95, N05, etc. void resize(); + void addBases(int count); Basic_Seq_Statistics(); Basic_Seq_Statistics(const Basic_Seq_Statistics &_bss); diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp index 516aa10..d9336f3 100644 --- a/src/fasta_module.cpp +++ b/src/fasta_module.cpp @@ -4,84 +4,187 @@ FASTA_module.cpp: */ #include #include -#include +// #include #include #include #include #include -#include "kseq.h" + +#include + #include "fasta_module.h" -KSEQ_INIT(gzFile, gzread) // this is a macro defined in kseq.h // Helper function for saving formatted read statistics to an output text file static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_details_fp) { int exit_code = 0; - gzFile input_fp; - kseq_t *seq; - char *read_seq; - char *read_name; - int64_t read_len; double read_gc_cnt; Basic_Seq_Statistics &long_read_info = py_output_fa.long_read_info; - input_fp = gzopen(input_file, "r"); - if (!input_fp) - { - fprintf(stderr, "\nERROR! Failed to open file for reading: %s\n", input_file); + std::ifstream input_file_stream(input_file); + if (!input_file_stream.is_open()) { + fprintf(stderr, "Failed to open file for reading: %s\n", input_file); exit_code = 3; } else { - seq = kseq_init(input_fp); - while ((read_len = kseq_read(seq)) >= 0) + std::string line, sequence, sequence_data_str; + int gc_count = 0; + while (std::getline(input_file_stream, line)) { - if (read_len == 0) {continue;} - read_name = seq->name.s; - read_seq = seq->seq.s; - if ((int64_t)read_len > long_read_info.longest_read_length) + if (line.empty()) { - long_read_info.longest_read_length = read_len; + continue; } - long_read_info.total_num_reads += 1; - long_read_info.total_num_bases += read_len; - if (read_len < (int64_t) long_read_info.read_length_count.size()) { - long_read_info.read_length_count[read_len] += 1; + + if (line[0] == '>') { // Header line + // Save the last sequence's statistics + if (!sequence.empty()) + { + // Get the base counts + uint64_t base_count = 0; + uint64_t n_count = 0; + for (int i = 0; i < (int)sequence.size(); i++) + { + if (sequence[i] == 'A' || sequence[i] == 'a') + { + long_read_info.total_a_cnt += 1; + base_count += 1; + } + else if (sequence[i] == 'G' || sequence[i] == 'g') + { + long_read_info.total_g_cnt += 1; + gc_count += 1; + base_count += 1; + } + else if (sequence[i] == 'C' || sequence[i] == 'c') + { + long_read_info.total_c_cnt += 1; + gc_count += 1; + base_count += 1; + } + else if (sequence[i] == 'T' || sequence[i] == 't' || sequence[i] == 'U' || sequence[i] == 'u') + { + long_read_info.total_tu_cnt += 1; + base_count += 1; + } else { + n_count += 1; + } + } + + // Save sequence length statistics + if (base_count > (uint64_t)long_read_info.longest_read_length) + { + long_read_info.longest_read_length = base_count; + } + long_read_info.total_num_reads += 1; + + // Get the sequence length distribution + if (base_count < long_read_info.read_length_count.size()) { + long_read_info.read_length_count[(int)base_count] += 1; + } else { + long_read_info.read_length_count.resize(base_count + 1000, 0); + long_read_info.read_length_count[(int)base_count] += 1; + } + + long_read_info.total_num_bases += base_count; + long_read_info.total_n_cnt += n_count; + read_gc_cnt = 100.0 * gc_count / (double)base_count; + long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; + + // Remove the newline character from the sequence data + size_t pos = sequence_data_str.find_first_of("\r\n"); + if (pos != std::string::npos) + { + std::cerr << "Warning: newline character found in sequence data: " << sequence_data_str << std::endl; + sequence_data_str = sequence_data_str.substr(0, pos); + } + fprintf(read_details_fp, "%s\t%ld\t%.2f\n", sequence_data_str.c_str(), base_count, read_gc_cnt); + sequence.clear(); + gc_count = 0; + } + + // Update the new sequence's name + sequence_data_str = line.substr(1); + + // Clear the sequence + sequence.clear(); + } else { - long_read_info.read_length_count.resize(read_len + 1000, 0); - long_read_info.read_length_count[read_len] += 1; + sequence += line; // T } - read_gc_cnt = 0; - for (int i = 0; i < read_len; i++) + } + + // Save the last sequence's statistics + if (!sequence.empty()) + { + // Get the base counts + uint64_t base_count = 0; + uint64_t n_count = 0; + for (int i = 0; i < (int)sequence.size(); i++) { - if (read_seq[i] == 'A' || read_seq[i] == 'a') + if (sequence[i] == 'A' || sequence[i] == 'a') { long_read_info.total_a_cnt += 1; + base_count += 1; } - else if (read_seq[i] == 'G' || read_seq[i] == 'g') + else if (sequence[i] == 'G' || sequence[i] == 'g') { long_read_info.total_g_cnt += 1; - read_gc_cnt += 1; + gc_count += 1; + base_count += 1; } - else if (read_seq[i] == 'C' || read_seq[i] == 'c') + else if (sequence[i] == 'C' || sequence[i] == 'c') { long_read_info.total_c_cnt += 1; - read_gc_cnt += 1; + gc_count += 1; + base_count += 1; } - else if (read_seq[i] == 'T' || read_seq[i] == 't' || read_seq[i] == 'U' || read_seq[i] == 'u') + else if (sequence[i] == 'T' || sequence[i] == 't' || sequence[i] == 'U' || sequence[i] == 'u') { long_read_info.total_tu_cnt += 1; + base_count += 1; + } else { + n_count += 1; } } - read_gc_cnt = 100.0 * read_gc_cnt / (double)read_len; + + // Save sequence length statistics + if (base_count > (uint64_t)long_read_info.longest_read_length) + { + long_read_info.longest_read_length = base_count; + } + long_read_info.total_num_reads += 1; + + // Get the sequence length distribution + if (base_count < long_read_info.read_length_count.size()) { + long_read_info.read_length_count[(int)base_count] += 1; + } else { + long_read_info.read_length_count.resize(base_count + 1000, 0); + long_read_info.read_length_count[(int)base_count] += 1; + } + + long_read_info.total_num_bases += base_count; + long_read_info.total_n_cnt += n_count; + read_gc_cnt = 100.0 * gc_count / (double)base_count; long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; - fprintf(read_details_fp, "%s\t%ld\t%.2f\n", read_name, read_len, read_gc_cnt); + + // Remove the newline character from the sequence data + size_t pos = sequence_data_str.find_first_of("\r\n"); + if (pos != std::string::npos) + { + std::cerr << "Warning: newline character found in sequence data: " << sequence_data_str << std::endl; + sequence_data_str = sequence_data_str.substr(0, pos); + } + fprintf(read_details_fp, "%s\t%ld\t%.2f\n", sequence_data_str.c_str(), base_count, read_gc_cnt); + sequence.clear(); + gc_count = 0; } - kseq_destroy(seq); + // Close the input file + input_file_stream.close(); } - gzclose(input_fp); return exit_code; } diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp index 2c87e86..c45a79d 100644 --- a/src/fastq_module.cpp +++ b/src/fastq_module.cpp @@ -1,160 +1,101 @@ #include #include -#include #include #include #include #include +#include #include -// "Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format": -// https://github.com/lh3/seqtk -#include "kseq.h" -#include "kseq.h" #include "fastq_module.h" -KSEQ_INIT(gzFile, gzread) // this is a macro defined in kseq.h -static int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &output_data, FILE *read_details_fp) +int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &output_data, FILE *read_details_fp) { int exit_code = 0; - gzFile input_fp; - kseq_t *seq; - char *read_seq; - char *raw_read_qual; - char *read_name; int read_len; double read_gc_cnt; double read_mean_base_qual; - Basic_Seq_Statistics &long_read_info = output_data.long_read_info; Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info; - input_fp = gzopen(input_file, "r"); - if (!input_fp) + long_read_info.total_num_reads = ZeroDefault; // total number of long reads + long_read_info.longest_read_length = ZeroDefault; // the length of longest reads + + std::ifstream input_file_stream(input_file); + if (!input_file_stream.is_open()) { - std::cerr << "Failed to open file for reading: " << input_file << std::endl; + fprintf(stderr, "Failed to open file for reading: %s\n", input_file); exit_code = 3; } else { - seq = kseq_init(input_fp); - while ((read_len = kseq_read(seq)) >= 0) + std::string line, read_seq, read_name, raw_read_qual; + while (std::getline(input_file_stream, line) && !line.empty()) { - if (read_len == 0) {continue;} - read_name = seq->name.s; - read_seq = seq->seq.s; - raw_read_qual = seq->qual.s; - if (read_len > long_read_info.longest_read_length) + if (line[0] == '@') { - long_read_info.longest_read_length = read_len; - } - long_read_info.total_num_reads += 1; - long_read_info.total_num_bases += read_len; - if ((uint64_t)read_len < long_read_info.read_length_count.size()) { - long_read_info.read_length_count[read_len] += 1; - } else { - long_read_info.read_length_count.resize(read_len + 1000, 0); - long_read_info.read_length_count[read_len] += 1; - } - - // Store the read length - long_read_info.read_lengths.push_back(read_len); - - read_gc_cnt = 0; - read_mean_base_qual = 0; - uint64_t base_quality_value; - for (int i = 0; i < read_len; i++) - { - if (read_seq[i] == 'A' || read_seq[i] == 'a') - { - long_read_info.total_a_cnt += 1; - } - else if (read_seq[i] == 'G' || read_seq[i] == 'g') - { - long_read_info.total_g_cnt += 1; - read_gc_cnt += 1; - } - else if (read_seq[i] == 'C' || read_seq[i] == 'c') + read_name = line.substr(1); + read_name = read_name.substr(0, read_name.find_first_of(" \t")); + std::getline(input_file_stream, read_seq); + std::getline(input_file_stream, line); + std::getline(input_file_stream, raw_read_qual); + read_len = read_seq.size(); + if (read_len == 0) {continue;} + if (read_len > long_read_info.longest_read_length) { - long_read_info.total_c_cnt += 1; - read_gc_cnt += 1; + long_read_info.longest_read_length = read_len; } - else if (read_seq[i] == 'T' || read_seq[i] == 't' || read_seq[i] == 'U' || read_seq[i] == 'u') - { - long_read_info.total_tu_cnt += 1; + long_read_info.total_num_reads += 1; + long_read_info.total_num_bases += (uint64_t)read_len; + + // Update the read length counts + if ((uint64_t)read_len < long_read_info.read_length_count.size()) { + long_read_info.read_length_count[read_len] += 1; + } else { + long_read_info.read_length_count.resize(read_len + 1000, 0); + long_read_info.read_length_count[read_len] += 1; } - base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset; - seq_quality_info.base_quality_distribution[base_quality_value] += 1; - read_mean_base_qual += (double) base_quality_value; - } - read_gc_cnt = 100.0 * read_gc_cnt / (double)read_len; - long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; - read_mean_base_qual /= (double) read_len; - seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1; - fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, read_len, read_gc_cnt, read_mean_base_qual); - } - kseq_destroy(seq); - } - gzclose(input_fp); - return exit_code; -} -static uint8_t predict_base_quality_offset(Input_Para &_input_data) -{ - const char * input_file; - gzFile input_fp; - kseq_t *seq; - char *raw_read_qual; - int read_len; - char fastq_base_qual_offset = 0; - int max_num_reads; - int num_processed_reads; + // Store the read length + long_read_info.read_lengths.push_back(read_len); - if (_input_data.num_input_files == 0) - { - std::cerr << "No input files provided.\n" << std::endl; - } else { - max_num_reads = 100000; - num_processed_reads = 0; - for (size_t i = 0; i < _input_data.num_input_files; i++) - { - if (fastq_base_qual_offset){break;} - input_file = _input_data.input_files[i].c_str(); - input_fp = gzopen(input_file, "r"); - if (!input_fp) - { - std::cerr << "Failed to open file for reading: %s" << input_file << std::endl; - } else { - seq = kseq_init(input_fp); - while ((read_len = kseq_read(seq)) >= 0) + // Process base and quality information + read_gc_cnt = 0; + read_mean_base_qual = 0; + uint64_t base_quality_value; + for (int i = 0; i < read_len; i++) { - if (fastq_base_qual_offset) {break;} - if (read_len == 0) {continue;} - raw_read_qual = seq->qual.s; - for (int j = 0; j < read_len; j++) + if (read_seq[i] == 'A' || read_seq[i] == 'a') { - if (raw_read_qual[j] < 64) { - fastq_base_qual_offset = 33; - break; - } + long_read_info.total_a_cnt += 1; } - num_processed_reads++; - if (num_processed_reads > max_num_reads){ - fastq_base_qual_offset = 64; - break; + else if (read_seq[i] == 'G' || read_seq[i] == 'g') + { + long_read_info.total_g_cnt += 1; + read_gc_cnt += 1; } + else if (read_seq[i] == 'C' || read_seq[i] == 'c') + { + long_read_info.total_c_cnt += 1; + read_gc_cnt += 1; + } + else if (read_seq[i] == 'T' || read_seq[i] == 't' || read_seq[i] == 'U' || read_seq[i] == 'u') + { + long_read_info.total_tu_cnt += 1; + } + base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset; + seq_quality_info.base_quality_distribution[base_quality_value] += 1; + read_mean_base_qual += (double) base_quality_value; } - kseq_destroy(seq); + read_gc_cnt = 100.0 * read_gc_cnt / (double)read_len; + long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; + read_mean_base_qual /= (double) read_len; + seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1; + fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, read_gc_cnt, read_mean_base_qual); } - gzclose(input_fp); } + input_file_stream.close(); } - - if (fastq_base_qual_offset == 0){ - fastq_base_qual_offset = 64; - } -// std::cout << "NOTICE: predicted FASTQ base quality offset is " << fastq_base_qual_offset << std::endl; - - return fastq_base_qual_offset; + return exit_code; } int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) @@ -187,12 +128,8 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) output_data.long_read_info.total_n_cnt = ZeroDefault; // N content output_data.long_read_info.gc_cnt = ZeroDefault; // GC ratio - //int64_t *read_length_count; // statistics of read length: each position is the number of reads with the length of the index; - output_data.long_read_info.read_gc_content_count.clear(); output_data.long_read_info.read_length_count.clear(); - //output_data.seq_quality_info.base_quality_distribution.clear(); - //output_data.seq_quality_info.read_average_base_quality_distribution.clear(); output_data.long_read_info.read_length_count.resize(MAX_READ_LENGTH + 1, 0); // read_length_count[x] is the number of reads that length is equal to x. MAX_READ_LENGTH is a initial max value, the vector can expand if thre are reads longer than MAX_READ_LENGTH. @@ -207,15 +144,15 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) // base_quality_distribution[x] means number of bases that quality = x. output_data.seq_quality_info.read_average_base_quality_distribution.resize(256, 0); - // base_quality_distribution[x] means number of reads that average base quality = x. if (_input_data.user_defined_fastq_base_qual_offset > 0) { fastq_base_qual_offset = _input_data.user_defined_fastq_base_qual_offset; } else { - fastq_base_qual_offset = predict_base_quality_offset(_input_data); + fastq_base_qual_offset = 33; } read_details_fp = fopen(read_details_file.c_str(), "w"); + if (NULL == read_details_fp) { std::cerr << "Failed to write output file: " << read_details_file << std::endl; @@ -245,13 +182,20 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) // Calculate GC-content output_data.long_read_info.gc_cnt = g_c / a_tu_g_c; - // Sort the read lengths in descending order + // Get the max read length std::vector read_lengths = output_data.long_read_info.read_lengths; - std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); + if (read_lengths.size() == 0) { + std::cerr << "No reads found in input files." << std::endl; + exit_code = 3; + return exit_code; + } else { + // Sort the read lengths in descending order + std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); - // Get the max read length - int64_t max_read_length = read_lengths.at(0); - output_data.long_read_info.longest_read_length = max_read_length; + // Get the max read length + int64_t max_read_length = read_lengths.at(0); + output_data.long_read_info.longest_read_length = max_read_length; + } // Get the median read length int64_t median_read_length = read_lengths[read_lengths.size() / 2]; @@ -323,4 +267,4 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) } return exit_code; -} \ No newline at end of file +} diff --git a/src/kseq.cpp b/src/kseq.cpp deleted file mode 100644 index b05dfd2..0000000 --- a/src/kseq.cpp +++ /dev/null @@ -1,2 +0,0 @@ -#include "kseq.h" - diff --git a/src/output_data.cpp b/src/output_data.cpp index 48591dd..d0ef854 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -30,6 +30,10 @@ void Basic_Seq_Statistics::resize(){ } } +void Basic_Seq_Statistics::addBases(int count){ + this->total_num_bases += count; +} + // Base class for storing base quality data Basic_Seq_Statistics::Basic_Seq_Statistics( const Basic_Seq_Statistics& _bss){ read_gc_content_count = _bss.read_gc_content_count;