Skip to content

Commit

Permalink
0.16.0 release
Browse files Browse the repository at this point in the history
  • Loading branch information
chhylp123 committed Aug 23, 2021
1 parent 37b07e4 commit e230b3a
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 85 deletions.
37 changes: 36 additions & 1 deletion CommandLines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ static ko_longopt_t long_options[] = {
{ "fast", ko_no_argument, 329 },
{ "dp-er", ko_required_argument, 330},
{ "max-kocc", ko_required_argument, 331},
{ "hg-size", ko_required_argument, 332},
{ 0, 0, 0 }
};

Expand Down Expand Up @@ -70,6 +71,10 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " -N INT consider up to max(-D*coverage,-N) overlaps for each oriented read [%d]\n", asm_opt->max_n_chain);
fprintf(stderr, " -r INT round of correction [%d]\n", asm_opt->number_of_round);
fprintf(stderr, " -z INT length of adapters that should be removed [%d]\n", asm_opt->adapterLen);
fprintf(stderr, " --max-kocc INT\n");
fprintf(stderr, " employ k-mers occurring <INT times to rescue repetitive overlaps [%d]\n", asm_opt->max_kmer_cnt);
fprintf(stderr, " --hg-size INT(k, m or g)\n");
fprintf(stderr, " estimated haploid genome size used for inferring read coverage [auto]\n");
fprintf(stderr, " Assembly:\n");
fprintf(stderr, " -a INT round of assembly cleaning [%d]\n", asm_opt->clean_round);
fprintf(stderr, " -m INT pop bubbles of <INT in size in contig graphs [%lld]\n", asm_opt->large_pop_bubble_size);
Expand Down Expand Up @@ -213,6 +218,7 @@ void init_opt(hifiasm_opt_t* asm_opt)
asm_opt->scffold = 0;
asm_opt->dp_min_len = 2000;
asm_opt->dp_e = 0.0025;
asm_opt->hg_size = -1;
}

void destory_enzyme(enzyme* f)
Expand Down Expand Up @@ -514,6 +520,18 @@ int check_option(hifiasm_opt_t* asm_opt)
return 0;
}

if(asm_opt->max_kmer_cnt < 0)
{
fprintf(stderr, "[ERROR] [--max-kocc] must >= 0\n");
return 0;
}

if(asm_opt->hg_size < -1)
{
fprintf(stderr, "[ERROR] [--hg-size] wrong genome size\n");
return 0;
}

return 1;
}

Expand Down Expand Up @@ -599,6 +617,23 @@ void get_hic_enzymes(char *argv, enzyme** x, int check_name)
(*x)->a[k][(*x)->l[k]] = '\0';
}

int64_t inter_gsize(char *argv)
{
int64_t len = strlen(argv);
double s;
if(len <= 1) return -2;
char t = argv[len-1];
if(t != 'k' && t != 'K' && t != 'm' && t != 'M' && t != 'g' && t != 'G') return -2;
char *ss=(char*)malloc(len);
memcpy(ss, argv, len-1); ss[len-1] = '\0';
s = atof(ss);
free(ss);
if(t == 'k' || t == 'K') return s*1000;
if(t == 'm' || t == 'M') return s*1000000;
if(t == 'g' || t == 'G') return s*1000000000;
return s;
}

int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
{
ketopt_t opt = KETOPT_INIT;
Expand Down Expand Up @@ -686,6 +721,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
else if (c == 329) asm_opt->flag |= HA_F_FAST;
else if (c == 330) asm_opt->dp_e = atof(opt.arg);
else if (c == 331) asm_opt->max_kmer_cnt = atol(opt.arg);
else if (c == 332) asm_opt->hg_size = inter_gsize(opt.arg);
else if (c == 'l')
{ ///0: disable purge_dup; 1: purge containment; 2: purge overlap
asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg);
Expand Down Expand Up @@ -716,6 +752,5 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)

get_queries(argc, argv, &opt, asm_opt);


return check_option(asm_opt);
}
3 changes: 2 additions & 1 deletion CommandLines.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <pthread.h>
#include <stdint.h>

#define HA_VERSION "0.15.5-r366"
#define HA_VERSION "0.16.0-r369"

#define VERBOSE 0

Expand Down Expand Up @@ -113,6 +113,7 @@ typedef struct {
uint64_t scffold;
int32_t dp_min_len;
float dp_e;
int64_t hg_size;
} hifiasm_opt_t;

extern hifiasm_opt_t asm_opt;
Expand Down
20 changes: 16 additions & 4 deletions Purge_Dups.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ void print_peak(long long* cov_buf, long long cov_buf_length, long long max_i)
}


void get_read_peak(long long* cov_buf, long long cov_buf_length, long long* topo_peak_cov,
long long* hom_peak, long long* het_peak, long long* k_mer_only, long long* coverage_only)
void get_read_peak(asg_t *read_g, long long* cov_buf, long long cov_buf_length, long long* topo_peak_cov,
long long* hom_peak, long long* het_peak, long long* k_mer_only, long long* coverage_only, long long g_size)
{
long long i, start, err_i, max_i, max2_i, max3_i, topo_peak_i, max, max2, max3, topo_peak, min;

Expand Down Expand Up @@ -242,6 +242,18 @@ long long* hom_peak, long long* het_peak, long long* k_mer_only, long long* cove
coverage_hom = max_i;
}

if(g_size > 0) {
long long n_bs, m_peak_hom = -1;
int p_ht = -1;
for (i = n_bs = 0; i < read_g->n_seq; i++) n_bs += read_g->seq[i].len;
m_peak_hom = n_bs/g_size;
if(m_peak_hom > 0) {
p_ht = -1;
coverage_hom = adj_m_peak_hom(m_peak_hom, max_i, max2_i, max3_i, &p_ht);
coverage_het = p_ht;
}
}

if(k_mer_het != -1)
{
(*het_peak) = k_mer_het;
Expand Down Expand Up @@ -410,8 +422,8 @@ long long* k_mer_only, long long* coverage_only)
cov_buf_length);
}

get_read_peak(cov_buf, cov_buf_length, alter_peak == -1? NULL: &alter_peak, &hom_peak, &het_peak,
k_mer_only, coverage_only);
get_read_peak(read_g, cov_buf, cov_buf_length, alter_peak == -1? NULL: &alter_peak, &hom_peak, &het_peak,
k_mer_only, coverage_only, asm_opt.hg_size);

free(cov_buf);

Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@
# built documents.
#
# The short X.Y version.
version = '0.15.5-r350'
version = '0.16.0-r369'
# The full version, including alpha/beta/rc tags.
release = '0.15.5'
release = '0.16.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
4 changes: 2 additions & 2 deletions docs/source/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Why one Hi-C integrated assembly is larger than another one?
Another possibility is that hifiasm misidentifies coverage threshold for homozygous reads. For instance, hifiasm prints the following information during assembly:
::

[M::purge_dups] purge duplication coverage threshold: 36
[M::purge_dups] homozygous read coverage threshold: 36

In this example, hifiasm identifies the coverage threshold for homozygous reads as ``36``. If it is significantly smaller than the homozygous coverage peak, hifiasm will generate two unbalanced assemblies. In this case, please set ``--hom-cov`` to homozygous coverage peak. Please note that tuning ``--hom-cov`` may affect ``*p_utg*gfa`` so that ``*hic*.bin`` should be deleted. Since v0.15.5, hifiasm can detect such changes and renew Hi-C bin files automatically.

Expand All @@ -60,7 +60,7 @@ For Hi-C integrated assembly, why the assembly size of both haplotypes are much
If most bases of a diploid sample are homozygous, the coverage threshold is wrongly determined by hifiasm. For instance, hifiasm prints the following information during assembly:
::

[M::purge_dups] purge duplication coverage threshold: 36
[M::purge_dups] homozygous read coverage threshold: 36

In this example, hifiasm identifies the coverage threshold for homozygous reads as ``36``. If it is much smaller than homozygous coverage peak, hifiasm thinks most reads are homozygous and assign them to both assemblies, making both of them much larger than the estimated haploid genome size. In this case, please set ``--hom-cov`` to homozygous coverage peak. Please note that tuning ``--hom-cov`` may affect ``*p_utg*gfa`` so that ``*hic*.bin`` should be deleted. Since v0.15.5, hifiasm can detect such changes and renew Hi-C bin files automatically.

Expand Down
11 changes: 11 additions & 0 deletions docs/source/parameter-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@ Error correction options
**\-z <INT=0>**
Length of adapters that should be removed. This option remove ``INT`` bases from both ends of each read. Some old HiFi reads may consist of short adapters (e.g. 20bp adapter at one end). For such data, trimming short adapters would significantly improve the assembly quality.

.. _max-kocc-opt:

**\-\-max-kocc <INT=2000>**
Employ k-mers occurring < ``INT`` times to rescue repetitive overlaps. This option may improve the resolution of repeats.


.. _hg-size-opt:

**\-\-hg-size <INT(k/m/g)>**
Estimated haploid genome size used for inferring read coverage. This option is used to get accurate homozygous read coverage during error correction. Common suffices are required, for example, 100m or 3g.


.. _min-hist-cnt-opt:

Expand Down
15 changes: 14 additions & 1 deletion hifiasm.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH hifiasm 1 "25 July 2021" "hifiasm-0.15.5 (r350)" "Bioinformatics tools"
.TH hifiasm 1 "22 August 2021" "hifiasm-0.16.0 (r369)" "Bioinformatics tools"

.SH NAME
.PP
Expand Down Expand Up @@ -161,6 +161,19 @@ Some old Hifi reads may consist of
short adapters (e.g., 20bp adapter at one end). For such data, trimming short adapters would
significantly improve the assembly quality.

.TP
.BI --max-kocc \ INT
Employ k-mers occurring <
.IR INT
times to rescue repetitive overlaps [2000].
This option may improve the resolution of repeats.

.TP
.BI --hg-size \ INT (k/m/g)
Estimated haploid genome size used for inferring read coverage [auto].
This option is used to get accurate homozygous read coverage during
error correction. Common suffices are required, for example, 100m or 3g.

.TP
.BI --min-hist-cnt \ INT
When analyzing the k-mer spectrum, ignore counts below
Expand Down
32 changes: 31 additions & 1 deletion hist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,35 @@ void print_hist_lines(int n_cnt, int start_cnt, const int64_t *cnt)
}
}

int ha_analyze_count(int n_cnt, int start_cnt, const int64_t *cnt, int *peak_het)
int adj_m_peak_hom(int m_peak_hom, int max_i, int max2_i, int max3_i, int *peak_het)
{
int64_t mm[3], d, min_i, min_d, i;
mm[0] = max2_i; mm[1] = max_i; mm[2] = max3_i;
for (i = 0, min_i = -1, min_d = -1; i < 3; i++){
if(mm[i] <= 0) continue;
d = (mm[i] >= m_peak_hom?mm[i]-m_peak_hom:m_peak_hom-mm[i]);
if(min_d == -1 || min_d > d || (min_d == d && i == 1)){
min_d = d; min_i = i;
}
}
if(min_i < 0) return m_peak_hom;
if(mm[min_i] < m_peak_hom){
d = m_peak_hom - mm[min_i];
if(d >= mm[min_i]*0.51) {
*peak_het = mm[min_i];
return m_peak_hom;
}
}

for (i = min_i-1; i >= 0; i--){
if(mm[i] <= 0) continue;
*peak_het = mm[i];
break;
}
return mm[min_i];
}

int ha_analyze_count(int n_cnt, int start_cnt, int m_peak_hom, const int64_t *cnt, int *peak_het)
{
const int hist_max = 100;
int i, start, low_i, max_i, max2_i, max3_i;
Expand Down Expand Up @@ -117,6 +145,8 @@ int ha_analyze_count(int n_cnt, int start_cnt, const int64_t *cnt, int *peak_het
}
if (max3 > 0) fprintf(stderr, "[M::%s] right: count[%d] = %ld\n", __func__, max3_i, (long)cnt[max3_i]);
else fprintf(stderr, "[M::%s] right: none\n", __func__);

if(m_peak_hom > 0) return adj_m_peak_hom(m_peak_hom, max_i, max2_i, max3_i, peak_het);
if (max3_i > 0) {
*peak_het = max_i;
return max3_i;
Expand Down
Loading

0 comments on commit e230b3a

Please sign in to comment.