diff --git a/CHANGELOG.md b/CHANGELOG.md index ba6cd6c..0438822 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,37 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [1.4.2] - 11.03.2019 +### Added +- Cell barcode correction based on whitelist instead of top cells. This will trigger automatically + when a whitelist is provided. +- UMI correction for tags with more than than 20'000 umis is too slow right now. + Cells containing a TAG with more than 20'000 umis will be discarded. This means that more than 20'000 + antibody tags with different umis have been captured which is odd. Such high numbers could mean + that cells have been aggregating and can't be used for downstream analysis. + The number of cells are reported under `Bad cells` and the dense matrix of those cells is provided in `uncorrected_cells`. + Fixing issues #32 +- The option to not correct for umis with `--no_umi_correction`. +- Now prints how many reads will be processed. +- Checks that tags are indeed made of ATGC bases. +- Added a check for cell barcode correction and collapsing threshold for given whitelist. +- New option to allow for a sliding window when aligning TAGS. `--sliding-window` + Use when you have a variable sequence before your tags. + +### Changed +- Sparse matrix is now based on int32 instead of int16. Fixing issue #40 +- Cell barcode correction without a whitelist is not outputing any error anymore. + This is due to the fact that it uses a hard threshold based on the `-cells` option + if it kind find one. Fixing issues #29, #36 +- Upgraded umi_tools to `1.0.0` +- fixed the error linked to umi_tools version update `getCellWhitelist` #45. + + ## [1.4.0] - 24.01.2019 ### Added - Enabled parallelization using multiprocessing. You can choose how many cores/threads you want to use with the `-T` `--threads` option. Takes max - cpu by default. It will run on only one core if you run only 1'000'000 reads. + cpu by default. It will run on only one core if you run 1'000'000 reads or less. - The output is now given in a gzipped mtx format. This is to ensure smooth usage for new/larger datasets such as data from novaseq sequencers. For those who still want to use the dense format, there is an option `--dense` which will add the dense output diff --git a/README.md b/README.md index 7babda3..718b2ed 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772) # CITE-seq-Count A python package that allows to count antibody TAGS from a [CITE-seq](https://www.nature.com/articles/nmeth.4380) and/or [cell hashing](https://www.biorxiv.org/content/early/2017/12/21/237693) experiment. @@ -14,7 +15,7 @@ Installation ------------------------------------------- ``` -pip install CITE-seq-Count==1.4.1.1 +pip install CITE-seq-Count==1.4.2 ``` diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index 67c1696..1f88301 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -7,6 +7,7 @@ import os import datetime import pkg_resources +import logging from argparse import ArgumentParser from argparse import RawTextHelpFormatter @@ -75,6 +76,8 @@ def get_args(): barcodes.add_argument('--umi_collapsing_dist', dest='umi_threshold', required=False, type=int, default=2, help="threshold for umi collapsing.") + barcodes.add_argument('--no_umi_correctio', required=False, action='store_true', default=False, + dest='no_umi_correction', help="Deactivate UMI collapsing") barcodes.add_argument('--bc_collapsing_dist', dest='bc_threshold', required=False, type=int, default=1, help="threshold for cellular barcode collapsing.") @@ -114,6 +117,12 @@ def get_args(): help=("Number of bases to discard from read2.") ) + filters.add_argument( + '--sliding-window', dest='sliding_window', + required=False, default=False, action='store_true', + help=("Allow for a sliding window when aligning.") + ) + # Remaining arguments. parser.add_argument('-T', '--threads', required=False, type=int, @@ -136,12 +145,11 @@ def get_args(): help="Print extra information for debugging.") parser.add_argument('--version', action='version', version='CITE-seq-Count v{}'.format(version), help="Print version number.") - # Finally! Too many options XD return parser -def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, args): +def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, bad_cells, args): """ Creates a report with details about the run in a yaml format. @@ -154,8 +162,8 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere args (arg_parse): Arguments provided by the user. """ - total_mapped = sum(reads_per_cell.values()) total_unmapped = sum(no_match.values()) + total_mapped = sum(reads_per_cell.values()) - total_unmapped mapped_perc = round((total_mapped/n_reads)*100) unmapped_perc = round((total_unmapped/n_reads)*100) @@ -167,6 +175,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere Reads processed: {} Percentage mapped: {} Percentage unmapped: {} +Uncorrected cells: {} Correction: \tCell barcodes collapsing threshold: {} \tCell barcodes corrected: {} @@ -191,6 +200,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere n_reads, mapped_perc, unmapped_perc, + len(bad_cells), args.bc_threshold, bcs_corrected, args.umi_threshold, @@ -206,6 +216,15 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere args.start_trim)) def main(): + #Create logger and stream handler + logger = logging.getLogger('cite_seq_count') + logger.setLevel(logging.CRITICAL) + ch = logging.StreamHandler() + ch.setLevel(logging.CRITICAL) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + start_time = time.time() parser = get_args() if not sys.argv[1:]: @@ -215,10 +234,12 @@ def main(): # Parse arguments. args = parser.parse_args() if args.whitelist: - whitelist = preprocessing.parse_whitelist_csv(args.whitelist, - args.cb_last - args.cb_first + 1) + (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv( + filename=args.whitelist, + barcode_length=args.cb_last - args.cb_first + 1, + collapsing_threshold=args.bc_threshold) else: - whitelist = None + whitelist = False # Load TAGs/ABs. ab_map = preprocessing.parse_tags_csv(args.tags) @@ -243,8 +264,8 @@ def main(): n_lines = preprocessing.get_n_lines(args.read1_path) n_reads = int(n_lines/4) n_threads = args.n_threads - print('Started mapping') + print('Processing {:,} reads'.format(n_reads)) #Run with one process if n_threads <= 1 or n_reads < 1000001: print('CITE-seq-Count is running with one core.') @@ -260,13 +281,14 @@ def main(): whitelist=whitelist, debug=args.debug, start_trim=args.start_trim, - maximum_distance=args.max_error) + maximum_distance=args.max_error, + sliding_window=args.sliding_window) print('Mapping done') umis_per_cell = Counter() reads_per_cell = Counter() for cell_barcode,counts in final_results.items(): - umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts if UMI != 'unmapped']) - reads_per_cell[cell_barcode] = sum([sum(counts[UMI].values()) for UMI in counts if UMI != 'unmapped']) + umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts]) + reads_per_cell[cell_barcode] = sum([sum(counts[UMI].values()) for UMI in counts]) else: # Run with multiple processes print('CITE-seq-Count is running with {} cores.'.format(n_threads)) @@ -286,13 +308,15 @@ def main(): whitelist, args.debug, args.start_trim, - args.max_error), + args.max_error, + args.sliding_window), callback=parallel_results.append, error_callback=sys.stderr) p.close() p.join() print('Mapping done') print('Merging results') + ( final_results, umis_per_cell, @@ -301,35 +325,47 @@ def main(): ) = processing.merge_results(parallel_results=parallel_results) del(parallel_results) - # Correct cell barcodes - ( - final_results, - umis_per_cell, - bcs_corrected - ) = processing.correct_cells( - final_results=final_results, - umis_per_cell=umis_per_cell, - expected_cells=args.expected_cells, - collapsing_threshold=args.bc_threshold) - - # Correct umi barcodes - ( - final_results, - umis_corrected - ) = processing.correct_umis( - final_results=final_results, - collapsing_threshold=args.umi_threshold) - ordered_tags_map = OrderedDict() for i,tag in enumerate(ab_map.values()): ordered_tags_map[tag] = i ordered_tags_map['unmapped'] = i + 1 + + # Correct cell barcodes + if(len(umis_per_cell) <= args.expected_cells): + print("Number of expected cells, {}, is higher " \ + "than number of cells found {}.\nNot performing" \ + "cell barcode correction" \ + "".format(args.expected_cells, len(umis_per_cell))) + bcs_corrected = 0 + else: + print('Correcting cell barcodes') + if not whitelist: + ( + final_results, + umis_per_cell, + bcs_corrected + ) = processing.correct_cells( + final_results=final_results, + umis_per_cell=umis_per_cell, + expected_cells=args.expected_cells, + collapsing_threshold=args.bc_threshold) + else: + ( + final_results, + umis_per_cell, + bcs_corrected) = processing.correct_cells_whitelist( + final_results=final_results, + umis_per_cell=umis_per_cell, + whitelist=whitelist, + collapsing_threshold=args.bc_threshold) - # Sort cells by number of mapped umis + # Correct umi barcodes if not whitelist: top_cells_tuple = umis_per_cell.most_common(args.expected_cells) top_cells = set([pair[0] for pair in top_cells_tuple]) + + # Sort cells by number of mapped umis else: top_cells = whitelist # Add potential missing cell barcodes. @@ -339,11 +375,41 @@ def main(): else: final_results[missing_cell] = dict() for TAG in ordered_tags_map: - final_results[missing_cell][TAG] = 0 + final_results[missing_cell][TAG] = Counter() top_cells.add(missing_cell) + #If we want umi correction + if not args.no_umi_correction: + ( + final_results, + umis_corrected, + aberrant_cells + ) = processing.correct_umis( + final_results=final_results, + collapsing_threshold=args.umi_threshold, + top_cells=top_cells, + max_umis=20000) + else: + umis_corrected = 0 + aberrant_cells = set() + for cell_barcode in aberrant_cells: + top_cells.remove(cell_barcode) + #Create sparse aberrant cells matrix + ( + umi_aberrant_matrix, + read_aberrant_matrix + ) = processing.generate_sparse_matrices( + final_results=final_results, + ordered_tags_map=ordered_tags_map, + top_cells=aberrant_cells) + + #Write uncorrected cells to dense output + io.write_dense( + sparse_matrix=umi_aberrant_matrix, + index=list(ordered_tags_map.keys()), + columns=aberrant_cells, + outfolder=os.path.join(args.outfolder,'uncorrected_cells'), + filename='dense_umis.tsv') - - ( umi_results_matrix, read_results_matrix @@ -351,12 +417,14 @@ def main(): final_results=final_results, ordered_tags_map=ordered_tags_map, top_cells=top_cells) + # Write umis to file io.write_to_files( sparse_matrix=umi_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type='umi', outfolder=args.outfolder) + # Write reads to file io.write_to_files( sparse_matrix=read_results_matrix, top_cells=top_cells, @@ -365,6 +433,7 @@ def main(): outfolder=args.outfolder) top_unmapped = merged_no_match.most_common(args.unknowns_top) + with open(os.path.join(args.outfolder, args.unmapped_file),'w') as unknown_file: unknown_file.write('tag,count\n') for element in top_unmapped: @@ -378,6 +447,7 @@ def main(): ordered_tags_map=ordered_tags_map, umis_corrected=umis_corrected, bcs_corrected=bcs_corrected, + bad_cells=aberrant_cells, args=args) if args.dense: print('Writing dense format output') @@ -385,7 +455,8 @@ def main(): sparse_matrix=umi_results_matrix, index=list(ordered_tags_map.keys()), columns=top_cells, - file_path=os.path.join(args.outfolder, 'dense_umis.tsv')) + outfolder=args.outfolder, + filename='dense_umis.tsv') if __name__ == '__main__': main() diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py index 344c3ec..5c46ee0 100644 --- a/cite_seq_count/io.py +++ b/cite_seq_count/io.py @@ -31,6 +31,8 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol shutil.copyfileobj(mtx_in, mtx_gz) os.remove(os.path.join(prefix,'matrix.mtx')) -def write_dense(sparse_matrix, index, columns, file_path): +def write_dense(sparse_matrix, index, columns, outfolder, filename): + prefix = os.path.join(outfolder) + os.makedirs(prefix, exist_ok=True) pandas_dense = pd.DataFrame(sparse_matrix.todense(), columns=columns, index=index) - pandas_dense.to_csv(file_path, sep='\t') + pandas_dense.to_csv(os.path.join(outfolder,filename), sep='\t') diff --git a/cite_seq_count/preprocessing.py b/cite_seq_count/preprocessing.py index 7ccc589..f8638c1 100644 --- a/cite_seq_count/preprocessing.py +++ b/cite_seq_count/preprocessing.py @@ -49,7 +49,7 @@ def chunk_reads(n_reads, n): return(indexes) -def parse_whitelist_csv(filename, barcode_length): +def parse_whitelist_csv(filename, barcode_length, collapsing_threshold): """Reads white-listed barcodes from a CSV file. The function accepts plain barcodes or even 10X style barcodes with the @@ -69,10 +69,38 @@ def parse_whitelist_csv(filename, barcode_length): cell_pattern = regex.compile(r'[ATGC]{{{}}}'.format(barcode_length)) whitelist = [row[0].strip(STRIP_CHARS) for row in csv_reader if (len(row[0].strip(STRIP_CHARS)) == barcode_length)] - for cell_barcode in whitelist: - if not cell_pattern.match(cell_barcode): - sys.exit('This barcode {} is not only composed of ATGC bases.'.format(cell_barcode)) - return set(whitelist) + for cell_barcode in whitelist: + if not cell_pattern.match(cell_barcode): + sys.exit('This barcode {} is not only composed of ATGC bases.'.format(cell_barcode)) + collapsing_threshold=test_cell_distances(whitelist, collapsing_threshold) + return(set(whitelist), collapsing_threshold) + + +def test_cell_distances(whitelist, collapsing_threshold): + """Tests cell barcode distances to validate provided cell barcode collapsing threshold + + Function needs the given whitelist as well as the threshold. + If the value is too high, it will rerun until an acceptable value is found. + + Args: + whitelist (set): Whitelist barcode set + collapsing_threshold (int): Value of threshold + + Returns: + collapsing_threshold (int): Valid threshold + """ + ok = False + while not ok: + print('Testing cell barcode collapsing threshold of {}'.format(collapsing_threshold)) + for comb in combinations(whitelist, 2): + if Levenshtein.hamming(comb[0], comb[1]) <= collapsing_threshold: + collapsing_threshold -= 1 + print('Value is too high, reducing it by 1') + break + else: + ok = True + print('Using {} for cell barcode collapsing threshold'.format(collapsing_threshold)) + return(collapsing_threshold) def parse_tags_csv(filename): @@ -131,7 +159,11 @@ def check_tags(tags, maximum_distance): distance = Levenshtein.distance(a, b) if (distance <= (maximum_distance - 1)): offending_pairs.append([a, b, distance]) - + DNA_pattern = regex.compile('^[ATGC]*$') + for tag in tags: + if not DNA_pattern.match(tag): + print('This tag {} is not only composed of ATGC bases.\nPlease check your tags file'.format(tag)) + sys.exit('Exiting the application.\n') # If offending pairs are found, print them all. if offending_pairs: print( @@ -212,6 +244,7 @@ def check_barcodes_lengths(read1_length, cb_first, cb_last, umi_first, umi_last) return(barcode_slice, umi_slice, barcode_umi_length) + def blocks(files, size=65536): """ A fast way of counting the lines of a large file. diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py index d4e0713..0b478bf 100644 --- a/cite_seq_count/processing.py +++ b/cite_seq_count/processing.py @@ -4,15 +4,17 @@ import os import Levenshtein import regex +import pybktree from collections import Counter from collections import defaultdict from itertools import islice -from numpy import int16 +from numpy import int32 from scipy import sparse from umi_tools import network from umi_tools import umi_methods +import umi_tools.whitelist_methods as whitelist_methods from cite_seq_count import secondsToText @@ -37,17 +39,53 @@ def find_best_match(TAG_seq, tags, maximum_distance): best_match = 'unmapped' best_score = maximum_distance for tag, name in tags.items(): - score = Levenshtein.distance(tag, TAG_seq[:len(tag)]) - if score <= best_score: + score = Levenshtein.hamming(tag, TAG_seq[:len(tag)]) + if score == 0: + #Best possible match + return(name) + elif score <= best_score: best_score = score best_match = name return(best_match) return(best_match) +def find_best_match_shift(TAG_seq, tags, maximum_distance): + """ + Find the best match from the list of tags with sliding window. + + Compares the Levenshtein distance between tags and the trimmed sequences. + The tag and the sequence must have the same length. + If no matches found returns 'unmapped'. + We add 1 + Args: + TAG_seq (string): Sequence from R1 already start trimmed + tags (dict): A dictionary with the TAGs as keys and TAG Names as values. + maximum_distance (int): Maximum distance given by the user. + + Returns: + best_match (string): The TAG name that will be used for counting. + """ + best_match = 'unmapped' + best_score = maximum_distance + shifts = range(0,len(TAG_seq) - len(max(tags,key=len))) + + for shift in shifts: + for tag, name in tags.items(): + score = Levenshtein.hamming(tag, TAG_seq[shift:len(tag)+shift]) + if score == 0: + #Best possible match + return(name) + elif score <= best_score: + best_score = score + best_match = name + return(best_match) + return(best_match) + + def map_reads(read1_path, read2_path, tags, barcode_slice, umi_slice, indexes, whitelist, debug, - start_trim, maximum_distance): + start_trim, maximum_distance, sliding_window): """Read through R1/R2 files and generate a islice starting at a specific index. It reads both Read1 and Read2 files, creating a dict based on cell barcode. @@ -106,7 +144,10 @@ def map_reads(read1_path, read2_path, tags, barcode_slice, if cell_barcode not in results: results[cell_barcode] = defaultdict(Counter) - best_match = find_best_match(TAG_seq, tags, maximum_distance) + if(sliding_window): + best_match = find_best_match_shift(TAG_seq, tags, maximum_distance) + else: + best_match = find_best_match(TAG_seq, tags, maximum_distance) results[cell_barcode][best_match][UMI] += 1 @@ -157,14 +198,13 @@ def merge_results(parallel_results): if mapped[cell_barcode][TAG]: for UMI in mapped[cell_barcode][TAG]: merged_results[cell_barcode][TAG][UMI] += mapped[cell_barcode][TAG][UMI] - if TAG != 'unmapped': - umis_per_cell[cell_barcode] += len(mapped[cell_barcode][TAG]) - reads_per_cell[cell_barcode] += mapped[cell_barcode][TAG][UMI] + umis_per_cell[cell_barcode] += len(mapped[cell_barcode][TAG]) + reads_per_cell[cell_barcode] += mapped[cell_barcode][TAG][UMI] merged_no_match.update(unmapped) return(merged_results, umis_per_cell, reads_per_cell, merged_no_match) -def correct_umis(final_results, collapsing_threshold): +def correct_umis(final_results, collapsing_threshold, top_cells, max_umis): """ Corrects umi barcodes within same cell/tag groups. @@ -178,21 +218,72 @@ def correct_umis(final_results, collapsing_threshold): """ print('Correcting umis') corrected_umis = 0 - for cell_barcode in final_results: + aberrant_umi_count_cells = set() + for cell_barcode in top_cells: for TAG in final_results[cell_barcode]: - if len(final_results[cell_barcode][TAG]) > 1: + n_umis = len(final_results[cell_barcode][TAG]) + if n_umis > 1 and n_umis <= max_umis: umi_clusters = network.UMIClusterer() UMIclusters = umi_clusters( final_results[cell_barcode][TAG], collapsing_threshold) - for umi_cluster in UMIclusters: # This is a list with the first element the dominant barcode - if(len(umi_cluster) > 1): # This means we got a correction - major_umi = umi_cluster[0] - for minor_umi in umi_cluster[1:]: - corrected_umis += 1 - temp = final_results[cell_barcode][TAG].pop(minor_umi) - final_results[cell_barcode][TAG][major_umi] += temp - return(final_results, corrected_umis) + (new_res, temp_corrected_umis) = update_umi_counts(UMIclusters, final_results[cell_barcode].pop(TAG)) + final_results[cell_barcode][TAG] = new_res + corrected_umis += temp_corrected_umis + elif n_umis > max_umis: + aberrant_umi_count_cells.add(cell_barcode) + return(final_results, corrected_umis, aberrant_umi_count_cells) + + +def update_umi_counts(UMIclusters, cell_tag_counts): + """ + Update a dict object with umis corrected. + + Args: + UMIclusters (list): List of lists with corrected umis + cell_tag_counts (Counter): Counter of umis + + Returns: + cell_tag_counts (Counter): Updated Counter of umis + temp_corrected_umis (int): Number of corrected umis + """ + temp_corrected_umis = 0 + for umi_cluster in UMIclusters: # This is a list with the first element the dominant barcode + if(len(umi_cluster) > 1): # This means we got a correction + major_umi = umi_cluster[0] + for minor_umi in umi_cluster[1:]: + temp_corrected_umis += 1 + temp = cell_tag_counts.pop(minor_umi) + cell_tag_counts[major_umi] += temp + return(cell_tag_counts, temp_corrected_umis) + + +def collapse_cells(true_to_false, umis_per_cell, final_results): + """ + Collapses cell barcodes based on the mapping true_to_false + + Args: + true_to_false (dict): Mapping between the reference and the "mutated" barcodes. + umis_per_cell (Counter): Counter of number of umis per cell. + final_results (dict): Dict of dict of Counters with mapping results. + + Returns: + umis_per_cell (Counter): Counter of number of umis per cell. + final_results (dict): Same as input but with corrected cell barcodes. + corrected_barcodes (int): How many cell barcodes have been corrected. + """ + print('Collapsing cell barcodes') + corrected_barcodes = 0 + for real_barcode in true_to_false: + for fake_barcode in true_to_false[real_barcode]: + if real_barcode in final_results: + temp = final_results.pop(fake_barcode) + corrected_barcodes += 1 + for TAG in temp.keys(): + final_results[real_barcode][TAG].update(temp[TAG]) + temp_umi_counts = umis_per_cell.pop(fake_barcode) + umis_per_cell[real_barcode] += temp_umi_counts + return(umis_per_cell, final_results, corrected_barcodes) def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_cells): @@ -207,28 +298,75 @@ def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_c Returns: final_results (dict): Same as input but with corrected umis. + umis_per_cell (Counter): Counter of umis per cell after cell barcode correction corrected_umis (int): How many umis have been corrected. """ - print('Correcting cell barcodes') - corrected_barcodes = 0 - try: - cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( - cell_barcode_counts=umis_per_cell, - expect_cells=expected_cells, - cell_number=False, - error_correct_threshold=collapsing_threshold, - plotfile_prefix=False) - if true_to_false_map: - for real_barcode in true_to_false_map: - for fake_barcode in true_to_false_map[real_barcode]: - temp = final_results.pop(fake_barcode) - corrected_barcodes += 1 - for TAG in temp.keys(): - final_results[real_barcode][TAG].update(temp[TAG]) - temp_umi_counts = umis_per_cell.pop(fake_barcode) - umis_per_cell[real_barcode] += temp_umi_counts - except Exception as e: - print('Could not find a good local minima for correction.\nNo cell barcode correction was done.') + print('Finding a whitelist') + cell_whitelist, true_to_false = whitelist_methods.getCellWhitelist( + cell_barcode_counts=umis_per_cell, + expect_cells=expected_cells, + cell_number=expected_cells, + error_correct_threshold=collapsing_threshold, + plotfile_prefix=False) + + ( + umis_per_cell, + final_results, + corrected_barcodes + ) = collapse_cells( + true_to_false=true_to_false, + umis_per_cell=umis_per_cell, + final_results=final_results) + return(final_results, umis_per_cell, corrected_barcodes) + + +def correct_cells_whitelist(final_results, umis_per_cell, whitelist, collapsing_threshold): + """ + Corrects cell barcodes. + + Args: + final_results (dict): Dict of dict of Counters with mapping results. + umis_per_cell (Counter): Counter of number of umis per cell. + whitelist (set): The whitelist reference given by the user. + collapsing_threshold (int): Max distance between umis. + + Returns: + final_results (dict): Same as input but with corrected umis. + umis_per_cell (Counter): Counter of umis per cell after cell barcode correction + corrected_umis (int): How many umis have been corrected. + """ + true_to_false = defaultdict(set) + barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist) + print('Generated barcode tree from whitelist') + cell_barcodes = list(final_results.keys()) + print('Finding reference candidates') + for i, cell_barcode in enumerate(cell_barcodes): + if cell_barcode in whitelist: + # if the barcode is already whitelisted, no need to add + continue + # get all members of whitelist that are at distance of collapsing_threshold + candidates = [white_cell for d, white_cell in barcode_tree.find(cell_barcode, collapsing_threshold) if d > 0] + + if len(candidates) == 0: + # the cell doesnt match to any whitelisted barcode, + # hence we have to drop it + # (as it cannot be asscociated with any frequent barcode) + continue + elif len(candidates) == 1: + white_cell_str = candidates[0] + true_to_false[white_cell_str].add(cell_barcode) + else: + # more than on whitelisted candidate: + # we drop it as its not uniquely assignable + continue + ( + umis_per_cell, + final_results, + corrected_barcodes + ) = collapse_cells( + true_to_false=true_to_false, + umis_per_cell=umis_per_cell, + final_results=final_results) return(final_results, umis_per_cell, corrected_barcodes) @@ -245,8 +383,8 @@ def generate_sparse_matrices(final_results, ordered_tags_map, top_cells): read_results_matrix (scipy.sparse.dok_matrix): Read counts """ - umi_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int16) - read_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int16) + umi_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int32) + read_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int32) for i,cell_barcode in enumerate(top_cells): for j,TAG in enumerate(final_results[cell_barcode]): if final_results[cell_barcode][TAG]: diff --git a/docs/docs/Installation.md b/docs/docs/Installation.md index dc1ef20..cf1dd6d 100644 --- a/docs/docs/Installation.md +++ b/docs/docs/Installation.md @@ -4,7 +4,7 @@ Installation with pip `CITE-seq-Count` is stored on pypi. You can install it using the following command: ``` -pip install CITE-seq-Count +pip install CITE-seq-Count==1.4.2 ``` diff --git a/docs/docs/Reading-the-output.md b/docs/docs/Reading-the-output.md index e8961b3..cf28e72 100644 --- a/docs/docs/Reading-the-output.md +++ b/docs/docs/Reading-the-output.md @@ -39,6 +39,7 @@ CITE-seq-Count Version: 1.4.0 Reads processed: 50000000 Percentage mapped: 95 Percentage unmapped: 5 +Uncorrected cells: 33 Correction: Cell barcodes collapsing threshold: 1 Cell barcodes corrected: 20000 @@ -65,7 +66,25 @@ Packages to read MTX I recommend using `Seurat` and their `Read10x` function to read the results. -Example: `Read10x('OUTFOLDER/umi_count', gene.column=1)` +With Seurat V3: + +`Read10x('OUTFOLDER/umi_count/', gene.column=1)` + + +With Matrix: + +``` +library(Matrix) +matrix_dir = "/path_to_your_directory/out_cite_seq_count/umi_count/" +barcode.path <- paste0(matrix_dir, "barcodes.tsv.gz") +features.path <- paste0(matrix_dir, "features.tsv.gz") +matrix.path <- paste0(matrix_dir, "matrix.mtx.gz") +mat <- readMM(file = matrix.path) +feature.names = read.delim(features.path, header = FALSE, stringsAsFactors = FALSE) +barcode.names = read.delim(barcode.path, header = FALSE, stringsAsFactors = FALSE) +colnames(mat) = barcode.names$V1 +rownames(mat) = feature.names$V1 +``` **Python** diff --git a/docs/docs/Running-the-script.md b/docs/docs/Running-the-script.md index 6aa64e3..8652154 100644 --- a/docs/docs/Running-the-script.md +++ b/docs/docs/Running-the-script.md @@ -128,7 +128,7 @@ Barcodes from 1 to 16 and UMI from 17 to 26, then this is the input you need: Filtering for structure of the antibody barcode as well as maximum errors. -* [OPTIONAL] Maximum Levenshtein distance allowed. This allows to catch antibody barcodes that might have `--max-error` errors compared to the real barcodes. +* [OPTIONAL] Maximum Levenshtein distance allowed. This allows to catch antibody barcodes that might have `--max-error` errors compared to the real barcodes. (was `-hd` in previous versions) `--max-error MAX_ERROR`, default `3` @@ -148,6 +148,23 @@ There is a sanity check when for the `MAX_ERROR` value chosen to be sure you are `-trim N_BASES, --start-trim N_BASES`, default `0` +* [OPTIONAL] Activate sliding window alignement. Use this when you have a protocol that has a variable sequence before the inserted TAG. + +`--sliding-window`, default `False` + +Example: + +The TAG: `ATGCTAGCT` with a variable prefix: `TTCAATTTCA` +R2 reads: +``` +TTCA ATGCTAGCTAAAAAAAAAAAAAAAAA +TTCAA ATGCTAGCTAAAAAAAAAAAAAAAA +TTCAAT ATGCTAGCTAAAAAAAAAAAAAAA +TTCAATT ATGCTAGCTAAAAAAAAAAAAAA +TTCAATTT ATGCTAGCTAAAAAAAAAAAAA +TTCAATTTC ATGCTAGCTAAAAAAAAAAAA +``` + ### OUTPUT You have to choose either the number of cells you expect or give it a list of cell barcodes to retrieve. @@ -156,7 +173,7 @@ You have to choose either the number of cells you expect or give it a list of ce `-cells CELLS, --expected_cells CELLS` -* [Optional] If you have a whitelist of barcodes produced by the cDNA data, you are using a well-plate based protocol or any known whitelist, you can use this list to only extract TAGS matching those barcodes. Typically using the cell list from the mRNA data of the same experiment. +* [Optional] If you have a whitelist of barcodes produced by the cDNA data, you are using a well-plate based protocol or a platform reference, you can use this list to only extract TAGS matching those barcodes. Typically using the cell list from the mRNA data of the same experiment. This is highly recommended as knowing the true cells helps for cell barcode correction. `-wl WHITELIST, --whitelist WHITELIST` diff --git a/docs/docs/index.md b/docs/docs/index.md index b8bb589..79133a5 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -6,4 +6,19 @@ IMPORTANT NEWS For users who have processed data using CITE-seq-Count version `1.3.4` please rerun any data using version 1.4.0. -There was a bug and the counts are not umi counts! \ No newline at end of file +There was a bug and the counts are not umi counts! + + +How to cite CITE-seq-Count +------------------------------- + +If you are using `CITE-seq-count` for you experiments, please consider citing it. + +You can use the DOI provided bellow. + +[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772) + + +Bugs or feature requests +----------------------------- +You are welcome to address any issues on the [github page](https://github.com/Hoohm/CITE-seq-Count/issues) \ No newline at end of file diff --git a/setup.py b/setup.py index d73a4ab..79a17c3 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="CITE-seq-Count", - version="1.4.1.1", + version="1.4.2", author="Roelli Patrick", author_email="patrick.roelli@gmail.com", description="A python package that counts CITE seq or hashing data for single cell experiments", @@ -28,7 +28,8 @@ 'umi_tools==1.0.0', 'pytest==4.1.0', 'pytest-dependency==0.4.0', - 'pandas>=0.23.4' + 'pandas>=0.23.4', + 'pybktree==1.1' ], python_requires='>=3.6' ) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 16b4682..f702caa 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -44,7 +44,7 @@ def data(): @pytest.mark.dependency() def test_parse_whitelist_csv(data): - assert preprocessing.parse_whitelist_csv(pytest.correct_whitelist_path, 16) == pytest.correct_whitelist + assert preprocessing.parse_whitelist_csv(pytest.correct_whitelist_path, 16, 1) == (pytest.correct_whitelist,1) @pytest.mark.dependency() def test_parse_tags_csv(data): diff --git a/tests/test_processing.py b/tests/test_processing.py index 8801b89..624fa03 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -85,13 +85,15 @@ def data(): {'test2-CGTACGTAGCCTAGC': Counter({b'TAGCTTAGTA': 5, b'GCGATGCATA': 1})} } - pytest.umis_per_cell = { + pytest.umis_per_cell = Counter({ 'ACTGTTTTATTGGCCT': 1, 'TTCATAAGGTAGGGAT': 2 - } + }) pytest.expected_cells = 2 pytest.no_match = Counter() pytest.collapsing_threshold = 1 + pytest.sliding_window = False + pytest.max_umis = 20000 pytest.sequence_pool = [] pytest.tags_complete = preprocessing.check_tags(preprocessing.parse_tags_csv('tests/test_data/tags/correct.csv'), 5) @@ -151,12 +153,14 @@ def test_classify_reads_multi_process(data): pytest.correct_whitelist, pytest.debug, pytest.start_trim, - pytest.maximum_distance) + pytest.maximum_distance, + pytest.sliding_window) assert len(results) == 2 + @pytest.mark.dependency(depends=['test_classify_reads_multi_process']) def test_correct_umis(data): - temp = processing.correct_umis(pytest.results, 2) + temp = processing.correct_umis(pytest.results, 2, pytest.corrected_results.keys(), pytest.max_umis) results = temp[0] n_corrected = temp[1] for cell_barcode in results.keys(): @@ -165,10 +169,12 @@ def test_correct_umis(data): assert sum(results[cell_barcode][TAG].values()) == sum(pytest.corrected_results[cell_barcode][TAG].values()) assert n_corrected == 3 + @pytest.mark.dependency(depends=['test_correct_umis']) def test_correct_cells(data): processing.correct_cells(pytest.corrected_results, pytest.umis_per_cell, pytest.expected_cells, pytest.collapsing_threshold) + @pytest.mark.dependency(depends=['test_correct_umis']) def test_generate_sparse_matrices(data): (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices(