Deploying to gh-pages from @ 0ee24fb 🚀

regulatory-genomics · Oct 1, 2024 · ab26fe3 · ab26fe3
commit ab26fe3
Show file tree

Hide file tree

Showing 178 changed files with 26,098 additions and 0 deletions.
diff --git a/.buildinfo b/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: f2af386d6e99bb22829f120be7f0f3d7
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.delete_read.doctree b/.doctrees/_autosummary/precellar.SeqSpec.delete_read.doctree
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.doctree b/.doctrees/_autosummary/precellar.SeqSpec.doctree
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.to_yaml.doctree b/.doctrees/_autosummary/precellar.SeqSpec.to_yaml.doctree
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.update_read.doctree b/.doctrees/_autosummary/precellar.SeqSpec.update_read.doctree
diff --git a/.doctrees/_autosummary/precellar.align.doctree b/.doctrees/_autosummary/precellar.align.doctree
diff --git a/.doctrees/_autosummary/precellar.make_fragment.doctree b/.doctrees/_autosummary/precellar.make_fragment.doctree
diff --git a/.doctrees/_autosummary/precellar.make_genome_index.doctree b/.doctrees/_autosummary/precellar.make_genome_index.doctree
diff --git a/.doctrees/_autosummary/precellar.utils.strip_barcode_from_fastq.doctree b/.doctrees/_autosummary/precellar.utils.strip_barcode_from_fastq.doctree
diff --git a/.doctrees/api.doctree b/.doctrees/api.doctree
diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle
diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree
diff --git a/.doctrees/nbsphinx/tutorials/generic.ipynb b/.doctrees/nbsphinx/tutorials/generic.ipynb
@@ -0,0 +1,246 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Processing barcoded Fastq files\n",
+ "\n",
+ "You would likely encounter barcoded fastq files when working with single cell ATAC-seq data.\n",
+ "As on early days of single cell ATAC-seq, cell barcodes are usually added to the read name of the fastq files.\n",
+ "This notebook demonstrates how to process these barcoded fastq files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import precellar"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Extracting cell barcodes from read names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "@CCAGCACAAGCCATCCTATCGT:A00953:155:HVCHLDRXX:1:1101:1036:1031 1:N:0:1\n",
+ "ANCTTGGATCATCAGGTTTGTCTGTAGCTGATTTATTTCTTTAAGTTTCCC\n",
+ "+\n",
+ "F#FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF\n",
+ "@TAACCACTACGAATGACTGACA:A00953:155:HVCHLDRXX:1:1101:1127:1031 1:N:0:1\n",
+ "TNCCAGGACCAGTGACCGTCACCCGCAGTAAGGATCGGGGCGGCTCCGCCA\n",
+ "+\n",
+ "F#:FFFFFFFFF:FFFFF:FF,F,FFFFFFFF,FFF:FFFF:FFFFFF,FF\n",
+ "@CGATATGTAGGGGACTAATTCC:A00953:155:HVCHLDRXX:1:1101:1145:1031 1:N:0:1\n",
+ "GNCGGATCACAAGGTCAGGAGTTCGAGACCTGGCTGGCCAACACGGTGAAA\n",
+ "\n",
+ "gzip: stdout: Broken pipe\n"
+ ]
+ }
+ ],
+ "source": [
+ "!zcat R1.fq.gz | head"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "precellar.utils.strip_barcode_from_fastq(\n",
+ " 'R1.fq.gz',\n",
+ " 'R1_processed.fq.zst',\n",
+ " out_barcode='I1.fq.zst',\n",
+ " regex=\"^([ACTG]+):\",\n",
+ " right_add=1,\n",
+ ")\n",
+ "\n",
+ "precellar.utils.strip_barcode_from_fastq(\n",
+ " 'R2.fq.gz',\n",
+ " 'R2_processed.fq.zst',\n",
+ " regex=\"^([ACTG]+):\",\n",
+ " right_add=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Starting download of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Downloaded 2643 bytes\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m New version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml cached\n"
+ ]
+ }
+ ],
+ "source": [
+ "assay = precellar.SeqSpec(\"https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\n",
+ "└── atac(153-1150)\n",
+ " ├── atac-illumina_p5(29)\n",
+ " ├── atac-read1(34) [↓R1(1-98)]\n",
+ " ├── gDNA(1-1000)\n",
+ " ├── atac-read2(34) [↑R2(1-98), ↓I1(22)]\n",
+ " ├── atac-cell_barcode(22)\n",
+ " └── atac-illumina_p7(24)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "assay"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assay.update_read(\"R1\", fastq=\"R1_processed.fq.zst\")\n",
+ "assay.update_read(\"I1\", fastq=\"I1.fq.zst\")\n",
+ "assay.update_read(\"R2\", fastq=\"R2_processed.fq.zst\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\n",
+ "└── atac(153-1150)\n",
+ " ├── atac-illumina_p5(29)\n",
+ " ├── atac-read1(34) [↓R1(51)]\n",
+ " ├── gDNA(1-1000)\n",
+ " ├── atac-read2(34) [↑R2(51), ↓I1(22)]\n",
+ " ├── atac-cell_barcode(22)\n",
+ " └── atac-illumina_p7(24)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "assay"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+ "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+ "100%|██████████| 2500/2500 [00:00<00:00, 15545.42it/s]"
+ ]
+ }
+ ],
+ "source": [
+ "qc = precellar.align(\n",
+ " assay, \"/data/kzhang/GRCh38/hg38.fa.gz\",\n",
+ " modality=\"atac\",\n",
+ " output_fragment=\"atac_fragments.tsv.zst\",\n",
+ " num_threads=32,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'frac_q30_bases_read1': 0.8179764705882353,\n",
+ " 'frac_valid_barcode': 1.0,\n",
+ " 'sequenced_read_pairs': 2500.0,\n",
+ " 'frac_q30_bases_barcode': 1.0,\n",
+ " 'frac_unmapped': 0.07640000000000002,\n",
+ " 'sequenced_reads': 5000.0,\n",
+ " 'frac_fragment_flanking_single_nucleosome': 0.0029791459781529296,\n",
+ " 'frac_confidently_mapped': 0.8524,\n",
+ " 'frac_fragment_in_nucleosome_free_region': 0.010427010923535254,\n",
+ " 'frac_q30_bases_read2': 0.9442745098039216,\n",
+ " 'frac_nonnuclear': 0.0128,\n",
+ " 'frac_duplicates': 0.004940711462450593}"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "qc"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}