Psy-Fer · Psy-Fer · Feb 5, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/README.md b/README.md
@@ -38,30 +38,29 @@ pip install --upgrade setuptools wheel
 # see slow5lib for more info
 # set this first to ensure pyslow5 installs with zstd:
 # export PYSLOW5_ZSTD=1
+# newer versions of pyslow5 have zstd libs included by default
 
 # if GUPPY_VERSION=6.3.8
 # modify requirements.txt to have:
 #   ont-pyguppy-client-lib==6.3.8
 # if using DORADO_SERVER_VERSION=7.4.12
 #   ont-pybasecall-client-lib==7.4.12
 
-python setup.py install
-
-# Alternatively, the new way of building things is to do the following command
-# pip install .
+# Install using pip
+pip install .
 
 buttery-eel --help
 
 ```
 
 Suppose the name of the virtual environment you created is venv3 and resides directly in the root of the cloned buttery-eel git repository. In that case, you can use the wrapper script available under `/path/to/repository/scripts/eel` for conveniently executing buttery-eel. This script will automatically source the virtual environment, find a free port, execute the buttery-eel with the parameters you specified and finally deactivate the virtual environment. If you add the path of `/path/to/repository/scripts/` to your PATH environment variable, you can simply use buttery-eel as:
 ```
-eel -g /path/to/ont-guppy/bin/ --config dna_r10.4.1_e8.2_400bps_hac_prom.cfg --device cuda:all -i reads.blow5 -o reads.reads # and any other parameters
+eel -g /path/to/ont-guppy/bin/ --config dna_r10.4.1_e8.2_400bps_hac.cfg --device cuda:all -i reads.blow5 -o reads.reads # and any other parameters
 ```
 
 Alternatively, you can manually execute buttery-eel if you have sourced the virtual environment. You must provide `--port PORT --use_tcp` parameters manually in this case. Example:
 ```
-buttery-eel -g /path/to/ont-guppy/bin/ --config dna_r10.4.1_e8.2_400bps_hac_prom.cfg --device cuda:all -i reads.blow5 -o reads.reads.fastq --port 5000 --use_tcp  # and any other parameters
+buttery-eel -g /path/to/ont-guppy/bin/ --config dna_r10.4.1_e8.2_400bps_hac.cfg --device cuda:all -i reads.blow5 -o reads.reads.fastq --port 5000 --use_tcp  # and any other parameters
 ```
 
 furthermore, if you are using the latest version, and are using dorado-server backend, then simply set the port argumnet to `--port auto` and it will automatically find a free port for you.
@@ -78,13 +77,15 @@ v7.3.10
 
 ## Duplex calling
 
+#### Duplex looks to be depricated - leaving this for legacy sake
+
 The duplex calling does work, so long as you provide a duplex model for `--config` and the `--duplex` flag.
 
 However there are some things to note:
 - Use a single blow5 file rather than many smaller ones.
 - The basecalling server stores all the reads for 10 channels, then on the 11th, it releases the first. Buttery-eel sends 1 channel per client connection, controlled by `--procs`, and in order to force the basecaller to release the data, it sends 10 "fake" reads to the basecaller with channel numbers >9000. This is mostly due to the poor implementation of duplex in the ONT library, so I can't really do much about that.
 - You should write duplex data out using `.sam`. This will mean you get the duplex tags, dx:i:N where N=0 is simplex, N=-1 is a parent of a duplex read, and N=1 is a duplex read.
-- There is a bug in the ONT library, where if a read is split, and two reads from that split read are parents of a duplex read, one of those parent reads won't be flagged with dx:i:-1, but dx:i:0 instead. I have told ONT and they said they will fix it (Bug present in `ont-pybasecall-client-lib v7.4.12`)
+- There is a bug in the ONT library, where if a read is split, and two reads from that split read are parents of a duplex read, one of those parent reads won't be flagged with dx:i:-1, but dx:i:0 instead. I have told ONT and they said they will fix it (Bug present in `ont-pybasecall-client-lib v7.4.12`, now fixed in `ont-pybasecall-client-lib v7.6.8`)
 - When duplex first starts, it sequentially reads the whole blow5 file to create the channel groups. This can take a while, so please be patient.
 
 I wouldn't recommend using duplex just yet because of the issues and poor performance.
@@ -95,9 +96,9 @@ I wouldn't recommend using duplex just yet because of the issues and poor perfor
 The `--help` shown will be different for different versions of the ont library installed.
 
 ```
-usage: buttery-eel [-h] -i INPUT -o OUTPUT -g BASECALLER_BIN --config CONFIG [--call_mods] [-q QSCORE] [--slow5_threads SLOW5_THREADS] [--procs PROCS] [--slow5_batchsize SLOW5_BATCHSIZE]
-                   [--quiet] [--max_read_queue_size MAX_READ_QUEUE_SIZE] [--log LOG] [--moves_out] [--trim_adapters] [--seq_sum] [--barcode_kits BARCODE_KITS] [--enable_trim_barcodes]
-                   [--require_barcodes_both_ends] [--duplex] [--single] [--profile] [-v]
+usage: buttery-eel [-h] -i INPUT -o OUTPUT [-g BASECALLER_BIN] [--config CONFIG] [--call_mods] [-q QSCORE] [--slow5_threads SLOW5_THREADS] [--procs PROCS] [--slow5_batchsize SLOW5_BATCHSIZE] [--quiet]
+                   [--max_read_queue_size MAX_READ_QUEUE_SIZE] [--log LOG] [--moves_out] [--max_batch_time MAX_BATCH_TIME] [--resume RESUME] [--trim_adapters] [--seq_sum] [--barcode_kits BARCODE_KITS]
+                   [--enable_trim_barcodes] [--require_barcodes_both_ends] [--U2T] [--estimate_poly_a] [--poly_a_config POLY_A_CONFIG] [--duplex] [--single] [--profile] [-v]
 
 buttery-eel - wrapping ONT basecallers (guppy/dorado) for SLOW5 basecalling
 
@@ -127,6 +128,9 @@ Run Options:
                         Number of reads to process at a time reading slow5 (default: 20000)
   --log LOG             basecaller log folder path (default: buttery_basecaller_logs)
   --moves_out           output move table (sam format only) (default: False)
+  --max_batch_time MAX_BATCH_TIME
+                        Maximum seconds to wait for batch to be basecalled before killing basecalling. Used to detect locked states/hung servers. Default=1200 (20min) (default: 1200)
+  --resume RESUME       Resume a sequencing run. fastq or sam input. (default: None)
 
 Sequencing summary Options:
   --seq_sum             Write out sequencing_summary.txt file (default: False)
@@ -142,6 +146,12 @@ Barcode demultiplexing Options:
   --require_barcodes_both_ends
                         Flag indicating that barcodes must be at both ends. (default: False)
 
+RNA options:
+  --U2T                 Convert Uracil (U) to Thymine (T) in direct RNA output (default: False)
+  --estimate_poly_a     Perform polyA/T tail length estimation (default: False)
+  --poly_a_config POLY_A_CONFIG
+                        Filename of a custom polyA/T configuration to use for estimation. (default: None)
+
 Duplex Options:
   --duplex              Turn on duplex calling - channel based - See README for information (default: False)
   --single              use only a single proc for testing - DUPLEX TESTING (default: False)
@@ -167,12 +177,28 @@ Some common args are:
 - `-x/--device`: Specify CPU or GPU device: 'cpu', 'cuda:all', 'auto' or 'cuda:<device_id>[,<device_id> ...]', the default is 'cpu'. Specifying 'auto' will choose either 'cpu' or 'cuda:all' depending on the presence of a cuda device.
 - `--use_tcp`: Make connections on a tcp port instead of a Unix socket file. This flag has no effect on Windows as connections are always via tcp.
 - `-p/--port`:Port for hosting service. Specify "auto" to make server automatically search for a free port.
+- `--dorado_model_path`/`--dorado_modbase_models`: Can be provided instead of using `--config` the same way you can define a model path in dorado. The modbase models can be added like this `--dorado_modbase_models 5mC_5hmC@v1,6mA@v2` where 2 models are given separated by a comma with no spaces.
+
+Fore more information on model paths and modbase models, please refer to the models table in the dorado documentation [here](https://github.com/nanoporetech/dorado/tree/release-v0.9?tab=readme-ov-file#dna-models)
+
+## Resume a run
+
+If a run did not complete, crashed or was interupted for some reason, you can resume the run with the `--resume <failed_run.fastq/sam>` flag and providing the fastq/sam file of the failed run.
+
+Make sure the new run `-o/--output` filename is different to the file used in `--resume`, otherwise it will overwrite it and you will lose the previous run of data.
+
+Once the resumed run is completed, you can merge the output files.
+
+
+### Estimate polyT/A tails
+
+The arguments `--estimate_poly_a` and  `--poly_a_config` work the same way they do in dorado-server. Please see their documentation [here](https://github.com/nanoporetech/dorado/blob/release-v0.9/documentation/PolyTailConfig.md) for more information.
 
 
 ## Usage for older versions < 7.3.10
 ```
-usage: buttery-eel [-h] -i INPUT -o OUTPUT -g BASECALLER_BIN --config CONFIG [--call_mods] [-q QSCORE] [--slow5_threads SLOW5_THREADS] [--procs PROCS] [--slow5_batchsize SLOW5_BATCHSIZE]
-                   [--quiet] [--max_read_queue_size MAX_READ_QUEUE_SIZE] [--log LOG] [--moves_out] [--do_read_splitting] [--min_score_read_splitting MIN_SCORE_READ_SPLITTING]
+usage: buttery-eel [-h] -i INPUT -o OUTPUT [-g BASECALLER_BIN] --config CONFIG [--call_mods] [-q QSCORE] [--slow5_threads SLOW5_THREADS] [--procs PROCS] [--slow5_batchsize SLOW5_BATCHSIZE] [--quiet]
+                   [--max_read_queue_size MAX_READ_QUEUE_SIZE] [--log LOG] [--moves_out] [--max_batch_time MAX_BATCH_TIME] [--resume RESUME] [--do_read_splitting] [--min_score_read_splitting MIN_SCORE_READ_SPLITTING]
                    [--detect_adapter] [--min_score_adapter MIN_SCORE_ADAPTER] [--trim_adapters] [--detect_mid_strand_adapter] [--seq_sum] [--barcode_kits BARCODE_KITS] [--enable_trim_barcodes]
                    [--require_barcodes_both_ends] [--detect_mid_strand_barcodes] [--min_score_barcode_front MIN_SCORE_BARCODE_FRONT] [--min_score_barcode_rear MIN_SCORE_BARCODE_REAR]
                    [--min_score_barcode_mid MIN_SCORE_BARCODE_MID] [--profile] [-v]
@@ -205,6 +231,9 @@ Run Options:
                         Number of reads to process at a time reading slow5 (default: 20000)
   --log LOG             basecaller log folder path (default: buttery_basecaller_logs)
   --moves_out           output move table (sam format only) (default: False)
+  --max_batch_time MAX_BATCH_TIME
+                        Maximum seconds to wait for batch to be basecalled before killing basecalling. Used to detect locked states/hung servers. Default=1200 (20min) (default: 1200)
+  --resume RESUME       Resume a sequencing run. fastq or sam input. (default: None)
 
 Sequencing summary Options:
   --seq_sum             Write out sequencing_summary.txt file (default: False)
@@ -237,14 +266,19 @@ Barcode demultiplexing Options:
                         Minimum score for a rear barcode to be classified (default: 60.0)
   --min_score_barcode_mid MIN_SCORE_BARCODE_MID
                         Minimum score for mid barcodes to be detected (default: 60.0)
+
 ```
 
 ## Aligning uSAM output and getting sorted bam using -y in minimap2
 
-    samtools fastq -TMM,ML test.mod.sam | minimap2 -ax map-ont -y ref.fa - | samtools view -Sb - | samtools sort - > test.aln.mod.bam
+    samtools fastq -TMM,ML test.mod.sam | minimap2 -ax map-ont -y -Y ref.fa - | samtools sort - > test.aln.mod.bam
 
 If you also wish to keep the quality scores in the unofficial qs tags or if mapping a regular unmapped sam the -T argument can be used in conjunction with minimap2 -y for example: `-TMM,ML,qs` or `-Tqs`. You can also get all sam tags with `-T'*'` but you need samtools of v1.16 or higher.
 
+#### samtools v1.16 and higher
+
+    samtools fastq -T '*' test.mod.sam | minimap2 -ax map-ont -y -Y ref.fa - | samtools sort - > test.aln.mod.bam
+
 
 # Shutting down server
 
@@ -275,9 +309,9 @@ kill <PID>
 
 ONT have 4 basecallers.
 - `Albacore` (archived)
-- `Guppy`    (current - production)
+- `Guppy`    (archived)
 - `Bonito`   (research)
-- `Dorado`   (preview - future production)
+- `Dorado`   (current - production)
 
 `Albacore` (not used anymore) and `Guppy` are closed source software, and researchers are required to sign a developer agreement to gain access to the source code. Any changes made by the researchers can't share that modified software, and there are a number of restrictions in place that are too boring to talk about.
 
@@ -302,6 +336,7 @@ There is now a new python library/API for the dorado-server builds called `ont-p
 - My partner Hilary for coming up with the name.
 - Matt Loose and Alexander Payne for having the basics of this all along in your readfish code and being awesome in general
 - ONT and their open source code of bonito and dorado for handling uSAM writing.
+- Mark Bicknell at ONT who has been very helpful and patient with me while figuring out how to use new released features.
 - Lastly, I'd like to say i'm a little surprised this wasn't suggested to us by the devs at ONT when they were rejecting our pull requests on Guppy, Bonito, and Dorado. Oh well.
 
 # Software used
@@ -310,7 +345,7 @@ There is now a new python library/API for the dorado-server builds called `ont-p
 - [ONT dorado-server](https://community.nanoporetech.com/downloads)
 - [ONT ont-pyguppy-client-lib](https://pypi.org/project/ont-pyguppy-client-lib/)
 - [ONT ont-pyguppy-client-lib](https://pypi.org/project/ont-pybasecall-client-lib/)
-- basecaller code and flow mostly follows the methods used in [readfish](https://github.com/LooseLab/readfish/blob/23dd37117bce576b99caf097e7711dc87d30fa0a/ru/basecall.py) by Matt Loose and Alexander Payne
+- basecaller code and flow mostly follows the methods used in [readfish](https://github.com/LooseLab/readfish/blob/23dd37117bce576b99caf097e7711dc87d30fa0a/ru/basecall.py) by Matt Loose and Alexander Payne (thought not so much anymore from ~0.7.0)
 
 
 # Citation

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 numpy
 pyslow5>=1.1.0
 #ont-pyguppy-client-lib==6.5.7
-ont-pybasecall-client-lib==7.4.12
+ont-pybasecall-client-lib==7.6.8
diff --git a/scripts/error_test.py b/scripts/error_test.py
@@ -0,0 +1,163 @@
+import pyslow5
+import sys
+import multiprocessing as mp
+import time
+
+from io import StringIO
+from contextlib import contextmanager, redirect_stdout
+
+try:
+    from pybasecall_client_lib.pyclient import PyBasecallClient as pclient
+    from pybasecall_client_lib import helper_functions
+
+except ImportError:
+    # maybe i can do a version check or something? hard to do from this side of the software
+    print("Could not load pybasecall, trying for version earlier versions <=7.2.15 pyguppy lib")
+    try:
+        from pyguppy_client_lib.pyclient import PyGuppyClient as pclient
+        from pyguppy_client_lib import helper_functions
+    except ImportError:
+        print("Can't import pybasecall_client_lib or pyguppy_client_lib, please check environment and try again.")
+        sys.exit(1)
+
+
+
+def reader_proc(fname):
+    print("before")
+    s5 = pyslow5.Open(fname, 'r', DEBUG=1)
+    print("after")
+    print(s5.get_num_read_groups())
+
+
+def worker_proc(address, config, fname):
+    print("worker_proc")
+    print("worker_proc before")
+    sys.exit(1)
+
+
+    connected = False
+    for t in range(3):
+        print("t:", t)
+        if connected:
+            continue
+        client_sub = pclient(address=address, config=config)
+        connected = True
+        with client_sub as client:
+            for i in range(15):
+                print("i:", i)
+                print("status: {}".format(client.get_status()))
+                print(type(client.get_status()))
+                if client.get_status() == client.status.connected:
+                    print("client is connected!")
+                else:
+                    print("client is not connected!")
+                    connected = False
+                # print("error_message: {}".format(client.get_error_message()))
+                # print("internal state: {}".format(client.get_internal_state()))
+                # print("server information: {}".format(client.get_server_information(address, 2000)))
+                # if i == 3 and t < 2:
+                #     result = client.disconnect()
+                #     print("disconnect result:", result)
+                #     connected = False
+                if not connected:
+                    break
+                time.sleep(i)
+                # s5 = pyslow5.Open(fname, 'r', DEBUG=1)
+                # print(s5.get_num_read_groups())
+            print("worker_proc after forloop")
+    if not connected:
+        raise ConnectionAbortedError
+    print("worker_proc with")
+
+
+
+
+# region start basecaller
+@contextmanager
+def start_guppy_server_and_client(basecaller_bin):
+    """
+    Thanks to Alex Payne for basis of this code to appropriately handle the server and client connections
+    https://gist.github.com/alexomics/043bb120c74161e5b93e1b68fb00206c
+
+    Starts server and connects client
+    TODO: allow a connection to existing guppy server
+    """
+    server_args = []
+
+    server_args.extend(["--log_path", "./logs",
+                        "--config", "dna_r10.4.1_e8.2_400bps_5khz_fast.cfg",
+                        "--port", "auto",
+                        "--use_tcp",
+                        "--device", "cuda:all",
+                        # "--max_queued_reads", args.max_queued_reads,
+                        # "--chunk_size", args.chunk_size,
+                        ])
+    # the high priority queue uses a different batch size which alters the basecalls when called with dorado
+    # leaving this on default should set it to medium and give 'correct' results
+    # funny enough, it will call R9.4.1 data at higher accuracy, and the opposite impact on R10.4.1
+    # params = {"priority": PyGuppyClient.high_priority}
+    params = {}
+
+    # This function has it's own prints that may want to be suppressed
+    with redirect_stdout(StringIO()) as fh:
+        server, port = helper_functions.run_server(server_args, bin_path=basecaller_bin)
+
+    if port == "ERROR":
+        raise RuntimeError("Server couldn't be started")
+
+    if port.startswith("ipc"):
+        address = "{}".format(port)
+    else:
+        address = "localhost:{}".format(port)
+    client = pclient(address=address, config="dna_r10.4.1_e8.2_400bps_5khz_fast.cfg")
+
+
+    print("Setting params...")
+    client.set_params(params)
+    print("Connecting...")
+    try:
+        with client:
+            yield [client, address, "dna_r10.4.1_e8.2_400bps_5khz_fast.cfg", params]
+    finally:
+        server.terminate()
+
+
+def main():
+
+    mp.set_start_method('spawn')
+
+    # filename = sys.argv[1]
+
+    # reader = mp.Process(target=reader_proc, args=(filename,), name='read_worker')
+    # reader.start()
+
+    # reader.join()
+    # print("reader exitcode:", reader.exitcode)
+    # if reader.exitcode < 0:
+    #     sys.exit(1)
+    # print("done")
+
+    # filename = sys.argv[1]
+    basecaller_bin = sys.argv[1]
+
+    with start_guppy_server_and_client(basecaller_bin) as client_one:
+        client, address, config, params = client_one
+        print(client)
+        print("guppy/dorado started...")
+        print("basecaller version:", "{}".format(".".join([str(i) for i in client.get_software_version()])))
+        print()
+
+        worker = mp.Process(target=worker_proc, args=(address, config, "t.blow5",), name='worker_proc')
+        worker.start()
+
+        worker.join()
+        print("worker exitcode:", worker.exitcode)
+        if worker.exitcode != 0:
+            print("killing program")
+            sys.exit(1)
+        print("last print before closing server")
+    print("Server closed")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/split_qscore.py b/scripts/split_qscore.py
@@ -136,18 +136,18 @@ def find_score(read, ftype):
     if ftype == "fastq":
         header = read[0].split(" ")
         for i in header:
-            if i[:4] == "qs:i":
-                score = int(i[5:])
+            if i[:4] in ["qs:i", "qs:f"]:
+                score = float(i[5:])
             elif i.split("=")[0] in ["qscore", "mean_qscore"]:
-                score = int(i.split("=")[1])
+                score = float(i.split("=")[1])
             else:
                 score = None
 
     elif ftype == "sam":
         sread = read.split("\t")
         for i in sread:
-            if i[:4] == "qs:i":
-                score = int(i[5:])
+            if i[:4] in ["qs:i", "qs:f"]:
+                score = float(i[5:])
             else:
                 score = None
     return score