Skip to content

Commit

Permalink
Merge pull request #668 from macs3-project/feat/macs3/fragmentfile
Browse files Browse the repository at this point in the history
Feat/macs3/fragmentfile
  • Loading branch information
taoliu authored Oct 23, 2024
2 parents f46aa3a + a6ddf28 commit bf8d0c8
Show file tree
Hide file tree
Showing 45 changed files with 14,273 additions and 10,384 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ other_test/
MACS3/IO/BedGraphIO.c
MACS3/IO/Parser.c
MACS3/IO/PeakIO.c
MACS3/IO/BAM.c
MACS3/Signal/BedGraph.c
MACS3/Signal/CallPeakUnit.c
MACS3/Signal/FixWidthTrack.c
Expand All @@ -28,6 +29,7 @@ MACS3/Signal/PairedEndTrack.c
MACS3/Signal/PeakDetect.c
MACS3/Signal/PeakModel.c
MACS3/Signal/Pileup.c
MACS3/Signal/PileupV2.c
MACS3/Signal/Prob.c
MACS3/Signal/RACollection.c
MACS3/Signal/ReadAlignment.c
Expand All @@ -37,6 +39,8 @@ MACS3/Signal/Signal.c
MACS3/Signal/SignalProcessing.c
MACS3/Signal/UnitigRACollection.c
MACS3/Signal/VariantStat.c
MACS3/Signal/PeakVariants.c
MACS3/Signal/PosReadsInfo.c

# MacOSX temp
.DS_Store
Expand Down
11 changes: 10 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@

* Features added

1) We extensively rewrote the `pyx` codes into `py` codes. In
1) We implemented the IO module for reading the fragment files
usually used in single-cell ATAC-seq experiment
`Parser.FragParser`. And we implemented a new
`PairedEndTrack.PETrackII` to store the data in fragment file,
including the barcodes and counts information. In the `PETrackII`
class, we are able to extract a subset using a list of barcodes,
which enables us to call peaks only on a pool (pseudo-bulk) of
cells.

2) We extensively rewrote the `pyx` codes into `py` codes. In
another words, we now apply the 'pure python style' with PEP-484
type annotations to our previous Cython style codes. So that, the
source codes can be more compatible to Python programming tools
Expand Down
8 changes: 4 additions & 4 deletions MACS3/Commands/callvar_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2024-10-02 16:34:23 Tao Liu>
# Time-stamp: <2024-10-11 10:28:07 Tao Liu>

"""Description: Call variants directly
Expand Down Expand Up @@ -137,11 +137,11 @@ def run(args):

peakio = open(peakbedfile)
peaks = PeakIO()
i = 0
#i = 0
for t_peak in peakio:
fs = t_peak.rstrip().split()
i += 1
peaks.add(fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i)
# i += 1
peaks.add(fs[0].encode(), int(fs[1]), int(fs[2])) # , name=b"%d" % i)
peaks.sort()

# chrs = peaks.get_chr_names()
Expand Down
2 changes: 1 addition & 1 deletion MACS3/Commands/refinepeak_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2024-10-02 17:01:42 Tao Liu>
# Time-stamp: <2024-10-11 11:11:00 Tao Liu>

"""Description: refine peak summits
Expand Down
194 changes: 174 additions & 20 deletions MACS3/IO/Parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# cython: language_level=3
# cython: profile=True
# cython: linetrace=True
# Time-stamp: <2024-10-07 16:08:43 Tao Liu>
# Time-stamp: <2024-10-22 10:25:23 Tao Liu>

"""Module for all MACS Parser classes for input. Please note that the
parsers are for reading the alignment files ONLY.
Expand All @@ -28,7 +28,7 @@

from MACS3.Utilities.Constants import READ_BUFFER_SIZE
from MACS3.Signal.FixWidthTrack import FWTrack
from MACS3.Signal.PairedEndTrack import PETrackI
from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII
from MACS3.Utilities.Logger import logging

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -199,7 +199,8 @@ def __init__(self, string, strand):
self.string = string

def __str__(self):
return repr("Strand information can not be recognized in this line: \"%s\",\"%s\"" % (self.string, self.strand))
return repr("Strand information can not be recognized in this line: \"%s\",\"%s\"" %
(self.string, self.strand))


@cython.cclass
Expand Down Expand Up @@ -544,7 +545,8 @@ def pe_parse_line(self, thisline: bytes):
atoi(thisfields[1]),
atoi(thisfields[2]))
except IndexError:
raise Exception("Less than 3 columns found at this line: %s\n" % thisline)
raise Exception("Less than 3 columns found at this line: %s\n" %
thisline)

@cython.ccall
def build_petrack(self):
Expand Down Expand Up @@ -950,7 +952,9 @@ def tlen_parse_line(self, thisline: bytes) -> cython.int:
thisfields = thisline.split(b'\t')
bwflag = atoi(thisfields[1])
if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
return 0 #unmapped sequence or bad sequence or 2nd or sup alignment
# unmapped sequence or bad sequence or 2nd or sup alignment
return 0

if bwflag & 1:
# paired read. We should only keep sequence if the mate is mapped
# and if this is the left mate, all is within the flag!
Expand Down Expand Up @@ -1068,9 +1072,11 @@ def __init__(self, filename: str,
f.close()
if self.gzipped:
# open with gzip.open, then wrap it with BufferedReader!
self.fhd = io.BufferedReader(gzip.open(filename, mode='rb'), buffer_size=READ_BUFFER_SIZE) # buffersize set to 1M
self.fhd = io.BufferedReader(gzip.open(filename, mode='rb'),
buffer_size=READ_BUFFER_SIZE)
else:
self.fhd = io.open(filename, mode='rb') # binary mode! I don't expect unicode here!
# binary mode! I don't expect unicode here!
self.fhd = io.open(filename, mode='rb')

@cython.ccall
def sniff(self):
Expand All @@ -1089,7 +1095,8 @@ def sniff(self):
return True
else:
self.fhd.seek(0)
raise Exception("File is not of a valid BAM format! %d" % tsize)
raise Exception("File is not of a valid BAM format! %d" %
tsize)
else:
self.fhd.seek(0)
return False
Expand Down Expand Up @@ -1189,7 +1196,8 @@ def build_fwtrack(self):
rlengths: dict

fwtrack = FWTrack(buffer_size=self.buffer_size)
references, rlengths = self.get_references() # after this, ptr at list of alignments
# after this, ptr at list of alignments
references, rlengths = self.get_references()
# fseek = self.fhd.seek
fread = self.fhd.read
# ftell = self.fhd.tell
Expand Down Expand Up @@ -1248,7 +1256,9 @@ def append_fwtrack(self, fwtrack):
info("%d reads have been read." % i)
self.fhd.close()
# fwtrack.finalize()
# this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
# this is the problematic part. If fwtrack is finalized, then
# it's impossible to increase the length of it in a step of
# buffer_size for multiple input files.
fwtrack.set_rlengths(rlengths)
return fwtrack

Expand Down Expand Up @@ -1323,14 +1333,9 @@ def build_petrack(self):
if i % 1000000 == 0:
info(" %d fragments parsed" % i)

# print(f"{references[chrid]:},{fpos:},{tlen:}")
info("%d fragments have been read." % i)
# debug(f" {e1} Can't identify the length of entry, it may be the end of file, stop looping...")
# debug(f" {e2} Chromosome name can't be found which means this entry is skipped ...")
# assert i > 0, "Something went wrong, no fragment has been read! Check input file!"
self.d = m / i
self.n = i
# assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
self.fhd.close()
petrack.set_rlengths(rlengths)
return petrack
Expand All @@ -1349,9 +1354,7 @@ def append_petrack(self, petrack):
rlengths: dict

references, rlengths = self.get_references()
# fseek = self.fhd.seek
fread = self.fhd.read
# ftell = self.fhd.tell

# for convenience, only count valid pairs
add_loc = petrack.add_loc
Expand All @@ -1374,10 +1377,7 @@ def append_petrack(self, petrack):
info("%d fragments have been read." % i)
self.d = (self.d * self.n + m) / (self.n + i)
self.n += i
# assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
self.fhd.close()
# this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
# petrack.finalize()
petrack.set_rlengths(rlengths)
return petrack

Expand Down Expand Up @@ -1472,3 +1472,157 @@ def fw_parse_line(self, thisline: bytes) -> tuple:
1)
else:
raise StrandFormatError(thisline, thisfields[1])


@cython.cclass
class FragParser(GenericParser):
"""Parser for Fragment file containing scATAC-seq information.
Format:
chromosome frag_leftend frag_rightend barcode count
Note: Only the first five columns are used!
"""
n = cython.declare(cython.int, visibility='public')
d = cython.declare(cython.float, visibility='public')

@cython.cfunc
def skip_first_commentlines(self):
"""BEDPEParser needs to skip the first several comment lines.
"""
l_line: cython.int
thisline: bytes

for thisline in self.fhd:
l_line = len(thisline)
if thisline and (thisline[:5] != b"track") \
and (thisline[:7] != b"browser") \
and (thisline[0] != 35): # 35 is b"#"
break

# rewind from SEEK_CUR
self.fhd.seek(-l_line, 1)
return

@cython.cfunc
def pe_parse_line(self, thisline: bytes):
"""Parse each line, and return chromosome, left and right
positions, barcode and count.
"""
thisfields: list

thisline = thisline.rstrip()

# still only support tabular as delimiter.
thisfields = thisline.split(b'\t')
try:
return (thisfields[0],
atoi(thisfields[1]),
atoi(thisfields[2]),
thisfields[3],
atoi(thisfields[4]))
except IndexError:
raise Exception("Less than 5 columns found at this line: %s\n" %
thisline)

@cython.ccall
def build_petrack2(self):
"""Build PETrackII from all lines.
"""
chromosome: bytes
left_pos: cython.int
right_pos: cython.int
barcode: bytes
count: cython.uchar
i: cython.long = 0 # number of fragments
m: cython.long = 0 # sum of fragment lengths
tmp: bytes = b""

petrack = PETrackII(buffer_size=self.buffer_size)
add_loc = petrack.add_loc

while True:
# for each block of input
tmp += self.fhd.read(READ_BUFFER_SIZE)
if not tmp:
break
lines = tmp.split(b"\n")
tmp = lines[-1]
for thisline in lines[:-1]:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos < 0 or not chromosome:
continue
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
m += right_pos - left_pos
i += 1
if i % 1000000 == 0:
info(" %d fragments parsed" % i)
add_loc(chromosome, left_pos, right_pos, barcode, count)
# last one
if tmp:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos >= 0 and chromosome:
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
i += 1
m += right_pos - left_pos
add_loc(chromosome, left_pos, right_pos, barcode, count)

self.d = cython.cast(cython.float, m) / i
self.n = i
assert self.d >= 0, "Something went wrong (mean fragment size was negative)"

self.close()
petrack.set_rlengths({"DUMMYCHROM": 0})
return petrack

@cython.ccall
def append_petrack(self, petrack):
"""Build PETrackI from all lines, return a PETrackI object.
"""
chromosome: bytes
left_pos: cython.int
right_pos: cython.int
barcode: bytes
count: cython.uchar
i: cython.long = 0 # number of fragments
m: cython.long = 0 # sum of fragment lengths
tmp: bytes = b""

add_loc = petrack.add_loc
while True:
# for each block of input
tmp += self.fhd.read(READ_BUFFER_SIZE)
if not tmp:
break
lines = tmp.split(b"\n")
tmp = lines[-1]
for thisline in lines[:-1]:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos < 0 or not chromosome:
continue
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
m += right_pos - left_pos
i += 1
if i % 1000000 == 0:
info(" %d fragments parsed" % i)
add_loc(chromosome, left_pos, right_pos, barcode, count)
# last one
if tmp:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos >= 0 and chromosome:
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
i += 1
m += right_pos - left_pos
add_loc(chromosome, left_pos, right_pos, barcode, count)

self.d = (self.d * self.n + m) / (self.n + i)
self.n += i

assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
self.close()
petrack.set_rlengths({"DUMMYCHROM": 0})
return petrack
Loading

0 comments on commit bf8d0c8

Please sign in to comment.