-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_new_bin_abundance_23.py
52 lines (45 loc) · 2.1 KB
/
get_new_bin_abundance_23.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 2 09:59:51 2022
@author: acer
"""
import json
import subprocess
samples=['W19-1','W19-2','W19-3','W19-4','W19-5','W19-6',
'W22-1','W22-2','W22-3','W22-4','W22-5','W22-6',
'W23-1','W23-2','W23-3','W23-4','W23-5','W23-6'
]
comples=['W23-1','W23-2','W23-3','W23-4','W23-5','W23-6'
]
dRep_clusters=json.load(open('/home/tina/Documents/RAT/bio_example/dRep_clusters.json'))
dRep_genomes={}
# print(len(dRep_clusters))
for d in dRep_clusters:
for e in dRep_clusters[d]:
dRep_genomes[e]=d
# print(len(dRep_genomes))
for comp in comples:
contig2bin=json.load(open('/net/phage/linuxhome/mgx/people/tina/wageningen/'
'2018/{}.contig2bin.json'.format(comp)))
for smp in samples:
bam_file=('/net/phage/linuxhome/mgx/people/tina/wageningen/'
'2018/bam/{}.fasta.{}_R1.fq.gz.bwamem.sorted'.format(comp,smp))
smp_vs_comp={}
print('{} vs {}'.format(smp,comp))
# open alignment file with samtools, so that we don't need pysam dependency
cmd=['samtools', 'view', bam_file]
proc=subprocess.Popen(cmd, stdout=subprocess.PIPE)
contig=''
# check whether it is a paired bam file by checking the flags for "read paired"
# flag and setting paired to true if it finds it
while contig!='*':
for read in proc.stdout:
read=read.decode("utf-8").rstrip().split('\t')
read_id, flag, contig, score=read[0], int(read[1]), read[2], int(read[4])
if contig in contig2bin and score>=2 and len(bin(flag))<14:
if contig in contig2bin and contig2bin[contig] in dRep_genomes:
if contig2bin[contig] not in smp_vs_comp:
smp_vs_comp[contig2bin[contig]]=0
smp_vs_comp[contig2bin[contig]]+=1
with open('/home/tina/tmp/test{}vs{}.json'.format(smp,comp), 'w') as outf:
outf.write(json.dumps(smp_vs_comp, indent=4))