-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommonlib.py
76 lines (59 loc) · 2.39 KB
/
commonlib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
import ipdb
# convert the genome sequence into integer for efficient storage
# A->1, C->2, G->3, T->4
def convert_genome_str_to_int(genome_str):
conversion_table = { "A": 1, "C": 2, "G": 3, "T":4 }
return map(lambda x: conversion_table[x], genome_str)
# convert from int back to char
# 1->A, 2->C, 3->G, 4->T
def get_base_from_int(num):
conversion_table = ['A', 'C', 'G', 'T']
return conversion_table[num-1]
# given a list of base like 1,2,3,4 return a string combining them like 1234
def get_string_from_mer(mer):
return "".join(map(lambda x: "%d" % x, mer))
# given a string of int 1,2,3,4 return a numpy array of those number
def get_mer_from_int_str(int_str):
return np.array(map(lambda x: int(x), int_str), dtype=np.uint8)
# read the reference genome into a np array
def read_reference_genome(ref_file):
with open(ref_file, "r") as f:
lines = f.readlines()
# calculate how many bases we have
# assumming we have same number of base from line 1 -> n -1
total_base = len(lines[1].rstrip()) * (len(lines)-2) + len(lines[-1].rstrip())
# allocate memory as np array
return_array = np.zeros(total_base, dtype=np.uint8)
offset = 0
for line in lines[1:]:
line = line.rstrip()
return_array[offset:offset+len(line)] = convert_genome_str_to_int(line)
offset += len(line)
return return_array
# read the reference genome into a very long string
def read_reference_genome_bare(ref_file):
with open(ref_file, "r") as f:
lines = f.readlines()
del lines[0]
lines = map(lambda x: x.rstrip(), lines)
return "".join(lines)
# read the reads.txt input file and create 2d array,
# each nucleotid is also represented as integer
def read_all_reads(reads_file):
with open(reads_file, "r") as f:
lines = f.readlines()
# calculate how many read pairs we have
num_read_pairs = len(lines) - 1
# calculate how long is a read
read_length = len(lines[1].split(',')[0])
# allocate memory as np array
return_array = np.zeros((num_read_pairs, 2, read_length), dtype=np.uint8)
pair_idx = 0
for line in lines[1:]:
line = line.rstrip()
raw_read_pairs = line.split(',')
int_read_pairs = map(lambda x: convert_genome_str_to_int(x), raw_read_pairs)
return_array[pair_idx] = int_read_pairs
pair_idx += 1
return return_array