-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgene_multifasta_preparation.py
29 lines (24 loc) · 1.02 KB
/
gene_multifasta_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
#Load excel file with genes and their sequences
#Change path
excel_path = "/change/path/realdoc.xlsx"
df = pd.read_excel(excel_path)
#Specify column names in the file
gene_name_col = "gene name"
description_col = "gene description"
species_col = "species"
sequence_col = "nt"
#Open a new file to write the multifasta
#change path
with open("/cnew/path/genes_multifasta.fasta", "w") as fasta_file:
for index, row in df.iterrows():
gene_name = row[gene_name_col]
description = row[description_col]
species = row[species_col]
sequence = row[sequence_col]
#We will remove whitespaces and split the lines into lines of 60 characters
sequence = str(row[sequence_col]).replace(" ", "").replace("\n", "")
sequence_lines = [sequence[i:i+60] for i in range(0, len(sequence), 60)]
#We finally write the fasta header and sequence lines
fasta_file.write(f">{gene_name} | {description} | {species}\n")
fasta_file.write("\n".join(sequence_lines) + "\n")