-
Notifications
You must be signed in to change notification settings - Fork 23
AB TrimAl
This reimplements portions of the excellent software trimAl, allowing the removal of poorly aligned regions to increase overall alignment quality.
Note: If working from annotated genbank files, any features spanning columns that are removed will be truncated as is appropriate, and will be deleted completely if all of its columns are removed.
The trimal function contains four distinct modes: 'clean', 'all' (or 'no_gaps'), 'gappyout', and a threshold mode controlled by supplying number.
clean: Removes any columns that are 100% gaps (example 1).
all: Removes all columns with any number of gaps (example 2).
gappyout: A more involved algorithm that takes into account the distribution of alignment gaps to automatically select the most appropriate threshold (example 3).
Threshold ( int ): Specifically set the maximum number of gaps a column may contain without being removed (example 4).
Threshold ( float ): Use a fractional proportion (0.05, 0.65, 0.99, etc) less than 1 to set the maximum percentage of columns with gaps (example 5).
#NEXUS
begin data;
dimensions ntax=7 nchar=216;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 --ML-ILSKF--KGVTPFKGR-QYTGSVIS--GFKKFG--STFAEDY-VRLLHN--PVES-DQELFSIGD--FKTYGSEW----IKKLKL-EDNLATEA-FLRDDSA---IKHMYFQRKL-RGNGKAL----------------------------------------------------------------------------------------
Mle_Panxα7A -MGV-ILFPI--RATAP-KSRQQLGGAYIA--GFSRDEEYERFAEEW-VAMLQN--PVED-GFEKFKFGS--FAEYGLLW----DTRNP--LNNVQSLS-SLDRSPAVGVISKLYLKDESPRAEVLKRRSKKVKVPSPRKPKLLFHEEIKKKLIKRTERKDDNLTKYAQEDVLDSEYVVVEQSVPETMTEQESVEESVPEISKAEQEGGSSDHIDV
Mle_Panxα8 -MVL-V-ALF--PRLAPFKVR-SYTGSVIE--GFLKVP--VEFAKDY-ISMLHN--PVET-AEELFHIGS--FFSYGVDW----AAEEPEPGEDYKTLA-FLRDEPA---YSEIWRVGGE-----------------------------VRCLVERC-------------FSLNRPHI--------------------PHLVK-------------
Mle_Panxα5 -MIYWV-AVF--KRMAPFKVG-IIA-STIK--GFEDSFRSEAFVDEY-INLLHN--PLTRDEEELFKIGN--FVTYGFEW----VGSQVP-NGSTYTLT-FWND-WN---LRHLYWNEYV-QYNGHLKTS--------------------------------------------------------------------------------------
Mle_Panxα4 -MVIEL-AGY--KGLSPFKDR-QYS-TLIA--GFTKFH--PQFAEDY-IKLLCL--ASGSPEQQMFEYGSNTWYRYGADWYGTRFSSYHE-TNNSITLG-MLPDKPS---YRYVFMTTSK-NADVIENLS-----------------------------------------KLDSSVI----------------------------ELGSKDSI--
Mle_Panxα3 MLLLGS-GTI--KNLS-FKDS-QYT-KNIS--GFTKFG--EDFSQDY-LKIMSDYYHCTT-ETPSFQVGD--FKTYGIEW----LRQFPN-PENYSTSGHFSHKHPG---WKFMYYKHLR-IGHVPGE-------------------------------------------YLTDPA---------------------------------------
Mle_Panxα6 -MLLEI-ANF--KGAT-FKER-QYT-GIIA--GLTKFS--AAFAEDYILAMLHN--PVDG-DDVTFESGR--YLTYGSEW----FASLDK-QSNYTSFC-FLKENSK---LKFIYFEEKN-KQHLKGSKD-----------------------------------------ILV------------------------------------------
;
end;
$: alb Mnemiopsis.nex -trm clean
#NEXUS
begin data;
dimensions ntax=7 nchar=212;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 --ML-ILSKFKGVTPFKGR-QYTGSVISGFKKFG--STFAEDY-VRLLHN--PVES-DQELFSIGD--FKTYGSEW----IKKLKL-EDNLATEA-FLRDDSA---IKHMYFQRKL-RGNGKAL----------------------------------------------------------------------------------------
Mle_Panxα7A -MGV-ILFPIRATAP-KSRQQLGGAYIAGFSRDEEYERFAEEW-VAMLQN--PVED-GFEKFKFGS--FAEYGLLW----DTRNP--LNNVQSLS-SLDRSPAVGVISKLYLKDESPRAEVLKRRSKKVKVPSPRKPKLLFHEEIKKKLIKRTERKDDNLTKYAQEDVLDSEYVVVEQSVPETMTEQESVEESVPEISKAEQEGGSSDHIDV
Mle_Panxα8 -MVL-V-ALFPRLAPFKVR-SYTGSVIEGFLKVP--VEFAKDY-ISMLHN--PVET-AEELFHIGS--FFSYGVDW----AAEEPEPGEDYKTLA-FLRDEPA---YSEIWRVGGE-----------------------------VRCLVERC-------------FSLNRPHI--------------------PHLVK-------------
Mle_Panxα5 -MIYWV-AVFKRMAPFKVG-IIA-STIKGFEDSFRSEAFVDEY-INLLHN--PLTRDEEELFKIGN--FVTYGFEW----VGSQVP-NGSTYTLT-FWND-WN---LRHLYWNEYV-QYNGHLKTS--------------------------------------------------------------------------------------
Mle_Panxα4 -MVIEL-AGYKGLSPFKDR-QYS-TLIAGFTKFH--PQFAEDY-IKLLCL--ASGSPEQQMFEYGSNTWYRYGADWYGTRFSSYHE-TNNSITLG-MLPDKPS---YRYVFMTTSK-NADVIENLS-----------------------------------------KLDSSVI----------------------------ELGSKDSI--
Mle_Panxα3 MLLLGS-GTIKNLS-FKDS-QYT-KNISGFTKFG--EDFSQDY-LKIMSDYYHCTT-ETPSFQVGD--FKTYGIEW----LRQFPN-PENYSTSGHFSHKHPG---WKFMYYKHLR-IGHVPGE-------------------------------------------YLTDPA---------------------------------------
Mle_Panxα6 -MLLEI-ANFKGAT-FKER-QYT-GIIAGLTKFS--AAFAEDYILAMLHN--PVDG-DDVTFESGR--YLTYGSEW----FASLDK-QSNYTSFC-FLKENSK---LKFIYFEEKN-KQHLKGSKD-----------------------------------------ILV------------------------------------------
;
end;
$: alb Mnemiopsis.nex -trm all
#NEXUS
begin data;
dimensions ntax=7 nchar=89;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 MLISKFKGVTKGRQYTSVISGFKKFGSTFAEDYVRLLHNPVESDQELFSIGDFKTYGSEWIKKLKEDNLATEAFLRDSAIKHMYFQRKL
Mle_Panxα7A GVIFPIRATAKSRQLGAYIAGFSRDEERFAEEWVAMLQNPVEDGFEKFKFGSFAEYGLLWDTRNPLNNVQSLSSLDRPAISKLYLKDES
Mle_Panxα8 VLVALFPRLAKVRSYTSVIEGFLKVPVEFAKDYISMLHNPVETAEELFHIGSFFSYGVDWAAEEPGEDYKTLAFLRDPAYSEIWRVGGE
Mle_Panxα5 IYVAVFKRMAKVGIIASTIKGFEDSFEAFVDEYINLLHNPLTREEELFKIGNFVTYGFEWVGSQVNGSTYTLTFWNDWNLRHLYWNEYV
Mle_Panxα4 VILAGYKGLSKDRQYSTLIAGFTKFHPQFAEDYIKLLCLASGSEQQMFEYGSWYRYGADWFSSYHTNNSITLGMLPDPSYRYVFMTTSK
Mle_Panxα3 LLSGTIKNLSKDSQYTKNISGFTKFGEDFSQDYLKIMSDHCTTETPSFQVGDFKTYGIEWLRQFPPENYSTSGFSHKPGWKFMYYKHLR
Mle_Panxα6 LLIANFKGATKERQYTGIIAGLTKFSAAFAEDYLAMLHNPVDGDDVTFESGRYLTYGSEWFASLDQSNYTSFCFLKESKLKFIYFEEKN
;
end;
$: alb Mnemiopsis.nex -trm gappyout
#NEXUS
begin data;
dimensions ntax=7 nchar=104;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 -MLISKFKGVTPFKGRQYTSVISGFKKFGSTFAEDYVRLLHNPVESDQELFSIGDFKTYGSEWIKKLKLEDNLATEAFLRDDSAIKHMYFQRKLRGNGKAL---
Mle_Panxα7A MGVIFPIRATAP-KSRQLGAYIAGFSRDEERFAEEWVAMLQNPVEDGFEKFKFGSFAEYGLLWDTRNP-LNNVQSLSSLDRSPAISKLYLKDESRAEVLKRVLD
Mle_Panxα8 MVLVALFPRLAPFKVRSYTSVIEGFLKVPVEFAKDYISMLHNPVETAEELFHIGSFFSYGVDWAAEEPEGEDYKTLAFLRDEPAYSEIWRVGGE-------SLN
Mle_Panxα5 MIYVAVFKRMAPFKVGIIASTIKGFEDSFEAFVDEYINLLHNPLTREEELFKIGNFVTYGFEWVGSQVPNGSTYTLTFWND-WNLRHLYWNEYVQYNGHLK---
Mle_Panxα4 MVILAGYKGLSPFKDRQYSTLIAGFTKFHPQFAEDYIKLLCLASGSEQQMFEYGSWYRYGADWFSSYHETNNSITLGMLPDKPSYRYVFMTTSKNADVIENKLD
Mle_Panxα3 LLLSGTIKNLS-FKDSQYTKNISGFTKFGEDFSQDYLKIMSDHCTTETPSFQVGDFKTYGIEWLRQFPNPENYSTSGFSHKHPGWKFMYYKHLRIGHVPGEYLT
Mle_Panxα6 MLLIANFKGAT-FKERQYTGIIAGLTKFSAAFAEDYLAMLHNPVDGDDVTFESGRYLTYGSEWFASLDKQSNYTSFCFLKENSKLKFIYFEEKNKQHLKGSILV
;
end;
$: alb Mnemiopsis.nex -trm 3
#NEXUS
begin data;
dimensions ntax=7 nchar=110;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 -ML-ISKFKGVTPFKGRQYTSVISGFKKFGSTFAEDYVRLLHNPVESDQELFSIGDFKTYGSEWIKKLKLEDNLATEAFLRDDSAIKHMYFQRKLRGNGKAL--------
Mle_Panxα7A MGV-IFPIRATAP-KSRQLGAYIAGFSRDEERFAEEWVAMLQNPVEDGFEKFKFGSFAEYGLLWDTRNP-LNNVQSLSSLDRSPAISKLYLKDESRAEVLKRRSVLDSEY
Mle_Panxα8 MVL-VALFPRLAPFKVRSYTSVIEGFLKVPVEFAKDYISMLHNPVETAEELFHIGSFFSYGVDWAAEEPEGEDYKTLAFLRDEPAYSEIWRVGGE---------SLNRPH
Mle_Panxα5 MIYWVAVFKRMAPFKVGIIASTIKGFEDSFEAFVDEYINLLHNPLTREEELFKIGNFVTYGFEWVGSQVPNGSTYTLTFWND-WNLRHLYWNEYVQYNGHLKTS------
Mle_Panxα4 MVIELAGYKGLSPFKDRQYSTLIAGFTKFHPQFAEDYIKLLCLASGSEQQMFEYGSWYRYGADWFSSYHETNNSITLGMLPDKPSYRYVFMTTSKNADVIENLSKLDSSV
Mle_Panxα3 LLLGSGTIKNLS-FKDSQYTKNISGFTKFGEDFSQDYLKIMSDHCTTETPSFQVGDFKTYGIEWLRQFPNPENYSTSGFSHKHPGWKFMYYKHLRIGHVPGE--YLTDPA
Mle_Panxα6 MLLEIANFKGAT-FKERQYTGIIAGLTKFSAAFAEDYLAMLHNPVDGDDVTFESGRYLTYGSEWFASLDKQSNYTSFCFLKENSKLKFIYFEEKNKQHLKGSKDILV---
;
end;
$: alb Mnemiopsis.nex -trm 0.75
#NEXUS
begin data;
dimensions ntax=7 nchar=138;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 -ML-ILSKFKGVTPFKGRQYTGSVISGFKKFG--STFAEDYVRLLHNPVES-DQELFSIGDFKTYGSEWIKKLKLEDNLATEAFLRDDSAIKHMYFQRKLRGNGKAL-------------------------------
Mle_Panxα7A MGV-ILFPIRATAP-KSRQLGGAYIAGFSRDEEYERFAEEWVAMLQNPVED-GFEKFKFGSFAEYGLLWDTRNP-LNNVQSLSSLDRSPAISKLYLKDESRAEVLKRRSKKKLIKRTDVLDSEYVPEISKEGGSSDHI
Mle_Panxα8 MVL-V-ALFPRLAPFKVRSYTGSVIEGFLKVP--VEFAKDYISMLHNPVET-AEELFHIGSFFSYGVDWAAEEPEGEDYKTLAFLRDEPAYSEIWRVGGE---------VRCLVERCFSLNRPHIPHLVK--------
Mle_Panxα5 MIYWV-AVFKRMAPFKVGIIA-STIKGFEDSFRSEAFVDEYINLLHNPLTRDEEELFKIGNFVTYGFEWVGSQVPNGSTYTLTFWND-WNLRHLYWNEYVQYNGHLKTS-----------------------------
Mle_Panxα4 MVIEL-AGYKGLSPFKDRQYS-TLIAGFTKFH--PQFAEDYIKLLCLASGSPEQQMFEYGSWYRYGADWFSSYHETNNSITLGMLPDKPSYRYVFMTTSKNADVIENLS---------KLDSSVI-----ELGSKDSI
Mle_Panxα3 LLLGS-GTIKNLS-FKDSQYT-KNISGFTKFG--EDFSQDYLKIMSDHCTT-ETPSFQVGDFKTYGIEWLRQFPNPENYSTSGFSHKHPGWKFMYYKHLRIGHVPGE-----------YLTDPA--------------
Mle_Panxα6 MLLEI-ANFKGAT-FKERQYT-GIIAGLTKFS--AAFAEDYLAMLHNPVDG-DDVTFESGRYLTYGSEWFASLDKQSNYTSFCFLKENSKLKFIYFEEKNKQHLKGSKD---------ILV-----------------
;
end;