Skip to content

Commit

Permalink
Merge pull request #18 from nsoranzo/tighten_phylip_datatype
Browse files Browse the repository at this point in the history
Sniff relaxed and strict interleaved PHYLIP files
  • Loading branch information
mvdbeek authored Mar 15, 2021
2 parents de7b74b + c5554ae commit 9b6d4c1
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 25 deletions.
20 changes: 17 additions & 3 deletions .github/workflows/maintenance_bot.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
name: "Maintenance Bot"
name: Maintenance Bot
on:
pull_request_target:
types: [opened, reopened]
types: [opened, reopened, edited, ready_for_review, unlabeled]

jobs:
labeler:
name: Assign labels and milestone
if: github.repository_owner == 'galaxyproject'
runs-on: ubuntu-latest
env:
MILESTONE_NUMBER: 19
steps:
- name: Get latest pull request labels
id: get_pr_labels
uses: actions/github-script@v3
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const response = await github.issues.listLabelsOnIssue({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
console.log(response);
return response.data;
- name: Add area labels
if: ${{ ! contains(join(github.event.pull_request.labels.*.name, ', '), 'area/') }}
if: ${{ ! contains(join(fromJSON(steps.get_pr_labels.outputs.result).*.name, ', '), 'area/') }}
uses: actions/labeler@main
with:
repo-token: "${{ secrets.GITHUB_TOKEN }}"
Expand Down
105 changes: 84 additions & 21 deletions lib/galaxy/datatypes/phylip.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,36 +46,99 @@ def set_peek(self, dataset, is_multi_byte=False):
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'

def sniff_strict_interleaved(self, nb_seq, seq_length, alignment_prefix):
found_seq_length = None
for _ in range(nb_seq):
line = alignment_prefix.readline()
if not line:
# Not enough lines, either the prefix is too short or this is not PHYLIP
return False
line = line.rstrip("\n")
if len(line) < 11:
# Sequence characters immediately follow the sequence ID.
# They must start at the 11th character in the line, as the first 10 characters are reserved for the sequence ID
return False
seq = line[10:].replace(" ", "")
this_seq_length = len(seq)
if this_seq_length > seq_length:
return False
if found_seq_length is None:
found_seq_length = this_seq_length
elif this_seq_length != found_seq_length:
# All sequence parts should have the same length
return False
# Fail if sequence is not ascii
seq.encode('ascii')
if any(str.isdigit(c) for c in seq):
# Could tighten up further by requiring IUPAC strings chars
return False
# There may be more lines with the remaining parts of the sequences
return True

def sniff_strict_sequential(self, nb_seq, seq_length, alignment_prefix):
raise NotImplementedError

def sniff_relaxed_interleaved(self, nb_seq, seq_length, alignment_prefix):
found_seq_length = None
for _ in range(nb_seq):
line = alignment_prefix.readline()
if not line:
# Not enough lines, either the prefix is too short or this is not PHYLIP
return False
line = line.rstrip("\n")
# In the relaxed format the sequence id can have any length.
# The id and sequence are separated by some whitespaces.
seq = line.split(None, 1)[1].replace(" ", "")
this_seq_length = len(seq)
if this_seq_length > seq_length:
return False
if found_seq_length is None:
found_seq_length = this_seq_length
elif this_seq_length != found_seq_length:
# All sequence parts should have the same length
return False
# Fail if sequence is not ascii
seq.encode('ascii')
if any(str.isdigit(c) for c in seq):
# Could tighten up further by requiring IUPAC strings chars
return False
# There may be more lines with the remaining parts of the sequences
return True

def sniff_prefix(self, file_prefix):
"""
All Phylip files starts with the number of sequences so we can use this
to count the following number of sequences in the first 'stack'
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.phylip')
>>> fname = get_test_fname('test_strict_interleaved.phylip')
>>> Phylip().sniff(fname)
True
>>> fname = get_test_fname('test_relaxed_interleaved.phylip')
>>> Phylip().sniff(fname)
True
"""
f = file_prefix.string_io()
# Get number of sequences and sequence length from first line
nb_seq, _seq_length = (int(n) for n in f.readline().split())
if nb_seq <= 0:
nb_seq, seq_length = (int(n) for n in f.readline().split())
if nb_seq <= 0 or seq_length <= 0:
return False
# counts number of sequence from first stack
count = 0
for line in f:
if not line.split():
break
if len(line) < 11:
# Sequence characters immediately follow the sequence ID.
# They must start at the 11th character in the line, as the first 10 characters are reserved for the sequence ID
return False
# Fail if sequence is not ascii
line[10:].encode('ascii')
if any(str.isdigit(c) for c in line[10:]):
# Could tighten up further by requiring IUPAC strings chars
return False
count += 1
if count > nb_seq:
return False
return count == nb_seq
file_pos = f.tell()
try:
if self.sniff_strict_interleaved(nb_seq, seq_length, f):
return True
except Exception:
pass
f.seek(file_pos)
try:
if self.sniff_strict_sequential(nb_seq, seq_length, f):
return True
except Exception:
pass
f.seek(file_pos)
try:
if self.sniff_relaxed_interleaved(nb_seq, seq_length, f):
return True
except Exception:
pass
return False
5 changes: 4 additions & 1 deletion lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,10 @@ def guess_ext(fname, sniff_order, is_binary=False):
>>> fname = get_test_fname('test.blib')
>>> guess_ext(fname, sniff_order)
'blib'
>>> fname = get_test_fname('test.phylip')
>>> fname = get_test_fname('test_strict_interleaved.phylip')
>>> guess_ext(fname, sniff_order)
'phylip'
>>> fname = get_test_fname('test_relaxed_interleaved.phylip')
>>> guess_ext(fname, sniff_order)
'phylip'
>>> fname = get_test_fname('1.smat')
Expand Down
30 changes: 30 additions & 0 deletions lib/galaxy/datatypes/test/test_relaxed_interleaved.phylip
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
9 144
OM-RGC.v1.012162940 ---------- ---------- RVFERFGLAP MDKPFDPITT FDYD-SRQVF IQ-----TDA
FELIC KMKIL-I--- -CYGFIIRRL IVNERFGRTS MDKPNDTIES FSME-KEDVF PQLFPKVLDA
OM-RGC.v1.010553481 MKKIFLL--- -NDGITIREI SIIEVFAKRK LNKPFDTIEP FSE--TQKVR PAVRKNGNSS
OM-RGC.v1.001504500 YLKLFLLNHP TFSGIQYPF- NDRRTSQTRQ FNSPFDKYAD STFEMTDRFW TFTRKTLEEF
OM-RGC.v1.012274392 ---------- ----IQYPF- NNRRTSQTRQ FNSPFDKYAD STFEMTDRFW TFTKKTLDEF
OM-RGC.v1.003588026 ------D-QF SWTLIQNESG DSEQEGFVYD FNSPFDKIYD DFYRLENRLF SRSMRDLDYF
OM-RGC.v1.002677074 -----YD-QF SWTDIEYEFG DSSQEGFISD FNSPFDKIYD EFYKLDSRLF SRTMRDLNYF
OM-RGC.v1.012568023 ---------- ---------- ----KGFTSK YNAPFDNIID LSRSLDWFME NWTQWLLEDY
OM-RGC.v1.002130110 --LLR-N--- ----VND--- --LQNVFLKK SDKPFDTAEF QDSDLSLNLY ---QPELTIH

SFRAEW---- -NQISGTTK- -DTYPVVSLI AIASFLESSV SDQASFTVLT IVAFNLVADK
S-YEEWDKKD YNHLSANKK- -ESFSRAIWF GVVSFV--DF ANQNSFTLMT MVTFNFTSSL
VDI--WKFKV E--------- -DR------- ----ILAPGS KN-------S LMT-------
MDDSGWKY-- ERDTVDVPGQ YDA---LHFI ILSCVVDPEL EARTTIVCLS LIAYNVIDAE
MDDSGWKY-- ERDTVDVPGQ YDA---LHFI ILSCVVDPEL EARTTIVCLS LIAYNVIDAE
VKKVGWNA-- EQDPNRATA- -SA---IHFI ILMCVVDPEL ESRTTIVCLS LIAYNVIDSE
LKKNGWDA-- KQDPNRSSE- -SA---IHFI ILMCVVDPEL ESRTTIVCLS LIAYNVIDSE
AGEIEWDA-- YKDPVYLSQ- -IG---IFFI ILLCFIHPEL ESKTTIICLS LIAYNVIDED
KEIYEWD--- HTEIYGQGS- -QT---IFLI IVWSFIRAEL ESKASMGSLT LVAYNVFGDD

PAFALMFLSV ASSSI-DTRF PLAF
TPFIISSL-L TYSV--NFQF PLLI
---------- ---------- ----
KEVWVIYVPF FRK-L-EIKG LSSN
KEVWVIYVPF FRK-L-EIKG LSS-
KEVWVIYVPF FRK-I-EIKG LSSN
KE-------- ---------- ----
KSIYILFVPF YQM-V-DKRG PILN
KNIAILFLSV YFLTINDKRV PGLL
File renamed without changes.

0 comments on commit 9b6d4c1

Please sign in to comment.