From d2afd631469f3d7f55105e6dbbb4023e74a980ad Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Thu, 11 Mar 2021 17:40:08 +0000 Subject: [PATCH 1/3] Sniff relaxed and strict interleaved PHYLIP files --- lib/galaxy/datatypes/phylip.py | 105 ++++++++++++++---- lib/galaxy/datatypes/sniff.py | 5 +- .../test/test_relaxed_interleaved.phylip | 30 +++++ ....phylip => test_strict_interleaved.phylip} | 0 4 files changed, 118 insertions(+), 22 deletions(-) create mode 100644 lib/galaxy/datatypes/test/test_relaxed_interleaved.phylip rename lib/galaxy/datatypes/test/{test.phylip => test_strict_interleaved.phylip} (100%) diff --git a/lib/galaxy/datatypes/phylip.py b/lib/galaxy/datatypes/phylip.py index a5fa62d8b4f4..4c2dea5dd9fe 100644 --- a/lib/galaxy/datatypes/phylip.py +++ b/lib/galaxy/datatypes/phylip.py @@ -46,36 +46,99 @@ def set_peek(self, dataset, is_multi_byte=False): dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def sniff_strict_interleaved(self, nb_seq, seq_length, alignment_prefix): + found_seq_length = None + for _ in range(nb_seq): + line = alignment_prefix.readline() + if not line: + # Not enough lines, either the prefix is too short or this is not PHYLIP + return False + line = line.rstrip("\n") + if len(line) < 11: + # Sequence characters immediately follow the sequence ID. + # They must start at the 11th character in the line, as the first 10 characters are reserved for the sequence ID + return False + seq = line[10:].replace(" ", "") + this_seq_length = len(seq) + if this_seq_length > seq_length: + return False + if found_seq_length is None: + found_seq_length = this_seq_length + elif this_seq_length != found_seq_length: + # All sequence parts should have the same length + return False + # Fail if sequence is not ascii + seq.encode('ascii') + if any(str.isdigit(c) for c in seq): + # Could tighten up further by requiring IUPAC strings chars + return False + # There may be more lines with the remaining parts of the sequences + return True + + def sniff_strict_sequential(self, nb_seq, seq_length, alignment_prefix): + raise NotImplementedError + + def sniff_relaxed_interleaved(self, nb_seq, seq_length, alignment_prefix): + found_seq_length = None + for _ in range(nb_seq): + line = alignment_prefix.readline() + if not line: + # Not enough lines, either the prefix is too short or this is not PHYLIP + return False + line = line.rstrip("\n") + # In the relaxed format the sequence id can have any length. + # The id and sequence are separated by some whitespaces. + seq = line.split(None, 1)[1].replace(" ", "") + this_seq_length = len(seq) + if this_seq_length > seq_length: + return False + if found_seq_length is None: + found_seq_length = this_seq_length + elif this_seq_length != found_seq_length: + # All sequence parts should have the same length + return False + # Fail if sequence is not ascii + seq.encode('ascii') + if any(str.isdigit(c) for c in seq): + # Could tighten up further by requiring IUPAC strings chars + return False + # There may be more lines with the remaining parts of the sequences + return True + def sniff_prefix(self, file_prefix): """ All Phylip files starts with the number of sequences so we can use this to count the following number of sequences in the first 'stack' >>> from galaxy.datatypes.sniff import get_test_fname - >>> fname = get_test_fname('test.phylip') + >>> fname = get_test_fname('test_strict_interleaved.phylip') + >>> Phylip().sniff(fname) + True + >>> fname = get_test_fname('test_relaxed_interleaved.phylip') >>> Phylip().sniff(fname) True """ f = file_prefix.string_io() # Get number of sequences and sequence length from first line - nb_seq, _seq_length = (int(n) for n in f.readline().split()) - if nb_seq <= 0: + nb_seq, seq_length = (int(n) for n in f.readline().split()) + if nb_seq <= 0 or seq_length <= 0: return False - # counts number of sequence from first stack - count = 0 - for line in f: - if not line.split(): - break - if len(line) < 11: - # Sequence characters immediately follow the sequence ID. - # They must start at the 11th character in the line, as the first 10 characters are reserved for the sequence ID - return False - # Fail if sequence is not ascii - line[10:].encode('ascii') - if any(str.isdigit(c) for c in line[10:]): - # Could tighten up further by requiring IUPAC strings chars - return False - count += 1 - if count > nb_seq: - return False - return count == nb_seq + file_pos = f.tell() + try: + if self.sniff_strict_interleaved(nb_seq, seq_length, f): + return True + except Exception: + pass + f.seek(file_pos) + try: + if self.sniff_strict_sequential(nb_seq, seq_length, f): + return True + except Exception: + pass + f.seek(file_pos) + try: + if self.sniff_relaxed_interleaved(nb_seq, seq_length, f): + return True + except Exception: + pass + return False diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index 6a3657b05c51..c185a861cfd9 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -422,7 +422,10 @@ def guess_ext(fname, sniff_order, is_binary=False): >>> fname = get_test_fname('test.blib') >>> guess_ext(fname, sniff_order) 'blib' - >>> fname = get_test_fname('test.phylip') + >>> fname = get_test_fname('test_strict_interleaved.phylip') + >>> guess_ext(fname, sniff_order) + 'phylip' + >>> fname = get_test_fname('test_relaxed_interleaved.phylip') >>> guess_ext(fname, sniff_order) 'phylip' >>> fname = get_test_fname('1.smat') diff --git a/lib/galaxy/datatypes/test/test_relaxed_interleaved.phylip b/lib/galaxy/datatypes/test/test_relaxed_interleaved.phylip new file mode 100644 index 000000000000..b3f413e5ad35 --- /dev/null +++ b/lib/galaxy/datatypes/test/test_relaxed_interleaved.phylip @@ -0,0 +1,30 @@ + 9 144 +OM-RGC.v1.012162940 ---------- ---------- RVFERFGLAP MDKPFDPITT FDYD-SRQVF IQ-----TDA +FELIC KMKIL-I--- -CYGFIIRRL IVNERFGRTS MDKPNDTIES FSME-KEDVF PQLFPKVLDA +OM-RGC.v1.010553481 MKKIFLL--- -NDGITIREI SIIEVFAKRK LNKPFDTIEP FSE--TQKVR PAVRKNGNSS +OM-RGC.v1.001504500 YLKLFLLNHP TFSGIQYPF- NDRRTSQTRQ FNSPFDKYAD STFEMTDRFW TFTRKTLEEF +OM-RGC.v1.012274392 ---------- ----IQYPF- NNRRTSQTRQ FNSPFDKYAD STFEMTDRFW TFTKKTLDEF +OM-RGC.v1.003588026 ------D-QF SWTLIQNESG DSEQEGFVYD FNSPFDKIYD DFYRLENRLF SRSMRDLDYF +OM-RGC.v1.002677074 -----YD-QF SWTDIEYEFG DSSQEGFISD FNSPFDKIYD EFYKLDSRLF SRTMRDLNYF +OM-RGC.v1.012568023 ---------- ---------- ----KGFTSK YNAPFDNIID LSRSLDWFME NWTQWLLEDY +OM-RGC.v1.002130110 --LLR-N--- ----VND--- --LQNVFLKK SDKPFDTAEF QDSDLSLNLY ---QPELTIH + + SFRAEW---- -NQISGTTK- -DTYPVVSLI AIASFLESSV SDQASFTVLT IVAFNLVADK + S-YEEWDKKD YNHLSANKK- -ESFSRAIWF GVVSFV--DF ANQNSFTLMT MVTFNFTSSL + VDI--WKFKV E--------- -DR------- ----ILAPGS KN-------S LMT------- + MDDSGWKY-- ERDTVDVPGQ YDA---LHFI ILSCVVDPEL EARTTIVCLS LIAYNVIDAE + MDDSGWKY-- ERDTVDVPGQ YDA---LHFI ILSCVVDPEL EARTTIVCLS LIAYNVIDAE + VKKVGWNA-- EQDPNRATA- -SA---IHFI ILMCVVDPEL ESRTTIVCLS LIAYNVIDSE + LKKNGWDA-- KQDPNRSSE- -SA---IHFI ILMCVVDPEL ESRTTIVCLS LIAYNVIDSE + AGEIEWDA-- YKDPVYLSQ- -IG---IFFI ILLCFIHPEL ESKTTIICLS LIAYNVIDED + KEIYEWD--- HTEIYGQGS- -QT---IFLI IVWSFIRAEL ESKASMGSLT LVAYNVFGDD + + PAFALMFLSV ASSSI-DTRF PLAF + TPFIISSL-L TYSV--NFQF PLLI + ---------- ---------- ---- + KEVWVIYVPF FRK-L-EIKG LSSN + KEVWVIYVPF FRK-L-EIKG LSS- + KEVWVIYVPF FRK-I-EIKG LSSN + KE-------- ---------- ---- + KSIYILFVPF YQM-V-DKRG PILN + KNIAILFLSV YFLTINDKRV PGLL diff --git a/lib/galaxy/datatypes/test/test.phylip b/lib/galaxy/datatypes/test/test_strict_interleaved.phylip similarity index 100% rename from lib/galaxy/datatypes/test/test.phylip rename to lib/galaxy/datatypes/test/test_strict_interleaved.phylip From 1ee618243d9d874410b7da77ab1bde52f43e87dd Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Thu, 11 Mar 2021 18:03:06 +0000 Subject: [PATCH 2/3] Backport maintenance_bot from dev --- .github/workflows/maintenance_bot.yaml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/maintenance_bot.yaml b/.github/workflows/maintenance_bot.yaml index 0083d243545a..4cea9235ed14 100644 --- a/.github/workflows/maintenance_bot.yaml +++ b/.github/workflows/maintenance_bot.yaml @@ -1,7 +1,7 @@ -name: "Maintenance Bot" +name: Maintenance Bot on: pull_request_target: - types: [opened, reopened] + types: [opened, reopened, edited, ready_for_review, unlabeled] jobs: labeler: @@ -10,8 +10,21 @@ jobs: env: MILESTONE_NUMBER: 19 steps: + - name: Get latest pull request labels + id: get_pr_labels + uses: actions/github-script@v3 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const response = await github.issues.listLabelsOnIssue({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + console.log(response); + return response.data; - name: Add area labels - if: ${{ ! contains(join(github.event.pull_request.labels.*.name, ', '), 'area/') }} + if: ${{ ! contains(join(fromJSON(steps.get_pr_labels.outputs.result).*.name, ', '), 'area/') }} uses: actions/labeler@main with: repo-token: "${{ secrets.GITHUB_TOKEN }}" From c5554ae3bd2abddd5558583c81643b97b7950904 Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Thu, 11 Mar 2021 18:04:57 +0000 Subject: [PATCH 3/3] Don't run maintenance_bot CI jobs on forks --- .github/workflows/maintenance_bot.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/maintenance_bot.yaml b/.github/workflows/maintenance_bot.yaml index 4cea9235ed14..0a058af6fdcd 100644 --- a/.github/workflows/maintenance_bot.yaml +++ b/.github/workflows/maintenance_bot.yaml @@ -6,6 +6,7 @@ on: jobs: labeler: name: Assign labels and milestone + if: github.repository_owner == 'galaxyproject' runs-on: ubuntu-latest env: MILESTONE_NUMBER: 19