From 245d63543206dfa92e73e61c56b924f0306b4d29 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Fri, 22 Apr 2022 10:20:02 -0400 Subject: [PATCH 1/6] adjust tests so runnable locally in windows --- tests/tests.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index e3236d2e..4ce5def0 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,6 +1,6 @@ # generate-sitemap: Github action for automating sitemap generation # -# Copyright (c) 2020-2021 Vincent A Cicirello +# Copyright (c) 2020-2022 Vincent A Cicirello # https://www.cicirello.org/ # # MIT License @@ -328,6 +328,8 @@ def test_gatherfiles_html(self) : "./unblocked1.html", "./unblocked2.html", "./unblocked3.html", "./unblocked4.html", "./subdir/a.html", "./subdir/subdir/b.html"} + if os.name == "nt" : + expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) def test_gatherfiles_html_pdf(self) : @@ -342,6 +344,8 @@ def test_gatherfiles_html_pdf(self) : "./subdir/a.html", "./subdir/subdir/b.html", "./x.pdf", "./subdir/y.pdf", "./subdir/subdir/z.pdf"} + if os.name == "nt" : + expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) def test_gatherfiles_pdf(self) : @@ -351,15 +355,21 @@ def test_gatherfiles_pdf(self) : asSet = set(allfiles) expected = { "./x.pdf", "./subdir/y.pdf", "./subdir/subdir/z.pdf"} + if os.name == "nt" : + expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) def test_lastmod(self) : - os.chdir("tests") - dateStr = gs.lastmod("./unblocked1.html") - self.assertTrue(validateDate(dateStr), msg=dateStr) - dateStr = gs.lastmod("./subdir/a.html") - self.assertTrue(validateDate(dateStr), msg=dateStr) - os.chdir("..") + # assumes that if on windows must be running tests locally + # rather than in GitHub Actions, and may or may not be in a + # git repo, so simply skips this test. + if os.name != "nt" : + os.chdir("tests") + dateStr = gs.lastmod("./unblocked1.html") + self.assertTrue(validateDate(dateStr), msg=dateStr) + dateStr = gs.lastmod("./subdir/a.html") + self.assertTrue(validateDate(dateStr), msg=dateStr) + os.chdir("..") def test_urlstring(self) : filenames = [ "./a.html", @@ -471,7 +481,7 @@ def test_robotsTxtParser(self) : os.chdir("tests") for i, e in enumerate(expected) : filename = "robots" + str(i) + ".txt" - self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e)) + self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e), msg=filename) os.chdir("..") def test_robotsBlockedWithRobotsParser(self) : From 8be8a80d210dbadfa282c5be84675b83c642f704 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Fri, 22 Apr 2022 10:43:03 -0400 Subject: [PATCH 2/6] tests to reveal bug --- tests/badCharsDoIndex.html | 14 +++++++ tests/badCharsNoindex1.html | 18 +++++++++ tests/badCharsNoindex2.html | 18 +++++++++ tests/gentestdata.py | 76 +++++++++++++++++++++++++++++++++++++ tests/tests.py | 24 +++++++++--- 5 files changed, 144 insertions(+), 6 deletions(-) create mode 100644 tests/badCharsDoIndex.html create mode 100644 tests/badCharsNoindex1.html create mode 100644 tests/badCharsNoindex2.html create mode 100644 tests/gentestdata.py diff --git a/tests/badCharsDoIndex.html b/tests/badCharsDoIndex.html new file mode 100644 index 00000000..082c4e77 --- /dev/null +++ b/tests/badCharsDoIndex.html @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/tests/badCharsNoindex1.html b/tests/badCharsNoindex1.html new file mode 100644 index 00000000..da99e90e --- /dev/null +++ b/tests/badCharsNoindex1.html @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/tests/badCharsNoindex2.html b/tests/badCharsNoindex2.html new file mode 100644 index 00000000..4fd2812a --- /dev/null +++ b/tests/badCharsNoindex2.html @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/tests/gentestdata.py b/tests/gentestdata.py new file mode 100644 index 00000000..4be343ba --- /dev/null +++ b/tests/gentestdata.py @@ -0,0 +1,76 @@ +# generate-sitemap: Github action for automating sitemap generation +# +# Copyright (c) 2020-2022 Vincent A Cicirello +# https://www.cicirello.org/ +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +if __name__ == "__main__" : + + beginning = """ + + + + + +""" + + ending = """ + + + + + + + +""" + + noindex = """ + + + +""" + + nonCharData = [ x for x in range(128, 256) ] + + with open("badCharsNoindex1.html", "w") as f : + f.write(beginning) + f.write(noindex) + with open("badCharsNoindex1.html", "ab") as f : + f.write(bytes(nonCharData)) + with open("badCharsNoindex1.html", "a") as f : + f.write(ending) + + with open("badCharsNoindex2.html", "w") as f : + f.write(beginning) + with open("badCharsNoindex2.html", "ab") as f : + f.write(bytes(nonCharData)) + with open("badCharsNoindex2.html", "a") as f : + f.write(noindex) + f.write(ending) + + with open("badCharsDoIndex.html", "w") as f : + f.write(beginning) + with open("badCharsDoIndex.html", "ab") as f : + f.write(bytes(nonCharData)) + with open("badCharsDoIndex.html", "a") as f : + f.write(ending) diff --git a/tests/tests.py b/tests/tests.py index 4ce5def0..48d8c56d 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -294,11 +294,14 @@ def test_robotsBlocked(self) : "tests/unblocked1.html", "tests/unblocked2.html", "tests/unblocked3.html", - "tests/unblocked4.html" ] + "tests/unblocked4.html", + "tests/badCharsDoIndex.html"] blocked = [ "tests/blocked1.html", "tests/blocked2.html", "tests/blocked3.html", - "tests/blocked4.html" ] + "tests/blocked4.html", + "tests/badCharsNoindex1.html", + "tests/badCharsNoindex2.html"] for f in unblocked : self.assertFalse(gs.robotsBlocked(f)) for f in blocked : @@ -308,11 +311,14 @@ def test_hasMetaRobotsNoindex(self) : unblocked = [ "tests/unblocked1.html", "tests/unblocked2.html", "tests/unblocked3.html", - "tests/unblocked4.html" ] + "tests/unblocked4.html", + "tests/badCharsDoIndex.html" ] blocked = [ "tests/blocked1.html", "tests/blocked2.html", "tests/blocked3.html", - "tests/blocked4.html" ] + "tests/blocked4.html", + "tests/badCharsNoindex1.html", + "tests/badCharsNoindex2.html" ] for f in unblocked : self.assertFalse(gs.hasMetaRobotsNoindex(f)) for f in blocked : @@ -327,7 +333,10 @@ def test_gatherfiles_html(self) : "./blocked3.html", "./blocked4.html", "./unblocked1.html", "./unblocked2.html", "./unblocked3.html", "./unblocked4.html", - "./subdir/a.html", "./subdir/subdir/b.html"} + "./subdir/a.html", "./subdir/subdir/b.html", + "./badCharsNoindex1.html", + "./badCharsNoindex2.html", + "./badCharsDoIndex.html"} if os.name == "nt" : expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) @@ -343,7 +352,10 @@ def test_gatherfiles_html_pdf(self) : "./unblocked3.html", "./unblocked4.html", "./subdir/a.html", "./subdir/subdir/b.html", "./x.pdf", "./subdir/y.pdf", - "./subdir/subdir/z.pdf"} + "./subdir/subdir/z.pdf", + "./badCharsNoindex1.html", + "./badCharsNoindex2.html", + "./badCharsDoIndex.html"} if os.name == "nt" : expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) From 36f946e341b36c711ce9ec0c217939bcec525962 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Fri, 22 Apr 2022 11:00:26 -0400 Subject: [PATCH 3/6] test to reveal bug --- tests/integration.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration.py b/tests/integration.py index e221dcf3..25868fed 100644 --- a/tests/integration.py +++ b/tests/integration.py @@ -71,7 +71,8 @@ def testIntegration(self) : "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/a.html", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/x.pdf", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf", - "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html" + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html", + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html" } self.assertEqual(expected, urlset) @@ -91,7 +92,8 @@ def testIntegrationWithAdditionalTypes(self) : "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.docx", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.pptx", - "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html" + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html", + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html" } self.assertEqual(expected, urlset) From 0406472749e66ec60f0d206987a5695028a9da44 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Fri, 22 Apr 2022 11:04:29 -0400 Subject: [PATCH 4/6] fix handling of non utf8 characters --- generatesitemap.py | 74 +++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/generatesitemap.py b/generatesitemap.py index 0841aa44..180aae1b 100755 --- a/generatesitemap.py +++ b/generatesitemap.py @@ -88,15 +88,19 @@ def hasMetaRobotsNoindex(f) : Keyword arguments: f - Filename including path """ - with open(f,"r") as file : - for line in file : - # Check line for , etc - if re.search(" directives required to be in head - if "" in line or "" in line : - return False + try: + with open(f, "r", errors="surrogateescape") as file : + for line in file : + # Check line for , etc + if re.search(" directives required to be in head + if "" in line or "" in line : + return False + except OSError: + print("WARNING: OS error while checking for noindex directive in:", f) + print("Assuming", f, "doesn't have noindex directive.") return False def getFileExtension(f) : @@ -170,30 +174,34 @@ def parseRobotsTxt(robotsFile="robots.txt") : must be robots.txt (the default). The parameter is to enable unit testing with different robots.txt files.""" blockedPaths = [] - if os.path.isfile(robotsFile) : - with open(robotsFile,"r") as robots : - foundBlock = False - rulesStart = False - for line in robots : - commentStart = line.find("#") - if commentStart > 0 : - line = line[:commentStart] - line = line.strip() - lineLow = line.lower() - if foundBlock : - if rulesStart and lineLow.startswith("user-agent:") : - foundBlock = False - elif not rulesStart and lineLow.startswith("allow:") : - rulesStart = True - elif lineLow.startswith("disallow:") : - rulesStart = True - if len(line) > 9 : - path = line[9:].strip() - if len(path) > 0 and " " not in path and "\t" not in path: - blockedPaths.append(path) - elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" : - foundBlock = True - rulesStart = False + try: + if os.path.isfile(robotsFile) : + with open(robotsFile, "r", errors="surrogateescape") as robots : + foundBlock = False + rulesStart = False + for line in robots : + commentStart = line.find("#") + if commentStart > 0 : + line = line[:commentStart] + line = line.strip() + lineLow = line.lower() + if foundBlock : + if rulesStart and lineLow.startswith("user-agent:") : + foundBlock = False + elif not rulesStart and lineLow.startswith("allow:") : + rulesStart = True + elif lineLow.startswith("disallow:") : + rulesStart = True + if len(line) > 9 : + path = line[9:].strip() + if len(path) > 0 and " " not in path and "\t" not in path: + blockedPaths.append(path) + elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" : + foundBlock = True + rulesStart = False + except OSError: + print("WARNING: OS error while parsing robots.txt") + print("Assuming nothing disallowed.") return blockedPaths def lastmod(f) : From acca2dc4fa8c7ebb0a4ff3e3ec54e224f3824038 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Fri, 22 Apr 2022 11:27:03 -0400 Subject: [PATCH 5/6] refactored robots.txt parsing logic --- generatesitemap.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/generatesitemap.py b/generatesitemap.py index 180aae1b..492fdbd4 100755 --- a/generatesitemap.py +++ b/generatesitemap.py @@ -185,10 +185,15 @@ def parseRobotsTxt(robotsFile="robots.txt") : line = line[:commentStart] line = line.strip() lineLow = line.lower() - if foundBlock : - if rulesStart and lineLow.startswith("user-agent:") : + if lineLow.startswith("user-agent:") : + if len(line)>11 and line[11:].strip() == "*" : + foundBlock = True + rulesStart = False + elif rulesStart : foundBlock = False - elif not rulesStart and lineLow.startswith("allow:") : + rulesStart = False + elif foundBlock : + if lineLow.startswith("allow:") : rulesStart = True elif lineLow.startswith("disallow:") : rulesStart = True @@ -196,9 +201,6 @@ def parseRobotsTxt(robotsFile="robots.txt") : path = line[9:].strip() if len(path) > 0 and " " not in path and "\t" not in path: blockedPaths.append(path) - elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" : - foundBlock = True - rulesStart = False except OSError: print("WARNING: OS error while parsing robots.txt") print("Assuming nothing disallowed.") From c14c89f406155ae1b11087c39976281cac7e17b1 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Fri, 22 Apr 2022 11:42:22 -0400 Subject: [PATCH 6/6] prep for release --- CHANGELOG.md | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b06ffc9..a5f1888a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,24 +4,35 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] - 2022-03-31 +## [Unreleased] - 2022-04-22 ### Added ### Changed -* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1. ### Deprecated ### Removed ### Fixed + +### CI/CD + +### Dependencies + + +## [1.8.3] - 2022-04-22 + +### Fixed +* Corrected check for robots noindex directive in case when non-utf8 characters + present in an html file. * Disabled pycache to protect against potential future bug. Currently no imports so no pycache created, but if future versions import local py modules, a pycache would be created during run in repo. Disabled creation of pycache now to avoid. -### CI/CD +### Dependencies +* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1. ## [1.8.2] - 2022-03-04