diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b06ffc9..a5f1888a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,24 +4,35 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] - 2022-03-31 +## [Unreleased] - 2022-04-22 ### Added ### Changed -* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1. ### Deprecated ### Removed ### Fixed + +### CI/CD + +### Dependencies + + +## [1.8.3] - 2022-04-22 + +### Fixed +* Corrected check for robots noindex directive in case when non-utf8 characters + present in an html file. * Disabled pycache to protect against potential future bug. Currently no imports so no pycache created, but if future versions import local py modules, a pycache would be created during run in repo. Disabled creation of pycache now to avoid. -### CI/CD +### Dependencies +* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1. ## [1.8.2] - 2022-03-04 diff --git a/generatesitemap.py b/generatesitemap.py index 0841aa44..492fdbd4 100755 --- a/generatesitemap.py +++ b/generatesitemap.py @@ -88,15 +88,19 @@ def hasMetaRobotsNoindex(f) : Keyword arguments: f - Filename including path """ - with open(f,"r") as file : - for line in file : - # Check line for , etc - if re.search(" directives required to be in head - if "" in line or "" in line : - return False + try: + with open(f, "r", errors="surrogateescape") as file : + for line in file : + # Check line for , etc + if re.search(" directives required to be in head + if "" in line or "" in line : + return False + except OSError: + print("WARNING: OS error while checking for noindex directive in:", f) + print("Assuming", f, "doesn't have noindex directive.") return False def getFileExtension(f) : @@ -170,30 +174,36 @@ def parseRobotsTxt(robotsFile="robots.txt") : must be robots.txt (the default). The parameter is to enable unit testing with different robots.txt files.""" blockedPaths = [] - if os.path.isfile(robotsFile) : - with open(robotsFile,"r") as robots : - foundBlock = False - rulesStart = False - for line in robots : - commentStart = line.find("#") - if commentStart > 0 : - line = line[:commentStart] - line = line.strip() - lineLow = line.lower() - if foundBlock : - if rulesStart and lineLow.startswith("user-agent:") : - foundBlock = False - elif not rulesStart and lineLow.startswith("allow:") : - rulesStart = True - elif lineLow.startswith("disallow:") : - rulesStart = True - if len(line) > 9 : - path = line[9:].strip() - if len(path) > 0 and " " not in path and "\t" not in path: - blockedPaths.append(path) - elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" : - foundBlock = True - rulesStart = False + try: + if os.path.isfile(robotsFile) : + with open(robotsFile, "r", errors="surrogateescape") as robots : + foundBlock = False + rulesStart = False + for line in robots : + commentStart = line.find("#") + if commentStart > 0 : + line = line[:commentStart] + line = line.strip() + lineLow = line.lower() + if lineLow.startswith("user-agent:") : + if len(line)>11 and line[11:].strip() == "*" : + foundBlock = True + rulesStart = False + elif rulesStart : + foundBlock = False + rulesStart = False + elif foundBlock : + if lineLow.startswith("allow:") : + rulesStart = True + elif lineLow.startswith("disallow:") : + rulesStart = True + if len(line) > 9 : + path = line[9:].strip() + if len(path) > 0 and " " not in path and "\t" not in path: + blockedPaths.append(path) + except OSError: + print("WARNING: OS error while parsing robots.txt") + print("Assuming nothing disallowed.") return blockedPaths def lastmod(f) : diff --git a/tests/badCharsDoIndex.html b/tests/badCharsDoIndex.html new file mode 100644 index 00000000..082c4e77 --- /dev/null +++ b/tests/badCharsDoIndex.html @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/tests/badCharsNoindex1.html b/tests/badCharsNoindex1.html new file mode 100644 index 00000000..da99e90e --- /dev/null +++ b/tests/badCharsNoindex1.html @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/tests/badCharsNoindex2.html b/tests/badCharsNoindex2.html new file mode 100644 index 00000000..4fd2812a --- /dev/null +++ b/tests/badCharsNoindex2.html @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/tests/gentestdata.py b/tests/gentestdata.py new file mode 100644 index 00000000..4be343ba --- /dev/null +++ b/tests/gentestdata.py @@ -0,0 +1,76 @@ +# generate-sitemap: Github action for automating sitemap generation +# +# Copyright (c) 2020-2022 Vincent A Cicirello +# https://www.cicirello.org/ +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +if __name__ == "__main__" : + + beginning = """ + + + + + +""" + + ending = """ + + + + + + + +""" + + noindex = """ + + + +""" + + nonCharData = [ x for x in range(128, 256) ] + + with open("badCharsNoindex1.html", "w") as f : + f.write(beginning) + f.write(noindex) + with open("badCharsNoindex1.html", "ab") as f : + f.write(bytes(nonCharData)) + with open("badCharsNoindex1.html", "a") as f : + f.write(ending) + + with open("badCharsNoindex2.html", "w") as f : + f.write(beginning) + with open("badCharsNoindex2.html", "ab") as f : + f.write(bytes(nonCharData)) + with open("badCharsNoindex2.html", "a") as f : + f.write(noindex) + f.write(ending) + + with open("badCharsDoIndex.html", "w") as f : + f.write(beginning) + with open("badCharsDoIndex.html", "ab") as f : + f.write(bytes(nonCharData)) + with open("badCharsDoIndex.html", "a") as f : + f.write(ending) diff --git a/tests/integration.py b/tests/integration.py index e221dcf3..25868fed 100644 --- a/tests/integration.py +++ b/tests/integration.py @@ -71,7 +71,8 @@ def testIntegration(self) : "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/a.html", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/x.pdf", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf", - "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html" + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html", + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html" } self.assertEqual(expected, urlset) @@ -91,7 +92,8 @@ def testIntegrationWithAdditionalTypes(self) : "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.docx", "https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.pptx", - "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html" + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html", + "https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html" } self.assertEqual(expected, urlset) diff --git a/tests/tests.py b/tests/tests.py index e3236d2e..48d8c56d 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,6 +1,6 @@ # generate-sitemap: Github action for automating sitemap generation # -# Copyright (c) 2020-2021 Vincent A Cicirello +# Copyright (c) 2020-2022 Vincent A Cicirello # https://www.cicirello.org/ # # MIT License @@ -294,11 +294,14 @@ def test_robotsBlocked(self) : "tests/unblocked1.html", "tests/unblocked2.html", "tests/unblocked3.html", - "tests/unblocked4.html" ] + "tests/unblocked4.html", + "tests/badCharsDoIndex.html"] blocked = [ "tests/blocked1.html", "tests/blocked2.html", "tests/blocked3.html", - "tests/blocked4.html" ] + "tests/blocked4.html", + "tests/badCharsNoindex1.html", + "tests/badCharsNoindex2.html"] for f in unblocked : self.assertFalse(gs.robotsBlocked(f)) for f in blocked : @@ -308,11 +311,14 @@ def test_hasMetaRobotsNoindex(self) : unblocked = [ "tests/unblocked1.html", "tests/unblocked2.html", "tests/unblocked3.html", - "tests/unblocked4.html" ] + "tests/unblocked4.html", + "tests/badCharsDoIndex.html" ] blocked = [ "tests/blocked1.html", "tests/blocked2.html", "tests/blocked3.html", - "tests/blocked4.html" ] + "tests/blocked4.html", + "tests/badCharsNoindex1.html", + "tests/badCharsNoindex2.html" ] for f in unblocked : self.assertFalse(gs.hasMetaRobotsNoindex(f)) for f in blocked : @@ -327,7 +333,12 @@ def test_gatherfiles_html(self) : "./blocked3.html", "./blocked4.html", "./unblocked1.html", "./unblocked2.html", "./unblocked3.html", "./unblocked4.html", - "./subdir/a.html", "./subdir/subdir/b.html"} + "./subdir/a.html", "./subdir/subdir/b.html", + "./badCharsNoindex1.html", + "./badCharsNoindex2.html", + "./badCharsDoIndex.html"} + if os.name == "nt" : + expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) def test_gatherfiles_html_pdf(self) : @@ -341,7 +352,12 @@ def test_gatherfiles_html_pdf(self) : "./unblocked3.html", "./unblocked4.html", "./subdir/a.html", "./subdir/subdir/b.html", "./x.pdf", "./subdir/y.pdf", - "./subdir/subdir/z.pdf"} + "./subdir/subdir/z.pdf", + "./badCharsNoindex1.html", + "./badCharsNoindex2.html", + "./badCharsDoIndex.html"} + if os.name == "nt" : + expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) def test_gatherfiles_pdf(self) : @@ -351,15 +367,21 @@ def test_gatherfiles_pdf(self) : asSet = set(allfiles) expected = { "./x.pdf", "./subdir/y.pdf", "./subdir/subdir/z.pdf"} + if os.name == "nt" : + expected = { s.replace("/", "\\") for s in expected } self.assertEqual(asSet, expected) def test_lastmod(self) : - os.chdir("tests") - dateStr = gs.lastmod("./unblocked1.html") - self.assertTrue(validateDate(dateStr), msg=dateStr) - dateStr = gs.lastmod("./subdir/a.html") - self.assertTrue(validateDate(dateStr), msg=dateStr) - os.chdir("..") + # assumes that if on windows must be running tests locally + # rather than in GitHub Actions, and may or may not be in a + # git repo, so simply skips this test. + if os.name != "nt" : + os.chdir("tests") + dateStr = gs.lastmod("./unblocked1.html") + self.assertTrue(validateDate(dateStr), msg=dateStr) + dateStr = gs.lastmod("./subdir/a.html") + self.assertTrue(validateDate(dateStr), msg=dateStr) + os.chdir("..") def test_urlstring(self) : filenames = [ "./a.html", @@ -471,7 +493,7 @@ def test_robotsTxtParser(self) : os.chdir("tests") for i, e in enumerate(expected) : filename = "robots" + str(i) + ".txt" - self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e)) + self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e), msg=filename) os.chdir("..") def test_robotsBlockedWithRobotsParser(self) :