Merge pull request #43 from cicirello/fix-non-utf8-chars

Fix robots noindex directive check when non-utf8 chars present
cicirello · Apr 22, 2022 · 41eec24 · 41eec24
2 parents 3d5ae4e + c14c89f
commit 41eec24
Show file tree

Hide file tree

Showing 8 changed files with 223 additions and 52 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,24 +4,35 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased] - 2022-03-31
+## [Unreleased] - 2022-04-22
 
 ### Added
 
 ### Changed
-* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1.
 
 ### Deprecated
 
 ### Removed
 
 ### Fixed
+
+### CI/CD
+
+### Dependencies
+
+
+## [1.8.3] - 2022-04-22
+
+### Fixed
+* Corrected check for robots noindex directive in case when non-utf8 characters
+  present in an html file.
 * Disabled pycache to protect against potential future bug. Currently
   no imports so no pycache created, but if future versions import
   local py modules, a pycache would be created during run in repo. Disabled
   creation of pycache now to avoid.
 
-### CI/CD
+### Dependencies
+* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1.
 
 
 ## [1.8.2] - 2022-03-04

diff --git a/generatesitemap.py b/generatesitemap.py
@@ -88,15 +88,19 @@ def hasMetaRobotsNoindex(f) :
     Keyword arguments:
     f - Filename including path
     """
-    with open(f,"r") as file :
-        for line in file :
-            # Check line for <meta name="robots" content="noindex">, etc
-            if re.search("<meta\s+name.+robots.+content.+noindex", line) != None :
-                return True
-            # We can stop searching once no longer in head of file.
-            # <meta name="robots"> directives required to be in head
-            if "<body>" in line or "</head>" in line :
-                return False
+    try:
+        with open(f, "r", errors="surrogateescape") as file :
+            for line in file :
+                # Check line for <meta name="robots" content="noindex">, etc
+                if re.search("<meta\s+name.+robots.+content.+noindex", line) != None :
+                    return True
+                # We can stop searching once no longer in head of file.
+                # <meta name="robots"> directives required to be in head
+                if "<body>" in line or "</head>" in line :
+                    return False
+    except OSError:
+        print("WARNING: OS error while checking for noindex directive in:", f)
+        print("Assuming", f, "doesn't have noindex directive.")
     return False
 
 def getFileExtension(f) :
@@ -170,30 +174,36 @@ def parseRobotsTxt(robotsFile="robots.txt") :
     must be robots.txt (the default). The parameter is to enable
     unit testing with different robots.txt files."""
     blockedPaths = []
-    if os.path.isfile(robotsFile) :
-        with open(robotsFile,"r") as robots :
-            foundBlock = False
-            rulesStart = False
-            for line in robots :
-                commentStart = line.find("#")
-                if commentStart > 0 :
-                    line = line[:commentStart]
-                line = line.strip()
-                lineLow = line.lower()
-                if foundBlock :
-                    if rulesStart and lineLow.startswith("user-agent:") :
-                        foundBlock = False
-                    elif not rulesStart and lineLow.startswith("allow:") :
-                        rulesStart = True
-                    elif lineLow.startswith("disallow:") :
-                        rulesStart = True
-                        if len(line) > 9 :
-                            path = line[9:].strip()
-                            if len(path) > 0 and " " not in path and "\t" not in path:
-                                blockedPaths.append(path)
-                elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" :
-                    foundBlock = True
-                    rulesStart = False
+    try:
+        if os.path.isfile(robotsFile) :
+            with open(robotsFile, "r", errors="surrogateescape") as robots :
+                foundBlock = False
+                rulesStart = False
+                for line in robots :
+                    commentStart = line.find("#")
+                    if commentStart > 0 :
+                        line = line[:commentStart]
+                    line = line.strip()
+                    lineLow = line.lower()
+                    if lineLow.startswith("user-agent:") :
+                        if len(line)>11 and line[11:].strip() == "*" :
+                            foundBlock = True
+                            rulesStart = False
+                        elif rulesStart :
+                            foundBlock = False
+                            rulesStart = False
+                    elif foundBlock :
+                        if lineLow.startswith("allow:") :
+                            rulesStart = True
+                        elif lineLow.startswith("disallow:") :
+                            rulesStart = True
+                            if len(line) > 9 :
+                                path = line[9:].strip()
+                                if len(path) > 0 and " " not in path and "\t" not in path:
+                                    blockedPaths.append(path)
+    except OSError:
+        print("WARNING: OS error while parsing robots.txt")
+        print("Assuming nothing disallowed.")
     return blockedPaths
 
 def lastmod(f) :

diff --git a/tests/badCharsDoIndex.html b/tests/badCharsDoIndex.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html lang=en>
+<head>
+<meta charset=utf-8>
+<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">
+
+�亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、￥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<meta name="title" content="Title Goes HERE">
+</head>
+<body>
+</body>
+</html>
diff --git a/tests/badCharsNoindex1.html b/tests/badCharsNoindex1.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html lang=en>
+<head>
+<meta charset=utf-8>
+<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">
+
+
+
+<meta name="robots" content="noindex">
+
+�亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、￥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<meta name="title" content="Title Goes HERE">
+</head>
+<body>
+</body>
+</html>
diff --git a/tests/badCharsNoindex2.html b/tests/badCharsNoindex2.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html lang=en>
+<head>
+<meta charset=utf-8>
+<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">
+
+�亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、￥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�
+
+<meta name="robots" content="noindex">
+
+
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<meta name="title" content="Title Goes HERE">
+</head>
+<body>
+</body>
+</html>
diff --git a/tests/gentestdata.py b/tests/gentestdata.py
@@ -0,0 +1,76 @@
+# generate-sitemap: Github action for automating sitemap generation
+# 
+# Copyright (c) 2020-2022 Vincent A Cicirello
+# https://www.cicirello.org/
+#
+# MIT License
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+if __name__ == "__main__" :
+
+    beginning = """<!DOCTYPE html>
+<html lang=en>
+<head>
+<meta charset=utf-8>
+<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">
+
+"""
+
+    ending = """
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<meta name="title" content="Title Goes HERE">
+</head>
+<body>
+</body>
+</html>
+"""
+
+    noindex = """
+
+<meta name="robots" content="noindex">
+
+"""
+
+    nonCharData = [ x for x in range(128, 256) ]
+
+    with open("badCharsNoindex1.html", "w") as f :
+        f.write(beginning)
+        f.write(noindex)
+    with open("badCharsNoindex1.html", "ab") as f :
+        f.write(bytes(nonCharData))
+    with open("badCharsNoindex1.html", "a") as f :
+        f.write(ending)
+
+    with open("badCharsNoindex2.html", "w") as f :
+        f.write(beginning)
+    with open("badCharsNoindex2.html", "ab") as f :
+        f.write(bytes(nonCharData))
+    with open("badCharsNoindex2.html", "a") as f :
+        f.write(noindex)
+        f.write(ending)
+
+    with open("badCharsDoIndex.html", "w") as f :
+        f.write(beginning)
+    with open("badCharsDoIndex.html", "ab") as f :
+        f.write(bytes(nonCharData))
+    with open("badCharsDoIndex.html", "a") as f :
+        f.write(ending)
diff --git a/tests/integration.py b/tests/integration.py
@@ -71,7 +71,8 @@ def testIntegration(self) :
                      "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/a.html",
                      "https://TESTING.FAKE.WEB.ADDRESS.TESTING/x.pdf", 
                      "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf",
-                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html"
+                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html",
+                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html"
                      }
         self.assertEqual(expected, urlset)
 
@@ -91,7 +92,8 @@ def testIntegrationWithAdditionalTypes(self) :
                      "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf",
                      "https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.docx",
                      "https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.pptx",
-                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html"
+                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html",
+                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html"
                      }
         self.assertEqual(expected, urlset)
 

diff --git a/tests/tests.py b/tests/tests.py
@@ -1,6 +1,6 @@
 # generate-sitemap: Github action for automating sitemap generation
 # 
-# Copyright (c) 2020-2021 Vincent A Cicirello
+# Copyright (c) 2020-2022 Vincent A Cicirello
 # https://www.cicirello.org/
 #
 # MIT License
@@ -294,11 +294,14 @@ def test_robotsBlocked(self) :
                       "tests/unblocked1.html",
                       "tests/unblocked2.html",
                       "tests/unblocked3.html",
-                      "tests/unblocked4.html" ]
+                      "tests/unblocked4.html",
+                      "tests/badCharsDoIndex.html"]
         blocked = [ "tests/blocked1.html",
                     "tests/blocked2.html",
                     "tests/blocked3.html",
-                    "tests/blocked4.html" ]
+                    "tests/blocked4.html",
+                    "tests/badCharsNoindex1.html",
+                    "tests/badCharsNoindex2.html"]
         for f in unblocked :
             self.assertFalse(gs.robotsBlocked(f))
         for f in blocked :
@@ -308,11 +311,14 @@ def test_hasMetaRobotsNoindex(self) :
         unblocked = [ "tests/unblocked1.html",
                       "tests/unblocked2.html",
                       "tests/unblocked3.html",
-                      "tests/unblocked4.html" ]
+                      "tests/unblocked4.html",
+                      "tests/badCharsDoIndex.html" ]
         blocked = [ "tests/blocked1.html",
                     "tests/blocked2.html",
                     "tests/blocked3.html",
-                    "tests/blocked4.html" ]
+                    "tests/blocked4.html",
+                    "tests/badCharsNoindex1.html",
+                    "tests/badCharsNoindex2.html" ]
         for f in unblocked :
             self.assertFalse(gs.hasMetaRobotsNoindex(f))
         for f in blocked :
@@ -327,7 +333,12 @@ def test_gatherfiles_html(self) :
                      "./blocked3.html", "./blocked4.html",
                      "./unblocked1.html", "./unblocked2.html",
                      "./unblocked3.html", "./unblocked4.html",
-                     "./subdir/a.html", "./subdir/subdir/b.html"}
+                     "./subdir/a.html", "./subdir/subdir/b.html",
+                     "./badCharsNoindex1.html",
+                     "./badCharsNoindex2.html",
+                     "./badCharsDoIndex.html"}
+        if os.name == "nt" :
+            expected = { s.replace("/", "\\") for s in expected }
         self.assertEqual(asSet, expected)
 
     def test_gatherfiles_html_pdf(self) :
@@ -341,7 +352,12 @@ def test_gatherfiles_html_pdf(self) :
                      "./unblocked3.html", "./unblocked4.html",
                      "./subdir/a.html", "./subdir/subdir/b.html",
                      "./x.pdf", "./subdir/y.pdf",
-                     "./subdir/subdir/z.pdf"}
+                     "./subdir/subdir/z.pdf",
+                     "./badCharsNoindex1.html",
+                     "./badCharsNoindex2.html",
+                     "./badCharsDoIndex.html"}
+        if os.name == "nt" :
+            expected = { s.replace("/", "\\") for s in expected }
         self.assertEqual(asSet, expected)
 
     def test_gatherfiles_pdf(self) :
@@ -351,15 +367,21 @@ def test_gatherfiles_pdf(self) :
         asSet = set(allfiles)
         expected = { "./x.pdf", "./subdir/y.pdf",
                      "./subdir/subdir/z.pdf"}
+        if os.name == "nt" :
+            expected = { s.replace("/", "\\") for s in expected }
         self.assertEqual(asSet, expected)
 
     def test_lastmod(self) :
-        os.chdir("tests")
-        dateStr = gs.lastmod("./unblocked1.html")
-        self.assertTrue(validateDate(dateStr), msg=dateStr)
-        dateStr = gs.lastmod("./subdir/a.html")
-        self.assertTrue(validateDate(dateStr), msg=dateStr)
-        os.chdir("..")
+        # assumes that if on windows must be running tests locally
+        # rather than in GitHub Actions, and may or may not be in a
+        # git repo, so simply skips this test.
+        if os.name != "nt" :
+            os.chdir("tests")
+            dateStr = gs.lastmod("./unblocked1.html")
+            self.assertTrue(validateDate(dateStr), msg=dateStr)
+            dateStr = gs.lastmod("./subdir/a.html")
+            self.assertTrue(validateDate(dateStr), msg=dateStr)
+            os.chdir("..")
 
     def test_urlstring(self) :
         filenames = [ "./a.html",
@@ -471,7 +493,7 @@ def test_robotsTxtParser(self) :
         os.chdir("tests")
         for i, e in enumerate(expected) :
             filename = "robots" + str(i) + ".txt"
-            self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e))
+            self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e), msg=filename)
         os.chdir("..")
 
     def test_robotsBlockedWithRobotsParser(self) :