Skip to content

Commit

Permalink
Merge pull request #43 from cicirello/fix-non-utf8-chars
Browse files Browse the repository at this point in the history
Fix robots noindex directive check when non-utf8 chars present
  • Loading branch information
cicirello authored Apr 22, 2022
2 parents 3d5ae4e + c14c89f commit 41eec24
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 52 deletions.
17 changes: 14 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,35 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased] - 2022-03-31
## [Unreleased] - 2022-04-22

### Added

### Changed
* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1.

### Deprecated

### Removed

### Fixed

### CI/CD

### Dependencies


## [1.8.3] - 2022-04-22

### Fixed
* Corrected check for robots noindex directive in case when non-utf8 characters
present in an html file.
* Disabled pycache to protect against potential future bug. Currently
no imports so no pycache created, but if future versions import
local py modules, a pycache would be created during run in repo. Disabled
creation of pycache now to avoid.

### CI/CD
### Dependencies
* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1.


## [1.8.2] - 2022-03-04
Expand Down
76 changes: 43 additions & 33 deletions generatesitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,19 @@ def hasMetaRobotsNoindex(f) :
Keyword arguments:
f - Filename including path
"""
with open(f,"r") as file :
for line in file :
# Check line for <meta name="robots" content="noindex">, etc
if re.search("<meta\s+name.+robots.+content.+noindex", line) != None :
return True
# We can stop searching once no longer in head of file.
# <meta name="robots"> directives required to be in head
if "<body>" in line or "</head>" in line :
return False
try:
with open(f, "r", errors="surrogateescape") as file :
for line in file :
# Check line for <meta name="robots" content="noindex">, etc
if re.search("<meta\s+name.+robots.+content.+noindex", line) != None :
return True
# We can stop searching once no longer in head of file.
# <meta name="robots"> directives required to be in head
if "<body>" in line or "</head>" in line :
return False
except OSError:
print("WARNING: OS error while checking for noindex directive in:", f)
print("Assuming", f, "doesn't have noindex directive.")
return False

def getFileExtension(f) :
Expand Down Expand Up @@ -170,30 +174,36 @@ def parseRobotsTxt(robotsFile="robots.txt") :
must be robots.txt (the default). The parameter is to enable
unit testing with different robots.txt files."""
blockedPaths = []
if os.path.isfile(robotsFile) :
with open(robotsFile,"r") as robots :
foundBlock = False
rulesStart = False
for line in robots :
commentStart = line.find("#")
if commentStart > 0 :
line = line[:commentStart]
line = line.strip()
lineLow = line.lower()
if foundBlock :
if rulesStart and lineLow.startswith("user-agent:") :
foundBlock = False
elif not rulesStart and lineLow.startswith("allow:") :
rulesStart = True
elif lineLow.startswith("disallow:") :
rulesStart = True
if len(line) > 9 :
path = line[9:].strip()
if len(path) > 0 and " " not in path and "\t" not in path:
blockedPaths.append(path)
elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" :
foundBlock = True
rulesStart = False
try:
if os.path.isfile(robotsFile) :
with open(robotsFile, "r", errors="surrogateescape") as robots :
foundBlock = False
rulesStart = False
for line in robots :
commentStart = line.find("#")
if commentStart > 0 :
line = line[:commentStart]
line = line.strip()
lineLow = line.lower()
if lineLow.startswith("user-agent:") :
if len(line)>11 and line[11:].strip() == "*" :
foundBlock = True
rulesStart = False
elif rulesStart :
foundBlock = False
rulesStart = False
elif foundBlock :
if lineLow.startswith("allow:") :
rulesStart = True
elif lineLow.startswith("disallow:") :
rulesStart = True
if len(line) > 9 :
path = line[9:].strip()
if len(path) > 0 and " " not in path and "\t" not in path:
blockedPaths.append(path)
except OSError:
print("WARNING: OS error while parsing robots.txt")
print("Assuming nothing disallowed.")
return blockedPaths

def lastmod(f) :
Expand Down
14 changes: 14 additions & 0 deletions tests/badCharsDoIndex.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">

�亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、¥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�

<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
18 changes: 18 additions & 0 deletions tests/badCharsNoindex1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">



<meta name="robots" content="noindex">

�亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、¥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�

<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
18 changes: 18 additions & 0 deletions tests/badCharsNoindex2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">

�亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、¥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�

<meta name="robots" content="noindex">



<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
76 changes: 76 additions & 0 deletions tests/gentestdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# generate-sitemap: Github action for automating sitemap generation
#
# Copyright (c) 2020-2022 Vincent A Cicirello
# https://www.cicirello.org/
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#

if __name__ == "__main__" :

beginning = """<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">
"""

ending = """
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
"""

noindex = """
<meta name="robots" content="noindex">
"""

nonCharData = [ x for x in range(128, 256) ]

with open("badCharsNoindex1.html", "w") as f :
f.write(beginning)
f.write(noindex)
with open("badCharsNoindex1.html", "ab") as f :
f.write(bytes(nonCharData))
with open("badCharsNoindex1.html", "a") as f :
f.write(ending)

with open("badCharsNoindex2.html", "w") as f :
f.write(beginning)
with open("badCharsNoindex2.html", "ab") as f :
f.write(bytes(nonCharData))
with open("badCharsNoindex2.html", "a") as f :
f.write(noindex)
f.write(ending)

with open("badCharsDoIndex.html", "w") as f :
f.write(beginning)
with open("badCharsDoIndex.html", "ab") as f :
f.write(bytes(nonCharData))
with open("badCharsDoIndex.html", "a") as f :
f.write(ending)
6 changes: 4 additions & 2 deletions tests/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def testIntegration(self) :
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/a.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/x.pdf",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html"
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html"
}
self.assertEqual(expected, urlset)

Expand All @@ -91,7 +92,8 @@ def testIntegrationWithAdditionalTypes(self) :
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.docx",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.pptx",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html"
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html"
}
self.assertEqual(expected, urlset)

Expand Down
50 changes: 36 additions & 14 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generate-sitemap: Github action for automating sitemap generation
#
# Copyright (c) 2020-2021 Vincent A Cicirello
# Copyright (c) 2020-2022 Vincent A Cicirello
# https://www.cicirello.org/
#
# MIT License
Expand Down Expand Up @@ -294,11 +294,14 @@ def test_robotsBlocked(self) :
"tests/unblocked1.html",
"tests/unblocked2.html",
"tests/unblocked3.html",
"tests/unblocked4.html" ]
"tests/unblocked4.html",
"tests/badCharsDoIndex.html"]
blocked = [ "tests/blocked1.html",
"tests/blocked2.html",
"tests/blocked3.html",
"tests/blocked4.html" ]
"tests/blocked4.html",
"tests/badCharsNoindex1.html",
"tests/badCharsNoindex2.html"]
for f in unblocked :
self.assertFalse(gs.robotsBlocked(f))
for f in blocked :
Expand All @@ -308,11 +311,14 @@ def test_hasMetaRobotsNoindex(self) :
unblocked = [ "tests/unblocked1.html",
"tests/unblocked2.html",
"tests/unblocked3.html",
"tests/unblocked4.html" ]
"tests/unblocked4.html",
"tests/badCharsDoIndex.html" ]
blocked = [ "tests/blocked1.html",
"tests/blocked2.html",
"tests/blocked3.html",
"tests/blocked4.html" ]
"tests/blocked4.html",
"tests/badCharsNoindex1.html",
"tests/badCharsNoindex2.html" ]
for f in unblocked :
self.assertFalse(gs.hasMetaRobotsNoindex(f))
for f in blocked :
Expand All @@ -327,7 +333,12 @@ def test_gatherfiles_html(self) :
"./blocked3.html", "./blocked4.html",
"./unblocked1.html", "./unblocked2.html",
"./unblocked3.html", "./unblocked4.html",
"./subdir/a.html", "./subdir/subdir/b.html"}
"./subdir/a.html", "./subdir/subdir/b.html",
"./badCharsNoindex1.html",
"./badCharsNoindex2.html",
"./badCharsDoIndex.html"}
if os.name == "nt" :
expected = { s.replace("/", "\\") for s in expected }
self.assertEqual(asSet, expected)

def test_gatherfiles_html_pdf(self) :
Expand All @@ -341,7 +352,12 @@ def test_gatherfiles_html_pdf(self) :
"./unblocked3.html", "./unblocked4.html",
"./subdir/a.html", "./subdir/subdir/b.html",
"./x.pdf", "./subdir/y.pdf",
"./subdir/subdir/z.pdf"}
"./subdir/subdir/z.pdf",
"./badCharsNoindex1.html",
"./badCharsNoindex2.html",
"./badCharsDoIndex.html"}
if os.name == "nt" :
expected = { s.replace("/", "\\") for s in expected }
self.assertEqual(asSet, expected)

def test_gatherfiles_pdf(self) :
Expand All @@ -351,15 +367,21 @@ def test_gatherfiles_pdf(self) :
asSet = set(allfiles)
expected = { "./x.pdf", "./subdir/y.pdf",
"./subdir/subdir/z.pdf"}
if os.name == "nt" :
expected = { s.replace("/", "\\") for s in expected }
self.assertEqual(asSet, expected)

def test_lastmod(self) :
os.chdir("tests")
dateStr = gs.lastmod("./unblocked1.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
dateStr = gs.lastmod("./subdir/a.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
os.chdir("..")
# assumes that if on windows must be running tests locally
# rather than in GitHub Actions, and may or may not be in a
# git repo, so simply skips this test.
if os.name != "nt" :
os.chdir("tests")
dateStr = gs.lastmod("./unblocked1.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
dateStr = gs.lastmod("./subdir/a.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
os.chdir("..")

def test_urlstring(self) :
filenames = [ "./a.html",
Expand Down Expand Up @@ -471,7 +493,7 @@ def test_robotsTxtParser(self) :
os.chdir("tests")
for i, e in enumerate(expected) :
filename = "robots" + str(i) + ".txt"
self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e))
self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e), msg=filename)
os.chdir("..")

def test_robotsBlockedWithRobotsParser(self) :
Expand Down

0 comments on commit 41eec24

Please sign in to comment.