Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix robots noindex directive check when non-utf8 chars present #43

Merged
merged 6 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,35 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased] - 2022-03-31
## [Unreleased] - 2022-04-22

### Added

### Changed
* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1.

### Deprecated

### Removed

### Fixed

### CI/CD

### Dependencies


## [1.8.3] - 2022-04-22

### Fixed
* Corrected check for robots noindex directive in case when non-utf8 characters
present in an html file.
* Disabled pycache to protect against potential future bug. Currently
no imports so no pycache created, but if future versions import
local py modules, a pycache would be created during run in repo. Disabled
creation of pycache now to avoid.

### CI/CD
### Dependencies
* Bumped base Docker image cicirello/pyaction from 4.2.0 to 4.3.1.


## [1.8.2] - 2022-03-04
Expand Down
76 changes: 43 additions & 33 deletions generatesitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,19 @@ def hasMetaRobotsNoindex(f) :
Keyword arguments:
f - Filename including path
"""
with open(f,"r") as file :
for line in file :
# Check line for <meta name="robots" content="noindex">, etc
if re.search("<meta\s+name.+robots.+content.+noindex", line) != None :
return True
# We can stop searching once no longer in head of file.
# <meta name="robots"> directives required to be in head
if "<body>" in line or "</head>" in line :
return False
try:
with open(f, "r", errors="surrogateescape") as file :
for line in file :
# Check line for <meta name="robots" content="noindex">, etc
if re.search("<meta\s+name.+robots.+content.+noindex", line) != None :
return True
# We can stop searching once no longer in head of file.
# <meta name="robots"> directives required to be in head
if "<body>" in line or "</head>" in line :
return False
except OSError:
print("WARNING: OS error while checking for noindex directive in:", f)
print("Assuming", f, "doesn't have noindex directive.")
return False

def getFileExtension(f) :
Expand Down Expand Up @@ -170,30 +174,36 @@ def parseRobotsTxt(robotsFile="robots.txt") :
must be robots.txt (the default). The parameter is to enable
unit testing with different robots.txt files."""
blockedPaths = []
if os.path.isfile(robotsFile) :
with open(robotsFile,"r") as robots :
foundBlock = False
rulesStart = False
for line in robots :
commentStart = line.find("#")
if commentStart > 0 :
line = line[:commentStart]
line = line.strip()
lineLow = line.lower()
if foundBlock :
if rulesStart and lineLow.startswith("user-agent:") :
foundBlock = False
elif not rulesStart and lineLow.startswith("allow:") :
rulesStart = True
elif lineLow.startswith("disallow:") :
rulesStart = True
if len(line) > 9 :
path = line[9:].strip()
if len(path) > 0 and " " not in path and "\t" not in path:
blockedPaths.append(path)
elif lineLow.startswith("user-agent:") and len(line)>11 and line[11:].strip() == "*" :
foundBlock = True
rulesStart = False
try:
if os.path.isfile(robotsFile) :
with open(robotsFile, "r", errors="surrogateescape") as robots :
foundBlock = False
rulesStart = False
for line in robots :
commentStart = line.find("#")
if commentStart > 0 :
line = line[:commentStart]
line = line.strip()
lineLow = line.lower()
if lineLow.startswith("user-agent:") :
if len(line)>11 and line[11:].strip() == "*" :
foundBlock = True
rulesStart = False
elif rulesStart :
foundBlock = False
rulesStart = False
elif foundBlock :
if lineLow.startswith("allow:") :
rulesStart = True
elif lineLow.startswith("disallow:") :
rulesStart = True
if len(line) > 9 :
path = line[9:].strip()
if len(path) > 0 and " " not in path and "\t" not in path:
blockedPaths.append(path)
except OSError:
print("WARNING: OS error while parsing robots.txt")
print("Assuming nothing disallowed.")
return blockedPaths

def lastmod(f) :
Expand Down
14 changes: 14 additions & 0 deletions tests/badCharsDoIndex.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">

��������������������������������������������������������������������������������������������������������������������������������

<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
18 changes: 18 additions & 0 deletions tests/badCharsNoindex1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">



<meta name="robots" content="noindex">

��������������������������������������������������������������������������������������������������������������������������������

<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
18 changes: 18 additions & 0 deletions tests/badCharsNoindex2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">

��������������������������������������������������������������������������������������������������������������������������������

<meta name="robots" content="noindex">



<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
76 changes: 76 additions & 0 deletions tests/gentestdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# generate-sitemap: Github action for automating sitemap generation
#
# Copyright (c) 2020-2022 Vincent A Cicirello
# https://www.cicirello.org/
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#

if __name__ == "__main__" :

beginning = """<!DOCTYPE html>
<html lang=en>
<head>
<meta charset=utf-8>
<link rel="canonical" href="https://SOME.WEBSITE.WOULD.GO.HERE....">

"""

ending = """

<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="title" content="Title Goes HERE">
</head>
<body>
</body>
</html>
"""

noindex = """

<meta name="robots" content="noindex">

"""

nonCharData = [ x for x in range(128, 256) ]

with open("badCharsNoindex1.html", "w") as f :
f.write(beginning)
f.write(noindex)
with open("badCharsNoindex1.html", "ab") as f :
f.write(bytes(nonCharData))
with open("badCharsNoindex1.html", "a") as f :
f.write(ending)

with open("badCharsNoindex2.html", "w") as f :
f.write(beginning)
with open("badCharsNoindex2.html", "ab") as f :
f.write(bytes(nonCharData))
with open("badCharsNoindex2.html", "a") as f :
f.write(noindex)
f.write(ending)

with open("badCharsDoIndex.html", "w") as f :
f.write(beginning)
with open("badCharsDoIndex.html", "ab") as f :
f.write(bytes(nonCharData))
with open("badCharsDoIndex.html", "a") as f :
f.write(ending)
6 changes: 4 additions & 2 deletions tests/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def testIntegration(self) :
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/a.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/x.pdf",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html"
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html"
}
self.assertEqual(expected, urlset)

Expand All @@ -91,7 +92,8 @@ def testIntegrationWithAdditionalTypes(self) :
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/z.pdf",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.docx",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/include.pptx",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html"
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/uncommitted.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/badCharsDoIndex.html"
}
self.assertEqual(expected, urlset)

Expand Down
50 changes: 36 additions & 14 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generate-sitemap: Github action for automating sitemap generation
#
# Copyright (c) 2020-2021 Vincent A Cicirello
# Copyright (c) 2020-2022 Vincent A Cicirello
# https://www.cicirello.org/
#
# MIT License
Expand Down Expand Up @@ -294,11 +294,14 @@ def test_robotsBlocked(self) :
"tests/unblocked1.html",
"tests/unblocked2.html",
"tests/unblocked3.html",
"tests/unblocked4.html" ]
"tests/unblocked4.html",
"tests/badCharsDoIndex.html"]
blocked = [ "tests/blocked1.html",
"tests/blocked2.html",
"tests/blocked3.html",
"tests/blocked4.html" ]
"tests/blocked4.html",
"tests/badCharsNoindex1.html",
"tests/badCharsNoindex2.html"]
for f in unblocked :
self.assertFalse(gs.robotsBlocked(f))
for f in blocked :
Expand All @@ -308,11 +311,14 @@ def test_hasMetaRobotsNoindex(self) :
unblocked = [ "tests/unblocked1.html",
"tests/unblocked2.html",
"tests/unblocked3.html",
"tests/unblocked4.html" ]
"tests/unblocked4.html",
"tests/badCharsDoIndex.html" ]
blocked = [ "tests/blocked1.html",
"tests/blocked2.html",
"tests/blocked3.html",
"tests/blocked4.html" ]
"tests/blocked4.html",
"tests/badCharsNoindex1.html",
"tests/badCharsNoindex2.html" ]
for f in unblocked :
self.assertFalse(gs.hasMetaRobotsNoindex(f))
for f in blocked :
Expand All @@ -327,7 +333,12 @@ def test_gatherfiles_html(self) :
"./blocked3.html", "./blocked4.html",
"./unblocked1.html", "./unblocked2.html",
"./unblocked3.html", "./unblocked4.html",
"./subdir/a.html", "./subdir/subdir/b.html"}
"./subdir/a.html", "./subdir/subdir/b.html",
"./badCharsNoindex1.html",
"./badCharsNoindex2.html",
"./badCharsDoIndex.html"}
if os.name == "nt" :
expected = { s.replace("/", "\\") for s in expected }
self.assertEqual(asSet, expected)

def test_gatherfiles_html_pdf(self) :
Expand All @@ -341,7 +352,12 @@ def test_gatherfiles_html_pdf(self) :
"./unblocked3.html", "./unblocked4.html",
"./subdir/a.html", "./subdir/subdir/b.html",
"./x.pdf", "./subdir/y.pdf",
"./subdir/subdir/z.pdf"}
"./subdir/subdir/z.pdf",
"./badCharsNoindex1.html",
"./badCharsNoindex2.html",
"./badCharsDoIndex.html"}
if os.name == "nt" :
expected = { s.replace("/", "\\") for s in expected }
self.assertEqual(asSet, expected)

def test_gatherfiles_pdf(self) :
Expand All @@ -351,15 +367,21 @@ def test_gatherfiles_pdf(self) :
asSet = set(allfiles)
expected = { "./x.pdf", "./subdir/y.pdf",
"./subdir/subdir/z.pdf"}
if os.name == "nt" :
expected = { s.replace("/", "\\") for s in expected }
self.assertEqual(asSet, expected)

def test_lastmod(self) :
os.chdir("tests")
dateStr = gs.lastmod("./unblocked1.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
dateStr = gs.lastmod("./subdir/a.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
os.chdir("..")
# assumes that if on windows must be running tests locally
# rather than in GitHub Actions, and may or may not be in a
# git repo, so simply skips this test.
if os.name != "nt" :
os.chdir("tests")
dateStr = gs.lastmod("./unblocked1.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
dateStr = gs.lastmod("./subdir/a.html")
self.assertTrue(validateDate(dateStr), msg=dateStr)
os.chdir("..")

def test_urlstring(self) :
filenames = [ "./a.html",
Expand Down Expand Up @@ -471,7 +493,7 @@ def test_robotsTxtParser(self) :
os.chdir("tests")
for i, e in enumerate(expected) :
filename = "robots" + str(i) + ".txt"
self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e))
self.assertEqual(set(gs.parseRobotsTxt(filename)), set(e), msg=filename)
os.chdir("..")

def test_robotsBlockedWithRobotsParser(self) :
Expand Down