Merge pull request #114 from cicirello/feat-exclude-paths

Feat: list of paths to exclude from sitemap
cicirello · Nov 15, 2023 · 4cdb0c2 · 4cdb0c2
2 parents 11347d7 + fb64d79
commit 4cdb0c2
Show file tree

Hide file tree

Showing 15 changed files with 145 additions and 17 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -92,6 +92,20 @@ jobs:
         echo "url-count = ${{ steps.integration4.outputs.url-count }}"
         echo "excluded-count = ${{ steps.integration4.outputs.excluded-count }}"
 
+    - name: Integration test 5
+      id: integration5
+      uses: ./
+      with:
+        path-to-root: tests/exclude
+        base-url-path: https://TESTING.FAKE.WEB.ADDRESS.TESTING/
+        exclude-paths: /excludeSubDir /exc1.html /subdir/exc4.html
+
+    - name: Output stats test 5
+      run: |
+        echo "sitemap-path = ${{ steps.integration5.outputs.sitemap-path }}"
+        echo "url-count = ${{ steps.integration5.outputs.url-count }}"
+        echo "excluded-count = ${{ steps.integration5.outputs.excluded-count }}"
+
     - name: Verify integration test results
-      run: python3 -u -m unittest tests/integration.py
+      run: python3 -u -B -m unittest tests/integration.py
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased] - 2023-11-06
+## [Unreleased] - 2023-11-11
 
 ### Added
+* Ability to specify list of paths to exclude from sitemap, via new input `exclude-paths`.
 
 ### Changed
 

diff --git a/README.md b/README.md
@@ -27,6 +27,8 @@ Pages, and has the following features:
   directives, excluding any that do from the sitemap. 
 * Parses a robots.txt, if present at the root of the website, excluding 
   any URLs from the sitemap that match `Disallow:` rules for `User-agent: *`.
+* Enables specifying a list of directories and/or specific files to exclude from
+  the sitemap.
 * Sorts the sitemap entries in a consistent order, such that the URLs are 
   first sorted by depth in the directory structure (i.e., pages at the website 
   root appear first, etc), and then pages at the same depth are sorted alphabetically. 
@@ -142,6 +144,35 @@ is an example:
         additional-extensions: doc docx ppt pptx
 ```
 
+### `exclude-paths`
+
+The action will automatically exclude any files or directories
+based on a robots.txt file, if present. But if you have additional
+directories or individual files that you wish to exclude from the
+sitemap that are not otherwise blocked, you can use the `exclude-paths`
+input to specify a list of them, separated by any whitespace characters. 
+For example, if you wish to exclude the directory `/exclude-these` as 
+well as the individual file `/nositemap.html`, you can use the following:
+
+```yml
+    - name: Generate the sitemap
+      uses: cicirello/generate-sitemap@v1
+      with:
+        exclude-paths: /exclude-these /nositemap.html
+```
+
+If you have many such cases to exclude, your workflow may be easier to
+read if you use a YAML multi-line string, with the following:
+
+```yml
+    - name: Generate the sitemap
+      uses: cicirello/generate-sitemap@v1
+      with:
+        exclude-paths: >
+          /exclude-these 
+          /nositemap.html
+```
+
 ### `sitemap-format`
 
 Use this to specify the sitemap format. Default: `xml`.
@@ -211,7 +242,7 @@ you can also use a specific version such as with:
 
 ```yml
     - name: Generate the sitemap
-      uses: cicirello/generate-sitemap@v1.9.1
+      uses: cicirello/generate-sitemap@v1.10.0
       with:
         base-url-path: https://THE.URL.TO.YOUR.PAGE/
 ```

diff --git a/action.yml b/action.yml
@@ -1,6 +1,6 @@
 # generate-sitemap: Github action for automating sitemap generation
 # 
-# Copyright (c) 2020-2021 Vincent A Cicirello
+# Copyright (c) 2020-2023 Vincent A Cicirello
 # https://www.cicirello.org/
 #
 # MIT License
@@ -61,6 +61,10 @@ inputs:
     description: 'Pass true to include only the date without the time in XML sitemaps; and false to include full date and time.'
     required: false
     default: false
+  exclude-paths:
+    description: 'Space separated list of paths to exclude from the sitemap.'
+    required: false
+    default: ''
 outputs:
   sitemap-path: 
     description: 'The path to the generated sitemap file.'
@@ -80,3 +84,4 @@ runs:
     - ${{ inputs.additional-extensions }}
     - ${{ inputs.drop-html-extension }}
     - ${{ inputs.date-only }}
+    - ${{ inputs.exclude-paths }}
diff --git a/generatesitemap.py b/generatesitemap.py
@@ -334,6 +334,19 @@ def sanitize_path(websiteRoot) :
     else :
         print("ERROR: Specified website root directory appears to be outside of current working directory. Exiting....")
         exit(1)
+
+def adjust_path(path):
+    """Checks that path is formatted as expected, adjusting if necessary.
+
+    Keyword arguments:
+    path - the path to check and adjust
+    """
+    path = path.replace("\\", "/").removeprefix(".")
+    if len(path) == 0:
+        return "/"
+    if path[0] != "/":
+        return "/" + path
+    return path
 
 def main(
         websiteRoot,
@@ -343,7 +356,8 @@ def main(
         sitemapFormat,
         additionalExt,
         dropExtension,
-        dateOnly
+        dateOnly,
+        excludePaths
     ) :
     """The main function of the generate-sitemap GitHub Action.
 
@@ -361,6 +375,12 @@ def main(
     dropExtension - A boolean that controls whether to drop .html from
             URLs that are to html files (e.g., GitHub Pages will serve
             an html file if URL doesn't include the .html extension).
+    dateOnly - If true, includes only the date but not the time in XML
+            sitemaps, otherwise includes full date and time in lastmods
+            within XML sitemaps.
+    excludePaths - A set of paths to exclude from the sitemap, which can
+            include directories (relative from the root) or even full
+            paths to individual files.
     """
     repo_root = os.getcwd()
     os.chdir(sanitize_path(websiteRoot))
@@ -369,8 +389,10 @@ def main(
     # how the actions working directory is mounted
     # inside container actions.
     subprocess.run(['git', 'config', '--global', '--add', 'safe.directory', repo_root])
-
-    blockedPaths = parseRobotsTxt()
+
+    if len(excludePaths) > 0:
+        excludePaths = { adjust_path(path) for path in excludePaths}
+    blockedPaths = set(parseRobotsTxt()) | excludePaths
 
     allFiles = gatherfiles(createExtensionSet(includeHTML, includePDF, additionalExt))
     files = [ f for f in allFiles if not robotsBlocked(f, blockedPaths) ]
@@ -401,7 +423,8 @@ def main(
         sitemapFormat = sys.argv[5],
         additionalExt = set(sys.argv[6].lower().replace(",", " ").replace(".", " ").split()),
         dropExtension = sys.argv[7].lower() == "true",
-        dateOnly = sys.argv[8].lower() == "true"
+        dateOnly = sys.argv[8].lower() == "true",
+        excludePaths = set(sys.argv[9].replace(",", " ").split())
     )
 
 
diff --git a/tests/exclude/exc1.html b/tests/exclude/exc1.html
diff --git a/tests/exclude/excludeSubDir/exc3.html b/tests/exclude/excludeSubDir/exc3.html
diff --git a/tests/exclude/inc1.html b/tests/exclude/inc1.html
diff --git a/tests/exclude/robots.txt b/tests/exclude/robots.txt
@@ -0,0 +1,2 @@
+User-agent: *
+Disallow: /subdir/exc2.html
diff --git a/tests/exclude/subdir/exc2.html b/tests/exclude/subdir/exc2.html
diff --git a/tests/exclude/subdir/exc4.html b/tests/exclude/subdir/exc4.html
diff --git a/tests/exclude/subdir/inc2.html b/tests/exclude/subdir/inc2.html
diff --git a/tests/integration.py b/tests/integration.py
@@ -43,6 +43,33 @@ def validateDate(s) :
 
 class IntegrationTest(unittest.TestCase) :
 
+    def testIntegrationExcludePaths(self):
+        urlset = set()
+        with open("tests/exclude/sitemap.xml","r") as f :
+            for line in f :
+                i = line.find("<loc>")
+                if i >= 0 :
+                    i += 5
+                    j = line.find("</loc>", i)
+                    if j >= 0 :
+                        urlset.add(line[i:j].strip())
+                    else :
+                        self.fail("No closing </loc>")
+                i = line.find("<lastmod>")
+                if i >= 0 :
+                    i += 9
+                    j = line.find("</lastmod>", i)
+                    if j >= 0 :
+                        self.assertTrue(validateDate(line[i:j].strip()))
+                    else :
+                        self.fail("No closing </lastmod>")
+
+        expected = { "https://TESTING.FAKE.WEB.ADDRESS.TESTING/inc1.html",
+                     "https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/inc2.html"
+                     }
+        self.assertEqual(expected, urlset)
+
+
     def testIntegration(self) :
         urlset = set()
         with open("tests/sitemap.xml","r") as f :

diff --git a/tests/robots.txt b/tests/robots.txt
@@ -10,3 +10,4 @@ Disallow: /
 
 User-agent: *
 Disallow: /subdir/y.pdf
+Disallow: /exclude
diff --git a/tests/tests.py b/tests/tests.py
@@ -383,7 +383,11 @@ def test_gatherfiles_html(self) :
                      "./badCharsNoindex2.html",
                      "./badCharsDoIndex.html",
                      "./blocked5.html",
-                     "./blocked6.html"}
+                     "./blocked6.html",
+                     "./exclude/inc1.html", "./exclude/exc1.html",
+                     "./exclude/subdir/inc2.html", "./exclude/subdir/exc2.html",
+                     "./exclude/excludeSubDir/exc3.html",
+                     "./exclude/subdir/exc4.html"}
         if os.name == "nt" :
             expected = { s.replace("/", "\\") for s in expected }
         self.assertEqual(asSet, expected)
@@ -404,7 +408,11 @@ def test_gatherfiles_html_pdf(self) :
                      "./badCharsNoindex2.html",
                      "./badCharsDoIndex.html",
                      "./blocked5.html",
-                     "./blocked6.html"}
+                     "./blocked6.html",
+                     "./exclude/inc1.html", "./exclude/exc1.html",
+                     "./exclude/subdir/inc2.html", "./exclude/subdir/exc2.html",
+                     "./exclude/excludeSubDir/exc3.html",
+                     "./exclude/subdir/exc4.html"}
         if os.name == "nt" :
             expected = { s.replace("/", "\\") for s in expected }
         self.assertEqual(asSet, expected)
@@ -635,35 +643,51 @@ def test_robotsBlockedWithRobotsParser(self) :
                      "./x.pdf", "./subdir/y.pdf",
                      "./subdir/subdir/z.pdf"]
         for f in allFiles :
-            self.assertTrue(gs.robotsBlocked(f, ["/"]))
+            self.assertTrue(gs.robotsBlocked(f, {"/"}))
         blocked = {  "./blocked1.html", "./blocked2.html",
                      "./blocked3.html", "./blocked4.html",
                      "./subdir/a.html", "./subdir/subdir/b.html",
                      "./subdir/y.pdf",
                      "./subdir/subdir/z.pdf"}
         for f in allFiles :
             if f in blocked :
-                self.assertTrue(gs.robotsBlocked(f, ["/subdir/"]))
+                self.assertTrue(gs.robotsBlocked(f, {"/subdir/"}))
             else :
-                self.assertFalse(gs.robotsBlocked(f, ["/subdir/"]))
+                self.assertFalse(gs.robotsBlocked(f, {"/subdir/"}))
         blocked = {  "./blocked1.html", "./blocked2.html",
                      "./blocked3.html", "./blocked4.html",
                      "./subdir/subdir/b.html",
                      "./subdir/subdir/z.pdf"}
         for f in allFiles :
             if f in blocked :
-                self.assertTrue(gs.robotsBlocked(f, ["/subdir/subdir/"]))
+                self.assertTrue(gs.robotsBlocked(f, {"/subdir/subdir/"}))
             else :
-                self.assertFalse(gs.robotsBlocked(f, ["/subdir/subdir"]))
+                self.assertFalse(gs.robotsBlocked(f, {"/subdir/subdir"}))
         blocked = { "./blocked1.html", "./blocked2.html",
                     "./blocked3.html", "./blocked4.html",
                     "./subdir/subdir/b.html", "./subdir/y.pdf",
                     "./unblocked1.html" }
-        blockThese = [ "/subdir/subdir/b", "/unblocked1.html", "/subdir/y.pdf"]
+        blockThese = { "/subdir/subdir/b", "/unblocked1.html", "/subdir/y.pdf"}
         for f in allFiles :
             if f in blocked :
                 self.assertTrue(gs.robotsBlocked(f, blockThese))
             else :
                 self.assertFalse(gs.robotsBlocked(f, blockThese))
         os.chdir("..")
-
+
+    def test_adjust_path(self):
+        self.assertEqual("/", gs.adjust_path("."))
+        self.assertEqual("/", gs.adjust_path("\\"))
+        self.assertEqual("/", gs.adjust_path(".\\"))
+        self.assertEqual("/hello", gs.adjust_path("\\hello"))
+        self.assertEqual("/hello", gs.adjust_path(".\\hello"))
+        self.assertEqual("/hello/bye", gs.adjust_path("\\hello\\bye"))
+        self.assertEqual("/hello/bye", gs.adjust_path(".\\hello\\bye"))
+        self.assertEqual("/", gs.adjust_path("/"))
+        self.assertEqual("/", gs.adjust_path("./"))
+        self.assertEqual("/hello", gs.adjust_path("/hello"))
+        self.assertEqual("/hello", gs.adjust_path("./hello"))
+        self.assertEqual("/hello/bye", gs.adjust_path("/hello/bye"))
+        self.assertEqual("/hello/bye", gs.adjust_path("./hello/bye"))
+        self.assertEqual("/hello", gs.adjust_path("hello"))
+        self.assertEqual("/hello/bye", gs.adjust_path("hello/bye"))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,4 @@ Disallow: /

		User-agent: *
		Disallow: /subdir/y.pdf
		Disallow: /exclude