Skip to content

Commit

Permalink
Switched to using jsoup to validate fragments
Browse files Browse the repository at this point in the history
Also, modified link validation so links to the same page with different
fragments only load/download that page once.
  • Loading branch information
jroper committed Sep 11, 2019
1 parent 6da4e66 commit 6e1c352
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 75 deletions.
3 changes: 2 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ lazy val core = project
name := "paradox",
libraryDependencies ++= Library.pegdown,
libraryDependencies ++= Seq(
Library.st4
Library.st4,
Library.jsoup
),
parallelExecution in Test := false
)
Expand Down
90 changes: 55 additions & 35 deletions core/src/main/scala/com/lightbend/paradox/ParadoxProcessor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ import com.lightbend.paradox.template.PageTemplate
import com.lightbend.paradox.markdown._
import com.lightbend.paradox.tree.Tree.{ Forest, Location }
import java.io.{ File, FileOutputStream, OutputStreamWriter }
import java.net.{ HttpURLConnection, URI }
import java.nio.charset.StandardCharsets

import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.pegdown.ast._

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.control.NonFatal
import scala.util.matching.Regex

Expand Down Expand Up @@ -137,49 +137,65 @@ class ParadoxProcessor(reader: Reader = new Reader, writer: Writer = new Writer)
validate(Some(root.location))
}

def ignore(path: String) = ignorePaths.exists(_.pattern.matcher(path).matches())

linkCapturer.allLinks.foreach {
case CapturedLinkWithSources(CapturedAbsoluteLink(uri), sources) if validateAbsolute && !ignore(uri.toString) =>
validateExternalLink(uri, reportErrorOnSources(errorCollector, sources), logger)
case CapturedLinkWithSources(CapturedRelativeLink(path, fragment), sources) if !ignore(path) =>
fullSite.get(path) match {
case Some(file) =>
fragment.foreach { f =>
validateFragment(path, Source.fromFile(file, "UTF-8").mkString, f, reportErrorOnSources(errorCollector, sources))
}
case None =>
reportErrorOnSources(errorCollector, sources)(s"Could not find path [$path] in site")
}
case _ =>
// Ignore
}
linkCapturer.allLinks
.filterNot(l => ignorePaths.exists(_.pattern.matcher(l.link.toString).matches()))
.foreach {
case c @ CapturedLink(uri, fragments) if c.isInternal =>
fullSite.get(uri.getPath) match {
case Some(file) =>
if (c.hasFragments) {
validateFragments(uri.getPath, Jsoup.parse(file, "UTF-8"), fragments, errorCollector)
}
case None =>
reportErrorOnSources(errorCollector, c.allSources)(s"Could not find path [${uri.getPath}] in site")
}
case absolute if validateAbsolute =>
validateExternalLink(absolute, errorCollector, logger)
case _ =>
// Ignore
}

errorCollector.logErrors(logger)
errorCollector.errorCount
}

private def validateExternalLink(uri: URI, reportError: String => Unit, logger: ParadoxLogger) = {
logger.info(s"Validating external link: $uri")
val conn = uri.toURL.openConnection().asInstanceOf[HttpURLConnection]
conn.addRequestProperty("User-Agent", "Paradox Link Validator <https://github.com/lightbend/paradox>")
private def validateExternalLink(capturedLink: CapturedLink, errorContext: ErrorContext, logger: ParadoxLogger) = {
logger.info(s"Validating external link: ${capturedLink.link}")

def reportError = reportErrorOnSources(errorContext, capturedLink.allSources)(_)
val url = capturedLink.link.toString

try {
if (conn.getResponseCode / 100 == 3) {
reportError(s"Received a ${conn.getResponseCode} redirect on external link, location redirected to is [${conn.getHeaderField("Location")}]")
} else if (conn.getResponseCode != 200) {
reportError(s"Error validating external link, status code was ${conn.getResponseCode}")
val response = Jsoup.connect(url)
.userAgent("Paradox Link Validator <https://github.com/lightbend/paradox>")
.followRedirects(false)
.ignoreHttpErrors(true)
.ignoreContentType(true)
.execute()

// jsoup doesn't offer any simple way to clean up, the only way to close is to get the body stream and close it,
// but if you've already read the response body, that will throw an exception, and there's no way to check if
// you've already tried to read the response body, so we can't do that in a finally block, we have to do it
// explicitly every time we don't want to consume the stream.
def close() = response.bodyStream().close()

if (response.statusCode() / 100 == 3) {
close()
reportError(s"Received a ${response.statusCode()} ${response.statusMessage()} on external link, location redirected to is [${response.header("Location")}]")
} else if (response.statusCode() != 200) {
close()
reportError(s"Error validating external link, status was ${response.statusCode()} ${response.statusMessage()}")
} else {
if (uri.getFragment != null) {
val content = Source.fromInputStream(conn.getInputStream).mkString
validateFragment(uri.toString, content, uri.getFragment, reportError)
if (capturedLink.hasFragments) {
validateFragments(url, response.parse(), capturedLink.fragments, errorContext)
} else {
close()
}
}
} catch {
case NonFatal(e) =>
reportError(s"Exception occurred when validating external link: $e")
logger.debug(e)
} finally {
conn.disconnect()
}
}

Expand All @@ -189,9 +205,13 @@ class ParadoxProcessor(reader: Reader = new Reader, writer: Writer = new Writer)
}
}

private def validateFragment(path: String, content: String, fragment: String, reportError: String => Unit) = {
if (!(content.contains("id=\"" + fragment + "\"") || content.contains("name=\"" + fragment + "\""))) {
reportError(s"Could not find anchor id [$fragment] in page [$path]")
private def validateFragments(path: String, content: Document, fragments: List[CapturedLinkFragment], errorContext: ErrorContext): Unit = {
fragments.foreach {
case CapturedLinkFragment(Some(fragment), sources) =>
if (content.getElementById(fragment) == null && content.select(s"a[name=$fragment]").isEmpty) {
reportErrorOnSources(errorContext, sources)(s"Could not find anchor [$fragment] in page [$path]")
}
case _ =>
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ class LinkCapturer {
}
}

def allLinks: List[CapturedLinkWithSources] = {
def allLinks: List[CapturedLink] = {
// First, resolve the links, discarding links that we can't resolve.
links.collect {

case (page, node, uri) if isPageInSite(page, uri) =>
case Link(page, node, uri, fragment) if isPageInSite(page, uri) =>
val path = node match {
// Javadoc links may use the frames style, and may reference index.html, if so, need to drop it.
case d: DirectiveNode if d.name == "javadoc" && uri.getQuery != null =>
Expand All @@ -107,19 +107,22 @@ class LinkCapturer {
val pathWithIndex = if (path.endsWith("/")) uri.getPath + "index.html"
else path

val relativePath = URI.create(page.path).resolve(pathWithIndex).getPath

(CapturedRelativeLink(relativePath, Option(uri.getFragment)), page, node)

case (page, node, uri) if uri.getAuthority != null =>
(CapturedAbsoluteLink(uri), page, node)

}.groupBy(_._1).map {
case (link, links) =>
CapturedLinkWithSources(link, links.map {
case (_, page, node) => (page.file, node)
})
}.toList
val resolvedUri = URI.create(page.path).resolve(pathWithIndex)
Link(page, node, resolvedUri, fragment)

case link @ Link(_, _, uri, _) if uri.getAuthority != null => link
}.groupBy(_.uri)
.toList
.map {
case (uri, links) =>
val fragments = links.groupBy(_.fragment)
.toList
.map {
case (fragment, links) =>
CapturedLinkFragment(fragment, links.map(l => (l.page.file, l.node)))
}
CapturedLink(uri, fragments)
}
}

private def isPageInSite(page: Page, uri: URI): Boolean = {
Expand All @@ -136,18 +139,29 @@ class LinkCapturer {
}

private var nodeOverride: Option[Node] = None
private var links: List[(Page, Node, URI)] = Nil

private case class Link(page: Page, node: Node, uri: URI, fragment: Option[String])

private var links: List[Link] = Nil

def capture(page: Page, node: Node, rendering: LinkRenderer.Rendering): LinkRenderer.Rendering = {
links = (page, nodeOverride.getOrElse(node), URI.create(rendering.href)) :: links
val fullUri = URI.create(rendering.href)
val (uri, fragment) = if (fullUri.getFragment == null) (fullUri, None)
else (new URI(fullUri.getScheme, fullUri.getAuthority, fullUri.getPath, fullUri.getQuery, null), Some(fullUri.getFragment))
links = Link(page, nodeOverride.getOrElse(node), uri, fragment) :: links
rendering
}
}

case class CapturedLinkWithSources(link: CapturedLink, sources: List[(File, Node)])
sealed trait CapturedLink
case class CapturedAbsoluteLink(link: URI) extends CapturedLink
case class CapturedRelativeLink(path: String, fragment: Option[String]) extends CapturedLink
case class CapturedLink(link: URI, fragments: List[CapturedLinkFragment]) {
def allSources = fragments.flatMap(_.sources)

def isInternal = link.getAuthority == null && link.getPath != null

def hasFragments = fragments.size > 1 || fragments.headOption.flatMap(_.fragment).nonEmpty
}

case class CapturedLinkFragment(fragment: Option[String], sources: List[(File, Node)])

private class LinkCapturerRenderer(capturer: LinkCapturer, renderer: LinkRenderer, page: Page) extends LinkRenderer {
private def capture(node: Node, rendering: LinkRenderer.Rendering) = capturer.capture(page, node, rendering)
Expand Down
8 changes: 8 additions & 0 deletions plugin/src/sbt-test/paradox/validation/build.sbt
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
lazy val root = (project in file("."))
.enablePlugins(ParadoxPlugin)

lazy val additionalInternalMappings = settingKey[List[(File, String)]]("Additional mappings to add to paradox mappings")
lazy val additionalMappings = settingKey[List[(File, String)]]("Additional mappings to add to paradox mappings")

additionalInternalMappings := Nil
additionalMappings := Nil
(Compile / paradox / mappings) ++= additionalInternalMappings.value
(Compile / paradoxValidateInternalLinks / mappings) ++= additionalMappings.value
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

[outside docs path link](../outside.html#frag)

https://www.lightbend.com/thispagedoesnotexist
This URL should 404 (or otherwise fail)
https://github.com/lightbend/thisrepodoesnotexist
This URL should 301 (or otherwise fail)
https://lightbend.com

Including this in build means we depend on github.com being up. Probably ok.
https://github.com/
36 changes: 20 additions & 16 deletions plugin/src/sbt-test/paradox/validation/test
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
-> paradoxValidateInternalLinks

# Add the path, but without the fragment, so validation still fails
> set (Compile / paradox / mappings) += (file("withoutfragment.html") -> ("test.html"))
> set additionalInternalMappings := List(file("withoutfragment.html") -> ("test.html"))
-> paradoxValidateInternalLinks

# Now add the path with the fragment, validation should succeed
> reload
> set (Compile / paradox / mappings) += (file("withfragment.html") -> ("test.html"))
> set additionalInternalMappings := List(file("withfragment.html") -> ("test.html"))
> paradoxValidateInternalLinks

# Now set a base path of /, validation should fail due to /../outside.html not existing
Expand All @@ -22,27 +21,32 @@
-> paradoxValidateInternalLinks

# Now add an /outside.html, but without a fragment, validation should fail
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withoutfragment.html") -> ("/outside.html"))
> set additionalMappings := List(file("withoutfragment.html") -> ("/outside.html"))
-> paradoxValidateInternalLinks

# Now add an /outside.html, with a fragment, validation should succeed
> reload
> set paradoxValidationSiteBasePath := Some("/docs")
> set (Compile / paradox / mappings) += (file("withfragment.html") -> ("test.html"))
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/outside.html"))
> set additionalMappings := List(file("withfragment.html") -> ("/outside.html"))
> paradoxValidateInternalLinks

# Also try setting the internal link with an absolute path
> reload
> set paradoxValidationSiteBasePath := Some("/docs")
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/docs/test.html"))
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/outside.html"))
> set additionalInternalMappings := Nil
> set additionalMappings += (file("withfragment.html") -> ("/docs/test.html"))
> paradoxValidateInternalLinks

# Finally, try validating external links, first check that it fails
-> paradoxValidateLinks
# Now ignore the link that fails
> set paradoxValidationIgnorePaths += ("https://www.lightbend.com.*").r
# The following test will fail if GitHub is down. That's fairly unlikely if we're running a CI build, since CI
# depends on GitHub being up to run in the first place, and also being up to report the status of the build, etc.

# Now make sure it fails when just the 404ing link is ignored
> set paradoxValidationIgnorePaths := List(".*thisrepodoesnotexist.*".r)
-> paradoxValidateLinks

# Now make sure it fails when just the 401ing link is ignored
> set paradoxValidationIgnorePaths := List("https://lightbend.com".r)
-> paradoxValidateLinks

# Now ignore both failing links, and it should pass. Note though that passing depends on GitHub being up,
# since there is a valid link that points to GitHub. If we're running in CI, GitHub being up is an alright
# assumption to make, given that GitHub triggered the build, and will have to be up to receive the status
# report
> set paradoxValidationIgnorePaths := List("https://lightbend.com".r, (".*thisrepodoesnotexist.*").r)
> paradoxValidateLinks
2 changes: 2 additions & 0 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ object Version {
val sbtWeb = "1.4.4"
val scalatest = "3.0.8"
val st4 = "4.1"
val jsoup = "1.12.1"
}

object Library {
Expand All @@ -38,4 +39,5 @@ object Library {
val sbtWeb = "com.typesafe.sbt" % "sbt-web" % Version.sbtWeb
val scalatest = "org.scalatest" %% "scalatest" % Version.scalatest
val st4 = "org.antlr" % "ST4" % Version.st4
val jsoup = "org.jsoup" % "jsoup" % Version.jsoup
}

0 comments on commit 6e1c352

Please sign in to comment.