Skip to content

Commit

Permalink
Check whether string is an URI with custom function
Browse files Browse the repository at this point in the history
API change: the function `isURI`, testing if a string is a valid URI
with a known scheme, is exported from the Shared module, as is the set
of known `schemes`.

The new function replaces the function of the same name
from *Network.URI*, as the latter did not check whether a scheme is
well-known.  E.g. MediaWiki wikis frequently feature pages with names
like `User:John`. These links were interpreted as URIs, thus turning
internal links into global links. This is prevented by also checking
whether the scheme of a URI is frequently used (i.e. is IANA registered
or an otherwise well-known scheme).

Fixes: jgm#2713
  • Loading branch information
tarleb committed May 19, 2017
1 parent 4d1e9b8 commit db3cae7
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 34 deletions.
27 changes: 1 addition & 26 deletions src/Text/Pandoc/Parsing.hs
Original file line number Diff line number Diff line change
Expand Up @@ -465,33 +465,8 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain)
sepby1 p sep = (:) <$> p <*> (many (try $ sep >> p))


-- Schemes from http://www.iana.org/assignments/uri-schemes.html plus
-- the unofficial schemes coap, doi, javascript, isbn, pmid
schemes :: [String]
schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid",
"crid","data","dav","dict","dns","file","ftp","geo","go","gopher",
"h323","http","https","iax","icap","im","imap","info","ipp","iris",
"iris.beep","iris.xpc","iris.xpcs","iris.lwz","ldap","mailto","mid",
"msrp","msrps","mtqp","mupdate","news","nfs","ni","nih","nntp",
"opaquelocktoken","pop","pres","rtsp","service","session","shttp","sieve",
"sip","sips","sms","snmp","soap.beep","soap.beeps","tag","tel","telnet",
"tftp","thismessage","tn3270","tip","tv","urn","vemmi","ws","wss","xcon",
"xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r","z39.50s",
"adiumxtra","afp","afs","aim","apt","attachment","aw","beshare","bitcoin",
"bolo","callto","chrome","chrome-extension","com-eventbrite-attendee",
"content", "cvs","dlna-playsingle","dlna-playcontainer","dtn","dvb",
"ed2k","facetime","feed","finger","fish","gg","git","gizmoproject",
"gtalk","hcp","icon","ipn","irc","irc6","ircs","itms","jar","jms",
"keyparc","lastfm","ldaps","magnet","maps","market","message","mms",
"ms-help","msnim","mumble","mvn","notes","oid","palm","paparazzi",
"platform","proxy","psyc","query","res","resource","rmi","rsync",
"rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify",
"ssh","steam","svn","teamspeak","things","udp","unreal","ut2004",
"ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri",
"ymsgr", "isbn", "pmid"]

uriScheme :: Stream s m Char => ParserT s st m String
uriScheme = oneOfStringsCI schemes
uriScheme = oneOfStringsCI (Set.toList schemes)

-- | Parses a URI. Returns pair of original and URI-escaped version.
uri :: Stream [Char] m Char => ParserT [Char] st m (String, String)
Expand Down
44 changes: 43 additions & 1 deletion src/Text/Pandoc/Shared.hs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ module Text.Pandoc.Shared (
openURL,
collapseFilePath,
filteredFilesFromArchive,
-- * URI handling
schemes,
isURI,
-- * Error handling
mapLeft,
-- * for squashing blocks
Expand All @@ -104,7 +107,7 @@ import Data.List ( find, stripPrefix, intercalate )
import Data.Maybe (mapMaybe)
import Data.Version ( showVersion )
import qualified Data.Map as M
import Network.URI ( escapeURIString, unEscapeString )
import Network.URI ( URI(uriScheme), escapeURIString, unEscapeString, parseURI )
import qualified Data.Set as Set
import System.Directory
import System.FilePath (splitDirectories, isPathSeparator)
Expand Down Expand Up @@ -774,6 +777,45 @@ filteredFilesFromArchive zf f =
fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString)
fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e)


--
-- IANA URIs
--

-- | Schemes from http://www.iana.org/assignments/uri-schemes.html plus
-- the unofficial schemes coap, doi, javascript, isbn, pmid
schemes :: Set.Set String
schemes = Set.fromList [
"aaa", "aaas", "about", "acap", "adiumxtra", "afp", "afs", "aim", "apt",
"attachment", "aw", "beshare", "bitcoin", "bolo", "callto", "cap", "chrome",
"chrome-extension", "cid", "coap", "com-eventbrite-attendee", "content",
"crid", "cvs", "data", "dav", "dict", "dlna-playcontainer", "dlna-playsingle",
"dns", "doi", "dtn", "dvb", "ed2k", "facetime", "feed", "file", "finger",
"fish", "ftp", "geo", "gg", "git", "gizmoproject", "go", "gopher", "gtalk",
"h323", "hcp", "http", "https", "iax", "icap", "icon", "im", "imap", "info",
"ipn", "ipp", "irc", "irc6", "ircs", "iris", "iris.beep", "iris.lwz",
"iris.xpc", "iris.xpcs", "isbn", "itms", "jar", "javascript", "jms", "keyparc",
"lastfm", "ldap", "ldaps", "magnet", "mailto", "maps", "market", "message",
"mid", "mms", "ms-help", "msnim", "msrp", "msrps", "mtqp", "mumble", "mupdate",
"mvn", "news", "nfs", "ni", "nih", "nntp", "notes", "oid", "opaquelocktoken",
"palm", "paparazzi", "platform", "pmid", "pop", "pres", "proxy", "psyc",
"query", "res", "resource", "rmi", "rsync", "rtmp", "rtsp", "secondlife",
"service", "session", "sftp", "sgn", "shttp", "sieve", "sip", "sips", "skype",
"smb", "sms", "snmp", "soap.beep", "soap.beeps", "soldat", "spotify", "ssh",
"steam", "svn", "tag", "teamspeak", "tel", "telnet", "tftp", "things",
"thismessage", "tip", "tn3270", "tv", "udp", "unreal", "urn", "ut2004",
"vemmi", "ventrilo", "view-source", "webcal", "ws", "wss", "wtai", "wyciwyg",
"xcon", "xcon-userid", "xfire", "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri",
"ymsgr", "z39.50r", "z39.50s"
]

-- | Check if the string is a valid URL with a IANA or frequently used but
-- unofficial scheme (see @schemes@).
isURI :: String -> Bool
isURI = maybe False hasKnownScheme . parseURI
where
hasKnownScheme = (`Set.member` schemes) . filter (/= ':') . uriScheme

---
--- Squash blocks into inlines
---
Expand Down
2 changes: 1 addition & 1 deletion src/Text/Pandoc/Writers/ConTeXt.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import Control.Monad.State
import Data.Char (ord)
import Data.List (intercalate, intersperse)
import Data.Maybe (catMaybes)
import Network.URI (isURI, unEscapeString)
import Network.URI (unEscapeString)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
import Text.Pandoc.Definition
Expand Down
1 change: 0 additions & 1 deletion src/Text/Pandoc/Writers/Haddock.hs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ module Text.Pandoc.Writers.Haddock (writeHaddock) where
import Control.Monad.State
import Data.Default
import Data.List (intersperse, transpose)
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition
import Text.Pandoc.Logging
Expand Down
2 changes: 1 addition & 1 deletion src/Text/Pandoc/Writers/LaTeX.hs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ import Data.List (foldl', intercalate, intersperse, isInfixOf, nub, nubBy,
stripPrefix, (\\))
import Data.Maybe (catMaybes, fromMaybe, isJust)
import qualified Data.Text as T
import Network.URI (isURI, unEscapeString)
import Network.URI (unEscapeString)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition
import Text.Pandoc.Highlighting (formatLaTeXBlock, formatLaTeXInline, highlight,
Expand Down
1 change: 0 additions & 1 deletion src/Text/Pandoc/Writers/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ import qualified Data.Text as T
import qualified Data.Vector as V
import Data.Yaml (Value (Array, Bool, Number, Object, String))
import Network.HTTP (urlEncode)
import Network.URI (isURI)
import Text.HTML.TagSoup (Tag (..), isTagText, parseTags)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition
Expand Down
1 change: 0 additions & 1 deletion src/Text/Pandoc/Writers/MediaWiki.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ import Control.Monad.Reader
import Control.Monad.State
import Data.List (intercalate)
import qualified Data.Set as Set
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
import Text.Pandoc.Definition
Expand Down
1 change: 0 additions & 1 deletion src/Text/Pandoc/Writers/RST.hs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ import Control.Monad.State
import Data.Char (isSpace, toLower)
import Data.List (isPrefixOf, stripPrefix)
import Data.Maybe (fromMaybe)
import Network.URI (isURI)
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
Expand Down
2 changes: 1 addition & 1 deletion src/Text/Pandoc/Writers/Texinfo.hs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import Data.Char (chr, ord)
import Data.List (maximumBy, transpose)
import Data.Ord (comparing)
import qualified Data.Set as Set
import Network.URI (isURI, unEscapeString)
import Network.URI (unEscapeString)
import System.FilePath
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition
Expand Down

0 comments on commit db3cae7

Please sign in to comment.