Skip to content

Commit

Permalink
Add unexported module Text.Pandoc.Char.
Browse files Browse the repository at this point in the history
This exports `isCJK`.  Use this instead of locally defined
`isCJK` in T.P.Readers.MediaWiki.
  • Loading branch information
jgm committed May 29, 2024
1 parent 29fa97a commit 2f1f066
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 17 deletions.
1 change: 1 addition & 0 deletions pandoc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,7 @@ library
Text.Pandoc.Writers.Blaze,
Text.Pandoc.Writers.Powerpoint.Presentation,
Text.Pandoc.Writers.Powerpoint.Output,
Text.Pandoc.Char,
Text.Pandoc.TeX,
Text.Pandoc.URI,
Text.Pandoc.CSS,
Expand Down
59 changes: 59 additions & 0 deletions src/Text/Pandoc/Char.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{- |
Module : Text.Pandoc.Char
Copyright : Copyright (C) 2024 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Character functions not defined in Data.Char.
-}
module Text.Pandoc.Char ( isCJK )
where

-- | Returns True if character is CJK. Matches anything in:
--
-- * CJK Unified Ideographs Basic Block: U+4E00 - U+9FFF
-- * CJK Unified Ideographs Extension A: U+3400 - U+4DBF
-- * CJK Unified Ideographs Extension B: U+20000 - U+2A6DF
-- * CJK Unified Ideographs Extension C: U+2A700 - U+2B73F
-- * CJK Unified Ideographs Extension D: U+2B740 - U+2B81F
-- * CJK Compatibility Ideographs: U+F900 - U+FAFF
-- * CJK Compatibility Ideographs Supplement: U+2F800 - U+2FA1F
isCJK :: Char -> Bool
isCJK c =
c >= '\x4e00' &&
( (c >= '\x2e80' && c <= '\x2eff') -- CJK Radicals Supplement
|| (c >= '\x2f00' && c <= '\x2fdf') -- Kangxi Radicals
|| (c >= '\x2ff0' && c <= '\x2fff') -- Ideographic Description Characters
|| (c >= '\x3000' && c <= '\x303f') -- JK Symbols and Punctuation
|| (c >= '\x3040' && c <= '\x309f') -- Hiragana
|| (c >= '\x30a0' && c <= '\x30ff') -- Katakana
|| (c >= '\x3100' && c <= '\x312f') -- Bopomofo
|| (c >= '\x3130' && c <= '\x318f') -- Kanbun
|| (c >= '\x3190' && c <= '\x319f') -- Kanbun
|| (c >= '\x31c0' && c <= '\x31ef') -- CJK Strokes
|| (c >= '\x31f0' && c <= '\x31ff') -- Katakana Phonetic Extensions
|| (c >= '\x3200' && c <= '\x32ff') -- Enclosed CJK Letters & Months
|| (c >= '\x3300' && c <= '\x33ff') -- CJK Compatibility
|| (c >= '\x3400' && c <= '\x4dbf') -- CJK Unified Ideographs Extension A
|| (c >= '\x4e00' && c <= '\x9fff') -- CJK Unified Ideographs
|| (c >= '\xa000' && c <= '\xa48f') -- Yi Syllables
|| (c >= '\xa490' && c <= '\xa4cf') -- Yi Radicals
|| (c >= '\xf900' && c <= '\xfaff') -- CJK Compatibility Ideographs
|| (c >= '\xfe10' && c <= '\xfe1f') -- Vertical forms
|| (c >= '\xfe30' && c <= '\xfe4f') -- CJK Compatibility Forms
|| (c >= '\xFE50' && c <= '\xFE6F') -- Small Form Variants
|| (c >= '\xFF00' && c <= '\xFFEE') -- Halfwidth and Fullwidth Forms
|| (c >= '\x1B000' && c <= '\x1B0FF') -- Kana Supplement
|| (c >= '\x1B100' && c <= '\x1B12F') -- Kana Extended-A
|| (c >= '\x1B130' && c <= '\x1B16F') -- Small Kana Extension
|| (c >= '\x20000' && c <= '\x2A6DF') -- CJK Unified Ideographs Extension B
|| (c >= '\x2A700' && c <= '\x2B73F') -- CJK Unified Ideographs Extension C
|| (c >= '\x2B740' && c <= '\x2B81F') -- CJK Unified Ideographs Extension D
|| (c >= '\x2B820' && c <= '\x2CEAF') -- CJK Unified Ideographs Extension E
|| (c >= '\x2CEB0' && c <= '\x2EBEF') -- CJK Unified Ideographs Extension F
|| (c >= '\x2F800' && c <= '\x2FA1F') -- CJK Compatibility Ideographs Supp
|| (c >= '\x30000' && c <= '\x3134F') -- CJK Unified Ideographs Exten
)
18 changes: 1 addition & 17 deletions src/Text/Pandoc/Readers/MediaWiki.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import Text.Pandoc.Parsing hiding (tableCaption)
import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag, toAttr)
import Text.Pandoc.Shared (safeRead, stringify, stripTrailingNewlines,
trim, splitTextBy, tshow, formatCode)
import Text.Pandoc.Char (isCJK)
import Text.Pandoc.XML (fromEntities)

-- | Read mediawiki from an input string and return a Pandoc document.
Expand Down Expand Up @@ -677,23 +678,6 @@ internalLink = try $ do
return mempty
else return link

isCJK :: Char -> Bool
isCJK c =
(c >= '\x3400' && c <= '\x4DBF') ||
(c >= '\x4E00' && c <= '\x9FFF') ||
(c >= '\x20000' && c <= '\x2A6DF') ||
(c >= '\x2A700' && c <= '\x2B73F') ||
(c >= '\x2B740' && c <= '\x2B81F') ||
(c >= '\x2B820' && c <= '\x2CEAF') ||
(c >= '\x2CEB0' && c <= '\x2EBEF') ||
(c >= '\x30000' && c <= '\x3134F') ||
(c >= '\x31350' && c <= '\x323AF') ||
(c >= '\xF900' && c <= '\xFAFF') ||
(c >= '\x2F800' && c <= '\x2FA1F') ||
(c >= '\x2F00' && c <= '\x2FDF') ||
(c >= '\x2E80' && c <= '\x2EFF') ||
(c >= '\x3000' && c <= '\x303F')

externalLink :: PandocMonad m => MWParser m Inlines
externalLink = try $ do
char '['
Expand Down

2 comments on commit 2f1f066

@jiucenglou
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you help to suggest if it is possible to use Text.Pandoc.Char.isCJK in lua filters ?

@jgm
Copy link
Owner Author

@jgm jgm commented on 2f1f066 May 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently it's not. This would have to be added to the Lua API.
I'd suggest simply implementing this function directly in Lua.

Please sign in to comment.