Add default tokenizer type protocol (#3)

* Add DefaultTokenizerType. * Update tests. * Update documentation.
mathewsanders · Jan 4, 2017 · 6aff827 · 6aff827
1 parent 4e77fad
commit 6aff827
Show file tree

Hide file tree

Showing 15 changed files with 98 additions and 73 deletions.
diff --git a/Documentation/Literal tokenizer.md b/Documentation/Literal tokenizer.md
@@ -16,12 +16,7 @@ class LiteralTokenizer: TokenizerType {
     private let target: String
     private var position: String.UnicodeScalarIndex
 
-    // required by the TokenizerType protocol, but non-sensical to use
-    required convenience init() {
-        self.init(target: "")
-    }
-
-    // instead, we should initialize instance with the target String we're looking for
+    // initialize a tokenizer with the target String we're looking for
     init(target: String) {
         self.target = target
         self.position = target.unicodeScalars.startIndex
@@ -101,4 +96,4 @@ for token in tokens {
 
 ````
 
-See [FuzzyMatchTokenTests.swift](/Mustard/MustardTests/FuzzyMatchTokenTests.swift) for a unit test that includes matching a literal String, but allowing some flexibility in the literal match by ignoring certain characters. 
+See [FuzzyMatchTokenTests.swift](/Mustard/MustardTests/FuzzyMatchTokenTests.swift) for a unit test that includes matching a literal String, but allowing some flexibility in the literal match by ignoring certain characters.
diff --git a/Documentation/Template tokenizer.md b/Documentation/Template tokenizer.md
@@ -17,7 +17,7 @@ func ~= (option: CharacterSet, input: UnicodeScalar) -> Bool {
     return option.contains(input)
 }
 
-class DateTokenizer: TokenizerType {
+class DateTokenizer: TokenizerType, DefaultTokenizerType {
 
     // private properties
     private let _template = "00/00/00"

diff --git a/Documentation/TokenizerType protocol.md b/Documentation/TokenizerType protocol.md
@@ -8,6 +8,7 @@ Here's a slimmed down view of the protocol (see [`TokenizerType.swift`](/Mustard
 
 ````Swift
 
+/// Defines the implementation needed to create a tokenizer for use with Mustard.
 public protocol TokenizerType {
 
     /* required methods  */
@@ -22,9 +23,6 @@ public protocol TokenizerType {
     /// - Returns: `true` if the token can take this this scalar; otherwise, false.
     func tokenCanTake(_ scalar: UnicodeScalar) -> Bool
 
-    // structs get this for free if any properties have default values
-    init()
-
     /* default implementations provided  */
 
     // default implementation returns self if `tokenCanStart(with:)` returns true, otherwise nil
@@ -49,22 +47,37 @@ public protocol TokenizerType {
 
 ````
 
-As an example, here's the extension that Mustard uses to allow any `CharacterSet` to act as a tokenizer.
+An brief additional protocol `DefaultTokenizerType` can be used for tokenizers that have a default initializer,
+which provides some useful methods (see [type safety using a single tokenizer](Type safety using a single tokenizer) for more information).
 
 ````Swift
+/// Defines the implementation needed for a TokenizerType to have some convenience methods
+/// enabled when the tokenizer has a default initializer.
+public protocol DefaultTokenizerType: TokenizerType {
+
+    /// Initialize an empty instance of the tokenizer.
+    init()
+}
+````
+
+Implementations of tokenizers can range from trivial to complex.
+
+As an example, here's the extension that Mustard provides that allows any `CharacterSet` to act as a tokenizer:
 
-extension CharacterSet: TokenizerType {
+````Swift
+
+extension CharacterSet: TokenizerType, DefaultTokenizerType {
     public func tokenCanTake(_ scalar: UnicodeScalar) -> Bool {
         return self.contains(scalar)
     }
 }
 
 ````
 
-Here's an example showing how to match individuals words identified by [camel case](https://en.wikipedia.org/wiki/Camel_case):
+Here's a *slightly* more complex example showing a tokenizer that matches words identified by [camel case](https://en.wikipedia.org/wiki/Camel_case):
 
 ````Swift
-struct CamelCaseTokenizer: TokenizerType {
+struct CamelCaseTokenizer: TokenizerType, DefaultTokenizerType {
 
     // start of token is identified by an uppercase letter
     func tokenCanStart(with scalar: UnicodeScalar) -> Bool
@@ -78,16 +91,4 @@ struct CamelCaseTokenizer: TokenizerType {
 }
 ````
 
-Mustard uses instances of TokenizerType to perform tokenization. If your `TokenizerType` uses the default
-initializer, you have the option of using the static property `defaultTokenizer` as a semantic alias.
-
-````Swift
-let words = "HelloWorld".tokens(matchedWith: CamelCaseTokenizer.defaultTokenizer)
-// `CamelCaseTokenizer.defaultTokenizer` is equivalent to `CamelCaseTokenizer()`
-
-// words.count -> 2
-// words[0].text -> "Hello"
-// words[1].text -> "World"
-````
-
 For more complex examples of implementing TokenizerType, see examples for [EmojiTokenizer](Matching emoji.md), [LiteralTokenizer](Literal tokenizer.md), [DateTokenizer](Template tokenizer.md), and [unit tests](/Mustard/MustardTests).
diff --git a/Documentation/Type safety using a single tokenizer.md b/Documentation/Type safety using a single tokenizer.md
@@ -1,6 +1,6 @@
 # Type safety using a single tokenizer
 
-When matching with multiple tokenizers, there is no choice but to return an array of `Token` where the tokenizer element is the of the type `TokenizerType`.
+When matching with multiple types of tokenizer, there is no option but for Swift to return an array of `Token` where the tokenizer element has the protocol type `TokenizerType`.
 
 To make use of the `tokenizer` element, you need to either use type casting (using `as?`) or type checking (using `is`) to figure out what type of tokenizer matched the substring.
 
@@ -11,6 +11,7 @@ import Mustard
 
 let tokens = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters)
 // tokens.count -> 5
+// tokens[0].tokenizer -> type is `TokenizerType`
 
 let numberTokens = tokens.filter({ $0.tokenizer is NumberTokenizer })
 // numberTokens.count -> 0
@@ -20,7 +21,27 @@ While it's obvious to us why numberTokens is empty (the string was tokenized usi
 
 This may seem like an obvious error, but it's the type of unexpected bug that can slip in when we're using loosely typed results.
 
-Thankfully, Mustard can return a strongly typed set of matches if a single `TokenizerType` is used:
+Thankfully, Mustard can return a strongly typed set of matches if a single `TokenizerType` is used.
+
+Each `TokenizerType` includes a typealias for a tuple where the tokenizer element is the specific type of tokenizer instead of using the general protocol signature.
+
+For example, the signature for `CharacterSet.Token` is `(tokenizer: CharacterSet, text: String, range: Range<String.Index>)`
+
+Setting `CharacterSet.Token` as the result type allows Mustard to cast the results to the correct type. This allows the complier to give you a warning if you try and attempt something that doesn't make sense:
+
+````Swift
+import Mustard
+
+let tokens: [CharacterSet.Token] = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters)
+// tokens.count -> 5
+// tokens[0].tokenizer -> type is `TokenizerType`
+
+let numberTokens = tokens.filter({ $0.tokenizer is NumberTokenizer })
+// complier warning: Cast from 'CharacterSet' to unrelated type 'NumberTokenizer' always fails
+// numberTokens.count -> 0
+````
+
+Additionally, if the tokenizer implements the `DefaultTokenizerType` by providing a default initializer `init()` then you get an convenience method for getting tokens using the `tokens()` method:
 
 ````Swift
 import Mustard
@@ -34,8 +55,6 @@ let numberTokens: [NumberTokenizer.Token] = "123Hello world&^45.67".tokens()
 
 ````
 
-Using the `NumberTokenizer.Token` (which Mustard creates for every `TokenizerType`) which has a more specific type signature allows you to also use the shorter `tokens()` method which infers the type of tokenizer to match substrings.
-
 ## Bundling multiple types safely
 
 Achieving type-safety by limiting to a single `TokenizerType` may seem like a strong constraint for practical use, but
@@ -44,7 +63,7 @@ with a little overhead it's possible to create a tokenizer that acts as a lightw
 Here's an example `MixedTokenizer` that acts as a wrapper to existing word, number, and emoji tokenizers:
 
 ````Swift
-enum MixedTokenizer: TokenizerType {
+enum MixedTokenizer: TokenizerType, DefaultTokenizerType {
 
     case word
     case number

diff --git a/Mustard/Mustard/CharacterSet+Mustard.swift b/Mustard/Mustard/CharacterSet+Mustard.swift
@@ -22,7 +22,7 @@
 
 import Foundation
 
-extension CharacterSet: TokenizerType {
+extension CharacterSet: TokenizerType, DefaultTokenizerType {
     public func tokenCanTake(_ scalar: UnicodeScalar) -> Bool {
         return self.contains(scalar)
     }

diff --git a/Mustard/Mustard/Mustard.swift b/Mustard/Mustard/Mustard.swift
@@ -49,9 +49,18 @@ public extension String {
     /// instead.
     ///
     /// Returns: An array of type `TokenizerType.Token`.
-    func tokens<T: TokenizerType>() -> [(tokenizer: T, text: String, range: Range<String.Index>)] {
+    func tokens<T: DefaultTokenizerType>() -> [(tokenizer: T, text: String, range: Range<String.Index>)] {
 
-        return self.tokens(matchedWith: T()).flatMap({
+        return self.tokens(matchedWith: T.defaultTokenzier).flatMap({
+            if let tokenizer = $0.tokenizer as? T {
+                return (tokenizer: tokenizer, text: $0.text, range: $0.range)
+            }
+            else { return nil }
+        })
+    }
+
+    func tokens<T: TokenizerType>(matchedWith tokenizers: T...) -> [(tokenizer: T, text: String, range: Range<String.Index>)] {
+        return self.tokens(from: tokenizers).flatMap({
             if let tokenizer = $0.tokenizer as? T {
                 return (tokenizer: tokenizer, text: $0.text, range: $0.range)
             }

diff --git a/Mustard/Mustard/TokenizerType.swift b/Mustard/Mustard/TokenizerType.swift
@@ -29,6 +29,7 @@ import Foundation
 /// - range: The range of the matched text in the original string.
 public typealias Token = (tokenizer: TokenizerType, text: String, range: Range<String.Index>)
 
+/// Defines the implementation needed to create a tokenizer for use with Mustard.
 public protocol TokenizerType {
 
     /// Returns an instance of a tokenizer that starts with the given scalar,
@@ -87,9 +88,6 @@ public protocol TokenizerType {
     /// `tokenCanTake(_:)`
     func prepareForReuse()
 
-    /// Initialize an empty instance of the tokenizer.
-    init()
-
     /// Returns an instance of the tokenizer that will be used as the `tokenizer` element in the `Token` tuple.
     ///
     /// If the tokenizer implements `NSCopying` protocol, the default implementation returns the result of
@@ -99,6 +97,20 @@ public protocol TokenizerType {
     var tokenizerForMatch: TokenizerType { get }
 }
 
+/// Defines the implementation needed for a TokenizerType to have some convenience methods
+/// enabled when the tokenizer has a default initializer.
+public protocol DefaultTokenizerType: TokenizerType {
+
+    /// Initialize an empty instance of the tokenizer.
+    init()
+}
+
+extension DefaultTokenizerType {
+    /// The default tokenzier for this type.
+    /// This is equivilent to using the default initalizer `init()`.
+    public static var defaultTokenzier: DefaultTokenizerType { return Self() }
+}
+
 public extension TokenizerType {
 
     /// Token is a typealias for a tuple with the following named elements:
@@ -108,10 +120,6 @@ public extension TokenizerType {
     /// - range: The range of the matched text in the original string.
     typealias Token = (tokenizer: Self, text: String, range: Range<String.Index>)
 
-    /// The default tokenzier for this type.
-    /// This is equivilent to using the default initalizer `init()`.
-    static var defaultTokenzier: TokenizerType { return Self() }
-
     func tokenCanStart(with scalar: UnicodeScalar) -> Bool {
         return tokenCanTake(scalar)
     }

diff --git a/Mustard/MustardTests/CharacterSetTokenTests.swift b/Mustard/MustardTests/CharacterSetTokenTests.swift
@@ -35,7 +35,7 @@ class CharacterSetTokenTests: XCTestCase {
 
     func testCharacterSetTokenizer() {
 
-        let tokens = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters)
+        let tokens: [CharacterSet.Token] = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters)
 
         XCTAssert(tokens.count == 5, "Unexpected number of tokens [\(tokens.count)]")
 

diff --git a/Mustard/MustardTests/CustomTokenTests.swift b/Mustard/MustardTests/CustomTokenTests.swift
@@ -23,7 +23,7 @@
 import XCTest
 import Mustard
 
-struct NumberTokenizer: TokenizerType {
+struct NumberTokenizer: TokenizerType, DefaultTokenizerType {
 
     static private let numberCharacters = CharacterSet.decimalDigits.union(CharacterSet(charactersIn: "."))
 
@@ -38,7 +38,7 @@ struct NumberTokenizer: TokenizerType {
     }
 }
 
-struct WordTokenizer: TokenizerType {
+struct WordTokenizer: TokenizerType, DefaultTokenizerType {
 
     // word token can include any character in a...z + A...Z
     func tokenCanTake(_ scalar: UnicodeScalar) -> Bool {

diff --git a/Mustard/MustardTests/DateTokenizerTests.swift b/Mustard/MustardTests/DateTokenizerTests.swift
@@ -9,7 +9,7 @@
 import XCTest
 import Mustard
 
-class DateTokenizer: TokenizerType {
+class DateTokenizer: TokenizerType, DefaultTokenizerType {
 
     // private properties
     private let _template = "00/00/00"
@@ -22,19 +22,19 @@ class DateTokenizer: TokenizerType {
         return _date!
     }
 
+    // called when we access `DateToken.defaultTokenizer`
+    required init() {
+        _position = _template.unicodeScalars.startIndex
+        _dateText = ""
+    }
+
     // formatters are expensive, so only instantiate once for all DateTokens
     static let dateFormatter: DateFormatter = {
         let dateFormatter = DateFormatter()
         dateFormatter.dateFormat = "MM/dd/yy"
         return dateFormatter
     }()
 
-    // called when we access `DateToken.tokenizer`
-    required init() {
-        _position = _template.unicodeScalars.startIndex
-        _dateText = ""
-    }
-
     func tokenCanTake(_ scalar: UnicodeScalar) -> Bool {
 
         guard _position < _template.unicodeScalars.endIndex else {

diff --git a/Mustard/MustardTests/EmojiTokenTests.swift b/Mustard/MustardTests/EmojiTokenTests.swift
@@ -23,7 +23,7 @@
 import XCTest
 import Mustard
 
-struct EmojiTokenizer: TokenizerType {
+struct EmojiTokenizer: TokenizerType, DefaultTokenizerType {
 
     // (e.g. can't start with a ZWJ)
     func tokenCanStart(with scalar: UnicodeScalar) -> Bool {

diff --git a/Mustard/MustardTests/FuzzyMatchTokenTests.swift b/Mustard/MustardTests/FuzzyMatchTokenTests.swift
@@ -34,10 +34,6 @@ class FuzzyLiteralMatch: TokenizerType {
     private let exclusions: CharacterSet
     private var position: String.UnicodeScalarIndex
 
-    required convenience init() {
-        self.init(target: "", ignoring: CharacterSet.whitespaces)
-    }
-
     init(target: String, ignoring exclusions: CharacterSet) {
         self.target = target
         self.position = target.unicodeScalars.startIndex
@@ -101,11 +97,9 @@ class FuzzyMatchTokenTests: XCTestCase {
                                                ignoring: CharacterSet.whitespaces.union(.punctuationCharacters))
 
         let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/02/27 (Arrived) ref: 99/99/99"
-        let tokens = messyInput.tokens(matchedWith: fuzzyTokenzier)
+        let tokens: [FuzzyLiteralMatch.Token] = messyInput.tokens(matchedWith: fuzzyTokenzier)
 
         XCTAssert(tokens.count == 1, "Unexpected number of tokens [\(tokens.count)]")
-
-        XCTAssert(tokens[0].tokenizer is FuzzyLiteralMatch)
         XCTAssert(tokens[0].text == "#YF 1942-b")
     }
 }

diff --git a/Mustard/MustardTests/LiteralTokenTests.swift b/Mustard/MustardTests/LiteralTokenTests.swift
@@ -29,11 +29,6 @@ class LiteralTokenizer: TokenizerType {
     private let target: String
     private var position: String.UnicodeScalarIndex
 
-    // required by the TokenType protocol, but non-sensical to use
-    required convenience init() {
-        self.init(target: "")
-    }
-
     // instead, we should initalize instance with the target String we're looking for
     init(target: String) {
         self.target = target
@@ -91,21 +86,16 @@ extension String {
     }
 }
 
-
 class LiteralTokenTests: XCTestCase {
 
     func testGetCatAndDuck() {
 
         let input = "the cat and the catastrophe duck"
-        let tokens = input.tokens(matchedWith: "cat".literalTokenizer, "duck".literalTokenizer)
+        let tokens: [LiteralTokenizer.Token] = input.tokens(matchedWith: "cat".literalTokenizer, "duck".literalTokenizer)
 
         XCTAssert(tokens.count == 2, "Unexpected number of tokens [\(tokens.count)]")
 
-        XCTAssert(tokens[0].tokenizer is LiteralTokenizer)
         XCTAssert(tokens[0].text == "cat")
-
-        XCTAssert(tokens[1].tokenizer is LiteralTokenizer)
         XCTAssert(tokens[1].text == "duck")
-
     }
 }
diff --git a/Mustard/MustardTests/MixedTokenTests.swift b/Mustard/MustardTests/MixedTokenTests.swift
@@ -23,7 +23,7 @@
 import XCTest
 import Mustard
 
-enum MixedTokenizer: TokenizerType {
+enum MixedTokenizer: TokenizerType, DefaultTokenizerType {
 
     case word
     case number