From fd48a06bd9b1e643cb620cdf3abecdd3e06436f9 Mon Sep 17 00:00:00 2001 From: Mehmet Cetin Date: Thu, 8 Oct 2015 10:40:14 +0300 Subject: [PATCH 1/4] readme typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 64054b2..21e395f 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Textoken('Oh, no! Alfa 2000 is at home.', only_regexp: '^[0-9]*$').tokens You can combine all options. 'Only' and 'Exclude' Options support multiple option values like **only: 'punctuations, dates, numerics'** -Public interface of Textoken presents two methods, tokens & word; +Public interface of Textoken presents two methods, tokens & words; ```ruby Textoken('Alfa.').tokens From e26d98c468c39309c82c65e05f1d1a0607eeb813 Mon Sep 17 00:00:00 2001 From: Mehmet Cetin Date: Fri, 9 Oct 2015 00:21:04 +0300 Subject: [PATCH 2/4] Refactorings and preperation for pattern search. --- lib/textoken.rb | 1 + lib/textoken/options/exclude.rb | 5 +- lib/textoken/options/less_than.rb | 7 ++- .../options/modules/conditional_option.rb | 4 ++ .../options/modules/numeric_option.rb | 6 ++- .../options/modules/tokenizable_option.rb | 18 +++++++ lib/textoken/options/more_than.rb | 7 ++- lib/textoken/options/only.rb | 5 +- .../modules/tokenizable_option_spec.rb | 50 +++++++++++++++++++ 9 files changed, 90 insertions(+), 13 deletions(-) create mode 100644 lib/textoken/options/modules/tokenizable_option.rb create mode 100644 spec/lib/textoken/options/modules/tokenizable_option_spec.rb diff --git a/lib/textoken.rb b/lib/textoken.rb index b36e042..bfbcc7a 100644 --- a/lib/textoken.rb +++ b/lib/textoken.rb @@ -9,6 +9,7 @@ require 'textoken/tokenizer' require 'textoken/scanner' +require 'textoken/options/modules/tokenizable_option' require 'textoken/options/modules/numeric_option' require 'textoken/options/modules/conditional_option' require 'textoken/options/modules/regexp_option' diff --git a/lib/textoken/options/exclude.rb b/lib/textoken/options/exclude.rb index 3c408cd..0bb6c17 100644 --- a/lib/textoken/options/exclude.rb +++ b/lib/textoken/options/exclude.rb @@ -4,11 +4,12 @@ module Textoken class Exclude include ConditionalOption + private + # base.text is raw tokens splitted with ' ' # values are Regexps array to search # base.findings, Findings object for pushing matching tokens - def tokenize(base) - @base = base + def tokenize_condition tokenize_if { |word, regexp| !word.match(regexp) } end end diff --git a/lib/textoken/options/less_than.rb b/lib/textoken/options/less_than.rb index ad02f9c..283a5ff 100644 --- a/lib/textoken/options/less_than.rb +++ b/lib/textoken/options/less_than.rb @@ -4,13 +4,12 @@ module Textoken class LessThan include NumericOption - def tokenize(base) - @base = base + private + + def tokenize_condition tokenize_if { |word| word.length < number } end - private - def validate_option_value(value) validate { value.class == Fixnum && value > 1 } end diff --git a/lib/textoken/options/modules/conditional_option.rb b/lib/textoken/options/modules/conditional_option.rb index 757f644..17bf1e6 100644 --- a/lib/textoken/options/modules/conditional_option.rb +++ b/lib/textoken/options/modules/conditional_option.rb @@ -1,6 +1,8 @@ module Textoken # This module will be shared in options like, only and exclude module ConditionalOption + include TokenizableOption + attr_reader :regexps, :findings, :base def priority @@ -12,6 +14,8 @@ def initialize(values) @findings = Findings.new end + private + def tokenize_if(&block) regexps.each do |r| base.text.each_with_index do |w, i| diff --git a/lib/textoken/options/modules/numeric_option.rb b/lib/textoken/options/modules/numeric_option.rb index 8dcd38a..e5f3319 100644 --- a/lib/textoken/options/modules/numeric_option.rb +++ b/lib/textoken/options/modules/numeric_option.rb @@ -1,7 +1,9 @@ module Textoken # This module will be shared in options like, more_than and less_than module NumericOption - attr_reader :number, :findings, :base + include TokenizableOption + + attr_reader :number, :findings def priority 2 @@ -13,6 +15,8 @@ def initialize(value) @findings = Findings.new end + private + def tokenize_if(&code) base.text.each_with_index do |w, i| findings.push(i, w) if code.call(w) diff --git a/lib/textoken/options/modules/tokenizable_option.rb b/lib/textoken/options/modules/tokenizable_option.rb new file mode 100644 index 0000000..5662f34 --- /dev/null +++ b/lib/textoken/options/modules/tokenizable_option.rb @@ -0,0 +1,18 @@ +module Textoken + # This module will be shared in options like, only_regexp and exclude_regexp + module TokenizableOption + attr_reader :base + + def tokenize(base) + @base = base + tokenize_condition + end + + private + + def tokenize_condition + Textoken.type_err('tokenize_condition method has to be implemented + for Options.') + end + end +end diff --git a/lib/textoken/options/more_than.rb b/lib/textoken/options/more_than.rb index d5b967d..4935d6b 100644 --- a/lib/textoken/options/more_than.rb +++ b/lib/textoken/options/more_than.rb @@ -4,13 +4,12 @@ module Textoken class MoreThan include NumericOption - def tokenize(base) - @base = base + private + + def tokenize_condition tokenize_if { |word| word.length > number } end - private - def validate_option_value(value) validate { value.class == Fixnum && value >= 0 } end diff --git a/lib/textoken/options/only.rb b/lib/textoken/options/only.rb index 59c0cab..e4c109f 100644 --- a/lib/textoken/options/only.rb +++ b/lib/textoken/options/only.rb @@ -4,11 +4,12 @@ module Textoken class Only include ConditionalOption + private + # base.text is raw tokens splitted with ' ' # values are Regexps array to search # base.findings, Findings object for pushing matching tokens - def tokenize(base) - @base = base + def tokenize_condition tokenize_if { |word, regexp| word.match(regexp) } end end diff --git a/spec/lib/textoken/options/modules/tokenizable_option_spec.rb b/spec/lib/textoken/options/modules/tokenizable_option_spec.rb new file mode 100644 index 0000000..e2bb188 --- /dev/null +++ b/spec/lib/textoken/options/modules/tokenizable_option_spec.rb @@ -0,0 +1,50 @@ +require 'spec_helper' + +module Textoken + # A test dummy + class TheDumy + include TokenizableOption + + private + + def tokenize_condition + end + end +end + +module Textoken + # Another test dummy + class TheErrorDumy + include TokenizableOption + end +end + +describe Textoken::TokenizableOption do + describe '#tokenize' do + context 'sets the base' do + it 'as expected' do + t = Textoken::TheDumy.new + object = Object.new + t.tokenize(object) + expect(t.base).to eq(object) + end + end + + context 'sends tokenize_condition' do + it 'as expected' do + t = Textoken::TheDumy.new + expect(t).to receive(:tokenize_condition) + t.tokenize(Object.new) + end + end + + context 'raises error when not implemented' do + it 'as expected' do + t = Textoken::TheErrorDumy.new + expect do + t.tokenize(Object.new) + end.to raise_error + end + end + end +end From 01716e93ba06d47241c8b78022a354604e967465 Mon Sep 17 00:00:00 2001 From: Mehmet Cetin Date: Fri, 9 Oct 2015 00:46:00 +0300 Subject: [PATCH 3/4] readme changes and some polishing. --- README.md | 28 +++++++++---------- lib/textoken/options/less_than.rb | 4 +-- .../options/modules/numeric_option.rb | 4 +-- lib/textoken/options/more_than.rb | 4 +-- .../options/modules/numeric_option_spec.rb | 4 +-- .../modules/tokenizable_option_spec.rb | 2 +- textoken.gemspec | 5 +--- 7 files changed, 24 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 21e395f..e327013 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Dependency Status](https://gemnasium.com/manorie/textoken.svg)](https://gemnasium.com/manorie/textoken) [![Gem Version](https://badge.fury.io/rb/textoken.svg)](http://badge.fury.io/rb/textoken) -Textoken is a Ruby library for text tokenization. This gem extracts words from text with many customizations. It can be used in many fields like crawling and Natural Language Processing. +Textoken is a Ruby library for text tokenization. This gem extracts words from text with many customizations. It can be used in many fields like Web Crawling and Natural Language Processing. ## Basic Usage @@ -43,7 +43,7 @@ Textoken('Oh, no! Alfa 2000 is at home.', only_regexp: '^[0-9]*$').tokens You can combine all options. 'Only' and 'Exclude' Options support multiple option values like **only: 'punctuations, dates, numerics'** -Public interface of Textoken presents two methods, tokens & words; +Public interface of Textoken presents two methods, **tokens** & **words** ```ruby Textoken('Alfa.').tokens @@ -57,31 +57,31 @@ Textoken('Alfa.').words ## Current Options -**only:** accepts any regexp defined in [option_values.yml](//github.com/manorie/textoken/blob/development/lib/textoken/regexps/option_values.yml) +- **only:** Accepts any regexp defined in [option_values.yml](//github.com/manorie/textoken/blob/development/lib/textoken/regexps/option_values.yml) -**exclude:** accepts any regexp defined in [option_values.yml](https://github.com/manorie/textoken/blob/development/lib/textoken/regexps/option_values.yml) +- **only_regexp:** Accepts any regexp but only one regexp can be given. -**less_than:** accepts any integer bigger than 1 +- **exclude:** Accepts any regexp defined in [option_values.yml](https://github.com/manorie/textoken/blob/development/lib/textoken/regexps/option_values.yml) -**more_than:** accepts any positive integer +- **exclude_regexp** Accepts any regexp but only one regexp can be given. -**only_regexp:** accepts any regexp but only one regexp can be given +- **less_than:** Accepts any integer bigger than 1. -**exclude_regexp** accepts any regexp but only one regexp can be given +- **more_than:** Accepts any positive integer. ## Option Meanings -**only:** If a word in text consist of a regexp or regexps, only option includes it in result. +- **only:** If a word in text consist of a regexp or regexps, only option includes it in result. -**only_regexp:** If a word in text consist of user given regexp, only_regexp option includes it in result. +- **only_regexp:** If a word in text consist of user given regexp, only_regexp option includes it in result. -**exclude:** If a word in text does not have a regexp at some part, exclude option excludes it from result. Opposite of only. +- **exclude:** If a word in text does not have a regexp at some part, exclude option excludes it from result. Opposite of only. -**exclude_regexp:** If a word in text does not have user given regexp at some part, exclude option excludes it from result. Opposite of only_regexp. +- **exclude_regexp:** If a word in text does not have user given regexp at some part, exclude option excludes it from result. Opposite of only_regexp. -**less_than:** Filters result by the word length less than the option value given. +- **less_than:** Filters result by the word length less than the option value given. -**more_than:** Filters result by the word length bigger than the option value given. +- **more_than:** Filters result by the word length bigger than the option value given. ## Installation diff --git a/lib/textoken/options/less_than.rb b/lib/textoken/options/less_than.rb index 283a5ff..92a13d5 100644 --- a/lib/textoken/options/less_than.rb +++ b/lib/textoken/options/less_than.rb @@ -10,8 +10,8 @@ def tokenize_condition tokenize_if { |word| word.length < number } end - def validate_option_value(value) - validate { value.class == Fixnum && value > 1 } + def validate_option_value + validate { |value| value > 1 } end end end diff --git a/lib/textoken/options/modules/numeric_option.rb b/lib/textoken/options/modules/numeric_option.rb index e5f3319..a3f101f 100644 --- a/lib/textoken/options/modules/numeric_option.rb +++ b/lib/textoken/options/modules/numeric_option.rb @@ -10,9 +10,9 @@ def priority end def initialize(value) - validate_option_value(value) @number = value @findings = Findings.new + validate_option_value end private @@ -25,7 +25,7 @@ def tokenize_if(&code) end def validate(&code) - return if code.call + return if number.class == Fixnum && code.call(number) Textoken.expression_err "value #{number} is not permitted for #{self.class.name} option." end diff --git a/lib/textoken/options/more_than.rb b/lib/textoken/options/more_than.rb index 4935d6b..304ec7f 100644 --- a/lib/textoken/options/more_than.rb +++ b/lib/textoken/options/more_than.rb @@ -10,8 +10,8 @@ def tokenize_condition tokenize_if { |word| word.length > number } end - def validate_option_value(value) - validate { value.class == Fixnum && value >= 0 } + def validate_option_value + validate { |value| value >= 0 } end end end diff --git a/spec/lib/textoken/options/modules/numeric_option_spec.rb b/spec/lib/textoken/options/modules/numeric_option_spec.rb index f462a33..2b6118b 100644 --- a/spec/lib/textoken/options/modules/numeric_option_spec.rb +++ b/spec/lib/textoken/options/modules/numeric_option_spec.rb @@ -17,8 +17,8 @@ def tokenize_false(base) private - def validate_option_value(value) - validate { value.class == Fixnum && value > 1 } + def validate_option_value + validate { |value| value > 1 } end end end diff --git a/spec/lib/textoken/options/modules/tokenizable_option_spec.rb b/spec/lib/textoken/options/modules/tokenizable_option_spec.rb index e2bb188..cd1e3b4 100644 --- a/spec/lib/textoken/options/modules/tokenizable_option_spec.rb +++ b/spec/lib/textoken/options/modules/tokenizable_option_spec.rb @@ -43,7 +43,7 @@ class TheErrorDumy t = Textoken::TheErrorDumy.new expect do t.tokenize(Object.new) - end.to raise_error + end.to raise_error(Textoken::TypeError) end end end diff --git a/textoken.gemspec b/textoken.gemspec index a96b310..05c61ce 100644 --- a/textoken.gemspec +++ b/textoken.gemspec @@ -11,14 +11,11 @@ Gem::Specification.new do |s| s.email = ["mcetin.cm@gmail.com"] s.homepage = "https://github.com/manorie/textoken" s.summary = "Simple and customizable text tokenization gem." - s.description = "Textoken is a Ruby library for text tokenization. - This gem extracts words from text with many customizations. - It can be used in many fields like crawling and Natural Language Processing." + s.description = "Textoken is a Ruby library for text tokenization. This gem extracts words from text with many customizations. It can be used in many fields like Web Crawling and Natural Language Processing." s.license = "MIT" s.files = Dir["{app,config,db,lib}/**/*", "MIT-LICENSE", "Rakefile", "README.rdoc"] s.add_development_dependency 'rspec', '~> 3.3.0', '>= 3.3.0' s.add_development_dependency 'rake', '~> 10.0' - s.add_development_dependency 'pry', '~> 0' end From cefe8e776aa5bc4011d6dd6726cfdaa7563512d3 Mon Sep 17 00:00:00 2001 From: manorie Date: Thu, 22 Oct 2015 15:34:25 +0300 Subject: [PATCH 4/4] version update, refactorings --- lib/textoken/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/textoken/version.rb b/lib/textoken/version.rb index d3cf46d..c006fee 100644 --- a/lib/textoken/version.rb +++ b/lib/textoken/version.rb @@ -1,3 +1,3 @@ module Textoken - VERSION = "1.1.0" + VERSION = "1.1.1" end