From d07825f3c0df291bb1207e98b680aaa21e7975ad Mon Sep 17 00:00:00 2001
From: sdondley <s@dondley.com>
Date: Thu, 2 Aug 2018 08:55:21 -0400
Subject: [PATCH 1/2] improve unicode handling; allow multiple lines to be
 processed

---
 lib/Text/NLP/Stanford/EntityExtract.pm | 46 +++++++++++++++-----------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/lib/Text/NLP/Stanford/EntityExtract.pm b/lib/Text/NLP/Stanford/EntityExtract.pm
index b2708d5..c902e5b 100644
--- a/lib/Text/NLP/Stanford/EntityExtract.pm
+++ b/lib/Text/NLP/Stanford/EntityExtract.pm
@@ -7,6 +7,7 @@ use Mouse;
 use utf8;
 use Text::Unidecode;
 use IO::Socket;
+use Data::Dumper qw(Dumper);
 
 =head1 NAME
 
@@ -98,9 +99,11 @@ sub get_entities {
      foreach my $t (@txt) {
          warn "LENGTH: " . length($t) .  "\n" if $self->debug > 0;
          warn "TEXT: " .  $t . "\n" if $self->debug > 1;
-         $t = unidecode($t);
+
+         #$t = unidecode($t);
+         utf8::decode($t);
          $t =~ s/\n/ /mg;
-         $t =~ s/[^[:ascii:]]//mg;
+         #$t =~ s/[^[:ascii:]]//mg;
          push @result, $self->_process_line($t);
      }
     return @result;
@@ -116,9 +119,10 @@ processes a single line of text to tagged text
 sub _process_line {
     my ($self, $line) = @_;
     my $server = $self->server;
+    utf8::encode($line);
     print $server $line,"\n";
-    my $tagged_txt =  <$server>;
-    return $tagged_txt;
+    my @tagged_txt =  <$server>;
+    return \@tagged_txt;
 }
 
 =head2 entities_list($tagged_line)
@@ -133,23 +137,25 @@ TODO:  This needs some utility subs around it to make it more useful.
 =cut
 
 sub entities_list {
-    my ($self, $line) = @_;
-    my @tagged_words = split /\s+/, $line;
-    my $last_tag = '';
+    my ($self, $lines) = @_;
     my $taglist = {};
-    my $pos = 1;
-    foreach my $w (@tagged_words) {
-        my ($word, $tag) = $w =~ m{(.*)/(.*)$};
-        if (! $taglist->{$tag}) {
-            $taglist->{$tag} = [ ];
-        }
-        if ($tag ne $last_tag) {
-            push @{$taglist->{$tag}}, [$word, $pos++];
-        }
-        else {
-            push @{ $taglist->{$tag}->[ $#{ $taglist->{$tag}} ] }, [$word, $pos++];
-        }
-        $last_tag = $tag;
+    foreach my $line (@$lines) { 
+      my @tagged_words = split /\s+/, $line;
+      my $last_tag = '';
+      my $pos = 1;
+      foreach my $w (@tagged_words) {
+          my ($word, $tag) = $w =~ m{(.*)/(.*)$};
+          if (! $taglist->{$tag}) {
+              $taglist->{$tag} = [ ];
+          }
+          if ($tag ne $last_tag) {
+              push @{$taglist->{$tag}}, [$word, $pos++];
+          }
+          else {
+              push @{ $taglist->{$tag}->[ $#{ $taglist->{$tag}} ] }, [$word, $pos++];
+          }
+          $last_tag = $tag;
+      }
     }
     return $taglist;
 }

From 6065c330eaeb2431b39a104cfd92f4819631901c Mon Sep 17 00:00:00 2001
From: sdondley <s@dondley.com>
Date: Mon, 6 Aug 2018 22:18:27 -0400
Subject: [PATCH 2/2] fixed documentation and tests

---
 lib/Text/NLP/Stanford/EntityExtract.pm | 11 +++++++----
 t/00-load.t                            | 10 ++++++----
 2 files changed, 13 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 t/00-load.t

diff --git a/lib/Text/NLP/Stanford/EntityExtract.pm b/lib/Text/NLP/Stanford/EntityExtract.pm
index c902e5b..f91e055 100644
--- a/lib/Text/NLP/Stanford/EntityExtract.pm
+++ b/lib/Text/NLP/Stanford/EntityExtract.pm
@@ -31,7 +31,9 @@ Grab the Stanford Named Entity recogniser from http://nlp.stanford.edu/ner/index
 
 Run the server, something like as follows:
 
- java -server -mx400m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz 1234
+ java -server -mx400m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/english.all.3class.distsim.crf.ser.gz -port 1234
+
+By default, the server defaults to running on port 4465.
 
 =item *
 
@@ -44,7 +46,7 @@ Wrte a script to extract the named entities from the text, like the following:
  my $server = $ner->server;
  my @txt = ("Some text\n\n", "Treated as \\n\\n delimited paragraphs");
  my @tagged_text = $ner->get_entities(@txt);
- my $entities = $ner->entities_list($txt[0]); # rather complicated
+ my $entities = $ner->entities_list(@txt);    # rather complicated
                                               # @AOA based data
                                               # structure for further
                                               # processing
@@ -60,12 +62,13 @@ our $VERSION = '0.06';
 =head2 new ( host => '127.0.0.1', port => '1234' debug => 0|1|2);
 
 The debug flag warns the length of the text sent to the server if set
-to 1 and shows the actual text as well as the length if set to > 1.
+to 1 and shows the actual text as well as the length if set to > 1. The
+port number defaults to C<4465>.
 
 =cut
 
 has 'host'  => (is => 'ro', isa => 'Str', default => '127.0.0.1');
-has 'port'  => (is => 'ro', isa => 'Int', default => '1234');
+has 'port'  => (is => 'ro', isa => 'Int', default => '4465');
 has 'debug' => (is => 'rw', isa => 'Int', default => 0);
 
 =head2 server
diff --git a/t/00-load.t b/t/00-load.t
old mode 100644
new mode 100755
index bf2aff1..18494a5
--- a/t/00-load.t
+++ b/t/00-load.t
@@ -7,8 +7,9 @@ BEGIN {
 	use_ok( 'Text::NLP::Stanford::EntityExtract' );
 }
 
+
 diag( "Testing Text::NLP::Stanford::EntityExtract $Text::NLP::Stanford::EntityExtract::VERSION, Perl $], $^X" );
-diag "set env var NLR_SERVER to run live tests against the stanford.nlr server running on 127.0.0.1 port 1234" if ! $ENV{NLR_SERVER};
+diag "set env var NLR_SERVER to run live tests against the stanford.nlr server running on 127.0.0.1 port 4465" if ! $ENV{NLR_SERVER};
 my $ner = Text::NLP::Stanford::EntityExtract->new;
 
 SKIP: {
@@ -24,7 +25,8 @@ my @txt;
  @txt = <DATA>;
 }
 
-my $tagged_text = "blah/O blah/O Gwyneth/PERSON Paltrow/PERSON to/O the/O controversial/O Jewish-based/MISC faith/O that/O she/O follows/O ./O Now/O she/O is/O attempting/O ,/O for/O a/O second/O time/O ,/O to/O persuade/O Britney/LOCATION to/O follow/O suit/O ,/O reports/O said/O ./O Bruce/PERSON Lee/PERSON said/O from/O his/O home/O in/O Outer/LOCATION Mongolia/LOCATION ./O There/O is/O a/O question/O that/O Lord/PERSON Lucan/PERSON may/O have/O returned/O from/O the/O Chinese/LOCATION Mainland/LOCATION ./O Test/O a/O three/O word/O entity/O Location/LOCATION Location/LOCATION Location/LOCATION ./O";
+
+my @tagged_text = [ "blah/O blah/O Gwyneth/PERSON Paltrow/PERSON to/O the/O controversial/O Jewish-based/MISC faith/O that/O she/O follows/O ./O Now/O she/O is/O attempting/O ,/O for/O a/O second/O time/O ,/O to/O persuade/O Britney/LOCATION to/O follow/O suit/O ,/O reports/O said/O ./O Bruce/PERSON Lee/PERSON said/O from/O his/O home/O in/O Outer/LOCATION Mongolia/LOCATION ./O There/O is/O a/O question/O that/O Lord/PERSON Lucan/PERSON may/O have/O returned/O from/O the/O Chinese/LOCATION Mainland/LOCATION ./O Test/O a/O three/O word/O entity/O Location/LOCATION Location/LOCATION Location/LOCATION ./O" ];
 
 $data = {
           'LOCATION' =>
@@ -113,12 +115,12 @@ my $doclist = {
         };
 
 
-my @res = $ner->get_entities(@txt);
+my @res = $ner->get_entities(@tagged_text);
 SKIP: {
     skip "another test that requires a server", 1 unless   $ENV{NLR_SERVER};
     ok(scalar(@res), 'Defined result.  A silly test, because the NLP ER recogniser is probably non-deterministic');
 }
-is_deeply($ner->entities_list($tagged_text), $data, "got expected taglist");
+is_deeply($ner->entities_list(@tagged_text), $data, "got expected taglist");
 is_deeply($ner->list_entities($data), $list_data, "got expected list of entities");
 
 SKIP: {