From d07825f3c0df291bb1207e98b680aaa21e7975ad Mon Sep 17 00:00:00 2001 From: sdondley Date: Thu, 2 Aug 2018 08:55:21 -0400 Subject: [PATCH 1/2] improve unicode handling; allow multiple lines to be processed --- lib/Text/NLP/Stanford/EntityExtract.pm | 46 +++++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/lib/Text/NLP/Stanford/EntityExtract.pm b/lib/Text/NLP/Stanford/EntityExtract.pm index b2708d5..c902e5b 100644 --- a/lib/Text/NLP/Stanford/EntityExtract.pm +++ b/lib/Text/NLP/Stanford/EntityExtract.pm @@ -7,6 +7,7 @@ use Mouse; use utf8; use Text::Unidecode; use IO::Socket; +use Data::Dumper qw(Dumper); =head1 NAME @@ -98,9 +99,11 @@ sub get_entities { foreach my $t (@txt) { warn "LENGTH: " . length($t) . "\n" if $self->debug > 0; warn "TEXT: " . $t . "\n" if $self->debug > 1; - $t = unidecode($t); + + #$t = unidecode($t); + utf8::decode($t); $t =~ s/\n/ /mg; - $t =~ s/[^[:ascii:]]//mg; + #$t =~ s/[^[:ascii:]]//mg; push @result, $self->_process_line($t); } return @result; @@ -116,9 +119,10 @@ processes a single line of text to tagged text sub _process_line { my ($self, $line) = @_; my $server = $self->server; + utf8::encode($line); print $server $line,"\n"; - my $tagged_txt = <$server>; - return $tagged_txt; + my @tagged_txt = <$server>; + return \@tagged_txt; } =head2 entities_list($tagged_line) @@ -133,23 +137,25 @@ TODO: This needs some utility subs around it to make it more useful. =cut sub entities_list { - my ($self, $line) = @_; - my @tagged_words = split /\s+/, $line; - my $last_tag = ''; + my ($self, $lines) = @_; my $taglist = {}; - my $pos = 1; - foreach my $w (@tagged_words) { - my ($word, $tag) = $w =~ m{(.*)/(.*)$}; - if (! $taglist->{$tag}) { - $taglist->{$tag} = [ ]; - } - if ($tag ne $last_tag) { - push @{$taglist->{$tag}}, [$word, $pos++]; - } - else { - push @{ $taglist->{$tag}->[ $#{ $taglist->{$tag}} ] }, [$word, $pos++]; - } - $last_tag = $tag; + foreach my $line (@$lines) { + my @tagged_words = split /\s+/, $line; + my $last_tag = ''; + my $pos = 1; + foreach my $w (@tagged_words) { + my ($word, $tag) = $w =~ m{(.*)/(.*)$}; + if (! $taglist->{$tag}) { + $taglist->{$tag} = [ ]; + } + if ($tag ne $last_tag) { + push @{$taglist->{$tag}}, [$word, $pos++]; + } + else { + push @{ $taglist->{$tag}->[ $#{ $taglist->{$tag}} ] }, [$word, $pos++]; + } + $last_tag = $tag; + } } return $taglist; } From 6065c330eaeb2431b39a104cfd92f4819631901c Mon Sep 17 00:00:00 2001 From: sdondley Date: Mon, 6 Aug 2018 22:18:27 -0400 Subject: [PATCH 2/2] fixed documentation and tests --- lib/Text/NLP/Stanford/EntityExtract.pm | 11 +++++++---- t/00-load.t | 10 ++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) mode change 100644 => 100755 t/00-load.t diff --git a/lib/Text/NLP/Stanford/EntityExtract.pm b/lib/Text/NLP/Stanford/EntityExtract.pm index c902e5b..f91e055 100644 --- a/lib/Text/NLP/Stanford/EntityExtract.pm +++ b/lib/Text/NLP/Stanford/EntityExtract.pm @@ -31,7 +31,9 @@ Grab the Stanford Named Entity recogniser from http://nlp.stanford.edu/ner/index Run the server, something like as follows: - java -server -mx400m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz 1234 + java -server -mx400m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/english.all.3class.distsim.crf.ser.gz -port 1234 + +By default, the server defaults to running on port 4465. =item * @@ -44,7 +46,7 @@ Wrte a script to extract the named entities from the text, like the following: my $server = $ner->server; my @txt = ("Some text\n\n", "Treated as \\n\\n delimited paragraphs"); my @tagged_text = $ner->get_entities(@txt); - my $entities = $ner->entities_list($txt[0]); # rather complicated + my $entities = $ner->entities_list(@txt); # rather complicated # @AOA based data # structure for further # processing @@ -60,12 +62,13 @@ our $VERSION = '0.06'; =head2 new ( host => '127.0.0.1', port => '1234' debug => 0|1|2); The debug flag warns the length of the text sent to the server if set -to 1 and shows the actual text as well as the length if set to > 1. +to 1 and shows the actual text as well as the length if set to > 1. The +port number defaults to C<4465>. =cut has 'host' => (is => 'ro', isa => 'Str', default => '127.0.0.1'); -has 'port' => (is => 'ro', isa => 'Int', default => '1234'); +has 'port' => (is => 'ro', isa => 'Int', default => '4465'); has 'debug' => (is => 'rw', isa => 'Int', default => 0); =head2 server diff --git a/t/00-load.t b/t/00-load.t old mode 100644 new mode 100755 index bf2aff1..18494a5 --- a/t/00-load.t +++ b/t/00-load.t @@ -7,8 +7,9 @@ BEGIN { use_ok( 'Text::NLP::Stanford::EntityExtract' ); } + diag( "Testing Text::NLP::Stanford::EntityExtract $Text::NLP::Stanford::EntityExtract::VERSION, Perl $], $^X" ); -diag "set env var NLR_SERVER to run live tests against the stanford.nlr server running on 127.0.0.1 port 1234" if ! $ENV{NLR_SERVER}; +diag "set env var NLR_SERVER to run live tests against the stanford.nlr server running on 127.0.0.1 port 4465" if ! $ENV{NLR_SERVER}; my $ner = Text::NLP::Stanford::EntityExtract->new; SKIP: { @@ -24,7 +25,8 @@ my @txt; @txt = ; } -my $tagged_text = "blah/O blah/O Gwyneth/PERSON Paltrow/PERSON to/O the/O controversial/O Jewish-based/MISC faith/O that/O she/O follows/O ./O Now/O she/O is/O attempting/O ,/O for/O a/O second/O time/O ,/O to/O persuade/O Britney/LOCATION to/O follow/O suit/O ,/O reports/O said/O ./O Bruce/PERSON Lee/PERSON said/O from/O his/O home/O in/O Outer/LOCATION Mongolia/LOCATION ./O There/O is/O a/O question/O that/O Lord/PERSON Lucan/PERSON may/O have/O returned/O from/O the/O Chinese/LOCATION Mainland/LOCATION ./O Test/O a/O three/O word/O entity/O Location/LOCATION Location/LOCATION Location/LOCATION ./O"; + +my @tagged_text = [ "blah/O blah/O Gwyneth/PERSON Paltrow/PERSON to/O the/O controversial/O Jewish-based/MISC faith/O that/O she/O follows/O ./O Now/O she/O is/O attempting/O ,/O for/O a/O second/O time/O ,/O to/O persuade/O Britney/LOCATION to/O follow/O suit/O ,/O reports/O said/O ./O Bruce/PERSON Lee/PERSON said/O from/O his/O home/O in/O Outer/LOCATION Mongolia/LOCATION ./O There/O is/O a/O question/O that/O Lord/PERSON Lucan/PERSON may/O have/O returned/O from/O the/O Chinese/LOCATION Mainland/LOCATION ./O Test/O a/O three/O word/O entity/O Location/LOCATION Location/LOCATION Location/LOCATION ./O" ]; $data = { 'LOCATION' => @@ -113,12 +115,12 @@ my $doclist = { }; -my @res = $ner->get_entities(@txt); +my @res = $ner->get_entities(@tagged_text); SKIP: { skip "another test that requires a server", 1 unless $ENV{NLR_SERVER}; ok(scalar(@res), 'Defined result. A silly test, because the NLP ER recogniser is probably non-deterministic'); } -is_deeply($ner->entities_list($tagged_text), $data, "got expected taglist"); +is_deeply($ner->entities_list(@tagged_text), $data, "got expected taglist"); is_deeply($ner->list_entities($data), $list_data, "got expected list of entities"); SKIP: {