Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 33 additions & 24 deletions lib/Text/NLP/Stanford/EntityExtract.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use Mouse;
use utf8;
use Text::Unidecode;
use IO::Socket;
use Data::Dumper qw(Dumper);

=head1 NAME

Expand All @@ -30,7 +31,9 @@ Grab the Stanford Named Entity recogniser from http://nlp.stanford.edu/ner/index

Run the server, something like as follows:

java -server -mx400m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz 1234
java -server -mx400m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/english.all.3class.distsim.crf.ser.gz -port 1234

By default, the server defaults to running on port 4465.

=item *

Expand All @@ -43,7 +46,7 @@ Wrte a script to extract the named entities from the text, like the following:
my $server = $ner->server;
my @txt = ("Some text\n\n", "Treated as \\n\\n delimited paragraphs");
my @tagged_text = $ner->get_entities(@txt);
my $entities = $ner->entities_list($txt[0]); # rather complicated
my $entities = $ner->entities_list(@txt); # rather complicated
# @AOA based data
# structure for further
# processing
Expand All @@ -59,12 +62,13 @@ our $VERSION = '0.06';
=head2 new ( host => '127.0.0.1', port => '1234' debug => 0|1|2);

The debug flag warns the length of the text sent to the server if set
to 1 and shows the actual text as well as the length if set to > 1.
to 1 and shows the actual text as well as the length if set to > 1. The
port number defaults to C<4465>.

=cut

has 'host' => (is => 'ro', isa => 'Str', default => '127.0.0.1');
has 'port' => (is => 'ro', isa => 'Int', default => '1234');
has 'port' => (is => 'ro', isa => 'Int', default => '4465');
has 'debug' => (is => 'rw', isa => 'Int', default => 0);

=head2 server
Expand Down Expand Up @@ -98,9 +102,11 @@ sub get_entities {
foreach my $t (@txt) {
warn "LENGTH: " . length($t) . "\n" if $self->debug > 0;
warn "TEXT: " . $t . "\n" if $self->debug > 1;
$t = unidecode($t);

#$t = unidecode($t);
utf8::decode($t);
$t =~ s/\n/ /mg;
$t =~ s/[^[:ascii:]]//mg;
#$t =~ s/[^[:ascii:]]//mg;
push @result, $self->_process_line($t);
}
return @result;
Expand All @@ -116,9 +122,10 @@ processes a single line of text to tagged text
sub _process_line {
my ($self, $line) = @_;
my $server = $self->server;
utf8::encode($line);
print $server $line,"\n";
my $tagged_txt = <$server>;
return $tagged_txt;
my @tagged_txt = <$server>;
return \@tagged_txt;
}

=head2 entities_list($tagged_line)
Expand All @@ -133,23 +140,25 @@ TODO: This needs some utility subs around it to make it more useful.
=cut

sub entities_list {
my ($self, $line) = @_;
my @tagged_words = split /\s+/, $line;
my $last_tag = '';
my ($self, $lines) = @_;
my $taglist = {};
my $pos = 1;
foreach my $w (@tagged_words) {
my ($word, $tag) = $w =~ m{(.*)/(.*)$};
if (! $taglist->{$tag}) {
$taglist->{$tag} = [ ];
}
if ($tag ne $last_tag) {
push @{$taglist->{$tag}}, [$word, $pos++];
}
else {
push @{ $taglist->{$tag}->[ $#{ $taglist->{$tag}} ] }, [$word, $pos++];
}
$last_tag = $tag;
foreach my $line (@$lines) {
my @tagged_words = split /\s+/, $line;
my $last_tag = '';
my $pos = 1;
foreach my $w (@tagged_words) {
my ($word, $tag) = $w =~ m{(.*)/(.*)$};
if (! $taglist->{$tag}) {
$taglist->{$tag} = [ ];
}
if ($tag ne $last_tag) {
push @{$taglist->{$tag}}, [$word, $pos++];
}
else {
push @{ $taglist->{$tag}->[ $#{ $taglist->{$tag}} ] }, [$word, $pos++];
}
$last_tag = $tag;
}
}
return $taglist;
}
Expand Down
10 changes: 6 additions & 4 deletions t/00-load.t
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ BEGIN {
use_ok( 'Text::NLP::Stanford::EntityExtract' );
}


diag( "Testing Text::NLP::Stanford::EntityExtract $Text::NLP::Stanford::EntityExtract::VERSION, Perl $], $^X" );
diag "set env var NLR_SERVER to run live tests against the stanford.nlr server running on 127.0.0.1 port 1234" if ! $ENV{NLR_SERVER};
diag "set env var NLR_SERVER to run live tests against the stanford.nlr server running on 127.0.0.1 port 4465" if ! $ENV{NLR_SERVER};
my $ner = Text::NLP::Stanford::EntityExtract->new;

SKIP: {
Expand All @@ -24,7 +25,8 @@ my @txt;
@txt = <DATA>;
}

my $tagged_text = "blah/O blah/O Gwyneth/PERSON Paltrow/PERSON to/O the/O controversial/O Jewish-based/MISC faith/O that/O she/O follows/O ./O Now/O she/O is/O attempting/O ,/O for/O a/O second/O time/O ,/O to/O persuade/O Britney/LOCATION to/O follow/O suit/O ,/O reports/O said/O ./O Bruce/PERSON Lee/PERSON said/O from/O his/O home/O in/O Outer/LOCATION Mongolia/LOCATION ./O There/O is/O a/O question/O that/O Lord/PERSON Lucan/PERSON may/O have/O returned/O from/O the/O Chinese/LOCATION Mainland/LOCATION ./O Test/O a/O three/O word/O entity/O Location/LOCATION Location/LOCATION Location/LOCATION ./O";

my @tagged_text = [ "blah/O blah/O Gwyneth/PERSON Paltrow/PERSON to/O the/O controversial/O Jewish-based/MISC faith/O that/O she/O follows/O ./O Now/O she/O is/O attempting/O ,/O for/O a/O second/O time/O ,/O to/O persuade/O Britney/LOCATION to/O follow/O suit/O ,/O reports/O said/O ./O Bruce/PERSON Lee/PERSON said/O from/O his/O home/O in/O Outer/LOCATION Mongolia/LOCATION ./O There/O is/O a/O question/O that/O Lord/PERSON Lucan/PERSON may/O have/O returned/O from/O the/O Chinese/LOCATION Mainland/LOCATION ./O Test/O a/O three/O word/O entity/O Location/LOCATION Location/LOCATION Location/LOCATION ./O" ];

$data = {
'LOCATION' =>
Expand Down Expand Up @@ -113,12 +115,12 @@ my $doclist = {
};


my @res = $ner->get_entities(@txt);
my @res = $ner->get_entities(@tagged_text);
SKIP: {
skip "another test that requires a server", 1 unless $ENV{NLR_SERVER};
ok(scalar(@res), 'Defined result. A silly test, because the NLP ER recogniser is probably non-deterministic');
}
is_deeply($ner->entities_list($tagged_text), $data, "got expected taglist");
is_deeply($ner->entities_list(@tagged_text), $data, "got expected taglist");
is_deeply($ner->list_entities($data), $list_data, "got expected list of entities");

SKIP: {
Expand Down