A simple, cross-platform, NLTK-inspired averaged perceptron tagger written in Rust
Contents
use postagger::PerceptronTagger;
fn main() {
let tagger = PerceptronTagger::new( "tagger/weights.json" , "tagger/classes.txt" , "tagger/tags.json" ) ;
let tags = tagger.tag( "the quick brown fox jumps over the lazy dog" ) ;
for tag in &tags {
println!( "{} {} {}" , tag.word , tag.tag , tag.conf ) ;
}
}
$> cbindgen --lang C --output examples/c/postagger.h
$> cargo build --target=x86_64-unknown-linux-gnu --release
#include "postagger.h"
#include <stdio.h>
#include <stdlib.h>
int main( int argc , char** argv ) {
PerceptronTagger* tagger = tagger_create( "tagger/weights.json" , "tagger/classes.txt" , "tagger/tags.json" ) ;
const TagResults* results = tagger_annotate( tagger , "the quick brown fox jumps over the lazy dog" ) ;
for( int i = 0 ; i < results -> num_tags ; i++ ) {
printf( "word=%s , tag=%s , conf=%f \n" , results -> tags[i].word , results -> tags[i].tag , results -> tags[i].conf ) ;
}
tagger_release( tagger ) ;
}
import java.util.List;
import pos.tagger.POSTagger.POSTag;
public class Main {
public static void main( String[] args ) {
/*
* Replace with the absolute paths of
* weights, tags and classes
* See `tagger` directory at the root of the repository for these files
*/
POSTagger tagger = new POSTagger(
"weights.json",
"tags.json",
"classes.txt"
) ;
List<POSTag> tags = tagger.tag( "the quick brown fox jumps over the lazy dog" ) ;
for( POSTag tag : tags ) {
System.out.println( tag.getWord() + " " + tag.getTag() ) ;
}
}
}
The corpus stored in the tagger
directory at the root of the repository is extracted from averaged_perceptron_tagger.zip
from the nltk_data
repository. The model consists of
weights.json
: contains weights of each feature, as generated by a function similar toPerceptronTagger::get_features
insrc/perceptron_tagger.rs
classes.txt
: contains the list of all POS tags that the model predictstags.json
: maps words to their pre-determined POS tags (no inference will be made if the incoming word is present in this mapping)
perceptron.py
on NLTK- A Good Part-of-Speech Tagger in about 200 Lines of Python
- Reddit Discussion on
postagger.rs
:r/rust
andr/LanguageTechnology