#!/bin/bash if [ $# != 2 ] then echo 'Passar o diretorio de entrada com txt para normalizar e o diretorio de saida' exit 255 fi # Configuracao TOKENIZER=$PWD/tokenizer/webtok SPELLER_DIR=$PWD/speller SPELLER_ARGS= INPUT_DIR=$1 OUTPUT_DIR=$2 export PERL5LIB=$SPELLER_DIR export PYTHONPATH=$SPELLER_DIR # get absolute path of input and output dirs INPUT_DIR=`readlink -f $INPUT_DIR` OUTPUT_DIR=`readlink -f $OUTPUT_DIR` # Processamento SAVEIFS=$IFS IFS=$(echo -en "\n\b") rm -rf $OUTPUT_DIR mkdir $OUTPUT_DIR # tokenizador ################################################## echo echo "###" echo echo "Aplicando tokenizador em $INPUT_DIR/" rm -rf $OUTPUT_DIR/tok mkdir $OUTPUT_DIR/tok for f in `find $INPUT_DIR/ -name "*.txt"` do $TOKENIZER < $f > $OUTPUT_DIR/tok/`basename $f` done # speller ################################################## echo echo "###" echo echo "Aplicando speller em $OUTPUT_DIR/tok" rm -rf $OUTPUT_DIR/tok/checked mkdir $OUTPUT_DIR/tok/checked perl $SPELLER_DIR/spell.pl -stat $SPELLER_DIR/lexicos/regra+cb_freq.txt -d $OUTPUT_DIR/tok # normalizador de siglas ################################################## echo echo "###" echo echo "Normalizando siglas em $OUTPUT_DIR/tok/checked/" rm -rf $OUTPUT_DIR/tok/checked/siglas mkdir $OUTPUT_DIR/tok/checked/siglas for f in `find $OUTPUT_DIR/tok/checked -type f` do perl ./siglas_map.pl ./resources/lexico_siglas.txt $f > $OUTPUT_DIR/tok/checked/siglas/`basename $f` done # normalizador de Internetes ################################################## echo echo "###" echo echo "Normalizando internetes em $OUTPUT_DIR/tok/checked/siglas" rm -rf $OUTPUT_DIR/tok/checked/siglas/internetes mkdir $OUTPUT_DIR/tok/checked/siglas/internetes for f in `find $OUTPUT_DIR/tok/checked/siglas -type f` do perl ./internetes_map.pl ./resources/lexico_internetes.txt ./resources/lexico_internetes_sigl_abrv.txt $f > $OUTPUT_DIR/tok/checked/siglas/internetes/`basename $f` done # normalizador de Nome Proprio ################################################## echo echo "###" echo echo "Normalizando nomes proprios em $OUTPUT_DIR/tok/checked/siglas/internetes" rm -rf $OUTPUT_DIR/tok/checked/siglas/internetes/nomes mkdir $OUTPUT_DIR/tok/checked/siglas/internetes/nomes for f in `find $OUTPUT_DIR/tok/checked/siglas/internetes -type f` do perl ./np_map.pl ./resources/lexico_nome_proprio.txt $f > $OUTPUT_DIR/tok/checked/siglas/internetes/nomes/`basename $f` done # caixa alta para palavras precedidas por ponto final ################################################## for f in `find $OUTPUT_DIR/tok/checked/siglas/internetes/nomes -type f` do python ./upper_periods.py $f done IFS=$SAVEIFS