Skip to content

Commit

Permalink
Add maintenance scripts.
Browse files Browse the repository at this point in the history
literature_update.sh and reset_missing_documents.sh. Check the comments in the scripts for details.
  • Loading branch information
khituras committed Sep 6, 2024
1 parent a06beff commit 7af800a
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 1 deletion.
1 change: 0 additions & 1 deletion gepi/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@ gepi-indexing-pipeline/lib
gepi-indexing-pipeline/resources
gepi-indexing-pipeline/config/jcore-pipeline-config.jar
missing_configuration.txt
literature_update.sh
87 changes: 87 additions & 0 deletions gepi/literature_update.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash
# Script that runs in an infinite loop and updates the GePI interaction index.
# CAUTION: This script is written for the specific situation in the JulieLab. You should adapt it for your environment.
# Takes one parameter:
# 1: Path to a file that defines the following environment variables:
# - GEPI_REPO_DIR: Directory to the GePI Maven project root (i.e. the gepi/ directory within the gepi repository)
# - GEPI_PREPROCESSING_PM: Path to the JCoRe Pipeline performing the NLP preprocessing of PubMed for GePI (e.g. the one at gepi-preprocessing/pubmed/preprocessing of this repository)
# - GEPI_PREPROCESSING_PMC: Path to the JCoRe Pipeline performing the NLP preprocessing of PMC for GePI (e.g. the one at gepi-preprocessing/pmc/preprocessing of this repository)
# - GEPI_INDEXING_PM: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed)
# - GEPI_INDEXING_PMC: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed)
set -e
# Stop via CTRL-Z followed by "kill %%""
source $1
TIME_CMD="/usr/bin/time -v"

# a day
SECONDS_BETWEEN_UPDATES=86400

# For security reasons, our GePI ElasticSearch does not accept remote
# connections. Thus, we need to tunnel to it for indexing.
function tunnel_to_es() {
cmd1='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9201:localhost:9200 gepi-vm'
pid=`pgrep -f "${cmd1}" || true`
if [ -n "$pid" ]; then
kill $pid
fi
${cmd1}
cmd2='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9301:localhost:9300 gepi-vm'
pid=`pgrep -f "${cmd2}" || true`
if [ -n "$pid" ]; then
kill $pid
fi
${cmd2}
}

# Do the update again and again. There will be at least $SECONDS_BETWEEN_UPDATES between two runs.
while [ 1 ]
do
START_TIME=`date +%s`

cd $HOME/bin
echo "[LitUpdate] Importing new PubMed XML documents into the database `date`"
$TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -im $GEPI_PREPROCESSING_PM/../pubmedImport.xml
echo "[LitUpdate] Finished importing new PubMed XML documents into the database `date`"
echo "[LitUpdate] Importing new PMC XML documents into the database `date`"
$TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -ip $GEPI_PREPROCESSING_PMC/../pmcImport.xml
echo "[LitUpdate] Finished importing new PMC XML documents into the database `date`"

# Run the NLP processing
echo "[LitUpdate] Running PubMed preprocessing `date`"
cd $GEPI_PREPROCESSING_PM
$TIME_CMD ./run.sh
echo "[LitUpdate] Finished running PubMed preprocessing `date`"
echo "[LitUpdate] Running PMC preprocessing `date`"
cd $GEPI_PREPROCESSING_PMC
$TIME_CMD ./run.sh
echo "[LitUpdate] Finished running PMC preprocessing `date`"

# Run the indexing
# Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing)
java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -re gepi._documents_mirror -np
# Open tunnel to ES
tunnel_to_es
echo "[LitUpdate] Running PubMed indexing `date`"
cd $GEPI_INDEXING_PM
$TIME_CMD ./run.sh
echo "[LitUpdate] Finished running PubMed indexing `date`"
echo "[LitUpdate] Running PMC indexing `date`"
# Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing)
java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -re gepi._documents_mirror -np
# Reset the tunnel or re-create it if it collapsed before
tunnel_to_es
cd $GEPI_INDEXING_PMC
$TIME_CMD ./run.sh
echo "[LitUpdate] Finished running PMC indexing `date`"

END_TIME=`date +%s`
ELAPSED_TIME=$(($END_TIME-$START_TIME))
echo "[LitUpdate] Updated PubMed and PMC literature from XML to index in $ELAPSED_TIME seconds. `date`"
if [ $ELAPSED_TIME -lt $SECONDS_BETWEEN_UPDATES ]; then
SLEEP_TIME=$((SECONDS_BETWEEN_UPDATES-$ELAPSED_TIME))
echo "[LitUpdate] Sleeping for $SLEEP_TIME seconds before starting next update. `date`"
sleep $SLEEP_TIME
else
echo "[LitUpdate] Update took longer than the time between update runs. Starting with a new update. `date`"
fi
done
62 changes: 62 additions & 0 deletions gepi/reset_missing_documents.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
# This script checks if there are documents in the JeDIS Postgres database which are missing from the
# GePI ElasticSearch interaction index. Such documents should be reset for new processing in the JeDIS database.
# Takes one parameter:
# 1. Path to a file that defines the following environment variables:
# - DBNAME_PUBMED: Name of the Postgres database where the PubMed JeDIS data for GePI is located.
# - USER_PUBMED: Username for the PubMed JeDIS (Postgres) database.
# - PASSWORD_PUBMED: Password for the PubMed JeDIS (Postgres) database.
# - HOST_PUBMED: Host of the PubMed JeDIS (Postgres) database.
# - PORT_PUBMED: Port of the PubMed JeDIS (Postgres) database.
# - DBNAME_PMC: Name of the Postgres database where the PMC JeDIS data for GePI is located.
# - USER_PMC: Username for the PMC JeDIS (Postgres) database.
# - PASSWORD_PMC: Password for the PMC JeDIS (Postgres) database.
# - HOST_PMC: Host of the PMC JeDIS (Postgres) database.
# - PORT_PMC: Port of the PubMed JeDIS (Postgres) database.
# - ES_INDEX: Name of the GePI ElasticSearch interaction index
# - (Optional) ES_URL: The URL to ElasticSearch. Defaults to http://localhost:9201
source ~/.gepi-validation
export PGPASSWORD=$PASSWORD_PUBMED
echo "Writing PubMed IDs with EventMentions in the JeDIS database to pmid_pg.txt"
psql -qtA -h $HOST_PUBMED -p $PORT_PUBMED -U $USER_PUBMED $DBNAME_PUBMED -c "SELECT pmid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmid_pg.txt
export PGPASSWORD=$PASSWORD_PMC
echo "Writing PMC IDs with EventMentions in the JeDIS database to pmcid_pg.txt"
psql -qtA -h $HOST_PMC -p $PORT_PMC -U $USER_PMC $DBNAME_PMC -c "SELECT pmcid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmcid_pg.txt

# This script pulls the document IDs from ElasticSearch and Postgres in an effort to make sure that every
# document in the JeDIS database (Postgres) arrived in ElasticSearch.
if [ -z "$ES_URL" ]; then
ES_URL="http://localhost:9201"
fi
HEADER="-H Content-Type:application/json"
curl -XPOST $ES_URL/$ES_INDEX/_search $HEADER -d '{
"query": {
"match_all": {}
},
"aggs": {
"pmids": {
"terms": {
"field": "pmid",
"size": 10000000
}
},
"pmcids": {
"terms": {
"field": "pmcid",
"size": 10000000
}
}
}
}' > es_docid_aggregation.json
grep -oE 'key":"[0-9]+' es_docid_aggregation.json | grep -oE '[0-9]+' > pmid_es.txt
grep -oE 'key":"PMC[0-9]+' es_docid_aggregation.json | grep -oE 'PMC[0-9]+' > pmcid_es.txt

echo "PubMed: Got `wc -l pmid_pg.txt` IDs from Postgres and `wc -l pmid_es.txt` from ElasticSearch"
echo "PMC: Got `wc -l pmcid_pg.txt` IDs from Postgres and `wc -l pmcid_es.txt` from ElasticSearch"

cat pmid_es.txt pmid_pg.txt | sort | uniq > pmid_missing.txt
cat pmcid_es.txt pmcid_pg.txt | sort | uniq > pmcid_missing.txt

echo "Missing PubMed: Got `wc -l pmid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch"
echo "Missing PMC: Got `wc -l pmcid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch"

0 comments on commit 7af800a

Please sign in to comment.