Skip to content

Commit

Permalink
Merge pull request #364 from stweil/build_rules
Browse files Browse the repository at this point in the history
Remove build rules for Tesseract and Leptonica
  • Loading branch information
zdenop authored Dec 13, 2023
2 parents 637fb6c + 61b9bba commit 681a2cb
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 78 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ data/unicharset
dta19-reduced
dta19-reduced.tar.gz
*.built
tesseract-*
leptonica-*
*.BAK
/usr
data/checkpoints
Expand Down
53 changes: 1 addition & 52 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,6 @@ LAST_CHECKPOINT = $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)_checkpoint
# Name of the proto model. Default: '$(PROTO_MODEL)'
PROTO_MODEL = $(OUTPUT_DIR)/$(MODEL_NAME).traineddata

# No of cores to use for compiling leptonica/tesseract. Default: $(CORES)
CORES = 4

# Leptonica version. Default: $(LEPTONICA_VERSION)
LEPTONICA_VERSION := 1.83.0

# Tesseract commit. Default: $(TESSERACT_VERSION)
TESSERACT_VERSION := 5.3.0

# Tesseract model repo to use. Default: $(TESSDATA_REPO)
TESSDATA_REPO = _best

Expand Down Expand Up @@ -131,9 +122,6 @@ help: default
@echo " training Start training"
@echo " traineddata Create best and fast .traineddata files from each .checkpoint file"
@echo " proto-model Build the proto model"
@echo " leptonica Build leptonica"
@echo " tesseract Build tesseract"
@echo " tesseract-langs Download minimal stock models"
@echo " tesseract-langdata Download stock unicharsets"
@echo " clean-box Clean generated .box files"
@echo " clean-lstmf Clean generated .lstmf files"
Expand All @@ -154,9 +142,6 @@ help: default
@echo " PUNC_FILE Optional Punc file for Punctuation dawg. Default: $(PUNC_FILE)"
@echo " START_MODEL Name of the model to continue from. Default: '$(START_MODEL)'"
@echo " PROTO_MODEL Name of the proto model. Default: '$(PROTO_MODEL)'"
@echo " CORES No of cores to use for compiling leptonica/tesseract. Default: $(CORES)"
@echo " LEPTONICA_VERSION Leptonica version. Default: $(LEPTONICA_VERSION)"
@echo " TESSERACT_VERSION Tesseract commit. Default: $(TESSERACT_VERSION)"
@echo " TESSDATA_REPO Tesseract model repo to use (_fast or _best). Default: $(TESSDATA_REPO)"
@echo " MAX_ITERATIONS Max iterations. Default: $(MAX_ITERATIONS)"
@echo " EPOCHS Set max iterations based on the number of lines for the training. Default: none"
Expand All @@ -181,7 +166,7 @@ endif

.PRECIOUS: $(LAST_CHECKPOINT)

.PHONY: default clean help leptonica lists proto-model tesseract tesseract-langs tesseract-langdata training unicharset charfreq
.PHONY: default clean help lists proto-model tesseract-langdata training unicharset charfreq

ALL_FILES = $(and $(wildcard $(GROUND_TRUTH_DIR)),$(shell find -L $(GROUND_TRUTH_DIR) -name '*.gt.txt'))
unexport ALL_FILES # prevent adding this to envp in recipes (which can cause E2BIG if too long; cf. make #44853)
Expand Down Expand Up @@ -375,42 +360,6 @@ $(TESSERACT_LANGDATA):
@mkdir -p $(@D)
wget -O $@ 'https://github.com/tesseract-ocr/langdata_lstm/raw/main/$(@F)'

# Build leptonica
leptonica: leptonica.built

leptonica.built: leptonica-$(LEPTONICA_VERSION)
cd $< ; \
./configure --prefix=$(LOCAL) && \
make -j$(CORES) install SUBDIRS=src && \
date > "$@"

leptonica-$(LEPTONICA_VERSION): leptonica-$(LEPTONICA_VERSION).tar.gz
tar xf "$<"

leptonica-$(LEPTONICA_VERSION).tar.gz:
wget 'http://www.leptonica.org/source/$@'

# Build tesseract
tesseract: tesseract.built tesseract-langs

tesseract.built: tesseract-$(TESSERACT_VERSION)
cd $< && \
sh autogen.sh && \
PKG_CONFIG_PATH="$(LOCAL)/lib/pkgconfig" \
./configure --prefix=$(LOCAL) && \
LDFLAGS="-L$(LOCAL)/lib"\
make -j$(CORES) install && \
LDFLAGS="-L$(LOCAL)/lib"\
make -j$(CORES) training-install && \
date > "$@"

tesseract-$(TESSERACT_VERSION):
wget https://github.com/tesseract-ocr/tesseract/archive/$(TESSERACT_VERSION).zip
unzip $(TESSERACT_VERSION).zip

# Download tesseract-langs
tesseract-langs: $(TESSDATA)/eng.traineddata

$(TESSDATA)/%.traineddata:
wget -O $@ 'https://github.com/tesseract-ocr/tessdata$(TESSDATA_REPO)/raw/main/$(@F)'

Expand Down
30 changes: 6 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# tesstrain

> Training workflow for Tesseract 4 as a Makefile for dependency tracking and building the required software from source.
> Training workflow for Tesseract 5 as a Makefile for dependency tracking.
## Install

Expand All @@ -10,27 +10,15 @@ You will need at least GNU `make` (minimal version 4.2), `wget`, `find`, `bash`,

### Leptonica, Tesseract

You will need a recent version (>= 4.0.0beta1) of tesseract built with the
You will need a recent version (>= 5.3) of tesseract built with the
training tools and matching leptonica bindings.
[Build](https://github.com/tesseract-ocr/tesseract/wiki/Compiling)
[instructions](https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation)
and more can be found in the [Tesseract project
wiki](https://github.com/tesseract-ocr/tesseract/wiki/).

Alternatively, you can build leptonica and tesseract within this project and install it to a subdirectory `./usr` in the repo:

```sh
make leptonica tesseract
```

Tesseract will be built from the git repository, which requires CMake,
autotools (including autotools-archive) and some additional libraries for the
training tools. See the [installation notes in the tesseract
repository](https://github.com/tesseract-ocr/tesseract/blob/main/INSTALL.GIT.md).
[Build](https://tesseract-ocr.github.io/tessdoc/Compiling)
[instructions](https://tesseract-ocr.github.io/tessdoc/Compiling-%E2%80%93-GitInstallation)
and more can be found in the [Tesseract User Manual](https://tesseract-ocr.github.io/tessdoc/).

#### Windows

1. Install the latest tesseract (e.g. from https://digi.bib.uni-mannheim.de/tesseract/) make sure that tesseract is add to your PATH.
1. Install the latest tesseract (e.g. from https://digi.bib.uni-mannheim.de/tesseract/), make sure that tesseract is added to your PATH.
2. Install [Python 3](https://www.python.org/downloads/)
3. Install [Git SCM to Windows](https://gitforwindows.org/) - it provides a lot of linux utilities on Windows (e.g. `find`, `unzip`, `rm`) and put `C:\Program Files\Git\usr\bin` to the begining of your PATH variable (temporarely you can do it in `cmd` with `set PATH=C:\Program Files\Git\usr\bin;%PATH%` - unfornatelly there are several Windows tools with the same name as on linux (`find`, `sort`) with different behaviour/functionality and there is need to avoid them during training.
4. Install winget/[Windows Package Manager](https://github.com/microsoft/winget-cli/releases/) and then run `winget install GnuWin32.Make` and `winget install wget` to install missing tools.
Expand Down Expand Up @@ -110,9 +98,6 @@ Run `make help` to see all the possible targets and variables:
training Start training
traineddata Create best and fast .traineddata files from each .checkpoint file
proto-model Build the proto model
leptonica Build leptonica
tesseract Build tesseract
tesseract-langs Download minimal stock models
tesseract-langdata Download stock unicharsets
clean Clean all generated files
Expand All @@ -127,9 +112,6 @@ Run `make help` to see all the possible targets and variables:
DATA_DIR Data directory for output files, proto model, start model, etc. Default: data
OUTPUT_DIR Output directory for generated files. Default: DATA_DIR/MODEL_NAME
GROUND_TRUTH_DIR Ground truth directory. Default: OUTPUT_DIR-ground-truth
CORES No of cores to use for compiling leptonica/tesseract. Default: 4
LEPTONICA_VERSION Leptonica version. Default: 1.78.0
TESSERACT_VERSION Tesseract commit. Default: 4.1.1
TESSDATA_REPO Tesseract model repo to use (_fast or _best). Default: _best
TESSDATA Path to the .traineddata directory to start finetuning from. Default: ./usr/share/tessdata
MAX_ITERATIONS Max iterations. Default: 10000
Expand Down

0 comments on commit 681a2cb

Please sign in to comment.