From d9a572eeeba1bee5e344ed757573a6db53e17a09 Mon Sep 17 00:00:00 2001 From: Thomas Proisl Date: Thu, 9 Nov 2023 13:09:43 +0100 Subject: [PATCH] Documentation --- README.md | 34 +++++++++++++++++----------------- src/somajo/somajo.py | 6 +++--- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8bbd797..8fbcf2d 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ Here are some common use cases:
Show example ``` - echo "der beste Betreuer? - >ProfSmith! : )" | somajo-tokenizer -c - + echo 'der beste Betreuer? - >ProfSmith! : )' | somajo-tokenizer -c - der beste Betreuer @@ -218,7 +218,7 @@ Here are some common use cases:
Show example ``` - echo "der beste Betreuer? - >ProfSmith! : )" | somajo-tokenizer - + echo 'der beste Betreuer? - >ProfSmith! : )' | somajo-tokenizer - der beste Betreuer @@ -246,7 +246,7 @@ Here are some common use cases:
Show example ``` - echo "Palim, Palim! Ich hätte gerne eine Flasche Pommes Frites." | somajo-tokenizer --split-sentences - + echo 'Palim, Palim! Ich hätte gerne eine Flasche Pommes Frites.' | somajo-tokenizer --split-sentences - Palim , Palim @@ -273,7 +273,7 @@ Here are some common use cases:
Show example ``` - echo "Dont you wanna come?" | somajo-tokenizer -l en_PTB - + echo 'Dont you wanna come?' | somajo-tokenizer -l en_PTB - Do nt you @@ -329,7 +329,7 @@ Here are some common use cases:
Show example ``` - echo "der beste Betreuer? - >ProfSmith! : )" | somajo-tokenizer -c -e -t - + echo 'der beste Betreuer? - >ProfSmith! : )' | somajo-tokenizer -c -e -t - der regular beste regular Betreuer regular SpaceAfter=No @@ -351,19 +351,18 @@ Here are some common use cases: ### Using the module -You can easily incorporate SoMaJo into your own Python projects. All -you need to do is importing `somajo.SoMaJo`, creating a `SoMaJo` -object and calling one of its tokenizer functions: `tokenize_text`, -`tokenize_text_file`, `tokenize_xml` or `tokenize_xml_file`. These -functions return a generator that yields tokenized chunks of text. By -default, these chunks of text are sentences. If you set -`split_sentences=False`, then the chunks of text are either paragraphs -or chunks of XML. Every tokenized chunk of text is a list of `Token` -objects. - -For more details, take a look at the [API +Take a look at the [API documentation](https://github.com/tsproisl/SoMaJo/blob/master/doc/build/markdown/somajo.md). +You can incorporate SoMaJo into your own Python projects. All you need +to do is importing `somajo`, creating a `SoMaJo` object and calling +one of its tokenizer functions: `tokenize_text`, `tokenize_text_file`, +`tokenize_xml` or `tokenize_xml_file`. These functions return a +generator that yields tokenized chunks of text. By default, these +chunks of text are sentences. If you set `split_sentences=False`, then +the chunks of text are either paragraphs or chunks of XML. Every +tokenized chunk of text is a list of `Token` objects. + Here is an example for tokenizing and sentence splitting two paragraphs: @@ -379,7 +378,7 @@ paragraphs = ["der beste Betreuer?\n-- ProfSmith! : )", sentences = tokenizer.tokenize_text(paragraphs) for sentence in sentences: for token in sentence: - print("{}\t{}\t{}".format(token.text, token.token_class, token.extra_info)) + print(f"{token.text}\t{token.token_class}\t{token.extra_info}") print() ``` @@ -414,6 +413,7 @@ for sentence in sentences: print() ``` + ## Evaluation SoMaJo was the system with the highest average F₁ score in the diff --git a/src/somajo/somajo.py b/src/somajo/somajo.py index 51c66b7..b8c0cb8 100644 --- a/src/somajo/somajo.py +++ b/src/somajo/somajo.py @@ -33,7 +33,7 @@ class SoMaJo: guarantee well-formed output (tags might need to be closed and re-opened at sentence boundaries). character_offsets : bool, (default=False) - Compute for each token the character offsets in the input. + Compute the character offsets in the input for each token. This allows for stand-off tokenization. """ @@ -159,7 +159,7 @@ def tokenize_text_file(self, text_file, paragraph_separator, *, parallel=1): >>> sentences = tokenizer.tokenize_text_file("example_empty_lines.txt", paragraph_separator="single_newlines") >>> for sentence in sentences: ... for token in sentence: - ... print("{}\t{}\t{}".format(token.text, token.token_class, token.extra_info)) + ... print("{token.text}\t{token.token_class}\t{token.extra_info}") ... print() ... Heyi regular SpaceAfter=No @@ -383,7 +383,7 @@ def tokenize_text(self, paragraphs, *, parallel=1): >>> sentences = tokenizer.tokenize_text(paragraphs) >>> for sentence in sentences: ... for token in sentence: - ... print("{}\t{}\t{}".format(token.text, token.token_class, token.extra_info)) + ... print("{token.text}\t{token.token_class}\t{token.extra_info}") ... print() ... Heyi regular SpaceAfter=No