Merge pull request #6 from AndrewADev/feat/csv-output

AndrewADev · Jun 4, 2023 · bda4b39 · bda4b39
2 parents 9f17b54 + 4320b28
commit bda4b39
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -51,6 +51,11 @@ If you would like to use a different input path for the data file, you can do so
 python gen_list --input='/path/to/your/text/file'
 ```
 
+If you would like to specify an alternate output type (currently, the only additional option is CSV), you can do so by calling:
+```shell
+python gen_list --output-type=csv
+```
+
 Note that you can see all available options by calling:
 ```shell
 python gen_list --help

diff --git a/gen_list/__main__.py b/gen_list/__main__.py
@@ -1,5 +1,5 @@
-from extract_list import output_list
+from extract_list import generate_list
 
 
 if __name__ == '__main__':
- output_list()
+ generate_list()
diff --git a/gen_list/extract_list.py b/gen_list/extract_list.py
@@ -3,16 +3,33 @@
 
 file_path = './data/article.txt'
 
+nlp = spacy.load("de_core_news_lg")
 
 @click.command()
 @click.option('--input', default=file_path, help='Path to (plaintext) file to read from')
-def output_list(input):
- nlp = spacy.load("de_core_news_lg")
+@click.option('--output-type', type=click.Choice(['token-list', 'csv']), help='Type of output to be generated', default='token-list')
+def generate_list(input, output_type):
  with open(input, 'r', encoding='utf-8') as file:
- doc = nlp(file.read())
- for token in doc:
- if (token.pos_ == "NOUN" or token.pos_ == "VERB"):
- # TODO: Add option for verbose/debug
- # print(token.text, token.pos_)
- print(token.text, end=', ')
+ tokens = tokenize_file(file)
+ if output_type == 'token-list':
+ output_list(tokens)
+ elif output_type == 'csv':
+ output_csv(tokens)
 
+
+def output_list(tokens):
+ for token in tokens:
+ # TODO: Add option for verbose/debug
+ # print(token.text, token.pos_)
+ print(token.text, end=', ')
+
+def output_csv(tokens):
+ print('Token', end='\r\n')
+ for token in tokens:
+ print(token.text, end='\r\n')
+
+
+def tokenize_file(file):
+ doc = nlp(file.read())
+ tokens = [ token for token in doc if token.pos_ == "NOUN" or token.pos_ == "VERB" ]
+ return tokens
diff --git a/tests/test_extract_list.py b/tests/test_extract_list.py
@@ -1,14 +1,21 @@
 from click.testing import CliRunner
-from gen_list.extract_list import output_list
+from gen_list.extract_list import generate_list
 
 # Needs to be relative to project root!
 test_data_path = './tests/data/karl-das-krokodil.txt'
 
 def test_has_expected_tokens():
  runner = CliRunner()
- result = runner.invoke(output_list, ['--input', test_data_path])
+ result = runner.invoke(generate_list, ['--input', test_data_path])
  assert result.exit_code == 0
  assert "beschloss" in result.output
  assert "Haus" in result.output
  assert "Anpassungsfähigkeit" in result.output
 
+def test_exports_to_csv():
+ runner = CliRunner()
+ result = runner.invoke(generate_list, args=['--input', test_data_path, '--output-type', 'csv'])
+ assert result.exit_code == 0
+ assert "Token" in result.output # header
+ assert "Haus" in result.output
+ assert "Anpassungsfähigkeit" in result.output