Skip to content

Commit

Permalink
Merge pull request #6 from AndrewADev/feat/csv-output
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewADev authored Jun 4, 2023
2 parents 9f17b54 + 4320b28 commit bda4b39
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 12 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ If you would like to use a different input path for the data file, you can do so
python gen_list --input='/path/to/your/text/file'
```

If you would like to specify an alternate output type (currently, the only additional option is CSV), you can do so by calling:
```shell
python gen_list --output-type=csv
```

Note that you can see all available options by calling:
```shell
python gen_list --help
Expand Down
4 changes: 2 additions & 2 deletions gen_list/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from extract_list import output_list
from extract_list import generate_list


if __name__ == '__main__':
output_list()
generate_list()
33 changes: 25 additions & 8 deletions gen_list/extract_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,33 @@

file_path = './data/article.txt'

nlp = spacy.load("de_core_news_lg")

@click.command()
@click.option('--input', default=file_path, help='Path to (plaintext) file to read from')
def output_list(input):
nlp = spacy.load("de_core_news_lg")
@click.option('--output-type', type=click.Choice(['token-list', 'csv']), help='Type of output to be generated', default='token-list')
def generate_list(input, output_type):
with open(input, 'r', encoding='utf-8') as file:
doc = nlp(file.read())
for token in doc:
if (token.pos_ == "NOUN" or token.pos_ == "VERB"):
# TODO: Add option for verbose/debug
# print(token.text, token.pos_)
print(token.text, end=', ')
tokens = tokenize_file(file)
if output_type == 'token-list':
output_list(tokens)
elif output_type == 'csv':
output_csv(tokens)


def output_list(tokens):
for token in tokens:
# TODO: Add option for verbose/debug
# print(token.text, token.pos_)
print(token.text, end=', ')

def output_csv(tokens):
print('Token', end='\r\n')
for token in tokens:
print(token.text, end='\r\n')


def tokenize_file(file):
doc = nlp(file.read())
tokens = [ token for token in doc if token.pos_ == "NOUN" or token.pos_ == "VERB" ]
return tokens
11 changes: 9 additions & 2 deletions tests/test_extract_list.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
from click.testing import CliRunner
from gen_list.extract_list import output_list
from gen_list.extract_list import generate_list

# Needs to be relative to project root!
test_data_path = './tests/data/karl-das-krokodil.txt'

def test_has_expected_tokens():
runner = CliRunner()
result = runner.invoke(output_list, ['--input', test_data_path])
result = runner.invoke(generate_list, ['--input', test_data_path])
assert result.exit_code == 0
assert "beschloss" in result.output
assert "Haus" in result.output
assert "Anpassungsfähigkeit" in result.output

def test_exports_to_csv():
runner = CliRunner()
result = runner.invoke(generate_list, args=['--input', test_data_path, '--output-type', 'csv'])
assert result.exit_code == 0
assert "Token" in result.output # header
assert "Haus" in result.output
assert "Anpassungsfähigkeit" in result.output

0 comments on commit bda4b39

Please sign in to comment.