-
Notifications
You must be signed in to change notification settings - Fork 1
two inputs possible #59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Conversation
Please be a little more elaborate with your pull request description and commit messages. |
collect_stata/__main__.py
Outdated
if input_path is None: | ||
for file_de in input_german_path.glob("*.dta"): | ||
file = None | ||
process = Process(target=_run, args=(file, file_de, output_path, study_name)) | ||
processes.append(process) | ||
process.start() | ||
if input_german_path is None: | ||
for file in input_path.glob("*.dta"): | ||
file_de = None | ||
process = Process(target=_run, args=(file, file_de, output_path, study_name)) | ||
processes.append(process) | ||
process.start() | ||
if input_path is not None and input_german_path is not None: | ||
for file in input_path.glob("*.dta"): | ||
file_de = pathlib.Path(str(input_german_path) + "/" + os.path.basename(str(file))) | ||
process = Process(target=_run, args=(file, file_de, output_path, study_name)) | ||
processes.append(process) | ||
process.start() | ||
|
||
# complete the processes | ||
for process in processes: | ||
process.join() | ||
else: | ||
for file in input_path.glob("*.dta"): | ||
_run(file=file, output_path=output_path, study_name=study_name) | ||
if input_path is None: | ||
for file_de in input_german_path.glob("*.dta"): | ||
file = None | ||
_run(file=file, file_de=file_de, output_path=output_path, study_name=study_name) | ||
if input_german_path is None: | ||
for file in input_path.glob("*.dta"): | ||
file_de = None | ||
_run(file=file, file_de=file_de, output_path=output_path, study_name=study_name) | ||
if input_path is not None and input_german_path is not None: | ||
for file in input_path.glob("*.dta"): | ||
file_de = pathlib.Path(str(input_german_path) + "/" + os.path.basename(str(file))) | ||
_run(file=file, file_de=file_de, output_path=output_path, study_name=study_name) | ||
|
||
duration = time.time() - start_time | ||
logging.info("Duration {:.5f} seconds".format(duration)) | ||
|
||
|
||
def _run(file: pathlib.Path, output_path: pathlib.Path, study_name: str) -> None: | ||
def _run(file: pathlib.Path, file_de: pathlib.Path, output_path: pathlib.Path, study_name: str) -> None: | ||
"""Encapsulate data processing run with multiprocessing.""" | ||
file_path = output_path.joinpath(file.stem).with_suffix(".json") | ||
if file is None: | ||
file_path = output_path.joinpath(file_de.stem).with_suffix(".json") | ||
stata_data_de = StataDataExtractor(file_de) | ||
stata_data_de.parse_file() | ||
|
||
write_json(stata_data_de.data, None, stata_data_de.metadata, file_path, study=study_name) | ||
|
||
elif file_de is None: | ||
file_path = output_path.joinpath(file.stem).with_suffix(".json") | ||
stata_data = StataDataExtractor(file) | ||
stata_data.parse_file() | ||
|
||
write_json(stata_data.data, stata_data.metadata, None, file_path, study=study_name) | ||
|
||
elif file is not None and file_de is not None: | ||
file_path = output_path.joinpath(file.stem).with_suffix(".json") | ||
stata_data = StataDataExtractor(file) | ||
stata_data.parse_file() | ||
stata_data_de = StataDataExtractor(file_de) | ||
stata_data_de.get_variable_metadata() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like you could remove a lot of redundancy and maybe reduce complexity
collect_stata/write_json.py
Outdated
metadata_en: List[Dict[str, Union[str, Dict[str, List[Union[int, str, bool]]]]]], | ||
metadata_de: List[Dict[str, Union[str, Dict[str, List[Union[int, str, bool]]]]]], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you pass this to write_json and not directly into update_meatadata?
collect_stata/write_json.py
Outdated
metadata: List[Dict[str, Union[str, Dict[str, List[Union[int, str, bool]]]]]], | ||
metadata_de: List[Dict[str, Union[str, Dict[str, List[Union[int, str, bool]]]]]], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We updated to python 3.8 on the develop branch. You can use TypedDicts to make this more readable and more precise. https://mypy.readthedocs.io/en/latest/more_types.html#typeddict
collect_stata/write_json.py
Outdated
Input: | ||
metadata: Metadata of the english imported data. | ||
metadata_de: Metadata of the german imported data. | ||
|
||
Output: | ||
metadata: Metadata variable with german and english labels if given. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use Args: and Returns. Please indent blocks after :
https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
@@ -202,10 +202,44 @@ def generate_statistics( | |||
|
|||
return metadata | |||
|
|||
def update_metadata( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function would better fit into one of the other modules.
https://en.wikipedia.org/wiki/Cohesion_(computer_science)
Please integrate the changes from develop. |
collect_stata/__main__.py
Outdated
@@ -4,7 +4,7 @@ | |||
import argparse | |||
import logging | |||
import os | |||
import pathlib | |||
import pathlib import Path |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
syntax error.
Please use your local linter and or the pre-commit hooks provided, to prevent this.
This pull request introduces 2 alerts when merging 0dc5f9f into 8f9220b - view on LGTM.com new alerts:
|
Sometimes elem["categories"]["missings"] was not initiated before calling elem["categories"]["missings"].append()
variable_meta["categories"]["values"] sometimes contained numbers of numpy integer types. Those numbers caused json.dumps() to fail since it could not be serialize these integer objects.
Wrong indentation in multithreading block caused only one process to run.
This pull request introduces 2 alerts when merging 0df40e0 into 8f9220b - view on LGTM.com new alerts:
|
Process gathering and starting was only done inside the block for processing two languages. * Remove lint * Fix type hints * Refactor bloated code
* Homogenize naming * Fix formatting
* Value labels are not switched when language of the stata file is switched. + Variable labels are though, which lead to the false expectations and faulty implementation. * Value labels are bound to a key which in turn is associated with a language and a variable. This key seems to be arbitrarily assignable. * It seems that value labels can only be associated with their variables through the position of the key inside the lbllist stata reader attribute.
* Cast statistics values from numpy internal type to python numeric values.
This pull request introduces 1 alert when merging 029afbd into 8f9220b - view on LGTM.com new alerts:
|
labels_de list is too short in some variables. Here is a snippet with labels and labels_de: "labels": [
"[-6] Version of questionnaire with modified filtering",
"[-5] Not included in this version of the questionnaire",
"[-4] Inadmissible multiple response",
"[-3] Answer improbable",
"[-2] Does not apply",
"[-1] No Answer",
"[0] 0 Satisfied: On Scale 0-Low to 10-High",
"[10] 10 Satisfied: On Scale 0-Low to 10-High",
"8",
"7",
"5",
"9",
"6",
"4",
"3",
"2",
"1"
],
"labels_de": [
"[-6] Fragebogenversion mit geaenderter Filterfuehrung",
"[-5] In Fragebogenversion nicht enthalten",
"[-4] Unzulaessige Mehrfachantwort",
"[-3] nicht valide",
"[-2] trifft nicht zu",
"[-1] keine Angabe",
"[0] Ganz unzufrieden",
"[10] Ganz zufrieden"
], Here is the full variable: {
"name": "qp14301",
"dataset": "qp",
"label": "Satisfaction With Life At Today",
"categories": {
"values": [
-6,
-5,
-4,
-3,
-2,
-1,
0,
10,
8,
7,
5,
9,
6,
4,
3,
2,
1
],
"labels": [
"[-6] Version of questionnaire with modified filtering",
"[-5] Not included in this version of the questionnaire",
"[-4] Inadmissible multiple response",
"[-3] Answer improbable",
"[-2] Does not apply",
"[-1] No Answer",
"[0] 0 Satisfied: On Scale 0-Low to 10-High",
"[10] 10 Satisfied: On Scale 0-Low to 10-High",
"8",
"7",
"5",
"9",
"6",
"4",
"3",
"2",
"1"
],
"labels_de": [
"[-6] Fragebogenversion mit geaenderter Filterfuehrung",
"[-5] In Fragebogenversion nicht enthalten",
"[-4] Unzulaessige Mehrfachantwort",
"[-3] nicht valide",
"[-2] trifft nicht zu",
"[-1] keine Angabe",
"[0] Ganz unzufrieden",
"[10] Ganz zufrieden"
],
"missings": [
true,
true,
true,
true,
true,
true,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false
],
"frequencies": [
0,
0,
0,
0,
0,
66,
111,
1554,
7484,
5242,
2993,
2828,
2674,
753,
512,
269,
90
]
},
"scale": "cat",
"label_de": "Lebenszufriedenh. gegenwaertig",
"study": "soep-core",
"statistics": {
"valid": 24510,
"invalid": 66
}
},
{
"name": "qp14302",
"dataset": "qp",
"label": "Satisfaction With Life In Five Years",
"categories": {
"values": [
-6,
-5,
-4,
-3,
-2,
-1,
0,
10,
8,
7,
9,
5,
6,
4,
3,
2,
1
],
"labels": [
"[-6] Version of questionnaire with modified filtering",
"[-5] Not included in this version of the questionnaire",
"[-4] Inadmissible multiple response",
"[-3] Answer improbable",
"[-2] Does not apply",
"[-1] No Answer",
"[0] 0 Satisfied: On Scale 0-Low to 10-High",
"[10] 10 Satisfied: On Scale 0-Low to 10-High",
"8",
"7",
"9",
"5",
"6",
"4",
"3",
"2",
"1"
],
"labels_de": [
"[-6] Fragebogenversion mit geaenderter Filterfuehrung",
"[-5] In Fragebogenversion nicht enthalten",
"[-4] Unzulaessige Mehrfachantwort",
"[-3] nicht valide",
"[-2] trifft nicht zu",
"[-1] keine Angabe",
"[0] Ganz unzufrieden",
"[10] Ganz zufrieden"
],
"missings": [
true,
true,
true,
true,
true,
true,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false
],
"frequencies": [
0,
0,
0,
0,
0,
560,
141,
1836,
6847,
4347,
3732,
2799,
2359,
924,
580,
328,
123
]
},
"scale": "cat",
"label_de": "Lebenszufriedenh. in 5 Jahren",
"study": "soep-core",
"statistics": {
"valid": 24016,
"invalid": 560
}
} |
This pull request introduces 2 alerts when merging ae65259 into 8f9220b - view on LGTM.com new alerts:
Heads-up: LGTM.com's PR analysis will be disabled on the 5th of December, and LGTM.com will be shut down ⏻ completely on the 16th of December 2022. Please enable GitHub code scanning, which uses the same CodeQL engine ⚙️ that powers LGTM.com. For more information, please check out our post on the GitHub blog. |
No description provided.