From 9efb7c51104c987940d981988d58016fb02bda67 Mon Sep 17 00:00:00 2001 From: antonylebechec Date: Fri, 20 Sep 2024 19:45:45 +0200 Subject: [PATCH] Add options to control transcripts view struct #256 --- howard/functions/commons.py | 28 + howard/objects/variants.py | 270 +++++++-- ...n_transcripts_profiles_fields_renamed.json | 54 ++ tests/test_commons.py | 28 + tests/test_variants_transcripts.py | 557 +++++++++++++++++- 5 files changed, 852 insertions(+), 85 deletions(-) create mode 100644 tests/data/prioritization_transcripts_profiles_fields_renamed.json diff --git a/howard/functions/commons.py b/howard/functions/commons.py index 3d8a13a..94f9d8c 100644 --- a/howard/functions/commons.py +++ b/howard/functions/commons.py @@ -3987,3 +3987,31 @@ def determine_column_number(values_list: list) -> str: return "." return "1" + + +def clean_annotation_field(name: str = "", char_allowed: list = None) -> str: + """ + The `clean_annotation_field` function removes characters from a string that are not alphanumeric or + in a specified list. + + :param name: The `name` parameter is a string that represents the input text that you want to clean. + It typically contains annotations or other text that you want to process + :type name: str + :param char_allowed: The `char_allowed` parameter is a list that contains characters that are + allowed to remain in the `name` string after cleaning. Any character in the `name` string that is + not alphanumeric and not in the `char_allowed` list will be removed during the cleaning process + :type char_allowed: list + :return: The function `clean_annotation_field` returns a cleaned version of the `name` string, where + only alphanumeric characters and characters from the `char_allowed` list are kept. + """ + + # Init + if char_allowed is None: + char_allowed = [] + + # Convert char_allowed to a set for faster membership testing + char_allowed_set = set(char_allowed) + + return "".join( + char for char in name if (char.isalnum() or char in char_allowed_set) + ) diff --git a/howard/objects/variants.py b/howard/objects/variants.py index 6d2ca6a..cc85ccf 100644 --- a/howard/objects/variants.py +++ b/howard/objects/variants.py @@ -9725,7 +9725,6 @@ def transcripts_prioritization( pz_param.get("pzprefix", "PTZ") + pz_field ) else: - # pz_param_pzfields.append(pz_field) pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field pz_param_pzfields[pz_field] = pz_field_new @@ -9798,9 +9797,17 @@ def transcripts_prioritization( if "transcript" in fields_to_explode: fields_to_explode.remove("transcript") + # Fields intranscripts table + query_transcripts_table = f""" + DESCRIBE SELECT * FROM {transcripts_table} + """ + query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) + # Check fields to explode for field_to_explode in fields_to_explode: - if field_to_explode not in self.get_header_infos_list(): + if field_to_explode not in self.get_header_infos_list() + list( + query_transcripts_table.column_name + ): msg_err = f"INFO/{field_to_explode} NOT IN header" log.error(msg_err) raise ValueError(msg_err) @@ -9890,11 +9897,6 @@ def transcripts_prioritization( FROM {transcripts_table} """ - # DEBUG - # log.debug(f""" query_update_ranking={query_update_ranking} """) - # df_devel = self.get_query_to_df(query=query_update_ranking) - # log.debug(df_devel) - # Export Transcripts prioritization infos to variants table query_update = f""" WITH RankedTranscripts AS ( @@ -9932,37 +9934,59 @@ def create_transcript_view_from_columns_map( added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, + column_rename: dict = {}, + column_clean: bool = False, + column_case: str = None, ) -> tuple[list, list, list]: """ The `create_transcript_view_from_columns_map` function generates a temporary table view based on specified columns mapping for transcripts data. - :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of - the table where the transcripts data is stored or will be stored in the database. This table - typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, - predictions, etc. It defaults to "transcripts, defaults to transcripts + :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name + of the table where the transcripts data is stored or will be stored in the database. This table + typically contains information about transcripts such as Ensembl transcript IDs, gene names, + scores, predictions, etc. It defaults to "transcripts, defaults to transcripts :type transcripts_table: str (optional) - :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about - how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list - represents a mapping configuration for a specific set of columns. It typically includes details such - as the main transcript column and additional information columns + :param columns_maps: The `columns_maps` parameter is a dictionary that contains information + about how to map columns from a transcripts table to create a view. Each entry in the + `columns_maps` list represents a mapping configuration for a specific set of columns. It + typically includes details such as the main transcript column and additional information columns :type columns_maps: dict - :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` - function is a list that stores the additional columns that will be added to the view being created - based on the columns map provided. These columns are generated by exploding the transcript - information columns along with the main transcript column + :param added_columns: The `added_columns` parameter in the + `create_transcript_view_from_columns_map` function is a list that stores the additional columns + that will be added to the view being created based on the columns map provided. These columns + are generated by exploding the transcript information columns along with the main transcript + column :type added_columns: list :param temporary_tables: The `temporary_tables` parameter in the `create_transcript_view_from_columns_map` function is a list that stores the names of temporary - tables created during the process of creating a transcript view from a columns map. These temporary - tables are used to store intermediate results or transformations before the final view is generated + tables created during the process of creating a transcript view from a columns map. These + temporary tables are used to store intermediate results or transformations before the final view + is generated :type temporary_tables: list :param annotation_fields: The `annotation_fields` parameter in the - `create_transcript_view_from_columns_map` function is a list that stores the fields that are used - for annotation in the query view creation process. These fields are extracted from the + `create_transcript_view_from_columns_map` function is a list that stores the fields that are + used for annotation in the query view creation process. These fields are extracted from the `transcripts_column` and `transcripts_infos_columns` specified in the `columns :type annotation_fields: list - :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three + :param column_rename: The `column_rename` parameter in the + `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify + custom renaming for columns during the creation of the temporary table view. This parameter + provides a mapping of original column names to the desired renamed column names. By using this + parameter, + :type column_rename: dict + :param column_clean: The `column_clean` parameter in the + `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the + column values should be cleaned or not. If set to `True`, the column values will be cleaned by + removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to + False + :type column_clean: bool (optional) + :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` + function is used to specify the case transformation to be applied to the columns during the view + creation process. It allows you to control whether the column values should be converted to + lowercase, uppercase, or remain unchanged + :type column_case: str + :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three lists: `added_columns`, `temporary_tables`, and `annotation_fields`. """ @@ -10005,6 +10029,15 @@ def create_transcript_view_from_columns_map( # Transcripts infos columns transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) + # Transcripts infos columns rename + column_rename = columns_map.get("column_rename", column_rename) + + # Transcripts infos columns clean + column_clean = columns_map.get("column_clean", column_clean) + + # Transcripts infos columns case + column_case = columns_map.get("column_case", column_case) + if transcripts_column is not None: # Explode @@ -10013,24 +10046,53 @@ def create_transcript_view_from_columns_map( ) # View clauses - clause_select = [] + clause_select_variants = [] + clause_select_tanscripts = [] for field in [transcripts_column] + transcripts_infos_columns: - clause_select.append( + + # AS field + as_field = field + + # Rename + if column_rename: + as_field = column_rename.get(as_field, as_field) + + # Clean + if column_clean: + as_field = clean_annotation_field(as_field) + + # Case + if column_case: + if column_case.lower() in ["lower"]: + as_field = as_field.lower() + elif column_case.lower() in ["upper"]: + as_field = as_field.upper() + + # Clause select Variants + clause_select_variants.append( f""" regexp_split_to_table("{field}", ',') AS '{field}' """ ) - if field not in [transcripts_column]: - annotation_fields.append(field) + + if field in [transcripts_column]: + clause_select_tanscripts.append( + f""" regexp_split_to_table("{field}", ',') AS '{field}' """ + ) + else: + clause_select_tanscripts.append( + f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ + ) + annotation_fields.append(as_field) # Querey View query = f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{transcripts_column}" AS 'transcript', - {", ".join(clause_select)} + {", ".join(clause_select_tanscripts)} FROM ( SELECT "#CHROM", POS, REF, ALT, INFO, - {", ".join(clause_select)} + {", ".join(clause_select_variants)} FROM {table_variants} ) WHERE "{transcripts_column}" IS NOT NULL @@ -10057,33 +10119,55 @@ def create_transcript_view_from_column_format( column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, + column_rename: dict = {}, + column_clean: bool = False, + column_case: str = None, ) -> tuple[list, list, list]: """ The `create_transcript_view_from_column_format` function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields. - :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of - the table containing the transcripts data. This table will be used as the base table for creating - the transcript view. The default value for this parameter is "transcripts", but you can provide a - different table name if needed, defaults to transcripts + :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name + of the table containing the transcripts data. This table will be used as the base table for + creating the transcript view. The default value for this parameter is "transcripts", but you can + provide a different table name if needed, defaults to transcripts :type transcripts_table: str (optional) :param column_formats: The `column_formats` parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary - specifies the mapping between a transcripts column and a transcripts infos column. For example, in - the provided code snippet: + specifies the mapping between a transcripts column and a transcripts infos column. This + parameter allows you to define how the columns from the transcripts table should be transformed + or mapped :type column_formats: dict :param temporary_tables: The `temporary_tables` parameter in the - `create_transcript_view_from_column_format` function is a list that stores the names of temporary - views created during the process of creating a transcript view from a column format. These temporary - views are used to manipulate and extract data before generating the final transcript view. It + `create_transcript_view_from_column_format` function is a list that stores the names of + temporary views created during the process of creating a transcript view from a column format. + These temporary views are used to manipulate and extract data before generating the final + transcript view :type temporary_tables: list :param annotation_fields: The `annotation_fields` parameter in the `create_transcript_view_from_column_format` function is a list that stores the annotation fields - that are extracted from the temporary views created during the process. These annotation fields are - obtained by querying the temporary views and extracting the column names excluding specific columns - like `#CH + that are extracted from the temporary views created during the process. These annotation fields + are obtained by querying the temporary views and extracting the column names excluding specific + columns like `#CH :type annotation_fields: list + :param column_rename: The `column_rename` parameter in the + `create_transcript_view_from_column_format` function is a dictionary that allows you to specify + custom renaming of columns in the transcripts infos table. By providing a mapping of original + column names to new column names in this dictionary, you can rename specific columns during the + process + :type column_rename: dict + :param column_clean: The `column_clean` parameter in the + `create_transcript_view_from_column_format` function is a boolean flag that determines whether + the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns + will be cleaned during the creation of the transcript view based on the specified column format, + defaults to False + :type column_clean: bool (optional) + :param column_case: The `column_case` parameter in the + `create_transcript_view_from_column_format` function is used to specify the case transformation + to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" + to convert the column names to uppercase or lowercase, respectively + :type column_case: str :return: The `create_transcript_view_from_column_format` function returns two lists: `temporary_tables` and `annotation_fields`. """ @@ -10111,6 +10195,15 @@ def create_transcript_view_from_column_format( "transcripts_infos_column", "Feature_ID" ) + # Transcripts infos columns rename + column_rename = column_format.get("column_rename", column_rename) + + # Transcripts infos columns clean + column_clean = column_format.get("column_clean", column_clean) + + # Transcripts infos columns case + column_case = column_format.get("column_case", column_case) + # Temporary View name temporary_view_name = transcripts_table + "".join( random.choices(string.ascii_uppercase + string.digits, k=10) @@ -10122,6 +10215,9 @@ def create_transcript_view_from_column_format( annotation_field=annotation_field, view_name=temporary_view_name, annotation_id=transcript_annotation, + column_rename=column_rename, + column_clean=column_clean, + column_case=column_case, ) # Annotation fields @@ -10230,9 +10326,15 @@ def create_transcript_view( temporary_tables += temporary_tables_tmp annotation_fields += annotation_fields_tmp + # Remove some specific fields/column + annotation_fields = list(set(annotation_fields)) + for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: + if field in annotation_fields: + annotation_fields.remove(field) + # Merge temporary tables query query_merge = "" - for temporary_table in temporary_tables: + for temporary_table in list(set(temporary_tables)): # First temporary table if not query_merge: @@ -10289,38 +10391,63 @@ def annotation_format_to_table( annotation_field: str = "ANN", annotation_id: str = "Feature_ID", view_name: str = "transcripts", + column_rename: dict = {}, + column_clean: bool = False, + column_case: str = None, ) -> str: """ - The function `annotation_format_to_table` converts annotation data from a VCF file into a structured - table format. + The `annotation_format_to_table` function converts annotation data from a VCF file into a + structured table format, ensuring unique values and creating a temporary table for further + processing or analysis. - :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique - values in the output or not. If set to `True`, the function will make sure that the output values - are unique, defaults to True + :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure + unique values in the output or not. If set to `True`, the function will make sure that the + output values are unique, defaults to True :type uniquify: bool (optional) - :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that - contains the annotation information for each variant. This field is used to extract the annotation - details for further processing in the function, defaults to ANN + :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file + that contains the annotation information for each variant. This field is used to extract the + annotation details for further processing in the function. By default, it is set to "ANN", + defaults to ANN :type annotation_field: str (optional) - :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is - used to specify the identifier for the annotation feature. This identifier will be used as a column - name in the resulting table or view that is created based on the annotation data. It helps in - uniquely identifying each annotation entry in the, defaults to Feature_ID + :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method + is used to specify the identifier for the annotation feature. This identifier will be used as a + column name in the resulting table or view that is created based on the annotation data. It + helps in uniquely identifying each annotation entry in the, defaults to Feature_ID :type annotation_id: str (optional) - :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to - specify the name of the temporary table that will be created to store the transformed annotation - data. This table will hold the extracted information from the annotation field in a structured - format for further processing or analysis, defaults to transcripts + :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used + to specify the name of the temporary table that will be created to store the transformed + annotation data. This table will hold the extracted information from the annotation field in a + structured format for further processing or analysis. By default,, defaults to transcripts :type view_name: str (optional) - :return: The function `annotation_format_to_table` is returning the name of the view created, which - is stored in the variable `view_name`. + :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method + is a dictionary that allows you to specify custom renaming for columns. By providing key-value + pairs in this dictionary, you can rename specific columns in the resulting table or view that is + created based on the annotation data. This feature enables + :type column_rename: dict + :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is + a boolean flag that determines whether the annotation field should undergo a cleaning process. + If set to `True`, the function will clean the annotation field before further processing. This + cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults + to False + :type column_clean: bool (optional) + :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is + used to specify the case transformation to be applied to the column names extracted from the + annotation data. It allows you to set the case of the column names to either lowercase or + uppercase for consistency or other specific requirements during the conversion + :type column_case: str + :return: The function `annotation_format_to_table` is returning the name of the view created, + which is stored in the variable `view_name`. """ # Annotation field annotation_format = "annotation_explode" # Transcript annotation - annotation_id = "".join(char for char in annotation_id if char.isalnum()) + if column_rename: + annotation_id = column_rename.get(annotation_id, annotation_id) + + if column_clean: + annotation_id = clean_annotation_field(annotation_id) # Prefix prefix = self.get_explode_infos_prefix() @@ -10396,9 +10523,22 @@ def annotation_format_to_table( # Key key = row.iloc[0] - - # key_clean - key_clean = "".join(char for char in key if char.isalnum()) + key_clean = key + + # key rename + if column_rename: + key_clean = column_rename.get(key_clean, key_clean) + + # key clean + if column_clean: + key_clean = clean_annotation_field(key_clean) + + # Key case + if column_case: + if column_case.lower() in ["lower"]: + key_clean = key_clean.lower() + elif column_case.lower() in ["upper"]: + key_clean = key_clean.upper() # Type query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" diff --git a/tests/data/prioritization_transcripts_profiles_fields_renamed.json b/tests/data/prioritization_transcripts_profiles_fields_renamed.json new file mode 100644 index 0000000..e33bf59 --- /dev/null +++ b/tests/data/prioritization_transcripts_profiles_fields_renamed.json @@ -0,0 +1,54 @@ +{ + "transcripts": { + "LISTScore": [ + { + "type": "gt", + "value": "0.75", + "score": 10, + "flag": "PASS", + "comment": ["Very Good LIST Score"] + }, + { + "type": "gt", + "value": "0.50", + "score": 10, + "flag": "PASS", + "comment": ["Good LIST Score"] + } + ], + "CLNSIG": [ + { + "type": "eq", + "value": "pathogenic", + "score": 100, + "flag": "PASS", + "comment": ["Pathogenic"] + } + ], + "AnnotationImpact": [ + { + "type": "eq", + "value": "MODIFIER", + "score": 100, + "flag": "PASS", + "comment": ["MODIFIER"] + } + ], + "transcript": [ + { + "type": "eq", + "value": "NM_005228.5", + "score": 100, + "flag": "PASS", + "comment": ["NM_005228.5"] + }, + { + "type": "eq", + "value": "NM_001346941.2", + "score": 100, + "flag": "PASS", + "comment": ["NM_001346941.2"] + } + ] + } +} diff --git a/tests/test_commons.py b/tests/test_commons.py index d149fa0..ac415d1 100644 --- a/tests/test_commons.py +++ b/tests/test_commons.py @@ -1531,3 +1531,31 @@ def test_get_duckdb_extension_file(): conn = duckdb.connect() assert get_duckdb_extension_file("sqlite_scanner", conn=conn) + + +def test_clean_annotation_field_basic_alphanumeric(): + assert clean_annotation_field("HelloWorld") == "HelloWorld" + + +def test_clean_annotation_field_with_special_characters(): + assert clean_annotation_field("Hello, World!") == "HelloWorld" + + +def test_clean_annotation_field_with_allowed_characters(): + assert clean_annotation_field("Hello-World", char_allowed=["-"]) == "Hello-World" + + +def test_clean_annotation_field_empty_string(): + assert clean_annotation_field("") == "" + + +def test_clean_annotation_field_no_allowed_characters(): + assert clean_annotation_field("Hello@World#2023") == "HelloWorld2023" + + +def test_clean_annotation_field_all_characters_removed(): + assert clean_annotation_field("!!!") == "" + + +def test_clean_annotation_field_non_alphanumeric_with_allowed_chars(): + assert clean_annotation_field("Test123!@#", char_allowed=["!"]) == "Test123!" diff --git a/tests/test_variants_transcripts.py b/tests/test_variants_transcripts.py index 3882b2e..bf4cf64 100644 --- a/tests/test_variants_transcripts.py +++ b/tests/test_variants_transcripts.py @@ -23,10 +23,10 @@ @pytest.mark.parametrize( "input_vcf", [ - "tests/data/example.ann.transcripts.vcf.gz", - "tests/data/example.ann.vcf.gz", - "tests/data/example.dbnsfp.transcripts.vcf.gz", - "tests/data/example.dbnsfp.no_transcripts.vcf.gz", + f"{tests_data_folder}/example.ann.transcripts.vcf.gz", + f"{tests_data_folder}/example.ann.vcf.gz", + f"{tests_data_folder}/example.dbnsfp.transcripts.vcf.gz", + f"{tests_data_folder}/example.dbnsfp.no_transcripts.vcf.gz", ], ) def test_create_transcript_view(input_vcf): @@ -53,6 +53,9 @@ def test_create_transcript_view(input_vcf): { "transcripts_column": "ANN", "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, } ], "from_columns_map": [ # format List, e.g. dbNSFP columns @@ -64,6 +67,9 @@ def test_create_transcript_view(input_vcf): "LIST_S2_score", "LIST_S2_pred", ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, { "transcripts_column": "Ensembl_transcriptid", @@ -72,6 +78,9 @@ def test_create_transcript_view(input_vcf): "VARITY_R_score", "Aloft_pred", ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, ], }, @@ -101,10 +110,10 @@ def test_create_transcript_view(input_vcf): @pytest.mark.parametrize( "input_vcf", [ - "tests/data/example.ann.transcripts.vcf.gz", - "tests/data/example.ann.vcf.gz", - "tests/data/example.dbnsfp.transcripts.vcf.gz", - "tests/data/example.dbnsfp.no_transcripts.vcf.gz", + f"{tests_data_folder}/example.ann.transcripts.vcf.gz", + f"{tests_data_folder}/example.ann.vcf.gz", + f"{tests_data_folder}/example.dbnsfp.transcripts.vcf.gz", + f"{tests_data_folder}/example.dbnsfp.no_transcripts.vcf.gz", ], ) def test_create_transcript_view_to_variants(input_vcf): @@ -126,6 +135,9 @@ def test_create_transcript_view_to_variants(input_vcf): { "transcripts_column": "ANN", "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, } ], "from_columns_map": [ # format List, e.g. dbNSFP columns @@ -137,6 +149,9 @@ def test_create_transcript_view_to_variants(input_vcf): "LIST_S2_score", "LIST_S2_pred", ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, { "transcripts_column": "Ensembl_transcriptid", @@ -145,6 +160,9 @@ def test_create_transcript_view_to_variants(input_vcf): "VARITY_R_score", "Aloft_pred", ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, ], }, @@ -283,10 +301,10 @@ def test_create_transcript_view_to_variants(input_vcf): @pytest.mark.parametrize( "input_vcf", [ - "tests/data/example.ann.transcripts.vcf.gz", - "tests/data/example.ann.vcf.gz", - "tests/data/example.dbnsfp.transcripts.vcf.gz", - "tests/data/example.dbnsfp.no_transcripts.vcf.gz", + f"{tests_data_folder}/example.ann.transcripts.vcf.gz", + f"{tests_data_folder}/example.ann.vcf.gz", + f"{tests_data_folder}/example.dbnsfp.transcripts.vcf.gz", + f"{tests_data_folder}/example.dbnsfp.no_transcripts.vcf.gz", ], ) def test_transcripts_prioritization(input_vcf): @@ -309,10 +327,16 @@ def test_transcripts_prioritization(input_vcf): { "transcripts_column": "ANN", "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, }, { "transcripts_column": "ANN", "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, }, ], "from_columns_map": [ @@ -324,6 +348,9 @@ def test_transcripts_prioritization(input_vcf): "LIST_S2_score", "LIST_S2_pred", ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, { "transcripts_column": "Ensembl_transcriptid", @@ -332,13 +359,16 @@ def test_transcripts_prioritization(input_vcf): "VARITY_R_score", "Aloft_pred", ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, ], }, } param_prioritization = { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "prioritization_score_mode": "HOWARD", } @@ -484,7 +514,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "prioritization_score_mode": "HOWARD", }, @@ -500,7 +530,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "pzfields": ["Score", "Flag"], "prioritization_score_mode": "HOWARD", @@ -518,7 +548,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "pzfields": ["Score", "Flag", "LIST_S2_score", "LIST_S2_pred"], "prioritization_score_mode": "HOWARD", @@ -538,7 +568,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "pzfields": [ "Score", @@ -555,7 +585,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "prioritization_transcripts_order": { "LIST_S2_score": "ASC", @@ -574,7 +604,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "prioritization_transcripts_order": { "CADD_raw": "ASC", @@ -593,7 +623,7 @@ def test_transcripts_prioritization(input_vcf): f"{tests_data_folder}/example.ann.transcripts.vcf.gz", { "profiles": ["transcripts"], - "prioritization_config": "config/prioritization_transcripts_profiles.json", + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles.json", "pzprefix": "PZT", "prioritization_transcripts_order": { "field_not_present_in_header": "ASC", @@ -647,12 +677,491 @@ def test_transcripts_prioritization_multiple_param( { "transcripts_column": "ANN", "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, + } + ], + "from_columns_map": [ + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + "column_rename": None, + "column_clean": False, + "column_case": None, }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + "column_rename": None, + "column_clean": False, + "column_case": None, + }, + ], + }, + } + + # Param without prioritization + param_without_prioritization = {"transcripts": dict(param_struct)} + + # Param with prioritization + param_with_prioritization = {"transcripts": dict(param_struct)} + param_with_prioritization["transcripts"]["prioritization"] = dict( + param_prioritization + ) + + # Create object + variants = Variants( + conn=None, input=input_vcf, output=output_vcf, param=param, load=True + ) + + # Create transcript view + transcripts_table = variants.create_transcript_view( + param=param_without_prioritization + ) + + # Check table exists + assert transcripts_table is not None + + # If Raise with Value Error + if raise_value_error: + + # Catch ValueError + with pytest.raises(ValueError) as excinfo: + + # Prioritization + variants.transcripts_prioritization(param=param_with_prioritization) + + assert str(excinfo.value) == raise_value_error + + # If expected results + if where_clause: + + # Prioritization + assert variants.transcripts_prioritization(param=param_with_prioritization) + + # Check transcript prioritization result + # Check table content + query_check = f""" + SELECT * FROM variants + WHERE {where_clause} + """ + check = variants.get_query_to_df(query=query_check) + assert len(check) > 0 + + # Export + ######## + + # Check if VCF is in correct format with pyVCF + remove_if_exists([output_vcf]) + variants.export_output(output_file=output_vcf) + try: + vcf.Reader(filename=output_vcf) + except: + assert False + + +@pytest.mark.parametrize( + "struct, fields_list", + [ + ( # By default, no rename, no clean, no case (except clean for snpEff because mandatory) + { + "from_column_format": [ # format List, e.g. snpEff + { + "transcripts_column": "ANN", + "transcripts_infos_column": "Feature_ID", + "column_clean": True, + } + ], + "from_columns_map": [ # format List, e.g. dbNSFP columns + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + }, + ], + }, + [ + "FeatureID", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + "VARITY_R_score", + "Aloft_pred", + ], + ), + ( # No rename, no clean, nor case (except clean for snpEff because mandatory) + { + "from_column_format": [ # format List, e.g. snpEff + { + "transcripts_column": "ANN", + "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, + } + ], + "from_columns_map": [ # format List, e.g. dbNSFP columns + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + "column_rename": None, + "column_clean": False, + "column_case": None, + }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + "column_rename": None, + "column_clean": False, + "column_case": None, + }, + ], + }, + [ + "FeatureID", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + "VARITY_R_score", + "Aloft_pred", + ], + ), + ( # No rename, clean all and nocase (except clean for snpEff because mandatory) + { + "from_column_format": [ # format List, e.g. snpEff { "transcripts_column": "ANN", "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, + } + ], + "from_columns_map": [ # format List, e.g. dbNSFP columns + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + "column_rename": None, + "column_clean": True, + "column_case": None, + }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + "column_rename": None, + "column_clean": True, + "column_case": None, }, ], + }, + [ + "FeatureID", + "Ensemblgeneid", + "LISTS2score", + "LISTS2pred", + "VARITYRscore", + "Aloftpred", + ], + ), + ( # No rename, no clean, and case all (except clean for snpEff because mandatory) + { + "from_column_format": [ # format List, e.g. snpEff + { + "transcripts_column": "ANN", + "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": "lower", + } + ], + "from_columns_map": [ # format List, e.g. dbNSFP columns + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + "column_rename": None, + "column_clean": False, + "column_case": "lower", + }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + "column_rename": None, + "column_clean": False, + "column_case": "lower", + }, + ], + }, + [ + "featureid", + "ensembl_geneid", + "list_s2_score", + "list_s2_pred", + "varity_r_score", + "aloft_pred", + ], + ), + ( # No rename, clean all and case all (except clean for snpEff because mandatory) + { + "from_column_format": [ # format List, e.g. snpEff + { + "transcripts_column": "ANN", + "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": "lower", + } + ], + "from_columns_map": [ # format List, e.g. dbNSFP columns + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + "column_rename": None, + "column_clean": True, + "column_case": "lower", + }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + "column_rename": None, + "column_clean": True, + "column_case": "lower", + }, + ], + }, + [ + "featureid", + "ensemblgeneid", + "lists2score", + "lists2pred", + "varityrscore", + "aloftpred", + ], + ), + ( # Rename "genename" columns to merge, transcript ANN id, extra columns on struct_map + { + "from_column_format": [ # format List, e.g. snpEff + { + "transcripts_column": "ANN", + "transcripts_infos_column": "Feature_ID", + "column_rename": { + "Gene_Name": "genename", + "Feature_ID": "THETRANSCRIPTOFSNPEFF", + }, + "column_clean": True, + "column_case": None, + } + ], + "from_columns_map": [ # format List, e.g. dbNSFP columns + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "Ensembl_geneid", + "LIST_S2_score", + "LIST_S2_pred", + ], + "column_clean": False, + "column_case": None, + "column_rename": { + "LIST_S2_score": "LISTScore", + "LIST_S2_pred": "LISTPred", + }, + }, + { + "transcripts_column": "Ensembl_transcriptid", + "transcripts_infos_columns": [ + "genename", + "VARITY_R_score", + "Aloft_pred", + ], + "column_clean": False, + "column_case": None, + }, + ], + }, + [ + "genename", + "THETRANSCRIPTOFSNPEFF", + "LISTScore", + "LISTPred", + "VARITY_R_score", + "Aloft_pred", + ], + ), + ], +) +def test_create_transcript_view_rename_clean_case(struct, fields_list): + """ + The function `test_devel_create_transcript_view` creates a transcript view from a VCF file using + specified parameters and checks the resulting table for data. + + :param input_vcf: It seems like the `input_vcf` parameter is missing in the provided code snippet. + Could you please provide the value or path that should be assigned to the `input_vcf` variable in + the `test_devel_create_transcript_view` function? + """ + + with TemporaryDirectory(dir=tests_folder) as tmp_dir: + + # Init files + input_vcf = f"{tests_data_folder}/example.ann.transcripts.vcf.gz" + output_vcf = f"{tmp_dir}/output.vcf" + + # Construct param dict + param = {"transcripts": {"table": "transcripts", "struct": struct}} + + # Create object + variants = Variants( + conn=None, input=input_vcf, output=output_vcf, param=param, load=True + ) + + # Create transcript view + transcripts_table = variants.create_transcript_view() + + # Check table exists + assert transcripts_table is not None + + # Check table content + query_check = f""" + SELECT column_name + FROM ( + DESCRIBE SELECT * FROM {transcripts_table} + ) + WHERE column_name in ('{"', '".join(fields_list)}') + """ + check = variants.get_query_to_df(query=query_check) + + assert len(check) == len(list(set(fields_list))) + + +@pytest.mark.parametrize( + "input_vcf, param_prioritization, where_clause, raise_value_error", + [ + ( # Add PZfields plus + f"{tests_data_folder}/example.ann.transcripts.vcf.gz", + { + "profiles": ["transcripts"], + "prioritization_config": f"{tests_data_folder}/prioritization_transcripts_profiles_fields_renamed.json", + "pzprefix": "PZT", + "pzfields": ["Score", "Flag", "LISTScore", "LISTPred"], + "prioritization_score_mode": "HOWARD", + }, + """ + "#CHROM" = 'chr1' + AND POS = 69101 + AND contains(INFO, 'PZTTranscript=ENST00000641515') + AND contains(INFO, 'PZTScore') + AND contains(INFO, 'PZTFlag') + AND contains(INFO, 'PZTLISTScore') + AND contains(INFO, 'PZTLISTPred') + """, + None, + ), + ], +) +def test_transcripts_prioritization_multiple_param_fields_renamed( + input_vcf, param_prioritization, where_clause, raise_value_error +): + """ + The `test_transcripts_prioritization_multiple_param` function tests transcript prioritization + functionality in a genetic variant analysis pipeline with configurable parameters. + + :param input_vcf: It seems like the `input_vcf` parameter is the path or reference to the VCF + (Variant Call Format) file that contains genetic variant data. This file is likely used as input for + the genetic variant analysis pipeline where the transcript prioritization functionality is being + tested + :param param_prioritization: The `param_prioritization` parameter is a dictionary that contains + information about the prioritization configuration for transcripts in a genetic variant analysis + pipeline. It includes details such as profiles, prioritization configuration file path, prefix, and + score mode. This parameter is used to customize how transcripts are prioritized during the + :param where_clause: The `where_clause` parameter in the `test_transcripts_prioritization` function + is a SQL WHERE clause that is used to filter the results of a query. It specifies a condition that + must be met for a row to be included in the result set + :param raise_value_error: The `raise_value_error` parameter in the `test_transcripts_prioritization` + function is a boolean flag that determines whether the test should raise a `ValueError` and check if + the raised error message matches a specific value. If `raise_value_error` is `True`, the test will + raise + """ + + with TemporaryDirectory(dir=tests_folder) as tmp_dir: + + # Init files + output_vcf = f"{tmp_dir}/output.vcf" + + # Construct param dict + param = {} + param_struct = { + "table": "transcripts", + "column_id": "transcript", + "transcripts_info_json": "transcripts_json", + "transcripts_info_field": "transcripts_json", + "struct": { + "from_column_format": [ + { + "transcripts_column": "ANN", + "transcripts_infos_column": "Feature_ID", + "column_rename": None, + "column_clean": True, + "column_case": None, + } + ], "from_columns_map": [ { "transcripts_column": "Ensembl_transcriptid", @@ -662,6 +1171,12 @@ def test_transcripts_prioritization_multiple_param( "LIST_S2_score", "LIST_S2_pred", ], + "column_rename": { + "LIST_S2_score": "LISTScore", + "LIST_S2_pred": "LISTPred", + }, + "column_clean": False, + "column_case": None, }, { "transcripts_column": "Ensembl_transcriptid", @@ -670,6 +1185,8 @@ def test_transcripts_prioritization_multiple_param( "VARITY_R_score", "Aloft_pred", ], + "column_clean": False, + "column_case": None, }, ], },