Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle common prefixes in variant alleles #39

Merged
merged 1 commit into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
44 changes: 28 additions & 16 deletions standardize_mutation_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,6 @@ def resolve_tumor_seq_alleles(data, ref_allele):
# resolve tumor seq allele 1 from the tumor genotype column if it still has not been resolved
if TUMOR_GENOTYPE_COLUMN in data.keys() and tum_seq_allele1 == "":
tum_seq_allele1 = re.split("[\/|]", data[TUMOR_GENOTYPE_COLUMN])[0]
# if tumor seq allele 1 is still empty then set it to the reference allele by default
if tum_seq_allele1 == "":
tum_seq_allele1 = ref_allele

# the importer determines which tumor seq allele to use as the alt allele
# so simply return the resolved values as they are
Expand Down Expand Up @@ -501,28 +498,43 @@ def resolve_variant_allele_data(data, maf_data):
# this will be used to resolve the variant classification and variant type
# if there are no tumor alleles that do not match the ref allele then use empty string
# in the event that this happens then there might be something wrong with the data itself
try:
tumor_seq_allele = [allele for allele in [tumor_seq_allele1, tumor_seq_allele2] if allele != ref_allele][0]
except:
tumor_seq_allele = ""

variant_type = resolve_variant_type(data, ref_allele, tumor_seq_allele)
tumor_seq_allele = ""
for allele in [tumor_seq_allele1, tumor_seq_allele2]:
if allele != "" and allele != ref_allele:
tumor_seq_allele = allele
break

# fix ref allele and tum seq allele for INS or DEL variant types
if variant_type == "INS":
ref_allele = "-"
elif variant_type == "DEL":
tumor_seq_allele = "-"
# resolve start and end positions
start_pos = resolve_start_position(data)

# if the alleles share a common prefix then remove and adjust the start position accordingly
if not is_missing_data_value(ref_allele) and not is_missing_data_value(tumor_seq_allele) and not is_missing_data_value(start_pos):
common_prefix = os.path.commonprefix([ref_allele, tumor_seq_allele])
if common_prefix:
start_pos = str(int(start_pos) + len(common_prefix))
ref_allele = ref_allele[len(common_prefix):]
tumor_seq_allele = tumor_seq_allele[len(common_prefix):]
if not is_missing_data_value(tumor_seq_allele1):
tumor_seq_allele1 = tumor_seq_allele1[len(common_prefix):]
if not is_missing_data_value(tumor_seq_allele2):
tumor_seq_allele2 = tumor_seq_allele2[len(common_prefix):]

# ref and tumor seq allele might have been updated to remove common prefixes
# attempt to resolve the variant type based on the potentially updated allele strings
variant_type = resolve_variant_type(data, ref_allele, tumor_seq_allele)
variant_class = resolve_variant_classification(data, variant_type, ref_allele, tumor_seq_allele)
# fix variant type just in case it was missed before
if variant_class.endswith("INS") and variant_type != "INS":
variant_type = "INS"
elif variant_class.endswith("DEL") and variant_type != "DEL":
variant_type = "DEL"

# resolve start and end positions
start_pos = resolve_start_position(data)
# fix ref allele and tum seq allele for INS or DEL variant types
if variant_type == "INS" and len(ref_allele) == 0:
ref_allele = "-"
elif variant_type == "DEL" and len(tumor_seq_allele) == 0:
tumor_seq_allele = "-"

end_pos = resolve_end_position(data, start_pos, variant_type, ref_allele)

maf_data["Variant_Classification"] = variant_class
Expand Down