-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1000genomes_match_to_tsv.py
executable file
·118 lines (91 loc) · 5.62 KB
/
1000genomes_match_to_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import pandas as pd
import sys
def match_csv_tsv(directory_gwas_study_tsv, directory_1000_genomes, track_selection_list):
chunksize = 100000
output_path = './1000_genomes_csv_tsv_match'
os.makedirs(output_path, exist_ok=True)
original_result_file_path = os.path.join(output_path, 'combined_result_original.csv')
sorted_result_file_path = os.path.join(output_path, 'combined_result_sorted.csv')
# Load TSV data into memory
tsv_data = pd.read_csv(directory_gwas_study_tsv, sep='\t')
tsv_data['chromosome'] = pd.to_numeric(tsv_data['chromosome'], errors='coerce').astype('Int64')
tsv_data['base_pair_location'] = pd.to_numeric(tsv_data['base_pair_location'], errors='coerce').astype('Int64')
csv_columns_no_sad = ['alt', 'chr', 'pos', 'ref', 'snp']
tsv_columns = ['beta', 'standard_error', 'effect_allele_frequency', 'p_value']
# Columns to merge the files on
csv_match_columns = ['chr', 'pos']
tsv_match_columns = ['chromosome', 'base_pair_location']
# Flag for headers being written or not
headers_written = False
with open(original_result_file_path, 'w') as f_out:
csv_files = [f for f in os.listdir(directory_1000_genomes) if f.endswith('.csv') and f != 'targets.csv']
total_files = len(csv_files)
for i, csv_file in enumerate(csv_files):
csv_file_path = os.path.join(directory_1000_genomes, csv_file)
print(f"Processing file {i+1}/{total_files}: {csv_file}")
for chunk_index, chunk in enumerate(pd.read_csv(csv_file_path, chunksize=chunksize)):
print(f"Processing chunk {chunk_index+1} of {csv_file}")
# Filter and calculate average SAD
chunk_filtered = chunk[track_selection_list + csv_columns_no_sad].copy()
chunk_filtered['average_SAD'] = chunk_filtered[track_selection_list].mean(axis=1) # Calculate average SAD
merged_chunk = chunk_filtered.merge(
tsv_data,
left_on=csv_match_columns,
right_on=tsv_match_columns,
how='left'
)
# Remove duplicate columns from merged_chunk
merged_chunk.drop(columns=['other_allele', 'chromosome', 'base_pair_location', 'effect_allele'], inplace=True)
# Select and reorder the columns for output
output_columns = track_selection_list + csv_columns_no_sad + tsv_columns + ['average_SAD'] # Include average_SAD
merged_chunk = merged_chunk[output_columns]
# Add header if required
if not headers_written:
merged_chunk.to_csv(f_out, index=False, mode='a', header=True)
headers_written = True
else:
merged_chunk.to_csv(f_out, index=False, mode='a', header=False)
# Add the TSV rows not included in the original file. If a row of the TSV has pos and chr combination not in the file, then add it,
# with null values for the columns resulting from the CSV file, and filling in the columns from the TSV
# - chromosome goes to chr, other allele goes to alt, effect allele goes to ref, base_pair_location goes to pos
combined_data = pd.read_csv(original_result_file_path)
missing_tsv_rows = tsv_data[~tsv_data.set_index(tsv_match_columns).index.isin(combined_data.set_index(csv_match_columns).index)]
# Get the length of the missing_tsv_rows DataFrame
print("length of missing tsv rows", len(missing_tsv_rows))
missing_tsv_rows = missing_tsv_rows.rename(columns={
'chromosome': 'chr',
'base_pair_location': 'pos',
'other_allele': 'alt',
'effect_allele': 'ref'
})
# Create a DataFrame with the same columns as the combined CSV file
missing_tsv_rows_formatted = pd.DataFrame(columns=combined_data.columns)
# Fill in the missing_tsv_rows_formatted with the TSV data
for col in missing_tsv_rows.columns:
if col in missing_tsv_rows_formatted.columns:
missing_tsv_rows_formatted[col] = missing_tsv_rows[col]
with open(original_result_file_path, 'a') as f_out:
missing_tsv_rows_formatted.to_csv(f_out, index=False, header=False)
missing_tsv_rows_formatted.to_csv('1000_genomes_csv_tsv_match/missing_tsv_rows.csv', index=False)
print(f"Original results written to {original_result_file_path}")
# Sort by absolute value of average_SAD, and save as new file
combined_data = pd.read_csv(original_result_file_path)
combined_data['abs_average_SAD'] = combined_data['average_SAD'].abs() # Calculate absolute value
sorted_combined_data = combined_data.sort_values(by='abs_average_SAD', ascending=False, na_position='last')
# Add ranking column
sorted_combined_data['ranking'] = range(1, len(sorted_combined_data) + 1)
sorted_combined_data.to_csv(sorted_result_file_path, index=False)
print(f"Sorted results with ranking written to {sorted_result_file_path}")
return original_result_file_path, sorted_result_file_path
def main(directory_gwas_study_tsv, directory_1000_genomes):
original_path, sorted_path = match_csv_tsv(directory_gwas_study_tsv, directory_1000_genomes, ["SAD1", "SAD9"])
return
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <path for GWAS tsv> <directory for 1000 genomes csvs>")
else:
directory_gwas_study_tsv = sys.argv[1]
directory_1000_genomes = sys.argv[2]
main(directory_gwas_study_tsv, directory_1000_genomes)
sys.exit()