Skip to content

Commit

Permalink
Merge pull request #875 from pavanvidem/hic-new-features
Browse files Browse the repository at this point in the history
add new little requested features
  • Loading branch information
bgruening authored Apr 24, 2024
2 parents 879650f + c416e9a commit 67a4f37
Show file tree
Hide file tree
Showing 13 changed files with 242 additions and 50 deletions.
60 changes: 41 additions & 19 deletions hicexplorer/chicAggregateStatistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,31 @@ def parse_arguments(args=None):
return parser


def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None):
def filter_scores_target_list(pScoresDictionary, pTargetFType, pTargetPosDict, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None):

accepted_scores = {}
same_target_dict = {}
target_regions_intervaltree = None
if pTargetList is not None:

# newly added
if pTargetFType == 'hdf5':
# read hdf content for this specific combination
targetFileHDF5Object = h5py.File(pTargetFile, 'r')
target_object = targetFileHDF5Object['/'.join(pTargetList)]
chromosome = target_object.get('chromosome')[()].decode("utf-8")
start_list = list(target_object['start_list'][:])
end_list = list(target_object['end_list'][:])
targetFileHDF5Object.close()
elif pTargetFType == 'bed4':
chromosome = pTargetPosDict[pTargetList[-1]]['chromosome']
start_list = pTargetPosDict[pTargetList[-1]]['start_list']
end_list = pTargetPosDict[pTargetList[-1]]['end_list']
elif pTargetFType == 'bed3':
target_regions_intervaltree = pTargetIntervalTree
else:
log.error('No target list given.')
raise Exception('No target list given.')

if pTargetList is not None:
chromosome = [chromosome] * len(start_list)

target_regions = list(zip(chromosome, start_list, end_list))
Expand All @@ -85,12 +96,6 @@ def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetInterv

hicmatrix = hm.hiCMatrix()
target_regions_intervaltree = hicmatrix.intervalListToIntervalTree(target_regions)[0]
elif pTargetIntervalTree is not None:
target_regions_intervaltree = pTargetIntervalTree

else:
log.error('No target list given.')
raise Exception('No target list given.')

for key in pScoresDictionary:
chromosome = pScoresDictionary[key][0]
Expand Down Expand Up @@ -193,12 +198,12 @@ def writeAggregateHDF(pOutFileName, pOutfileNamesList, pAcceptedScoresList, pArg
aggregateFileH5Object.close()


def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pViewpointObj, pQueue=None, pOneTarget=False):
def run_target_list_compilation(pInteractionFilesList, pTargetList, pTargetFType, pTargetPosDict, pArgs, pViewpointObj, pQueue=None, pOneTarget=False):
outfile_names_list = []
accepted_scores_list = []
target_regions_intervaltree = None
try:
if pOneTarget == True:
if pTargetFType == 'bed3':
try:
target_regions = utilities.readBed(pTargetList)
except Exception as exp:
Expand All @@ -211,14 +216,13 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView
outfile_names_list_intern = []
accepted_scores_list_intern = []
for sample in interactionFile:

interaction_data, interaction_file_data, _ = pViewpointObj.readInteractionFile(pArgs.interactionFile, sample)
if pOneTarget == True:
target_file = None
else:
target_file = pTargetList[i]

accepted_scores = filter_scores_target_list(interaction_file_data, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile)
accepted_scores = filter_scores_target_list(interaction_file_data, pTargetFType, pTargetPosDict, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile)

outfile_names_list_intern.append(sample)
accepted_scores_list_intern.append(accepted_scores)
Expand All @@ -238,7 +242,7 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView
return


def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs, pViewpointObj):
def call_multi_core(pInteractionFilesList, pTargetFileList, pTargetFType, pTargetPosDict, pFunctionName, pArgs, pViewpointObj):
if len(pInteractionFilesList) < pArgs.threads:
pArgs.threads = len(pInteractionFilesList)
outfile_names_list = [None] * pArgs.threads
Expand Down Expand Up @@ -272,6 +276,8 @@ def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs
process[i] = Process(target=pFunctionName, kwargs=dict(
pInteractionFilesList=interactionFileListThread,
pTargetList=targetFileListThread,
pTargetFType=pTargetFType,
pTargetPosDict=pTargetPosDict,
pArgs=pArgs,
pViewpointObj=pViewpointObj,
pQueue=queue[i],
Expand Down Expand Up @@ -318,16 +324,32 @@ def main(args=None):

targetList = []
present_genes = {}
target_ftype = ''
targetPosDict = None
# read hdf file
interactionFileHDF5Object = h5py.File(args.interactionFile, 'r')
keys_interactionFile = list(interactionFileHDF5Object.keys())

if h5py.is_hdf5(args.targetFile):

targetDict, present_genes = viewpointObj.readTargetHDFFile(args.targetFile)
target_ftype = 'hdf5'

else:
targetList = [args.targetFile]
with open(args.targetFile) as file:
for line in file.readlines():
if line.startswith('#'):
continue
_line = line.strip().split('\t')
break
if len(_line) == 4:
log.info('Targets BED contains 4 columns, aggregating on column 4')
target_ftype = 'bed4'
present_genes, targetDict, targetPosDict = utilities.readTargetBed(args.targetFile)
elif len(_line) == 3:
targetList = [args.targetFile]
target_ftype = 'bed3'
else:
log.error('BED of targets list must have 3 or 4 columns')

if len(keys_interactionFile) > 1:

Expand All @@ -346,7 +368,7 @@ def main(args=None):
geneList2 = sorted(list(matrix_obj2[chromosome2].keys()))

for gene1, gene2 in zip(geneList1, geneList2):
if h5py.is_hdf5(args.targetFile):
if target_ftype != 'bed3':
if gene1 in present_genes[sample][sample2]:
interactionDict[gene1] = [[sample, chromosome1, gene1], [sample2, chromosome2, gene2]]
else:
Expand All @@ -356,7 +378,7 @@ def main(args=None):

interactionFileHDF5Object.close()

if h5py.is_hdf5(args.targetFile):
if target_ftype != 'bed3':
key_outer_matrix = present_genes.keys()
for keys_outer in key_outer_matrix:
keys_inner_matrix = present_genes[keys_outer].keys()
Expand All @@ -365,5 +387,5 @@ def main(args=None):
interactionList.append(interactionDict[gene])
targetList.append(targetDict[gene])

outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, run_target_list_compilation, args, viewpointObj)
outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, target_ftype, targetPosDict, run_target_list_compilation, args, viewpointObj)
writeAggregateHDF(args.outFileName, outfile_names_list, accepted_scores_list, args)
12 changes: 11 additions & 1 deletion hicexplorer/hicCorrectMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def correct_subparser():
'of chromosomes and/or translocations.',
action='store_true')

parserOpt.add_argument('--filteredBed',
help='Print bins filtered our by --filterThreshold to this file')

parserOpt.add_argument('--verbose',
help='Print processing status.',
action='store_true')
Expand Down Expand Up @@ -548,6 +551,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
to avoid introducing bias due to different chromosome numbers
"""
log.info("filtering by z-score")
to_remove = []
if perchr:
for chrname in list(hic_ma.interval_trees):
Expand Down Expand Up @@ -575,6 +579,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
"\n".format(chrname, lower_threshold, upper_threshold))

to_remove.extend(problematic)

else:
row_sum = np.asarray(hic_ma.matrix.sum(axis=1)).flatten()
# subtract from row sum, the diagonal
Expand All @@ -584,7 +589,6 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
mad = MAD(row_sum)
to_remove = np.flatnonzero(mad.is_outlier(
lower_threshold, upper_threshold))

return sorted(to_remove)


Expand Down Expand Up @@ -658,6 +662,12 @@ def main(args=None):
restore_masked_bins=False)

assert matrix_shape == ma.matrix.shape

if args.filteredBed:
with open(args.filteredBed, 'w') as f:
for outlier_region in set(outlier_regions):
interval = ma.cut_intervals[outlier_region]
f.write(f"{interval[0]}\t{interval[1]}\t{interval[2]}\t.\t{interval[3]}\t.\n")
# mask filtered regions
ma.maskBins(outlier_regions)
total_filtered_out = set(outlier_regions)
Expand Down
76 changes: 58 additions & 18 deletions hicexplorer/test/general/test_chicAggregateStatistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,41 +113,81 @@ def test_regular_mode_threads():
aggregateFileH5Object.close()


def test_target_list():
def test_target_list_bed3():
outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False)
outfile_aggregate.close()
args = "--interactionFile {} --targetFile {} --outFileName {} \
-t {}".format(ROOT + 'chicViewpoint/two_matrices.hdf5',
ROOT + 'chicAggregateStatistic/target_list.bed',
-t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5',
ROOT + 'chicAggregateStatistic/target_list_3col.bed',
outfile_aggregate.name, 10).split()
chicAggregateStatistic.main(args)

aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r')
assert 'FL-E13-5_chr1_MB-E10-5_chr1' in aggregateFileH5Object
assert 'FL-E13-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']
assert 'MB-E10-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']
assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object
assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']

assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']
assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']
assert len(aggregateFileH5Object) == 1
assert aggregateFileH5Object.attrs['type'] == 'aggregate'

for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']:
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']:

assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]) == 3
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]:
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]) == 7
for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]:
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']:
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']:

assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]) == 3
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]:
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]) == 7
for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]:
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

aggregateFileH5Object.close()


def test_target_list_bed4():
outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False)
outfile_aggregate.close()
args = "--interactionFile {} --targetFile {} --outFileName {} \
-t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5',
ROOT + 'chicAggregateStatistic/target_list_4col.bed',
outfile_aggregate.name, 10).split()
chicAggregateStatistic.main(args)

aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r')
assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object
assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']

assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']
assert len(aggregateFileH5Object) == 1
assert aggregateFileH5Object.attrs['type'] == 'aggregate'

for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']:

assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']:

assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

aggregateFileH5Object.close()
18 changes: 16 additions & 2 deletions hicexplorer/test/general/test_hicCorrectMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,35 @@
os.path.dirname(os.path.abspath(__file__))), "test_data/")


def are_files_equal(file1, file2, pDifference=1):
with open(file1) as textfile1, open(file2) as textfile2:
for x, y in zip(textfile1, textfile2):
if x != y:
count = sum(1 for a, b in zip(x, y) if a != b)
if count > pDifference:
return False
return True


def test_correct_matrix_ICE():
outfile = NamedTemporaryFile(suffix='.ICE.h5', delete=False)
outfile.close()

outfile_filtered = NamedTemporaryFile(suffix='.bed', delete=True)

args = "correct --matrix {} --correctionMethod ICE --chromosomes "\
"chrUextra chr3LHet --iterNum 500 --outFileName {} "\
"chrUextra chr3LHet --iterNum 500 --outFileName {} --filteredBed {} "\
"--filterThreshold -1.5 5.0".format(ROOT + "small_test_matrix.h5",
outfile.name).split()
outfile.name,
outfile_filtered.name).split()
# hicCorrectMatrix.main(args)
compute(hicCorrectMatrix.main, args, 5)
test = hm.hiCMatrix(
ROOT + "hicCorrectMatrix/small_test_matrix_ICEcorrected_chrUextra_chr3LHet.h5")
new = hm.hiCMatrix(outfile.name)
nt.assert_equal(test.matrix.data, new.matrix.data)
nt.assert_equal(test.cut_intervals, new.cut_intervals)
assert are_files_equal(outfile_filtered.name, ROOT + 'hicCorrectMatrix/filtered.bed')

os.unlink(outfile.name)

Expand Down
Binary file not shown.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chr1 4480000 4549000
chr1 4555000 4688000
chr1 14274000 14279000
chr1 14290000 14439000
chr1 14444000 14467000
chr1 14476000 14501000
chr1 19077000 19118000
chr1 19120000 19274000
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr1 4487335 4487535 Sox17
chr1 14300180 14300380 Eya1
chr1 19093003 19093203 Tfap2d
Binary file not shown.
Loading

0 comments on commit 67a4f37

Please sign in to comment.