Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery committed Dec 18, 2023
1 parent 070e476 commit aaa51bb
Showing 1 changed file with 23 additions and 1 deletion.
24 changes: 23 additions & 1 deletion sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import psutil
import os
import functools
import itertools
import re
Expand Down Expand Up @@ -84,6 +86,13 @@
]


def print_allocation(name, length, dtype):
print(f"{name}: {(np.dtype(dtype).itemsize * length) / (1024 ** 2):.2f} MB")

def get_memory_usage():
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 * 1024) # Return memory usage in MB

class FloatFormatFieldWarning(UserWarning):
"""Warning for VCF FORMAT float fields, which can use a lot of storage."""

Expand Down Expand Up @@ -240,6 +249,7 @@ def for_field(
dims.append(dimension)
chunksize += (size,)

print_allocation(variable_name, np.prod(chunksize), dtype)
array = np.full(chunksize, fill_value, dtype=dtype)

return InfoAndFormatFieldHandler(
Expand Down Expand Up @@ -362,11 +372,13 @@ def __init__(
self.truncate_calls = truncate_calls
self.max_alt_alleles = max_alt_alleles
self.fill = -2 if self.mixed_ploidy else -1
print_allocation("call_genotype", chunk_length * n_sample * ploidy, smallest_numpy_int_dtype(max_alt_alleles))
self.call_genotype = np.full(
(chunk_length, n_sample, ploidy),
self.fill,
dtype=smallest_numpy_int_dtype(max_alt_alleles),
)
print_allocation("call_genotype_phased", chunk_length * n_sample, "bool")
self.call_genotype_phased = np.full((chunk_length, n_sample), 0, dtype=bool)

def add_variant(self, i: int, variant: Any) -> None:
Expand Down Expand Up @@ -432,8 +444,13 @@ def vcf_to_zarr_sequential(
) -> None:
if read_chunk_length is None:
read_chunk_length = chunk_length

memory_before = get_memory_usage()
with open_vcf(input) as vcf:
print(f"Opening VCF: {get_memory_usage() - memory_before:.2f} MB")
print_allocation("samples", len(vcf.samples), "O")
sample_id = np.array(vcf.samples, dtype="O")

n_allele = max_alt_alleles + 1

variant_contig_names = vcf.seqnames
Expand All @@ -459,7 +476,9 @@ def vcf_to_zarr_sequential(
variants = vcf(region)

variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
print_allocation("variant_contig", read_chunk_length, variant_contig_dtype)
variant_contig = np.empty(read_chunk_length, dtype=variant_contig_dtype)
print_allocation("variant_position", read_chunk_length, "i4")
variant_position = np.empty(read_chunk_length, dtype="i4")

fields = fields or ["FORMAT/GT"] # default to GT as the only extra field
Expand Down Expand Up @@ -488,7 +507,9 @@ def vcf_to_zarr_sequential(
):
variant_ids = []
variant_alleles = []
print_allocation("variant_quality", read_chunk_length, "f4")
variant_quality = np.empty(read_chunk_length, dtype="f4")
print_allocation("variant_filter", read_chunk_length*len(filters), "bool")
variant_filter = np.full(
(read_chunk_length, len(filters)), False, dtype="bool"
)
Expand Down Expand Up @@ -533,12 +554,13 @@ def vcf_to_zarr_sequential(

for field_handler in field_handlers:
field_handler.truncate_array(i + 1)

print_allocation("variant_id", len(variant_ids), "O")
variant_id = np.array(variant_ids, dtype="O")
variant_id_mask = variant_id == "."
if len(variant_alleles) == 0:
variant_allele = np.empty((0, n_allele), dtype="O")
else:
print_allocation("variant_allele", len(variant_alleles), "O")
variant_allele = np.array(variant_alleles, dtype="O")

ds: xr.Dataset = create_genotype_call_dataset(
Expand Down

0 comments on commit aaa51bb

Please sign in to comment.