Skip to content

Commit

Permalink
Adding parquet utils
Browse files Browse the repository at this point in the history
  • Loading branch information
jloveric committed May 17, 2024
1 parent bcb1fcc commit 66f112c
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 10 deletions.
18 changes: 18 additions & 0 deletions parquet_shrinker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd
import click


@click.command()
@click.option("--file", help="Parquet file to read.", required=True)
@click.option("--newfile", help="File to write out.", required=True)
@click.option("--rows", type=int, help="Number of rows to write out.", required=True)
def run(file: str, newfile: str, rows: int):

df = pd.read_parquet(file)
small_df = df.head(rows)
small_df.to_parquet(newfile)
print(f"finished writing {newfile}")


if __name__ == "__main__":
run()
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ black = "^22.3.0"
pytest = "^7.1.2"
tensorboard = "^2.9.1"

[tool.poetry.group.dev.dependencies]
click = "^8.1.7"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Binary file added test_data/test.parquet
Binary file not shown.
25 changes: 17 additions & 8 deletions tests/test_single_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
image_to_dataset,
ImageNeighborhoodReader,
image_neighborhood_dataset,
Text2ImageDataset
Text2ImageDataset,
Text2ImageRenderDataset,
)
import torch
from torch.utils.data import Dataset, DataLoader



def test_neighborhood_dataset():
factor = 11
size = factor * 9
Expand All @@ -36,12 +36,21 @@ def test_image_neighborhood_reader():
assert ind.lasty == 27
assert ind.image.shape == torch.Size([3, 32, 32])


def test_parquet_dataset():
dataset = Text2ImageDataset(filenames = ["train-00000-of-00645-b66ac786bf6fb553.parquet"])
dataset = Text2ImageDataset(
filenames=["test_data/test.parquet"]
)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)
#caption, position, rgb = next(iter(dataloader))
for batch in dataloader :
print('batch', batch)

# caption, position, rgb = next(iter(dataloader))
for batch in dataloader:
print("batch", batch)
break
#print('val', caption, position, rgb)
# print('val', caption, position, rgb)


def test_text_to_image_sampler_dataloader():
dataloader = Text2ImageRenderDataset(
filenames=["test_data/test.parquet"]
)
3 changes: 1 addition & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from omegaconf import DictConfig
from high_order_implicit_representation.networks import Net


def test_generate_sample():
sample_points = 10
input_features = 70
Expand Down Expand Up @@ -64,4 +63,4 @@ def test_generate_sample():
)
assert len(results) == 3
ans = [result.shape == torch.Size([3, 64, 64]) for result in results]
assert all(ans) is True
assert all(ans) is True

0 comments on commit 66f112c

Please sign in to comment.