Skip to content

Commit

Permalink
fix: securely create temp directory
Browse files Browse the repository at this point in the history
  • Loading branch information
imathews committed Nov 15, 2024
1 parent d42fd6f commit 730f999
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 39 deletions.
12 changes: 3 additions & 9 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,10 @@

For local development, clone this repository and then run

```py
source. / venv / bin / activate & & python
setup.py
develop - -user
```sh
source ./venv/bin/ ctivate && python setup.py develop --user
# OR
source. / venv / bin / activate & & python3 - m
pip
install. & & REDIVIS_API_ENDPOINT = https: // localhost: 8443 / api / v1
python3 - W
ignore
source ./venv/bin/activate && python3 -m pip install . && REDIVIS_API_ENDPOINT = https://localhost:8443/api/v1 python3 - W ignore
```

You can then run the tests, e.g.:
Expand Down
2 changes: 1 addition & 1 deletion src/redivis/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.17"
__version__ = "0.15.18"
3 changes: 2 additions & 1 deletion src/redivis/classes/Notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os

from ..common.retryable_upload import perform_resumable_upload, perform_standard_upload
from ..common.util import get_tempdir


class Notebook(Base):
Expand All @@ -20,7 +21,7 @@ def __init__(
def create_output_table(
self, data=None, *, name=None, append=False, geography_variables=None
):
temp_file_path = f"/tmp/redivis/out/{uuid.uuid4()}"
temp_file_path = f"{get_tempdir()}/out/{uuid.uuid4()}"
try:
pathlib.Path(temp_file_path).parent.mkdir(exist_ok=True, parents=True)

Expand Down
15 changes: 4 additions & 11 deletions src/redivis/common/list_rows.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import concurrent.futures
import tempfile
import uuid
import os
import pathlib
from contextlib import closing
from ..classes.Row import Row
from tqdm.auto import tqdm
import shutil
from ..common.api_request import make_request
from .util import get_tempdir
from .api_request import make_request
from threading import Event

MAX_PARALLELIZATION = 8
Expand Down Expand Up @@ -145,7 +145,7 @@ def list_rows(
coerce_schema=coerce_schema,
)

folder = pathlib.Path("/").joinpath(
folder = pathlib.Path().joinpath(
get_tempdir(),
"tables",
f"{uuid.uuid4()}",
Expand Down Expand Up @@ -228,7 +228,7 @@ def list_rows(
# Make sure we no longer remove the folder in the finally clause after making this change
# Create the Parquet base directory
parquet_base_dir = str(
pathlib.Path("/")
pathlib.Path()
.joinpath(
get_tempdir(),
"tables",
Expand Down Expand Up @@ -410,13 +410,6 @@ def process_stream(
os.remove(os_file)


def get_tempdir():
user_suffix = os.environ.get("USER", os.environ.get("USERNAME")) or os.getuid()
return (
f"{os.getenv('REDIVIS_TMPDIR') or tempfile.gettempdir()}/redivis_{user_suffix}"
)


def format_tuple_type(val, type):
if val is None:
return val
Expand Down
73 changes: 56 additions & 17 deletions src/redivis/common/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import os
import tempfile
import atexit
import shutil
import pathlib


def get_geography_variable(variables, geography_variable_name):
if geography_variable_name == "":
for variable in variables:
Expand All @@ -16,36 +23,68 @@ def get_geography_variable(variables, geography_variable_name):


def get_warning(kind):
if kind == 'dataframe_deprecation':
if kind == "dataframe_deprecation":
return 'The to_dataframe() method is deprecated, and has been superceded by to_pandas_dataframe().\nBy default, this new method uses the "pyarrow" dtype_backend, which is more performant and will generally work with existing code.\nTo replicate historic behavior, use to_pandas_dataframe(dtype_backend="numpy").'
elif kind == 'geodataframe_deprecation':
return 'Please use the to_geopandas_dataframe() method to ensure future compatability.'
elif kind == "geodataframe_deprecation":
return "Please use the to_geopandas_dataframe() method to ensure future compatability."
else:
return 'WARNING'
return "WARNING"


created_temp_dir = None


def arrow_table_to_pandas(arrow_table, dtype_backend, date_as_object, max_parallelization):
def rm_tempdir():
global created_temp_dir
shutil.rmtree(created_temp_dir, ignore_errors=True)


def get_tempdir():
if os.getenv("REDIVIS_TMPDIR"):
user_suffix = os.environ.get("USER", os.environ.get("USERNAME")) or os.getuid()
return str(
pathlib.Path("/").joinpath(
os.getenv("REDIVIS_TMPDIR"),
f"redivis_{user_suffix}",
)
)

global created_temp_dir
if created_temp_dir is None:
created_temp_dir = tempfile.mkdtemp()
atexit.register(rm_tempdir)

return created_temp_dir


def arrow_table_to_pandas(
arrow_table, dtype_backend, date_as_object, max_parallelization
):
import pandas as pd
import pyarrow as pa

pa.set_cpu_count(max_parallelization)
pa.set_io_thread_count(max_parallelization)

if dtype_backend not in ['numpy', 'numpy_nullable', 'pyarrow']:
if dtype_backend not in ["numpy", "numpy_nullable", "pyarrow"]:
raise Exception(
f"Unknown dtype_backend. Must be one of 'pyarrow'|'numpy_nullable'|'numpy'. Default is 'pyarrow'")

if dtype_backend == 'numpy_nullable':
df = arrow_table.to_pandas(self_destruct=True, date_as_object=date_as_object, types_mapper={
pa.int64(): pd.Int64Dtype(),
pa.bool_(): pd.BooleanDtype(),
pa.float64(): pd.Float64Dtype(),
pa.string(): pd.StringDtype(),
}.get)
elif dtype_backend == 'pyarrow':
f"Unknown dtype_backend. Must be one of 'pyarrow'|'numpy_nullable'|'numpy'. Default is 'pyarrow'"
)

if dtype_backend == "numpy_nullable":
df = arrow_table.to_pandas(
self_destruct=True,
date_as_object=date_as_object,
types_mapper={
pa.int64(): pd.Int64Dtype(),
pa.bool_(): pd.BooleanDtype(),
pa.float64(): pd.Float64Dtype(),
pa.string(): pd.StringDtype(),
}.get,
)
elif dtype_backend == "pyarrow":
df = arrow_table.to_pandas(self_destruct=True, types_mapper=pd.ArrowDtype)
else:
df = arrow_table.to_pandas(self_destruct=True, date_as_object=date_as_object)

return df

0 comments on commit 730f999

Please sign in to comment.