diff --git a/Dockerfile b/Dockerfile index 135aada01..347794544 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ RUN apt-get update \ # install 3rd-party system dependencies RUN apt-get update \ - && apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake -y + && apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake gfortran libopenblas-dev liblapack-dev -y # prepare the java env WORKDIR /opt @@ -33,11 +33,7 @@ WORKDIR /data-juicer RUN pip install --upgrade setuptools==69.5.1 setuptools_scm \ && pip install git+https://github.com/xinyu1205/recognize-anything.git --default-timeout 1000 -# install requirements first to better reuse installed library cache -COPY environments/ environments/ -RUN cat environments/* | grep -v '^#' | xargs pip install --default-timeout 1000 - # install data-juicer then COPY . . -RUN pip install -v -e .[all] -RUN pip install -v -e .[sandbox] +RUN pip install -v -e .[all] --default-timeout 1000 +RUN pip install -v -e .[sandbox] --default-timeout 1000 diff --git a/README.md b/README.md index 32b5b0d1f..eb34e17ba 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ Table of Contents ## Prerequisites -- Recommend Python>=3.8,<=3.10 +- Recommend Python>=3.9,<=3.10 - gcc >= 5 (at least C++14 support) ## Installation @@ -386,6 +386,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml ```shell # run the data processing directly docker run --rm \ # remove container after the processing + --privileged \ + --shm-size 256g \ + --network host \ + --gpus all \ --name dj \ # name of the container -v : \ # mount data or config directory into the container -v ~/.cache/:/root/.cache/ \ # mount the cache directory into the container to reuse caches and models (recommended) @@ -398,6 +402,10 @@ docker run --rm \ # remove container after the processing ```shell # start the container docker run -dit \ # run the container in the background + --privileged \ + --shm-size 256g \ + --network host \ + --gpus all \ --rm \ --name dj \ -v : \ diff --git a/README_ZH.md b/README_ZH.md index 03e349547..905a4e1a2 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -144,7 +144,7 @@ Data-Juicer正在积极更新和维护中,我们将定期强化和新增更多 ## 前置条件 -* 推荐 Python>=3.8,<=3.10 +* 推荐 Python>=3.9,<=3.10 * gcc >= 5 (at least C++14 support) ## 安装 @@ -363,6 +363,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml ```shell # 直接运行数据处理 docker run --rm \ # 在处理结束后将容器移除 + --privileged \ + --shm-size 256g \ + --network host \ + --gpus all \ --name dj \ # 容器名称 -v : \ # 将本地的数据或者配置目录挂载到容器中 -v ~/.cache/:/root/.cache/ \ # 将 cache 目录挂载到容器以复用 cache 和模型资源(推荐) @@ -375,6 +379,10 @@ docker run --rm \ # 在处理结束后将容器移除 ```shell # 启动容器 docker run -dit \ # 在后台启动容器 + --privileged \ + --shm-size 256g \ + --network host \ + --gpus all \ --rm \ --name dj \ -v : \ diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index 2db669cde..7d37959fe 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -1,7 +1,7 @@ -fsspec==2023.5.0 -pyarrow<=12.0.0 -pandas==2.0.3 datasets>=2.19.0 +fsspec==2023.5.0 +pandas +numpy av soundfile librosa>=0.10 @@ -27,6 +27,5 @@ dill==0.3.4 psutil pydantic>=2.0 Pillow -numpy<2 fastapi[standard]>=0.100 httpx