diff --git a/Dockerfile b/Dockerfile index 3db40af..5c551fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ ARG CUDA_VERSION +ARG XGB_HASH=6d293020fbfa2c67b532d550fe5d55689662caac FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu16.04 SHELL ["/bin/bash", "-c"] # Install conda (and use python 3.7) @@ -26,6 +27,7 @@ RUN curl -o /opt/miniconda.sh \ /opt/conda/bin/conda update -n base conda && \ rm /opt/miniconda.sh ENV PATH /opt/conda/bin:$PATH + RUN conda install -c conda-forge \ bokeh \ h5py \ @@ -64,69 +66,17 @@ RUN wget --no-check-certificate \ rm -rf cmake-${CMAKE_LONG_VERSION}.tar.gz cmake-${CMAKE_LONG_VERSION} # lightgbm -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - bzip2 \ - ca-certificates \ - curl \ - git \ - libblas-dev \ - libboost-dev \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbz2-dev \ - libc6 \ - libglib2.0-0 \ - liblapack-dev \ - libsm6 \ - libxext6 \ - libxrender1 \ - make \ - tar \ - unzip \ - wget && \ - rm -rf /var/lib/apt/* -RUN mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd -ENV OPENCL_LIBRARIES /usr/local/cuda/lib64 -ENV OPENCL_INCLUDE_DIR /usr/local/cuda/include -RUN git config --global http.sslVerify false && \ - git clone --recursive https://github.com/Microsoft/LightGBM /opt/LightGBM && \ - cd /opt/LightGBM && \ - mkdir build && \ - cd build && \ - cmake .. \ - -DUSE_GPU=1 \ - -DOpenCL_LIBRARY=$OPENCL_LIBRARIES/libOpenCL.so \ - -DOpenCL_INCLUDE_DIR=$OPENCL_INCLUDE_DIR && \ - make OPENCL_HEADERS="/usr/local/cuda/targets/x86_64-linux/include" \ - LIBOPENCL="/usr/local/cuda/targets/x86_64-linux/lib" -j4 && \ - cd ../python-package && \ - python setup.py install --precompile +RUN pip install lightgbm # catboost -RUN if ["$CUDA_VERSION" < "11.0"]; then git config --global http.sslVerify false && \ - git clone --recursive "https://github.com/catboost/catboost" /opt/catboost && \ - cd /opt/catboost && \ - cd catboost/python-package/catboost && \ - ../../../ya make \ - -r \ - -o ../../.. \ - -DUSE_ARCADIA_PYTHON=no \ - -DUSE_SYSTEM_PYTHON=3.7\ - -DPYTHON_CONFIG=python3-config \ - -DCUDA_ROOT=$(dirname $(dirname $(which nvcc))); \ - fi -ENV if ["$CUDA_VERSION" < "11.0"]; then PYTHONPATH=$PYTHONPATH:/opt/catboost/catboost/python-package; fi\ - - +RUN pip install catboost # xgboost RUN git config --global http.sslVerify false && \ git clone --recursive https://github.com/dmlc/xgboost /opt/xgboost && \ cd /opt/xgboost && \ + git checkout $XGB_HASH && \ + git submodule update --init --recursive && \ mkdir build && \ cd build && \ RMM_ROOT=/opt/conda cmake .. \ @@ -134,6 +84,7 @@ RUN git config --global http.sslVerify false && \ -DUSE_NCCL=ON \ -DPLUGIN_RMM=ON && \ make -j4 && \ + git log > xgb_log.txt && \ cd ../python-package && \ pip uninstall -y xgboost && \ python setup.py install diff --git a/datasets.py b/datasets.py index f31ccec..4421f1b 100644 --- a/datasets.py +++ b/datasets.py @@ -137,8 +137,9 @@ def prepare_bosch(dataset_folder, nrows): os.system("kaggle competitions download -c bosch-production-line-performance -f " + filename + " -p " + dataset_folder) - X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32, - nrows=nrows) + X = pd.read_csv(local_url,compression='zip', dtype=np.float32) + X = X.set_index('Id') + X.index = X.index.astype('int64') y = X.iloc[:, -1].to_numpy(dtype=np.float32) X.drop(X.columns[-1], axis=1, inplace=True) X = X.to_numpy(dtype=np.float32)