diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6110454be7da8..e43dbf85db9dc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1537,6 +1537,7 @@ Performance Improvements function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`). - Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`) +- Improved performance in ``pd.read_csv()`` on some platforms with buffered reads (:issue:`16039`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 4381ef19e991b..dee7d9d9281c4 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,33 +9,40 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" +#include +#include +#include + /* On-disk FILE, uncompressed */ void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); - fs->fp = fopen(fname, "rb"); - - if (fs->fp == NULL) { - free(fs); + if (fs == NULL) { return NULL; } - setbuf(fs->fp, NULL); - fs->initial_file_pos = ftell(fs->fp); + fs->fd = open(fname, O_RDONLY); + if (fs->fd == -1) { + goto err_free; + } // Only allocate this heap memory if we are not memory-mapping the file fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { - return NULL; + goto err_free; } - memset(fs->buffer, 0, buffer_size + 1); - fs->buffer[buffer_size] = '\0'; + memset(fs->buffer, '\0', buffer_size + 1); + fs->size = buffer_size; return (void *)fs; + +err_free: + free(fs); + return NULL; } void *new_rd_source(PyObject *obj) { @@ -56,12 +63,12 @@ void *new_rd_source(PyObject *obj) { */ -int del_file_source(void *fs) { +int del_file_source(void *ptr) { + file_source *fs = ptr; if (fs == NULL) return 0; - /* allocated on the heap */ - free(FS(fs)->buffer); - fclose(FS(fs)->fp); + free(fs->buffer); + close(fs->fd); free(fs); return 0; @@ -83,17 +90,31 @@ int del_rd_source(void *rds) { void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { - file_source *src = FS(source); + file_source *fs = FS(source); + ssize_t rv; - *bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp); + if (nbytes > fs->size) { + nbytes = fs->size; + } - if (*bytes_read == 0) { + rv = read(fs->fd, fs->buffer, nbytes); + switch (rv) { + case -1: + *status = CALLING_READ_FAILED; + *bytes_read = 0; + return NULL; + case 0: *status = REACHED_EOF; - } else { + *bytes_read = 0; + return NULL; + default: *status = 0; + *bytes_read = rv; + fs->buffer[rv] = '\0'; + break; } - return (void *)src->buffer; + return (void *)fs->buffer; } void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, @@ -152,52 +173,58 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, #ifdef HAVE_MMAP #include -#include void *new_mmap(char *fname) { - struct stat buf; - int fd; memory_map *mm; - off_t filesize; + struct stat stat; + size_t filesize; mm = (memory_map *)malloc(sizeof(memory_map)); - mm->fp = fopen(fname, "rb"); - - fd = fileno(mm->fp); - if (fstat(fd, &buf) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); - return NULL; - } - filesize = buf.st_size; /* XXX This might be 32 bits. */ - if (mm == NULL) { - /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return NULL; + return (NULL); + } + mm->fd = open(fname, O_RDONLY); + if (mm->fd == -1) { + fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", + fname, errno); + goto err_free; } - mm->size = (off_t)filesize; - mm->line_number = 0; - mm->fileno = fd; - mm->position = ftell(mm->fp); - mm->last_pos = (off_t)filesize; + if (fstat(mm->fd, &stat) == -1) { + fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", + errno); + goto err_close; + } + filesize = stat.st_size; /* XXX This might be 32 bits. */ - mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); - if (mm->memmap == NULL) { + mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); + if (mm->memmap == MAP_FAILED) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: mmap() failed.\n"); - free(mm); - mm = NULL; + goto err_close; } - return (void *)mm; + mm->size = (off_t)filesize; + mm->position = 0; + + return mm; + +err_close: + close(mm->fd); +err_free: + free(mm); + return NULL; } -int del_mmap(void *src) { - munmap(MM(src)->memmap, MM(src)->size); +int del_mmap(void *ptr) { + memory_map *mm = ptr; + + if (mm == NULL) return 0; - fclose(MM(src)->fp); - free(src); + munmap(mm->memmap, mm->size); + close(mm->fd); + free(mm); return 0; } @@ -205,27 +232,26 @@ int del_mmap(void *src) { void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { void *retval; - memory_map *src = MM(source); + memory_map *src = source; + size_t remaining = src->size - src->position; - if (src->position == src->last_pos) { + if (remaining == 0) { *bytes_read = 0; *status = REACHED_EOF; return NULL; } - retval = src->memmap + src->position; - - if (src->position + (off_t)nbytes > src->last_pos) { - // fewer than nbytes remaining - *bytes_read = src->last_pos - src->position; - } else { - *bytes_read = nbytes; + if (nbytes > remaining) { + nbytes = remaining; } - *status = 0; + retval = src->memmap + src->position; /* advance position in mmap data structure */ - src->position += *bytes_read; + src->position += nbytes; + + *bytes_read = nbytes; + *status = 0; return retval; } diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index 77121e9a169c1..d22e8ddaea88d 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software. typedef struct _file_source { /* The file being read. */ - FILE *fp; + int fd; char *buffer; - - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - - /* Offset in the file of the data currently in the buffer. */ - off_t buffer_file_pos; - - /* Actual number of bytes in the current buffer. (Can be less than - * buffer_size.) */ - off_t last_pos; + size_t size; } file_source; #define FS(source) ((file_source *)source) @@ -37,20 +28,13 @@ typedef struct _file_source { #endif typedef struct _memory_map { - FILE *fp; + int fd; /* Size of the file, in bytes. */ - off_t size; - - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - - int line_number; - - int fileno; - off_t position; - off_t last_pos; char *memmap; + size_t size; + + size_t position; } memory_map; #define MM(src) ((memory_map *)src)