From 69a4ad001b579af5fbdd5eb7775deabfd0c452f1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 10 Jun 2017 21:24:52 -0500 Subject: [PATCH] BUG: Revert gh-16039 gh-16039 created a bug in which files containing byte-like data could break, as EOF characters mid-field (despite being quoted) would cause premature line breaks. Given that this PR was a performance patch, this commit can be safely reverted. Closes gh-16559. --- doc/source/whatsnew/v0.20.3.txt | 1 + pandas/_libs/src/parser/io.c | 142 +++++++++++++------------------ pandas/_libs/src/parser/io.h | 28 ++++-- pandas/io/parsers.py | 3 +- pandas/tests/io/parser/common.py | 15 ++++ 5 files changed, 98 insertions(+), 91 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index 52f7701724f18..f21230693686e 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -54,6 +54,7 @@ Indexing I/O ^^^ +- Bug in ``pd.read_csv()`` in which files containing EOF characters mid-field could fail with the C engine on Windows (:issue:`16039`, :issue:`16559`) Plotting diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index dee7d9d9281c4..4381ef19e991b 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,40 +9,33 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" -#include -#include -#include - /* On-disk FILE, uncompressed */ void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); - if (fs == NULL) { + fs->fp = fopen(fname, "rb"); + + if (fs->fp == NULL) { + free(fs); return NULL; } + setbuf(fs->fp, NULL); - fs->fd = open(fname, O_RDONLY); - if (fs->fd == -1) { - goto err_free; - } + fs->initial_file_pos = ftell(fs->fp); // Only allocate this heap memory if we are not memory-mapping the file fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { - goto err_free; + return NULL; } - memset(fs->buffer, '\0', buffer_size + 1); - fs->size = buffer_size; + memset(fs->buffer, 0, buffer_size + 1); + fs->buffer[buffer_size] = '\0'; return (void *)fs; - -err_free: - free(fs); - return NULL; } void *new_rd_source(PyObject *obj) { @@ -63,12 +56,12 @@ void *new_rd_source(PyObject *obj) { */ -int del_file_source(void *ptr) { - file_source *fs = ptr; +int del_file_source(void *fs) { if (fs == NULL) return 0; - free(fs->buffer); - close(fs->fd); + /* allocated on the heap */ + free(FS(fs)->buffer); + fclose(FS(fs)->fp); free(fs); return 0; @@ -90,31 +83,17 @@ int del_rd_source(void *rds) { void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { - file_source *fs = FS(source); - ssize_t rv; + file_source *src = FS(source); - if (nbytes > fs->size) { - nbytes = fs->size; - } + *bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp); - rv = read(fs->fd, fs->buffer, nbytes); - switch (rv) { - case -1: - *status = CALLING_READ_FAILED; - *bytes_read = 0; - return NULL; - case 0: + if (*bytes_read == 0) { *status = REACHED_EOF; - *bytes_read = 0; - return NULL; - default: + } else { *status = 0; - *bytes_read = rv; - fs->buffer[rv] = '\0'; - break; } - return (void *)fs->buffer; + return (void *)src->buffer; } void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, @@ -173,58 +152,52 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, #ifdef HAVE_MMAP #include +#include void *new_mmap(char *fname) { + struct stat buf; + int fd; memory_map *mm; - struct stat stat; - size_t filesize; + off_t filesize; mm = (memory_map *)malloc(sizeof(memory_map)); + mm->fp = fopen(fname, "rb"); + + fd = fileno(mm->fp); + if (fstat(fd, &buf) == -1) { + fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); + return NULL; + } + filesize = buf.st_size; /* XXX This might be 32 bits. */ + if (mm == NULL) { + /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return (NULL); - } - mm->fd = open(fname, O_RDONLY); - if (mm->fd == -1) { - fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", - fname, errno); - goto err_free; + return NULL; } + mm->size = (off_t)filesize; + mm->line_number = 0; - if (fstat(mm->fd, &stat) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", - errno); - goto err_close; - } - filesize = stat.st_size; /* XXX This might be 32 bits. */ + mm->fileno = fd; + mm->position = ftell(mm->fp); + mm->last_pos = (off_t)filesize; - mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); - if (mm->memmap == MAP_FAILED) { + mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); + if (mm->memmap == NULL) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: mmap() failed.\n"); - goto err_close; + free(mm); + mm = NULL; } - mm->size = (off_t)filesize; - mm->position = 0; - - return mm; - -err_close: - close(mm->fd); -err_free: - free(mm); - return NULL; + return (void *)mm; } -int del_mmap(void *ptr) { - memory_map *mm = ptr; - - if (mm == NULL) return 0; +int del_mmap(void *src) { + munmap(MM(src)->memmap, MM(src)->size); - munmap(mm->memmap, mm->size); - close(mm->fd); - free(mm); + fclose(MM(src)->fp); + free(src); return 0; } @@ -232,27 +205,28 @@ int del_mmap(void *ptr) { void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { void *retval; - memory_map *src = source; - size_t remaining = src->size - src->position; + memory_map *src = MM(source); - if (remaining == 0) { + if (src->position == src->last_pos) { *bytes_read = 0; *status = REACHED_EOF; return NULL; } - if (nbytes > remaining) { - nbytes = remaining; - } - retval = src->memmap + src->position; - /* advance position in mmap data structure */ - src->position += nbytes; + if (src->position + (off_t)nbytes > src->last_pos) { + // fewer than nbytes remaining + *bytes_read = src->last_pos - src->position; + } else { + *bytes_read = nbytes; + } - *bytes_read = nbytes; *status = 0; + /* advance position in mmap data structure */ + src->position += *bytes_read; + return retval; } diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index d22e8ddaea88d..77121e9a169c1 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -15,10 +15,19 @@ The full license is in the LICENSE file, distributed with this software. typedef struct _file_source { /* The file being read. */ - int fd; + FILE *fp; char *buffer; - size_t size; + + /* file position when the file_buffer was created. */ + off_t initial_file_pos; + + /* Offset in the file of the data currently in the buffer. */ + off_t buffer_file_pos; + + /* Actual number of bytes in the current buffer. (Can be less than + * buffer_size.) */ + off_t last_pos; } file_source; #define FS(source) ((file_source *)source) @@ -28,13 +37,20 @@ typedef struct _file_source { #endif typedef struct _memory_map { - int fd; + FILE *fp; /* Size of the file, in bytes. */ - char *memmap; - size_t size; + off_t size; - size_t position; + /* file position when the file_buffer was created. */ + off_t initial_file_pos; + + int line_number; + + int fileno; + off_t position; + off_t last_pos; + char *memmap; } memory_map; #define MM(src) ((memory_map *)src) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c2d5a629b03a3..9ec3f79e1ae70 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1985,7 +1985,8 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - f, handles = _get_handle(f, 'r', encoding=self.encoding, + mode = 'r' if PY3 else 'rb' + f, handles = _get_handle(f, mode, encoding=self.encoding, compression=self.compression, memory_map=self.memory_map) self.handles.extend(handles) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 31d815a4bca97..4b4f44b44c163 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1662,6 +1662,21 @@ def test_internal_eof_byte(self): result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) + def test_internal_eof_byte_to_file(self): + # see gh-16559 + data = b'c1,c2\r\n"test \x1a test", test\r\n' + expected = pd.DataFrame([["test \x1a test", " test"]], + columns=["c1", "c2"]) + + path = '__%s__.csv' % tm.rands(10) + + with tm.ensure_clean(path) as path: + with open(path, "wb") as f: + f.write(data) + + result = self.read_csv(path) + tm.assert_frame_equal(result, expected) + def test_file_handles(self): # GH 14418 - don't close user provided file handles