Skip to content

Commit d298414

Browse files
dgwynnejreback
authored andcommitted
revert #16663, which was a revert of #16039 (#16675)
* Revert "BUG: Revert gh-16039 (#16663)" This reverts commit c550372. * always treat files as binary to cope with windows and EOF. on windows, EOF can appear "in band" if the file is considered text. when moving from fread() to read(), i lost the "b" part of the mode. at the time i believed this was a nop, since unix doesnt treat files differently based on that flag. this adds O_BINARY to the flags to open to restore the behaviour lost when taking "b" away from fopen. if a platform doesn't provide O_BINARY, this defines it to 0 so it can still be used without effect later on in the code. * dont leak the fd in new_file_source() if buffer allocation fails. * reapply the test for EOF in the middle of a stream. part of c550372 * pass rb to _get_handle on python 3, otherwise stick to r. part of c550372 * replace goto with inline unwinding of state. requested by @jreback in #16675 feedback. * describe the fixes to the read_csv() backend and issue numbers. requested by @jreback in feedback on #16675
1 parent 11d274f commit d298414

File tree

3 files changed

+89
-78
lines changed

3 files changed

+89
-78
lines changed

doc/source/whatsnew/v0.20.3.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,7 @@ Indexing
5555
I/O
5656
^^^
5757

58-
- Bug in ``pd.read_csv()`` in which files containing EOF characters mid-field could fail with the C engine on Windows (:issue:`16039`, :issue:`16559`)
59-
58+
-- Bug in ``pd.read_csv()`` in which files weren't opened as binary files by the C engine on Windows, causing EOF characters mid-field, which would fail (:issue:`16039`, :issue:`16559`, :issue`16675`)
6059

6160
Plotting
6261
^^^^^^^^

pandas/_libs/src/parser/io.c

+82-54
Original file line numberDiff line numberDiff line change
@@ -9,31 +9,41 @@ The full license is in the LICENSE file, distributed with this software.
99

1010
#include "io.h"
1111

12+
#include <sys/types.h>
13+
#include <sys/stat.h>
14+
#include <fcntl.h>
15+
16+
#ifndef O_BINARY
17+
#define O_BINARY 0
18+
#endif /* O_BINARY */
19+
1220
/*
1321
On-disk FILE, uncompressed
1422
*/
1523

1624
void *new_file_source(char *fname, size_t buffer_size) {
1725
file_source *fs = (file_source *)malloc(sizeof(file_source));
18-
fs->fp = fopen(fname, "rb");
26+
if (fs == NULL) {
27+
return NULL;
28+
}
1929

20-
if (fs->fp == NULL) {
30+
fs->fd = open(fname, O_RDONLY | O_BINARY);
31+
if (fs->fd == -1) {
2132
free(fs);
2233
return NULL;
2334
}
24-
setbuf(fs->fp, NULL);
25-
26-
fs->initial_file_pos = ftell(fs->fp);
2735

2836
// Only allocate this heap memory if we are not memory-mapping the file
2937
fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));
3038

3139
if (fs->buffer == NULL) {
40+
close(fs->fd);
41+
free(fs);
3242
return NULL;
3343
}
3444

35-
memset(fs->buffer, 0, buffer_size + 1);
36-
fs->buffer[buffer_size] = '\0';
45+
memset(fs->buffer, '\0', buffer_size + 1);
46+
fs->size = buffer_size;
3747

3848
return (void *)fs;
3949
}
@@ -56,12 +66,12 @@ void *new_rd_source(PyObject *obj) {
5666
5767
*/
5868

59-
int del_file_source(void *fs) {
69+
int del_file_source(void *ptr) {
70+
file_source *fs = ptr;
6071
if (fs == NULL) return 0;
6172

62-
/* allocated on the heap */
63-
free(FS(fs)->buffer);
64-
fclose(FS(fs)->fp);
73+
free(fs->buffer);
74+
close(fs->fd);
6575
free(fs);
6676

6777
return 0;
@@ -83,17 +93,31 @@ int del_rd_source(void *rds) {
8393

8494
void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
8595
int *status) {
86-
file_source *src = FS(source);
96+
file_source *fs = FS(source);
97+
ssize_t rv;
8798

88-
*bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp);
99+
if (nbytes > fs->size) {
100+
nbytes = fs->size;
101+
}
89102

90-
if (*bytes_read == 0) {
103+
rv = read(fs->fd, fs->buffer, nbytes);
104+
switch (rv) {
105+
case -1:
106+
*status = CALLING_READ_FAILED;
107+
*bytes_read = 0;
108+
return NULL;
109+
case 0:
91110
*status = REACHED_EOF;
92-
} else {
111+
*bytes_read = 0;
112+
return NULL;
113+
default:
93114
*status = 0;
115+
*bytes_read = rv;
116+
fs->buffer[rv] = '\0';
117+
break;
94118
}
95119

96-
return (void *)src->buffer;
120+
return (void *)fs->buffer;
97121
}
98122

99123
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
@@ -152,80 +176,84 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
152176
#ifdef HAVE_MMAP
153177

154178
#include <sys/mman.h>
155-
#include <sys/stat.h>
156179

157180
void *new_mmap(char *fname) {
158-
struct stat buf;
159-
int fd;
160181
memory_map *mm;
161-
off_t filesize;
182+
struct stat stat;
183+
size_t filesize;
162184

163185
mm = (memory_map *)malloc(sizeof(memory_map));
164-
mm->fp = fopen(fname, "rb");
165-
166-
fd = fileno(mm->fp);
167-
if (fstat(fd, &buf) == -1) {
168-
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
169-
return NULL;
170-
}
171-
filesize = buf.st_size; /* XXX This might be 32 bits. */
172-
173186
if (mm == NULL) {
174-
/* XXX Eventually remove this print statement. */
175187
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
188+
return (NULL);
189+
}
190+
mm->fd = open(fname, O_RDONLY | O_BINARY);
191+
if (mm->fd == -1) {
192+
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
193+
fname, errno);
194+
free(mm);
176195
return NULL;
177196
}
178-
mm->size = (off_t)filesize;
179-
mm->line_number = 0;
180197

181-
mm->fileno = fd;
182-
mm->position = ftell(mm->fp);
183-
mm->last_pos = (off_t)filesize;
198+
if (fstat(mm->fd, &stat) == -1) {
199+
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
200+
errno);
201+
close(mm->fd);
202+
free(mm);
203+
return NULL;
204+
}
205+
filesize = stat.st_size; /* XXX This might be 32 bits. */
184206

185-
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
186-
if (mm->memmap == NULL) {
207+
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
208+
if (mm->memmap == MAP_FAILED) {
187209
/* XXX Eventually remove this print statement. */
188210
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
211+
close(mm->fd);
189212
free(mm);
190-
mm = NULL;
213+
return NULL;
191214
}
192215

193-
return (void *)mm;
216+
mm->size = (off_t)filesize;
217+
mm->position = 0;
218+
219+
return mm;
194220
}
195221

196-
int del_mmap(void *src) {
197-
munmap(MM(src)->memmap, MM(src)->size);
222+
int del_mmap(void *ptr) {
223+
memory_map *mm = ptr;
224+
225+
if (mm == NULL) return 0;
198226

199-
fclose(MM(src)->fp);
200-
free(src);
227+
munmap(mm->memmap, mm->size);
228+
close(mm->fd);
229+
free(mm);
201230

202231
return 0;
203232
}
204233

205234
void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
206235
int *status) {
207236
void *retval;
208-
memory_map *src = MM(source);
237+
memory_map *src = source;
238+
size_t remaining = src->size - src->position;
209239

210-
if (src->position == src->last_pos) {
240+
if (remaining == 0) {
211241
*bytes_read = 0;
212242
*status = REACHED_EOF;
213243
return NULL;
214244
}
215245

216-
retval = src->memmap + src->position;
217-
218-
if (src->position + (off_t)nbytes > src->last_pos) {
219-
// fewer than nbytes remaining
220-
*bytes_read = src->last_pos - src->position;
221-
} else {
222-
*bytes_read = nbytes;
246+
if (nbytes > remaining) {
247+
nbytes = remaining;
223248
}
224249

225-
*status = 0;
250+
retval = src->memmap + src->position;
226251

227252
/* advance position in mmap data structure */
228-
src->position += *bytes_read;
253+
src->position += nbytes;
254+
255+
*bytes_read = nbytes;
256+
*status = 0;
229257

230258
return retval;
231259
}

pandas/_libs/src/parser/io.h

+6-22
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software.
1515

1616
typedef struct _file_source {
1717
/* The file being read. */
18-
FILE *fp;
18+
int fd;
1919

2020
char *buffer;
21-
22-
/* file position when the file_buffer was created. */
23-
off_t initial_file_pos;
24-
25-
/* Offset in the file of the data currently in the buffer. */
26-
off_t buffer_file_pos;
27-
28-
/* Actual number of bytes in the current buffer. (Can be less than
29-
* buffer_size.) */
30-
off_t last_pos;
21+
size_t size;
3122
} file_source;
3223

3324
#define FS(source) ((file_source *)source)
@@ -37,20 +28,13 @@ typedef struct _file_source {
3728
#endif
3829

3930
typedef struct _memory_map {
40-
FILE *fp;
31+
int fd;
4132

4233
/* Size of the file, in bytes. */
43-
off_t size;
44-
45-
/* file position when the file_buffer was created. */
46-
off_t initial_file_pos;
47-
48-
int line_number;
49-
50-
int fileno;
51-
off_t position;
52-
off_t last_pos;
5334
char *memmap;
35+
size_t size;
36+
37+
size_t position;
5438
} memory_map;
5539

5640
#define MM(src) ((memory_map *)src)

0 commit comments

Comments
 (0)