Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/BUG: ignore line comments in CSV files GH2685 #4505

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,6 @@ def __init__(self, src, **kwds):
self._name_processed = True
(index_names, self.names,
self.index_col) = _clean_index_names(self.names, self.index_col)

if self.index_names is None:
self.index_names = index_names

Expand Down Expand Up @@ -1100,7 +1099,6 @@ def _get_index_names(self):
if self._reader.leading_cols == 0 and self.index_col is not None:
(idx_names, names,
self.index_col) = _clean_index_names(names, self.index_col)

return names, idx_names

def _maybe_parse_dates(self, values, index, try_parse_dates=True):
Expand Down Expand Up @@ -1282,21 +1280,30 @@ class MyDialect(csv.Dialect):

sniff_sep = True

if sep is not None:
if (sep is not None) and (dia.quotechar is not None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need for parens here is not binds tighter than and

sniff_sep = False
dia.delimiter = sep
# attempt to sniff the delimiter
if sniff_sep:
line = f.readline()
while self.pos in self.skiprows:
self.pos += 1
line = f.readline()

line = self._check_comments([line])[0]
line = self._check_comments([[line]])

while not line:
self.pos += 1
line = f.readline()
line = self._check_comments([[line]])

line = line[0][0]

self.pos += 1
sniffed = csv.Sniffer().sniff(line)
dia.delimiter = sniffed.delimiter
if not dia.delimiter:
dia.delimiter = sniffed.delimiter
if not dia.quotechar:
dia.quotechar = sniffed.quotechar
if self.encoding is not None:
self.buf.extend(list(
com.UnicodeReader(StringIO(line),
Expand Down Expand Up @@ -1466,14 +1473,26 @@ def _next_line(self):
line = self.data[self.pos]
except IndexError:
raise StopIteration

line = self._check_comments([line])

while not line:
self.pos += 1
try:
line = self.data[self.pos]
except IndexError:
raise StopIteration
line = self._check_comments([line])

line = line[0]
else:
while self.pos in self.skiprows:
next(self.data)
self.pos += 1

line = next(self.data)
line = self._check_comments([line])[0]

line = self._check_comments([line])[0]
line = self._check_thousands([line])[0]

self.pos += 1
Expand All @@ -1496,7 +1515,10 @@ def _check_comments(self, lines):
if len(x) > 0:
rl.append(x)
break
ret.append(rl)
if rl:
ret.append(rl)
if not ret:
ret = [[]];
return ret

def _check_thousands(self, lines):
Expand Down Expand Up @@ -1524,7 +1546,7 @@ def _clear_buffer(self):
def _get_index_name(self, columns):
orig_names = list(columns)
columns = list(columns)

try:
line = self._next_line()
except StopIteration:
Expand All @@ -1539,7 +1561,7 @@ def _get_index_name(self, columns):

# implicitly index_col=0 b/c 1 fewer column names
implicit_first_cols = 0
if line is not None:
if line and (line is not None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

# leave it 0, #2442
if self.index_col is not False:
implicit_first_cols = len(line) - len(columns)
Expand Down
32 changes: 22 additions & 10 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,17 +1527,29 @@ def test_multiple_date_col_multiple_index(self):

def test_comment(self):
data = """A,B,C
1,2.,4.#hello world
5.,NaN,10.0
#first line comment
1,2.,4. # first end line comment
# second line comment
3,5.,7.#second end line comment
6.,NaN,10.0
"""
expected = [[1., 2., 4.],
[5., np.nan, 10.]]
df = self.read_csv(StringIO(data), comment='#')
assert_almost_equal(df.values, expected)

df = self.read_table(StringIO(data), sep=',', comment='#',
na_values=['NaN'])
assert_almost_equal(df.values, expected)
expected = {
'c': [[np.nan, np.nan, np.nan],
[1., 2., 4.],
[np.nan, np.nan, np.nan],
[3., 5., 7.],
[6., np.nan, 10.]],
'python': [[1., 2., 4.],
[3., 5., 7.],
[6., np.nan, 10.]]
}
for engine in ('c', 'python'):
df = self.read_csv(StringIO(data), comment='#', engine=engine)
assert_almost_equal(df.values, expected[engine])

df = self.read_table(StringIO(data), sep=',', comment='#',
na_values=['NaN'], engine=engine)
assert_almost_equal(df.values, expected[engine])

def test_bool_na_values(self):
data = """A,B,C
Expand Down
5 changes: 2 additions & 3 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
}
else if (c == self->delimiter) {
// End of field. End of line not reached yet

END_FIELD();
self->state = START_FIELD;
}
Expand Down Expand Up @@ -866,7 +865,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
} else {
/* \r line terminator */

/* UGH. we don't actually want to consume the token. fix this later */
/*FIXME UGH. we don't actually want to consume the token. */
self->stream_len = slen;
if (end_line(self) < 0) {
goto parsingerror;
Expand All @@ -875,7 +874,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
slen = self->stream_len;
self->state = START_RECORD;

/* HACK, let's try this one again */
/*FIXME let's try this one again */
--i; buf--;
if (line_limit > 0 && self->lines == start_lines + line_limit) {
goto linelimit;
Expand Down