diff --git a/doc/source/release.rst b/doc/source/release.rst index 4f4681b112664..78236bbf821dd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -167,6 +167,8 @@ Improvements to existing features - Improve support for converting R datasets to pandas objects (more informative index for timeseries and numeric, support for factors, dist, and high-dimensional arrays). + - :func:`~pandas.read_html` now supports the ``parse_dates``, + ``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`). API Changes ~~~~~~~~~~~ @@ -373,6 +375,8 @@ See :ref:`Internal Refactoring` ``core/generic.py`` (:issue:`4435`). - Refactor cum objects to core/generic.py (:issue:`4435`), note that these have a more numpy-like function signature. + - :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from + bs4/lxml (:issue:`4770`). .. _release.bug_fixes-0.13.0: @@ -538,6 +542,15 @@ Bug Fixes - Make sure series-series boolean comparions are label based (:issue:`4947`) - Bug in multi-level indexing with a Timestamp partial indexer (:issue:`4294`) - Tests/fix for multi-index construction of an all-nan frame (:isue:`4078`) + - Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring + values of tables with commas (:issue:`5029`) + - Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable + ordering of returned tables (:issue:`4770`, :issue:`5029`). + - Fixed a bug where :func:`~pandas.read_html` was incorrectly parsing when + passed ``index_col=0`` (:issue:`5066`). + - Fixed a bug where :func:`~pandas.read_html` was incorrectly infering the + type of headers (:issue:`5048`). + pandas 0.12.0 ------------- diff --git a/pandas/io/html.py b/pandas/io/html.py index df94e0ffa2e79..96bedbf390af6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -7,15 +7,18 @@ import re import numbers import collections +import warnings from distutils.version import LooseVersion import numpy as np -from pandas import DataFrame, MultiIndex, isnull from pandas.io.common import _is_url, urlopen, parse_url -from pandas.compat import range, lrange, lmap, u, map -from pandas import compat +from pandas.io.parsers import TextParser +from pandas.compat import (lrange, lmap, u, string_types, iteritems, text_type, + raise_with_traceback) +from pandas.core import common as com +from pandas import Series try: @@ -45,7 +48,7 @@ ############# # READ HTML # ############# -_RE_WHITESPACE = re.compile(r'([\r\n]+|\s{2,})') +_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}') def _remove_whitespace(s, regex=_RE_WHITESPACE): @@ -67,7 +70,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): return regex.sub(' ', s.strip()) -def _get_skiprows_iter(skiprows): +def _get_skiprows(skiprows): """Get an iterator given an integer, slice or container. Parameters @@ -80,11 +83,6 @@ def _get_skiprows_iter(skiprows): TypeError * If `skiprows` is not a slice, integer, or Container - Raises - ------ - TypeError - * If `skiprows` is not a slice, integer, or Container - Returns ------- it : iterable @@ -92,13 +90,12 @@ def _get_skiprows_iter(skiprows): """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) - elif isinstance(skiprows, numbers.Integral): - return lrange(skiprows) - elif isinstance(skiprows, collections.Container): + elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows): return skiprows - else: - raise TypeError('{0} is not a valid type for skipping' - ' rows'.format(type(skiprows))) + elif skiprows is None: + return 0 + raise TypeError('%r is not a valid type for skipping rows' % + type(skiprows).__name__) def _read(io): @@ -120,11 +117,10 @@ def _read(io): elif os.path.isfile(io): with open(io) as f: raw_text = f.read() - elif isinstance(io, compat.string_types): + elif isinstance(io, string_types): raw_text = io else: - raise TypeError("Cannot read object of type " - "'{0.__class__.__name__!r}'".format(io)) + raise TypeError("Cannot read object of type %r" % type(io).__name__) return raw_text @@ -194,12 +190,6 @@ def _parse_raw_data(self, rows): A callable that takes a row node as input and returns a list of the column node in that row. This must be defined by subclasses. - Raises - ------ - AssertionError - * If `text_getter` is not callable - * If `column_finder` is not callable - Returns ------- data : list of list of strings @@ -254,7 +244,7 @@ def _parse_tables(self, doc, match, attrs): Raises ------ - AssertionError + ValueError * If `match` does not match any text in the document. Returns @@ -406,25 +396,28 @@ def _parse_tfoot(self, table): def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) + if not tables: - # known sporadically working release - raise AssertionError('No tables found') + raise ValueError('No tables found') - mts = [table.find(text=match) for table in tables] - matched_tables = [mt for mt in mts if mt is not None] - tables = list(set(mt.find_parent(element_name) - for mt in matched_tables)) + result = [] + unique_tables = set() - if not tables: - raise AssertionError("No tables found matching " - "'{0}'".format(match.pattern)) - return tables + for table in tables: + if (table not in unique_tables and + table.find(text=match) is not None): + result.append(table) + unique_tables.add(table) + + if not result: + raise ValueError("No tables found matching pattern %r" % + match.pattern) + return result def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise AssertionError('No text parsed from document: ' - '{0}'.format(self.io)) + raise ValueError('No text parsed from document: %s' % self.io) return raw_text def _build_doc(self): @@ -432,7 +425,7 @@ def _build_doc(self): return BeautifulSoup(self._setup_build_doc(), features='html5lib') -def _build_node_xpath_expr(attrs): +def _build_xpath_expr(attrs): """Build an xpath expression to simulate bs4's ability to pass in kwargs to search for attributes when using the lxml parser. @@ -450,8 +443,8 @@ def _build_node_xpath_expr(attrs): if 'class_' in attrs: attrs['class'] = attrs.pop('class_') - s = (u("@{k}='{v}'").format(k=k, v=v) for k, v in compat.iteritems(attrs)) - return u('[{0}]').format(' and '.join(s)) + s = [u("@%s=%r") % (k, v) for k, v in iteritems(attrs)] + return u('[%s]') % ' and '.join(s) _re_namespace = {'re': 'http://exslt.org/regular-expressions'} @@ -491,23 +484,20 @@ def _parse_tr(self, table): def _parse_tables(self, doc, match, kwargs): pattern = match.pattern - # check all descendants for the given pattern - check_all_expr = u('//*') - if pattern: - check_all_expr += u("[re:test(text(), '{0}')]").format(pattern) - - # go up the tree until we find a table - check_table_expr = '/ancestor::table' - xpath_expr = check_all_expr + check_table_expr + # 1. check all descendants for the given pattern and only search tables + # 2. go up the tree until we find a table + query = '//table//*[re:test(text(), %r)]/ancestor::table' + xpath_expr = u(query) % pattern # if any table attributes were given build an xpath expression to # search for them if kwargs: - xpath_expr += _build_node_xpath_expr(kwargs) + xpath_expr += _build_xpath_expr(kwargs) + tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + if not tables: - raise AssertionError("No tables found matching regex " - "'{0}'".format(pattern)) + raise ValueError("No tables found matching regex %r" % pattern) return tables def _build_doc(self): @@ -528,6 +518,7 @@ def _build_doc(self): """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + parser = HTMLParser(recover=False) try: @@ -552,8 +543,8 @@ def _build_doc(self): scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it - msg = ('{0} is not a valid url scheme, valid schemes are ' - '{1}').format(scheme, _valid_schemes) + msg = ('%r is not a valid url scheme, valid schemes are ' + '%s') % (scheme, _valid_schemes) raise ValueError(msg) else: # something else happened: maybe a faulty connection @@ -583,101 +574,38 @@ def _parse_raw_tfoot(self, table): table.xpath(expr)] -def _data_to_frame(data, header, index_col, infer_types, skiprows): - """Parse a BeautifulSoup table into a DataFrame. +def _expand_elements(body): + lens = Series(lmap(len, body)) + lens_max = lens.max() + not_max = lens[lens != lens_max] - Parameters - ---------- - data : tuple of lists - The raw data to be placed into a DataFrame. This is a list of lists of - strings or unicode. If it helps, it can be thought of as a matrix of - strings instead. - - header : int or None - An integer indicating the row to use for the column header or None - indicating no header will be used. + for ind, length in iteritems(not_max): + body[ind] += [np.nan] * (lens_max - length) - index_col : int or None - An integer indicating the column to use for the index or None - indicating no column will be used. - infer_types : bool - Whether to convert numbers and dates. +def _data_to_frame(data, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands): + head, body, _ = data # _ is footer which is rarely used: ignore for now - skiprows : collections.Container or int or slice - Iterable used to skip rows. + if head: + body = [head] + body - Returns - ------- - df : DataFrame - A DataFrame containing the data from `data` - - Raises - ------ - ValueError - * If `skiprows` is not found in the rows of the parsed DataFrame. + if header is None: # special case when a table has elements + header = 0 - Raises - ------ - ValueError - * If `skiprows` is not found in the rows of the parsed DataFrame. - - See Also - -------- - read_html - - Notes - ----- - The `data` parameter is guaranteed not to be a list of empty lists. - """ - thead, tbody, tfoot = data - columns = thead or None - df = DataFrame(tbody, columns=columns) + # fill out elements of body that are "ragged" + _expand_elements(body) - if skiprows is not None: - it = _get_skiprows_iter(skiprows) + tp = TextParser(body, header=header, index_col=index_col, + skiprows=_get_skiprows(skiprows), + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands) + df = tp.read() - try: - df = df.drop(it) - except ValueError: - raise ValueError('Labels {0} not found when trying to skip' - ' rows'.format(it)) - - # convert to numbers/dates where possible - # must be sequential since dates trump numbers if both args are given - if infer_types: - df = df.convert_objects(convert_numeric=True) + if infer_types: # TODO: rm this code so infer_types has no effect in 0.14 df = df.convert_objects(convert_dates='coerce') - - if header is not None: - header_rows = df.iloc[header] - - if header_rows.ndim == 2: - names = header_rows.index - df.columns = MultiIndex.from_arrays(header_rows.values, - names=names) - else: - df.columns = header_rows - - df = df.drop(df.index[header]) - - if index_col is not None: - cols = df.columns[index_col] - - try: - cols = cols.tolist() - except AttributeError: - pass - - # drop by default - df.set_index(cols, inplace=True) - if df.index.nlevels == 1: - if isnull(df.index.name) or not df.index.name: - df.index.name = None - else: - names = [name or None for name in df.index.names] - df.index = MultiIndex.from_tuples(df.index.values, names=names) - + else: + df = df.applymap(text_type) return df @@ -701,15 +629,15 @@ def _parser_dispatch(flavor): Raises ------ - AssertionError + ValueError * If `flavor` is not a valid backend. ImportError * If you do not have the requested `flavor` """ valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: - raise AssertionError('"{0!r}" is not a valid flavor, valid flavors are' - ' {1}'.format(flavor, valid_parsers)) + raise ValueError('%r is not a valid flavor, valid flavors are %s' % + (flavor, valid_parsers)) if flavor in ('bs4', 'html5lib'): if not _HAS_HTML5LIB: @@ -717,46 +645,54 @@ def _parser_dispatch(flavor): if not _HAS_BS4: raise ImportError("bs4 not found please install it") if bs4.__version__ == LooseVersion('4.2.0'): - raise AssertionError("You're using a version" - " of BeautifulSoup4 (4.2.0) that has been" - " known to cause problems on certain" - " operating systems such as Debian. " - "Please install a version of" - " BeautifulSoup4 != 4.2.0, both earlier" - " and later releases will work.") + raise ValueError("You're using a version" + " of BeautifulSoup4 (4.2.0) that has been" + " known to cause problems on certain" + " operating systems such as Debian. " + "Please install a version of" + " BeautifulSoup4 != 4.2.0, both earlier" + " and later releases will work.") else: if not _HAS_LXML: raise ImportError("lxml not found please install it") return _valid_parsers[flavor] -def _validate_parser_flavor(flavor): +def _print_as_set(s): + return '{%s}' % ', '.join([com.pprint_thing(el) for el in s]) + + +def _validate_flavor(flavor): if flavor is None: - flavor = ['lxml', 'bs4'] - elif isinstance(flavor, compat.string_types): - flavor = [flavor] + flavor = 'lxml', 'bs4' + elif isinstance(flavor, string_types): + flavor = flavor, elif isinstance(flavor, collections.Iterable): - if not all(isinstance(flav, compat.string_types) for flav in flavor): - raise TypeError('{0} is not an iterable of strings'.format(flavor)) + if not all(isinstance(flav, string_types) for flav in flavor): + raise TypeError('Object of type %r is not an iterable of strings' % + type(flavor).__name__) else: - raise TypeError('{0} is not a valid "flavor"'.format(flavor)) - - flavor = list(flavor) - valid_flavors = list(_valid_parsers.keys()) - - if not set(flavor) & set(valid_flavors): - raise ValueError('{0} is not a valid set of flavors, valid flavors are' - ' {1}'.format(flavor, valid_flavors)) + fmt = '{0!r}' if isinstance(flavor, string_types) else '{0}' + fmt += ' is not a valid flavor' + raise ValueError(fmt.format(flavor)) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError('%s is not a valid set of flavors, valid flavors are ' + '%s' % (_print_as_set(flavor_set), + _print_as_set(valid_flavors))) return flavor -def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs): - # bonus: re.compile is idempotent under function iteration so you can pass - # a compiled regex to it and it will return itself - flavor = _validate_parser_flavor(flavor) - compiled_match = re.compile(match) +def _parse(flavor, io, match, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands, attrs): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here - # ugly hack because python 3 DELETES the exception variable! + # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) @@ -769,25 +705,26 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs): else: break else: - raise retained + raise_with_traceback(retained) - return [_data_to_frame(table, header, index_col, infer_types, skiprows) + return [_data_to_frame(table, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands) for table in tables] def read_html(io, match='.+', flavor=None, header=None, index_col=None, - skiprows=None, infer_types=True, attrs=None): - r"""Read an HTML table into a DataFrame. + skiprows=None, infer_types=None, attrs=None, parse_dates=False, + tupleize_cols=False, thousands=','): + r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- io : str or file-like - A string or file like object that can be either a url, a file-like - object, or a raw string containing HTML. Note that lxml only accepts - the http, ftp and file url protocols. If you have a URI that starts - with ``'https'`` you might removing the ``'s'``. + A URL, a file-like object, or a raw string containing HTML. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. - match : str or regex, optional, default '.+' + match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be returned. Unless the HTML is extremely simple you will probably need to pass a non-empty string here. Defaults to '.+' (match any non-empty @@ -795,44 +732,30 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str, container of strings, default ``None`` - The parsing engine to use under the hood. 'bs4' and 'html5lib' are - synonymous with each other, they are both there for backwards - compatibility. The default of ``None`` tries to use ``lxml`` to parse - and if that fails it falls back on ``bs4`` + ``html5lib``. + flavor : str or None, container of strings + The parsing engine to use. 'bs4' and 'html5lib' are synonymous with + each other, they are both there for backwards compatibility. The + default of ``None`` tries to use ``lxml`` to parse and if that fails it + falls back on ``bs4`` + ``html5lib``. - header : int or array-like or None, optional, default ``None`` - The row (or rows for a MultiIndex) to use to make the columns headers. - Note that this row will be removed from the data. + header : int or list-like or None, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. - index_col : int or array-like or None, optional, default ``None`` - The column to use to make the index. Note that this column will be - removed from the data. + index_col : int or list-like or None, optional + The column (or list of columns) to use to create the index. - skiprows : int or collections.Container or slice or None, optional, default ``None`` - If an integer is given then skip this many rows after parsing the - column header. If a sequence of integers is given skip those specific - rows (0-based). Note that + skiprows : int or list-like or slice or None, optional + 0-based. Number of rows to skip after parsing the column integer. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. - .. code-block:: python - - skiprows == 0 - - yields the same result as - - .. code-block:: python + infer_types : bool, optional + This option is deprecated in 0.13, an will have no effect in 0.14. It + defaults to ``True``. - skiprows is None - - If `skiprows` is a positive integer, say :math:`n`, then - it is treated as "skip :math:`n` rows", *not* as "skip the - :math:`n^\textrm{th}` row". - - infer_types : bool, optional, default ``True`` - Whether to convert numeric types and date-appearing strings to numbers - and dates, respectively. - - attrs : dict or None, optional, default ``None`` + attrs : dict or None, optional This is a dictionary of attributes that you can pass to use to identify the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be @@ -858,33 +781,38 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, `__. It contains the latest information on table attributes for the modern web. + parse_dates : bool, optional + See :func:`~pandas.read_csv` for details. + + tupleize_cols : bool, optional + If ``False`` try to parse multiple header rows into a + :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to + ``False``. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + Returns ------- dfs : list of DataFrames - A list of DataFrames, each of which is the parsed data from each of the - tables on the page. Notes ----- - Before using this function you should probably read the :ref:`gotchas about - the parser libraries that this function uses `. + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. - There's as little cleaning of the data as possible due to the heterogeneity - and general disorder of HTML on the web. + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. - Expect some cleanup after you call this function. For example, - you might need to pass `infer_types=False` and perform manual conversion if - the column names are converted to NaN when you pass the `header=0` - argument. We try to assume as little as possible about the structure of the - table and push the idiosyncrasies of the HTML contained in the table to - you, the user. + This function searches for ```` elements and only for ```` + and ```` or ``
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". - This function only searches for elements and only for and - - - - - - - - - - - - - - - - - - - + +
- rows and elements within those rows. This could be extended by - subclassing one of the parser classes contained in :mod:`pandas.io.html`. - - Similar to :func:`read_csv` the `header` argument is applied **after** - `skiprows` is applied. + Similar to :func:`~pandas.read_csv` the `header` argument is applied + **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* it will fail, e.g., it will *not* return an empty list. @@ -892,12 +820,21 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, Examples -------- See the :ref:`read_html documentation in the IO section of the docs - ` for many examples of reading HTML. + ` for some examples of reading in HTML tables. + + See Also + -------- + pandas.read_csv """ + if infer_types is not None: + warnings.warn("infer_types will have no effect in 0.14", FutureWarning) + else: + infer_types = True # TODO: remove in 0.14 + # Type check here. We don't want to parse only to fail because of an # invalid value of an integer skiprows. if isinstance(skiprows, numbers.Integral) and skiprows < 0: - raise AssertionError('cannot skip rows starting from the end of the ' - 'data (you passed a negative value)') + raise ValueError('cannot skip rows starting from the end of the ' + 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, - attrs) + parse_dates, tupleize_cols, thousands, attrs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3ef3cbf856fef..8a2f249f6af06 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -606,16 +606,10 @@ def _failover_to_python(self): raise NotImplementedError def read(self, nrows=None): - suppressed_warnings = False if nrows is not None: if self.options.get('skip_footer'): raise ValueError('skip_footer not supported for iteration') - # # XXX hack - # if isinstance(self._engine, CParserWrapper): - # suppressed_warnings = True - # self._engine.set_error_bad_lines(False) - ret = self._engine.read(nrows) if self.options.get('as_recarray'): @@ -710,7 +704,6 @@ def _should_parse_dates(self, i): else: return (j in self.parse_dates) or (name in self.parse_dates) - def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ @@ -728,12 +721,10 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_ ic = [ ic ] sic = set(ic) - orig_header = list(header) - # clean the index_names index_names = header.pop(-1) - (index_names, names, - index_col) = _clean_index_names(index_names, self.index_col) + index_names, names, index_col = _clean_index_names(index_names, + self.index_col) # extract the columns field_count = len(header[0]) @@ -766,7 +757,7 @@ def _maybe_make_multi_index_columns(self, columns, col_names=None): return columns def _make_index(self, data, alldata, columns, indexnamerow=False): - if not _is_index_col(self.index_col) or len(self.index_col) == 0: + if not _is_index_col(self.index_col) or not self.index_col: index = None elif not self._has_complex_date_col: @@ -1430,7 +1421,7 @@ def read(self, rows=None): self._first_chunk = False columns = list(self.orig_names) - if len(content) == 0: # pragma: no cover + if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 return _get_empty_meta(self.orig_names, self.index_col, @@ -1468,8 +1459,8 @@ def _convert_data(self, data): col = self.orig_names[col] clean_conv[col] = f - return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, - clean_conv) + return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, + self.verbose, clean_conv) def _infer_columns(self): names = self.names @@ -1478,16 +1469,15 @@ def _infer_columns(self): header = self.header # we have a mi columns, so read and extra line - if isinstance(header,(list,tuple,np.ndarray)): + if isinstance(header, (list, tuple, np.ndarray)): have_mi_columns = True - header = list(header) + [header[-1]+1] + header = list(header) + [header[-1] + 1] else: have_mi_columns = False - header = [ header ] + header = [header] columns = [] for level, hr in enumerate(header): - if len(self.buf) > 0: line = self.buf[0] else: @@ -1521,10 +1511,11 @@ def _infer_columns(self): if names is not None: if len(names) != len(columns[0]): - raise Exception('Number of passed names did not match ' - 'number of header fields in the file') + raise ValueError('Number of passed names did not match ' + 'number of header fields in the file') if len(columns) > 1: - raise Exception('Cannot pass names with multi-index columns') + raise TypeError('Cannot pass names with multi-index ' + 'columns') columns = [ names ] else: diff --git a/pandas/io/tests/data/macau.html b/pandas/io/tests/data/macau.html new file mode 100644 index 0000000000000..be62b3221518d --- /dev/null +++ b/pandas/io/tests/data/macau.html @@ -0,0 +1,3691 @@ + + + + + + + + + + + + + + + +Traffic Statistics - Passengers + + + + +
+
+ + +
+ +
+ + + + + + + + + + + + + + +
+
+ + +
+ +
+
+

Traffic Statistics - Passengers

+ +
+
+
+ + +
+ +
+
+
+
+ + + Traffic Statistics + + + + + +


+ Passengers Figure(2008-2013)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  201320122011201020092008
January + + 374,917 + + + 362,379 + + + 301,503 + + + 358,902 + + + 342,323 + + + 420,574 +
February + + 393,152 + + + 312,405 + + + 301,259 + + + 351,654 + + + 297,755 + + + 442,809 +
March + + 408,755 + + + 334,000 + + + 318,908 + + + 360,365 + + + 387,879 + + + 468,540 +
April + + 408,860 + + + 358,198 + + + 339,060 + + + 352,976 + + + 400,553 + + + 492,930 +
May + + 374,397 + + + 329,218 + + + 321,060 + + + 330,407 + + + 335,967 + + + 465,045 +
June + + 401,995 + + + 356,679 + + + 343,006 + + + 326,724 + + + 296,748 + + + 426,764 +
July + + + + + 423,081 + + + 378,993 + + + 356,580 + + + 351,110 + + + 439,425 +
August + + + + + 453,391 + + + 395,883 + + + 364,011 + + + 404,076 + + + 425,814 +
September + + + + + 384,887 + + + 325,124 + + + 308,940 + + + 317,226 + + + 379,898 +
October + + + + + 383,889 + + + 333,102 + + + 317,040 + + + 355,935 + + + 415,339 +
November + + + + + 379,065 + + + 327,803 + + + 303,186 + + + 372,104 + + + 366,411 +
December + + + + + 413,873 + + + 359,313 + + + 348,051 + + + 388,573 + + + 354,253 +
Total + + 2,362,076 + + + 4,491,065 + + + 4,045,014 + + + 4,078,836 + + + 4,250,249 + + + 5,097,802 +
+ +


+ Passengers Figure(2002-2007)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200720062005200420032002
January + + 381,887 + + + 323,282 + + + 289,701 + + + 288,507 + + + 290,140 + + + 268,783 +
February + + 426,014 + + + 360,820 + + + 348,723 + + + 207,710 + + + 323,264 + + + 323,654 +
March + + 443,805 + + + 389,125 + + + 321,953 + + + 273,910 + + + 295,052 + + + 360,668 +
April + + 500,917 + + + 431,550 + + + 367,976 + + + 324,931 + + + 144,082 + + + 380,648 +
May + + 468,637 + + + 399,743 + + + 359,298 + + + 250,601 + + + 47,333 + + + 359,547 +
June + + 463,676 + + + 393,713 + + + 360,147 + + + 296,000 + + + 94,294 + + + 326,508 +
July + + 490,404 + + + 465,497 + + + 413,131 + + + 365,454 + + + 272,784 + + + 388,061 +
August + + 490,830 + + + 478,474 + + + 409,281 + + + 372,802 + + + 333,840 + + + 384,719 +
September + + 446,594 + + + 412,444 + + + 354,751 + + + 321,456 + + + 295,447 + + + 334,029 +
October + + 465,757 + + + 461,215 + + + 390,435 + + + 358,362 + + + 291,193 + + + 372,706 +
November + + 455,132 + + + 425,116 + + + 323,347 + + + 327,593 + + + 268,282 + + + 350,324 +
December + + 465,225 + + + 435,114 + + + 308,999 + + + 326,933 + + + 249,855 + + + 322,056 +
Total + + 5,498,878 + + + 4,976,093 + + + 4,247,742 + + + 3,714,259 + + + 2,905,566 + + + 4,171,703 +
+ +


+ Passengers Figure(1996-2001)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200120001999199819971996
January + + 265,603 + + + 184,381 + + + 161,264 + + + 161,432 + + + 117,984 + + + +
February + + 249,259 + + + 264,066 + + + 209,569 + + + 168,777 + + + 150,772 + + + +
March + + 312,319 + + + 226,483 + + + 186,965 + + + 172,060 + + + 149,795 + + + +
April + + 351,793 + + + 296,541 + + + 237,449 + + + 180,241 + + + 179,049 + + + +
May + + 338,692 + + + 288,949 + + + 230,691 + + + 172,391 + + + 189,925 + + + +
June + + 332,630 + + + 271,181 + + + 231,328 + + + 157,519 + + + 175,402 + + + +
July + + 344,658 + + + 304,276 + + + 243,534 + + + 205,595 + + + 173,103 + + + +
August + + 360,899 + + + 300,418 + + + 257,616 + + + 241,140 + + + 178,118 + + + +
September + + 291,817 + + + 280,803 + + + 210,885 + + + 183,954 + + + 163,385 + + + +
October + + 327,232 + + + 298,873 + + + 231,251 + + + 205,726 + + + 176,879 + + + +
November + + 315,538 + + + 265,528 + + + 228,637 + + + 181,677 + + + 146,804 + + + +
December + + 314,866 + + + 257,929 + + + 210,922 + + + 183,975 + + + 151,362 + + + +
Total + + 3,805,306 + + + 3,239,428 + + + 2,640,111 + + + 2,214,487 + + + 1,952,578 + + + 0 +
+ +


+ Passengers Figure(1995-1995)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  1995
January + + +
February + + +
March + + +
April + + +
May + + +
June + + +
July + + +
August + + +
September + + +
October + + +
November + + 6,601 +
December + + 37,041 +
Total + + 43,642 +
+ + +


+
passenger statistic picture
+


+ + + + +


+ Movement Statistics(2008-2013)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  201320122011201020092008
January + + 3,925 + + + 3,463 + + + 3,289 + + + 3,184 + + + 3,488 + + + 4,568 +
February + + 3,632 + + + 2,983 + + + 2,902 + + + 3,053 + + + 3,347 + + + 4,527 +
March + + 3,909 + + + 3,166 + + + 3,217 + + + 3,175 + + + 3,636 + + + 4,594 +
April + + 3,903 + + + 3,258 + + + 3,146 + + + 3,023 + + + 3,709 + + + 4,574 +
May + + 4,075 + + + 3,234 + + + 3,266 + + + 3,033 + + + 3,603 + + + 4,511 +
June + + 4,038 + + + 3,272 + + + 3,316 + + + 2,909 + + + 3,057 + + + 4,081 +
July + + + + + 3,661 + + + 3,359 + + + 3,062 + + + 3,354 + + + 4,215 +
August + + + + + 3,942 + + + 3,417 + + + 3,077 + + + 3,395 + + + 4,139 +
September + + + + + 3,703 + + + 3,169 + + + 3,095 + + + 3,100 + + + 3,752 +
October + + + + + 3,727 + + + 3,469 + + + 3,179 + + + 3,375 + + + 3,874 +
November + + + + + 3,722 + + + 3,145 + + + 3,159 + + + 3,213 + + + 3,567 +
December + + + + + 3,866 + + + 3,251 + + + 3,199 + + + 3,324 + + + 3,362 +
Total + + 23,482 + + + 41,997 + + + 38,946 + + + 37,148 + + + 40,601 + + + 49,764 +
+ +


+ Movement Statistics(2002-2007)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200720062005200420032002
January + + 4,384 + + + 3,933 + + + 3,528 + + + 3,051 + + + 3,257 + + + 2,711 +
February + + 4,131 + + + 3,667 + + + 3,331 + + + 2,372 + + + 3,003 + + + 2,747 +
March + + 4,349 + + + 4,345 + + + 3,549 + + + 3,049 + + + 3,109 + + + 2,985 +
April + + 4,460 + + + 4,490 + + + 3,832 + + + 3,359 + + + 2,033 + + + 2,928 +
May + + 4,629 + + + 4,245 + + + 3,663 + + + 3,251 + + + 1,229 + + + 3,109 +
June + + 4,365 + + + 4,124 + + + 3,752 + + + 3,414 + + + 1,217 + + + 3,049 +
July + + 4,612 + + + 4,386 + + + 3,876 + + + 3,664 + + + 2,423 + + + 3,078 +
August + + 4,446 + + + 4,373 + + + 3,987 + + + 3,631 + + + 3,040 + + + 3,166 +
September + + 4,414 + + + 4,311 + + + 3,782 + + + 3,514 + + + 2,809 + + + 3,239 +
October + + 4,445 + + + 4,455 + + + 3,898 + + + 3,744 + + + 3,052 + + + 3,562 +
November + + 4,563 + + + 4,285 + + + 3,951 + + + 3,694 + + + 3,125 + + + 3,546 +
December + + 4,588 + + + 4,435 + + + 3,855 + + + 3,763 + + + 2,996 + + + 3,444 +
Total + + 53,386 + + + 51,049 + + + 45,004 + + + 40,506 + + + 31,293 + + + 37,564 +
+ +


+ Movement Statistics(1996-2001)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200120001999199819971996
January + + 2,694 + + + 2,201 + + + 1,835 + + + 2,177 + + + 1,353 + + + 744 +
February + + 2,364 + + + 2,357 + + + 1,826 + + + 1,740 + + + 1,339 + + + 692 +
March + + 2,543 + + + 2,206 + + + 1,895 + + + 1,911 + + + 1,533 + + + 872 +
April + + 2,531 + + + 2,311 + + + 2,076 + + + 1,886 + + + 1,587 + + + 1,026 +
May + + 2,579 + + + 2,383 + + + 1,914 + + + 2,102 + + + 1,720 + + + 1,115 +
June + + 2,681 + + + 2,370 + + + 1,890 + + + 2,038 + + + 1,716 + + + 1,037 +
July + + 2,903 + + + 2,609 + + + 1,916 + + + 2,078 + + + 1,693 + + + 1,209 +
August + + 3,037 + + + 2,487 + + + 1,968 + + + 2,061 + + + 1,676 + + + 1,241 +
September + + 2,767 + + + 2,329 + + + 1,955 + + + 1,970 + + + 1,681 + + + 1,263 +
October + + 2,922 + + + 2,417 + + + 2,267 + + + 1,969 + + + 1,809 + + + 1,368 +
November + + 2,670 + + + 2,273 + + + 2,132 + + + 2,102 + + + 1,786 + + + 1,433 +
December + + 2,815 + + + 2,749 + + + 2,187 + + + 1,981 + + + 1,944 + + + 1,386 +
Total + + 32,506 + + + 28,692 + + + 23,861 + + + 24,015 + + + 19,837 + + + 13,386 +
+ +


+ Movement Statistics(1995-1995)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  1995
January + + +
February + + +
March + + +
April + + +
May + + +
June + + +
July + + +
August + + +
September + + +
October + + +
November + + 126 +
December + + 536 +
Total + + 662 +
+ + +


+
passenger statistic picture
+ + +
+ +
+
+
+ + + +
+
+ +
+ +
+ + + +
+ + + +
+
+ + \ No newline at end of file diff --git a/pandas/io/tests/data/nyse_wsj.html b/pandas/io/tests/data/nyse_wsj.html new file mode 100644 index 0000000000000..aa3d470a5fbc6 --- /dev/null +++ b/pandas/io/tests/data/nyse_wsj.html @@ -0,0 +1,1207 @@ + + + + + + +
+
+
+
+
+ SEARCH +
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 Issue(Roll over for charts and headlines) + VolumePriceChg% Chg
1 + J.C. Penney (JCP) + + 250,697,455$9.05-1.37-13.15
2 + Bank of America (BAC) + + 77,162,10313.90-0.18-1.28
3 + Rite Aid (RAD) + + 52,140,3824.70-0.08-1.67
4 + Ford Motor (F) + + 33,745,28717.05-0.22-1.27
5 + Pfizer (PFE) + + 27,801,85328.880.361.26
6 + Hertz Global Hldgs (HTZ) + + 25,821,26422.320.693.19
7 + General Electric (GE) + + 25,142,06424.05-0.20-0.82
8 + Elan ADS (ELN) + + 24,725,20915.590.080.52
9 + JPMorgan Chase (JPM) + + 22,402,75652.240.350.67
10 + Regions Financial (RF) + + 20,790,5329.300.121.31
11 + Violin Memory (VMEM) + + 20,669,8467.02-1.98-22.00
12 + Citigroup (C) + + 19,979,93248.89-0.04-0.08
13 + Nokia ADS (NOK) + + 19,585,0756.660.020.30
14 + Wells Fargo (WFC) + + 19,478,59041.59-0.02-0.05
15 + Vale ADS (VALE) + + 18,781,98715.60-0.52-3.23
16 + Delta Air Lines (DAL) + + 16,013,95623.57-0.44-1.83
17 + EMC (EMC) + + 15,771,25226.07-0.11-0.42
18 + Nike Cl B (NKE) + + 15,514,71773.643.304.69
19 + Alcoa (AA) + + 14,061,0738.20-0.07-0.85
20 + General Motors (GM) + + 13,984,00436.37-0.58-1.57
21 + Oracle (ORCL) + + 13,856,67133.78-0.03-0.09
22 + AT&T (T) + + 13,736,94833.98-0.25-0.73
23 + Trina Solar ADS (TSL) + + 13,284,20214.831.9915.50
24 + Yingli Green Energy Holding ADS (YGE) + + 12,978,3786.730.6310.33
25 + Petroleo Brasileiro ADS (PBR) + + 12,833,66015.40-0.21-1.35
26 + United Continental Holdings (UAL) + + 12,603,22530.91-3.16-9.28
27 + Coca-Cola (KO) + + 12,343,45238.40-0.34-0.88
28 + Arch Coal (ACI) + + 12,261,1384.25-0.28-6.18
29 + Morgan Stanley (MS) + + 11,956,34527.08-0.07-0.26
30 + Pandora Media (P) + + 11,829,96325.520.130.51
31 + Barrick Gold (ABX) + + 11,775,58518.530.000.00
32 + Abbott Laboratories (ABT) + + 11,755,71833.14-0.52-1.54
33 + Banco Santander Brasil ADS (BSBR) + + 11,587,3107.010.467.02
34 + Advanced Micro Devices (AMD) + + 11,337,6093.86-0.03-0.77
35 + Annaly Capital Management (NLY) + + 11,004,44011.63-0.07-0.60
36 + Alpha Natural Resources (ANR) + + 10,941,0746.08-0.19-3.03
37 + Exxon Mobil (XOM) + + 10,668,11586.90-0.17-0.20
38 + Itau Unibanco Holding ADS (ITUB) + + 10,638,80314.300.231.63
39 + Merck&Co (MRK) + + 10,388,15247.790.110.23
40 + Alcatel-Lucent ADS (ALU) + + 10,181,8333.650.010.27
41 + Verizon Communications (VZ) + + 10,139,32147.00-0.67-1.41
42 + Magnum Hunter Resources (MHR) + + 10,004,3036.330.467.84
43 + Hewlett-Packard (HPQ) + + 9,948,93521.17-0.13-0.61
44 + PulteGroup (PHM) + + 9,899,14116.57-0.41-2.41
45 + ReneSola ADS (SOL) + + 9,667,4384.840.398.76
46 + Corning (GLW) + + 9,547,26514.73-0.21-1.41
47 + Cole Real Estate Investments (COLE) + + 9,544,02112.210.010.08
48 + Dow Chemical (DOW) + + 9,150,47939.02-0.97-2.43
49 + International Game Technology (IGT) + + 9,129,12319.23-1.44-6.97
50 + Accenture Cl A (ACN) + + 8,773,26074.09-1.78-2.35
51 + KeyCorp (KEY) + + 8,599,33311.360.020.18
52 + Bristol-Myers Squibb (BMY) + + 8,440,70946.20-0.73-1.56
53 + Companhia Siderurgica Nacional ADS (SID) + + 8,437,6364.36-0.05-1.13
54 + H&R Block (HRB) + + 8,240,98426.360.311.19
55 + MGIC Investment (MTG) + + 8,135,0377.26-0.10-1.36
56 + RingCentral Cl A (RNG) + + 8,117,46918.205.2040.00
57 + United States Steel (X) + + 8,107,89920.44-0.66-3.13
58 + Cliffs Natural Resources (CLF) + + 8,041,57221.00-0.83-3.80
59 + Newmont Mining (NEM) + + 8,014,25027.98-0.19-0.67
60 + Altria Group (MO) + + 7,786,04834.71-0.29-0.83
61 + SandRidge Energy (SD) + + 7,782,7455.93-0.06-1.00
62 + Molycorp (MCP) + + 7,735,8316.73-0.45-6.27
63 + Halliburton (HAL) + + 7,728,73548.39-0.32-0.66
64 + Taiwan Semiconductor Manufacturing ADS (TSM) + + 7,661,39717.07-0.25-1.44
65 + Freeport-McMoRan Copper&Gold (FCX) + + 7,622,80333.42-0.45-1.33
66 + Kodiak Oil&Gas (KOG) + + 7,543,80611.940.161.36
67 + Xerox (XRX) + + 7,440,68910.37-0.01-0.10
68 + Sprint (S) + + 7,291,3516.16-0.14-2.22
69 + Two Harbors Investment (TWO) + + 7,153,8039.790.050.51
70 + Walter Energy (WLT) + + 7,152,19214.19-0.36-2.47
71 + International Paper (IP) + + 7,123,72245.44-1.85-3.91
72 + PPL (PPL) + + 7,026,29230.34-0.13-0.43
73 + Goldcorp (GG) + + 6,857,44725.760.080.31
74 + Time Warner (TWX) + + 6,807,23766.201.332.05
75 + Synovus Financial (SNV) + + 6,764,8053.290.020.61
76 + AK Steel Holding (AKS) + + 6,662,5993.83-0.11-2.79
77 + Boston Scientific (BSX) + + 6,629,08411.52-0.15-1.29
78 + Eldorado Gold (EGO) + + 6,596,9026.65-0.03-0.45
79 + Newpark Resources (NR) + + 6,552,45312.560.090.72
80 + AbbVie (ABBV) + + 6,525,52444.33-0.67-1.49
81 + MBIA (MBI) + + 6,416,58710.38-0.43-3.98
82 + SAIC (SAI) + + 6,404,58716.030.130.82
83 + Procter&Gamble (PG) + + 6,389,14377.21-0.84-1.08
84 + IAMGOLD (IAG) + + 6,293,0014.77-0.06-1.24
85 + Safeway (SWY) + + 6,268,18432.25-0.29-0.89
86 + Kinross Gold (KGC) + + 6,112,6584.99-0.03-0.60
87 + MGM Resorts International (MGM) + + 5,986,14320.22-0.05-0.25
88 + Cemex ADS (CX) + + 5,907,04011.27-0.06-0.53
89 + American International Group (AIG) + + 5,900,13349.15-0.30-0.61
90 + Chesapeake Energy (CHK) + + 5,848,01626.21-0.20-0.76
91 + RadioShack (RSH) + + 5,837,8333.44-0.43-11.11
92 + U.S. Bancorp (USB) + + 5,814,37336.50-0.04-0.11
93 + Eli Lilly (LLY) + + 5,776,99150.50-0.54-1.06
94 + MetLife (MET) + + 5,774,99647.21-0.37-0.78
95 + Yamana Gold (AUY) + + 5,742,42610.370.030.29
96 + CBS Cl B (CBS) + + 5,718,85855.50-0.06-0.11
97 + CSX (CSX) + + 5,710,06625.85-0.13-0.50
98 + Carnival (CCL) + + 5,661,32532.88-0.05-0.15
99 + Mosaic (MOS) + + 5,595,59243.43-0.76-1.72
100 + Walgreen (WAG) + + 5,568,31054.51-0.22-0.40
+ + +
+ + + + + + + + + + + + + + +
An Advertising Feature    PARTNER CENTER
+ + + + + + + + + + + + + + +
+ + +
diff --git a/pandas/io/tests/data/valid_markup.html b/pandas/io/tests/data/valid_markup.html index 5db90da3baec4..0130e9ed9d5f3 100644 --- a/pandas/io/tests/data/valid_markup.html +++ b/pandas/io/tests/data/valid_markup.html @@ -35,35 +35,26 @@
7 0
443
554
645
714
+ + + + + + + + - + - - - + + +
ab
80 6 7
985140
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 44e4b5cfda7b6..9b0fb1cacfb65 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -1,33 +1,31 @@ from __future__ import print_function + import os import re -from unittest import TestCase import warnings +import unittest + +try: + from importlib import import_module +except ImportError: + import_module = __import__ + from distutils.version import LooseVersion -from pandas.io.common import URLError import nose -from nose.tools import assert_raises import numpy as np from numpy.random import rand from numpy.testing.decorators import slow -from pandas.compat import map, zip, StringIO -import pandas.compat as compat - -try: - from importlib import import_module -except ImportError: - import_module = __import__ +from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, + date_range, Series) +from pandas.compat import map, zip, StringIO, string_types +from pandas.io.common import URLError, urlopen from pandas.io.html import read_html -from pandas.io.common import urlopen - -from pandas import DataFrame, MultiIndex, read_csv, Timestamp -from pandas.util.testing import (assert_frame_equal, network, - get_data_path) -from pandas.util.testing import makeCustomDataframe as mkdf +import pandas.util.testing as tm +from pandas.util.testing import makeCustomDataframe as mkdf, network def _have_module(module_name): @@ -40,11 +38,11 @@ def _have_module(module_name): def _skip_if_no(module_name): if not _have_module(module_name): - raise nose.SkipTest("{0} not found".format(module_name)) + raise nose.SkipTest("{0!r} not found".format(module_name)) def _skip_if_none_of(module_names): - if isinstance(module_names, compat.string_types): + if isinstance(module_names, string_types): _skip_if_no(module_names) if module_names == 'bs4': import bs4 @@ -54,17 +52,14 @@ def _skip_if_none_of(module_names): not_found = [module_name for module_name in module_names if not _have_module(module_name)] if set(not_found) & set(module_names): - raise nose.SkipTest("{0} not found".format(not_found)) + raise nose.SkipTest("{0!r} not found".format(not_found)) if 'bs4' in module_names: import bs4 if bs4.__version__ == LooseVersion('4.2.0'): raise nose.SkipTest("Bad version of bs4: 4.2.0") -DATA_PATH = get_data_path() - -def isframe(x): - return isinstance(x, DataFrame) +DATA_PATH = tm.get_data_path() def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -72,10 +67,12 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): 'len(list1) == {0}, ' 'len(list2) == {1}'.format(len(list1), len(list2))) - assert all(map(lambda x, y: isframe(x) and isframe(y), list1, list2)), \ - 'not all list elements are DataFrames' + msg = 'not all list elements are DataFrames' + both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and + isinstance(y, DataFrame), list1, list2)) + assert both_frames, msg for frame_i, frame_j in zip(list1, list2): - assert_frame_equal(frame_i, frame_j, *args, **kwargs) + tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) assert not frame_i.empty, 'frames are both empty' @@ -83,13 +80,13 @@ def test_bs4_version_fails(): _skip_if_none_of(('bs4', 'html5lib')) import bs4 if bs4.__version__ == LooseVersion('4.2.0'): - assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, - "spam.html"), - flavor='bs4') + tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, + "spam.html"), + flavor='bs4') -class TestReadHtmlBase(TestCase): - def run_read_html(self, *args, **kwargs): +class TestReadHtml(unittest.TestCase): + def read_html(self, *args, **kwargs): kwargs['flavor'] = kwargs.get('flavor', self.flavor) return read_html(*args, **kwargs) @@ -112,18 +109,16 @@ def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap('{0:.3f}'.format).astype(float) out = df.to_html() - res = self.run_read_html(out, attrs={'class': 'dataframe'}, + res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] - print(df.dtypes) - print(res.dtypes) - assert_frame_equal(res, df) + tm.assert_frame_equal(res, df) @network def test_banklist_url(self): url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' - df1 = self.run_read_html(url, 'First Federal Bank of Florida', + df1 = self.read_html(url, 'First Federal Bank of Florida', attrs={"id": 'table'}) - df2 = self.run_read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) + df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) @@ -131,133 +126,148 @@ def test_banklist_url(self): def test_spam_url(self): url = ('http://ndb.nal.usda.gov/ndb/foods/show/1732?fg=&man=&' 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') - df1 = self.run_read_html(url, '.*Water.*') - df2 = self.run_read_html(url, 'Unit') + df1 = self.read_html(url, '.*Water.*') + df2 = self.read_html(url, 'Unit') assert_framelist_equal(df1, df2) @slow def test_banklist(self): - df1 = self.run_read_html(self.banklist_data, '.*Florida.*', + df1 = self.read_html(self.banklist_data, '.*Florida.*', attrs={'id': 'table'}) - df2 = self.run_read_html(self.banklist_data, 'Metcalf Bank', + df2 = self.read_html(self.banklist_data, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) - def test_spam(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', - infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', infer_types=False) + def test_spam_no_types(self): + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html(self.spam_data, '.*Water.*', + infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) assert_framelist_equal(df1, df2) - print(df1[0]) + + self.assertEqual(df1[0].ix[0, 0], 'Proximates') + self.assertEqual(df1[0].columns[0], 'Nutrient') + + def test_spam_with_types(self): + df1 = self.read_html(self.spam_data, '.*Water.*') + df2 = self.read_html(self.spam_data, 'Unit') + assert_framelist_equal(df1, df2) self.assertEqual(df1[0].ix[0, 0], 'Proximates') self.assertEqual(df1[0].columns[0], 'Nutrient') def test_spam_no_match(self): - dfs = self.run_read_html(self.spam_data) + dfs = self.read_html(self.spam_data) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) def test_banklist_no_match(self): - dfs = self.run_read_html(self.banklist_data, attrs={'id': 'table'}) + dfs = self.read_html(self.banklist_data, attrs={'id': 'table'}) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) def test_spam_header(self): - df = self.run_read_html(self.spam_data, '.*Water.*', header=0) - df = self.run_read_html(self.spam_data, '.*Water.*', header=1)[0] - self.assertEqual(df.columns[0], 'Water') + df = self.read_html(self.spam_data, '.*Water.*', header=1)[0] + self.assertEqual(df.columns[0], 'Proximates') self.assertFalse(df.empty) def test_skiprows_int(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = [self.run_read_html(self.spam_data, '.*Water.*').pop()[2:]] - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=range(2)) - - assert_framelist_equal(df1, df2) + df1 = self.read_html(self.spam_data, '.*Water.*', + skiprows=range(2))[0] + df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=[2, 1]) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2])) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=slice(2)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) - df2 = self.run_read_html(self.spam_data, 'Unit', + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=np.arange(2)) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - self.assertRaises(ValueError, self.run_read_html, self.spam_data, - '.*Water.*', skiprows='asdf') + with tm.assertRaisesRegexp(TypeError, + 'is not a valid type for skipping rows'): + self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') def test_index(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', index_col=0) - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0) + df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) + df2 = self.read_html(self.spam_data, 'Unit', index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, - index_col=0, infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', header=1, index_col=0, - infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html(self.spam_data, '.*Water.*', header=1, + index_col=0, infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', header=1, + index_col=0, infer_types=False) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, + df1 = self.read_html(self.spam_data, '.*Water.*', header=1, index_col=0) - df2 = self.run_read_html(self.spam_data, 'Unit', header=1, index_col=0) + df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', index_col=0, - infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, - infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0, + infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', index_col=0, + infer_types=False) assert_framelist_equal(df1, df2) - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, - infer_types=True) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', index_col=0, + infer_types=True) - self.assertRaises(AssertionError, assert_framelist_equal, df1, df2) + with tm.assertRaises(AssertionError): + assert_framelist_equal(df1, df2) def test_string_io(self): with open(self.spam_data) as f: @@ -266,129 +276,197 @@ def test_string_io(self): with open(self.spam_data) as f: data2 = StringIO(f.read()) - df1 = self.run_read_html(data1, '.*Water.*', infer_types=False) - df2 = self.run_read_html(data2, 'Unit', infer_types=False) + df1 = self.read_html(data1, '.*Water.*') + df2 = self.read_html(data2, 'Unit') assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data) as f: data = f.read() - df1 = self.run_read_html(data, '.*Water.*', infer_types=False) - df2 = self.run_read_html(data, 'Unit', infer_types=False) + df1 = self.read_html(data, '.*Water.*') + df2 = self.read_html(data, 'Unit') assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data) as f: - df1 = self.run_read_html(f, '.*Water.*', infer_types=False) + df1 = self.read_html(f, '.*Water.*') with open(self.spam_data) as f: - df2 = self.run_read_html(f, 'Unit', infer_types=False) + df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2) @network def test_bad_url_protocol(self): - self.assertRaises(URLError, self.run_read_html, - 'git://github.com', '.*Water.*') + with tm.assertRaises(URLError): + self.read_html('git://github.com', match='.*Water.*') @network def test_invalid_url(self): - self.assertRaises(URLError, self.run_read_html, - 'http://www.a23950sdfa908sd.com') + with tm.assertRaises(URLError): + self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') @slow def test_file_url(self): url = self.banklist_data - dfs = self.run_read_html('file://' + url, 'First', - attrs={'id': 'table'}) - self.assert_(isinstance(dfs, list)) + dfs = self.read_html('file://' + url, 'First', attrs={'id': 'table'}) + tm.assert_isinstance(dfs, list) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) @slow def test_invalid_table_attrs(self): url = self.banklist_data - self.assertRaises(AssertionError, self.run_read_html, url, - 'First Federal Bank of Florida', - attrs={'id': 'tasdfable'}) + with tm.assertRaisesRegexp(ValueError, 'No tables found'): + self.read_html(url, 'First Federal Bank of Florida', + attrs={'id': 'tasdfable'}) def _bank_data(self, *args, **kwargs): - return self.run_read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'}, *args, **kwargs) + return self.read_html(self.banklist_data, 'Metcalf', + attrs={'id': 'table'}, *args, **kwargs) @slow def test_multiindex_header(self): df = self._bank_data(header=[0, 1])[0] - self.assert_(isinstance(df.columns, MultiIndex)) + tm.assert_isinstance(df.columns, MultiIndex) @slow def test_multiindex_index(self): df = self._bank_data(index_col=[0, 1])[0] - self.assert_(isinstance(df.index, MultiIndex)) + tm.assert_isinstance(df.index, MultiIndex) @slow def test_multiindex_header_index(self): df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] - self.assert_(isinstance(df.columns, MultiIndex)) - self.assert_(isinstance(df.index, MultiIndex)) + tm.assert_isinstance(df.columns, MultiIndex) + tm.assert_isinstance(df.index, MultiIndex) + + @slow + def test_multiindex_header_skiprows_tuples(self): + df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] + tm.assert_isinstance(df.columns, Index) @slow def test_multiindex_header_skiprows(self): df = self._bank_data(header=[0, 1], skiprows=1)[0] - self.assert_(isinstance(df.columns, MultiIndex)) + tm.assert_isinstance(df.columns, MultiIndex) @slow def test_multiindex_header_index_skiprows(self): df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] - self.assert_(isinstance(df.index, MultiIndex)) + tm.assert_isinstance(df.index, MultiIndex) + tm.assert_isinstance(df.columns, MultiIndex) @slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.run_read_html('file://' + url, + dfs = self.read_html('file://' + url, match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) - self.assert_(isinstance(dfs, list)) + tm.assert_isinstance(dfs, list) for df in dfs: - self.assert_(isinstance(df, DataFrame)) - - def test_negative_skiprows_spam(self): - url = self.spam_data - self.assertRaises(AssertionError, self.run_read_html, url, 'Water', - skiprows=-1) + tm.assert_isinstance(df, DataFrame) - def test_negative_skiprows_banklist(self): - url = self.banklist_data - self.assertRaises(AssertionError, self.run_read_html, url, 'Florida', - skiprows=-1) + def test_negative_skiprows(self): + with tm.assertRaisesRegexp(ValueError, + '\(you passed a negative value\)'): + self.read_html(self.spam_data, 'Water', skiprows=-1) @network def test_multiple_matches(self): url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' - dfs = self.run_read_html(url, match='Python', + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) self.assert_(len(dfs) > 1) @network def test_pythonxy_plugins_table(self): url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' - dfs = self.run_read_html(url, match='Python', + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) zz = [df.iloc[0, 0] for df in dfs] self.assertEqual(sorted(zz), sorted(['Python', 'SciTE'])) + @slow + def test_thousands_macau_stats(self): + all_non_nan_table_index = -2 + macau_data = os.path.join(DATA_PATH, 'macau.html') + dfs = self.read_html(macau_data, index_col=0, + attrs={'class': 'style1'}) + df = dfs[all_non_nan_table_index] + + self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) + + @slow + def test_thousands_macau_index_col(self): + all_non_nan_table_index = -2 + macau_data = os.path.join(DATA_PATH, 'macau.html') + dfs = self.read_html(macau_data, index_col=0, header=0) + df = dfs[all_non_nan_table_index] + + self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) + + def test_countries_municipalities(self): + # GH5048 + data1 = StringIO(''' + + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + data2 = StringIO(''' + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + res1 = self.read_html(data1) + res2 = self.read_html(data2, header=0) + assert_framelist_equal(res1, res2) + + def test_nyse_wsj_commas_table(self): + data = os.path.join(DATA_PATH, 'nyse_wsj.html') + df = self.read_html(data, index_col=0, header=0, + attrs={'class': 'mdcTable'})[0] + + columns = Index(['Issue(Roll over for charts and headlines)', + 'Volume', 'Price', 'Chg', '% Chg']) + nrows = 100 + self.assertEqual(df.shape[0], nrows) + self.assertTrue(df.columns.equals(columns)) + @slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace + def try_remove_ws(x): try: return _remove_whitespace(x) except AttributeError: return x - df = self.run_read_html(self.banklist_data, 'Metcalf', + df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), converters={'Updated Date': Timestamp, @@ -412,8 +490,8 @@ def try_remove_ws(x): dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew.convert_objects(convert_numeric=True) - assert_frame_equal(converted.convert_objects(convert_dates='coerce'), - gtnew) + tm.assert_frame_equal(converted.convert_objects(convert_dates='coerce'), + gtnew) @slow def test_gold_canyon(self): @@ -422,13 +500,93 @@ def test_gold_canyon(self): raw_text = f.read() self.assert_(gc in raw_text) - df = self.run_read_html(self.banklist_data, 'Gold Canyon', - attrs={'id': 'table'}, infer_types=False)[0] + df = self.read_html(self.banklist_data, 'Gold Canyon', + attrs={'id': 'table'})[0] self.assert_(gc in df.to_string()) + def test_different_number_of_rows(self): + expected = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233 nan nan nan
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" + out = """ + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" + expected = self.read_html(expected, index_col=0)[0] + res = self.read_html(out, index_col=0)[0] + tm.assert_frame_equal(expected, res) + + def test_parse_dates_list(self): + df = DataFrame({'date': date_range('1/1/2001', periods=10)}) + expected = df.to_html() + res = read_html(expected, parse_dates=[0], index_col=0) + tm.assert_frame_equal(df, res[0]) + + def test_parse_dates_combine(self): + raw_dates = Series(date_range('1/1/2001', periods=10)) + df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), + 'time': raw_dates.map(lambda x: str(x.time()))}) + res = read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, + index_col=1) + newdf = DataFrame({'datetime': raw_dates}) + tm.assert_frame_equal(newdf, res[0]) + + +class TestReadHtmlLxml(unittest.TestCase): + def setUp(self): + self.try_skip() -class TestReadHtmlLxml(TestCase): - def run_read_html(self, *args, **kwargs): + def read_html(self, *args, **kwargs): self.flavor = ['lxml'] self.try_skip() kwargs['flavor'] = kwargs.get('flavor', self.flavor) @@ -437,31 +595,28 @@ def run_read_html(self, *args, **kwargs): def try_skip(self): _skip_if_no('lxml') - def test_spam_data_fail(self): + def test_data_fail(self): from lxml.etree import XMLSyntaxError spam_data = os.path.join(DATA_PATH, 'spam.html') - self.assertRaises(XMLSyntaxError, self.run_read_html, spam_data, - flavor=['lxml']) - - def test_banklist_data_fail(self): - from lxml.etree import XMLSyntaxError banklist_data = os.path.join(DATA_PATH, 'banklist.html') - self.assertRaises(XMLSyntaxError, self.run_read_html, banklist_data, flavor=['lxml']) + + with tm.assertRaises(XMLSyntaxError): + self.read_html(spam_data, flavor=['lxml']) + + with tm.assertRaises(XMLSyntaxError): + self.read_html(banklist_data, flavor=['lxml']) def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') - dfs = self.run_read_html(filename, index_col=0, flavor=['lxml']) - self.assert_(isinstance(dfs, list)) - self.assert_(isinstance(dfs[0], DataFrame)) - - def setUp(self): - self.try_skip() + dfs = self.read_html(filename, index_col=0, flavor=['lxml']) + tm.assert_isinstance(dfs, list) + tm.assert_isinstance(dfs[0], DataFrame) @slow def test_fallback_success(self): _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') - self.run_read_html(banklist_data, '.*Water.*', flavor=['lxml', + self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) @@ -505,3 +660,11 @@ def test_lxml_finds_tables(): def test_lxml_finds_tbody(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'tbody') + + +def test_same_ordering(): + _skip_if_none_of(['bs4', 'lxml', 'html5lib']) + filename = os.path.join(DATA_PATH, 'valid_markup.html') + dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) + dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + assert_framelist_equal(dfs_lxml, dfs_bs4)