from __future__ import print_function import os import re import threading from functools import partial import pytest import numpy as np from numpy.random import rand from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import (map, zip, StringIO, BytesIO, is_platform_windows, PY3, reload) from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html from pandas._libs.parsers import ParserError import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network HERE = os.path.dirname(__file__) @pytest.fixture(params=[ 'chinese_utf-16.html', 'chinese_utf-32.html', 'chinese_utf-8.html', 'letz_latin1.html', ]) def html_encoding_file(request, datapath): """Parametrized fixture for HTML encoding test filenames.""" return datapath('io', 'data', 'html_encoding', request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): assert len(list1) == len(list2), ('lists are not of equal size ' 'len(list1) == {0}, ' 'len(list2) == {1}'.format(len(list1), len(list2))) msg = 'not all list elements are DataFrames' both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame), list1, list2)) assert both_frames, msg for frame_i, frame_j in zip(list1, list2): tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) assert not frame_i.empty, 'frames are both empty' @td.skip_if_no('bs4') def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): read_html(datapath("io", "data", "spam.html"), flavor='bs4') def test_invalid_flavor(): url = 'google.com' with pytest.raises(ValueError): read_html(url, 'google', flavor='not a* valid**++ flaver') @td.skip_if_no('bs4') @td.skip_if_no('lxml') def test_same_ordering(datapath): filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) @pytest.mark.parametrize("flavor", [ pytest.param('bs4', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No bs4')), pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): @pytest.fixture(autouse=True) def set_files(self, datapath): self.spam_data = datapath('io', 'data', 'spam.html') self.spam_data_kwargs = {} if PY3: self.spam_data_kwargs['encoding'] = 'UTF-8' self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): self.read_html = partial(read_html, flavor=flavor) yield def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap('{0:.3f}'.format).astype(float) out = df.to_html() res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] tm.assert_frame_equal(res, df) @network def test_banklist_url(self): url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' df1 = self.read_html(url, 'First Federal Bank of Florida', attrs={"id": 'table'}) df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) @network def test_spam_url(self): url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&' 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') df1 = self.read_html(url, '.*Water.*') df2 = self.read_html(url, 'Unit') assert_framelist_equal(df1, df2) @pytest.mark.slow def test_banklist(self): df1 = self.read_html(self.banklist_data, '.*Florida.*', attrs={'id': 'table'}) df2 = self.read_html(self.banklist_data, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) def test_spam_no_types(self): # infer_types removed in #10892 df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == 'Proximates' assert df1[0].columns[0] == 'Nutrient' def test_spam_with_types(self): df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == 'Proximates' assert df1[0].columns[0] == 'Nutrient' def test_spam_no_match(self): dfs = self.read_html(self.spam_data) for df in dfs: assert isinstance(df, DataFrame) def test_banklist_no_match(self): dfs = self.read_html(self.banklist_data, attrs={'id': 'table'}) for df in dfs: assert isinstance(df, DataFrame) def test_spam_header(self): df = self.read_html(self.spam_data, '.*Water.*', header=1)[0] assert df.columns[0] == 'Proximates' assert not df.empty def test_skiprows_int(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2])) df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=np.arange(2)) df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): with tm.assert_raises_regex(TypeError, 'is not a valid type ' 'for skipping rows'): self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') def test_index(self): df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) df2 = self.read_html(self.spam_data, 'Unit', index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): df1 = self.read_html(self.spam_data, '.*Water.*', header=1, index_col=0) df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): df1 = self.read_html(self.spam_data, '.*Water.*', header=1, index_col=0) df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): # 10892 infer_types removed df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) df2 = self.read_html(self.spam_data, 'Unit', index_col=0) assert_framelist_equal(df1, df2) def test_string_io(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data1 = StringIO(f.read()) with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) df1 = self.read_html(data1, '.*Water.*') df2 = self.read_html(data2, 'Unit') assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() df1 = self.read_html(data, '.*Water.*') df2 = self.read_html(data, 'Unit') assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data, **self.spam_data_kwargs) as f: df1 = self.read_html(f, '.*Water.*') with open(self.spam_data, **self.spam_data_kwargs) as f: df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2) @network def test_bad_url_protocol(self): with pytest.raises(URLError): self.read_html('git://github.com', match='.*Water.*') @network def test_invalid_url(self): try: with pytest.raises(URLError): self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') except ValueError as e: assert str(e) == 'No tables found' @pytest.mark.slow def test_file_url(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(os.path.abspath(url)), 'First', attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) @pytest.mark.slow def test_invalid_table_attrs(self): url = self.banklist_data with tm.assert_raises_regex(ValueError, 'No tables found'): self.read_html(url, 'First Federal Bank of Florida', attrs={'id': 'tasdfable'}) def _bank_data(self, *args, **kwargs): return self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'}, *args, **kwargs) @pytest.mark.slow def test_multiindex_header(self): df = self._bank_data(header=[0, 1])[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_index(self): df = self._bank_data(index_col=[0, 1])[0] assert isinstance(df.index, MultiIndex) @pytest.mark.slow def test_multiindex_header_index(self): df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) @pytest.mark.slow def test_multiindex_header_skiprows_tuples(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] assert isinstance(df.columns, Index) @pytest.mark.slow def test_multiindex_header_skiprows(self): df = self._bank_data(header=[0, 1], skiprows=1)[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_header_index_skiprows(self): df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) def test_negative_skiprows(self): with tm.assert_raises_regex(ValueError, r'\(you passed a negative value\)'): self.read_html(self.spam_data, 'Water', skiprows=-1) @network def test_multiple_matches(self): url = 'https://docs.python.org/2/' dfs = self.read_html(url, match='Python') assert len(dfs) > 1 @network def test_python_docs_table(self): url = 'https://docs.python.org/2/' dfs = self.read_html(url, match='Python') zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == sorted(['Repo', 'What']) @pytest.mark.slow def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 macau_data = datapath('io', 'data', 'macau.html') dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] assert not any(s.isna().any() for _, s in df.iteritems()) def test_empty_tables(self): """ Make sure that read_html ignores empty tables. """ data1 = '''
A B
1 2
''' data2 = data1 + '''
''' res1 = self.read_html(StringIO(data1)) res2 = self.read_html(StringIO(data2)) assert_framelist_equal(res1, res2) def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. data = '''
A B
1 2
3 4
''' expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) result = self.read_html(StringIO(data))[0] tm.assert_frame_equal(result, expected) def test_header_and_one_column(self): """ Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ data = StringIO('''
Header
first
''') expected = DataFrame(data={'Header': 'first'}, index=[0]) result = self.read_html(data)[0] tm.assert_frame_equal(result, expected) def test_tfoot_read(self): """ Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot """ data_template = ''' {footer}
A B
bodyA bodyB
''' data1 = data_template.format(footer="") data2 = data_template.format( footer="footAfootB") d1 = {'A': ['bodyA'], 'B': ['bodyB']} d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']} tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1)) tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2)) def test_countries_municipalities(self): # GH5048 data1 = StringIO('''
Country Municipality Year
Ukraine Odessa 1944
''') data2 = StringIO('''
Country Municipality Year
Ukraine Odessa 1944
''') res1 = self.read_html(data1) res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) def test_nyse_wsj_commas_table(self, datapath): data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] columns = Index(['Issue(Roll over for charts and headlines)', 'Volume', 'Price', 'Chg', '% Chg']) nrows = 100 assert df.shape[0] == nrows tm.assert_index_equal(df.columns, columns) @pytest.mark.slow def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): try: return _remove_whitespace(x) except AttributeError: return x df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) assert df.shape == ground_truth.shape old = ['First Vietnamese American BankIn Vietnamese', 'Westernbank Puerto RicoEn Espanol', 'R-G Premier Bank of Puerto RicoEn Espanol', 'EurobankEn Espanol', 'Sanderson State BankEn Espanol', 'Washington Mutual Bank(Including its subsidiary Washington ' 'Mutual Bank FSB)', 'Silver State BankEn Espanol', 'AmTrade International BankEn Espanol', 'Hamilton Bank, NAEn Espanol', 'The Citizens Savings BankPioneer Community Bank, Inc.'] new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico', 'R-G Premier Bank of Puerto Rico', 'Eurobank', 'Sanderson State Bank', 'Washington Mutual Bank', 'Silver State Bank', 'AmTrade International Bank', 'Hamilton Bank, NA', 'The Citizens Savings Bank'] dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew._convert(datetime=True, numeric=True) date_cols = ['Closing Date', 'Updated Date'] converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True) tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow def test_gold_canyon(self): gc = 'Gold Canyon' with open(self.banklist_data, 'r') as f: raw_text = f.read() assert gc in raw_text df = self.read_html(self.banklist_data, 'Gold Canyon', attrs={'id': 'table'})[0] assert gc in df.to_string() def test_different_number_of_rows(self): expected = """
C_l0_g0 C_l0_g1 C_l0_g2 C_l0_g3 C_l0_g4
R_l0_g0 0.763 0.233 nan nan nan
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" out = """
C_l0_g0 C_l0_g1 C_l0_g2 C_l0_g3 C_l0_g4
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" expected = self.read_html(expected, index_col=0)[0] res = self.read_html(out, index_col=0)[0] tm.assert_frame_equal(expected, res) def test_parse_dates_list(self): df = DataFrame({'date': date_range('1/1/2001', periods=10)}) expected = df.to_html() res = self.read_html(expected, parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) res = self.read_html(expected, parse_dates=['date'], index_col=0) tm.assert_frame_equal(df, res[0]) def test_parse_dates_combine(self): raw_dates = Series(date_range('1/1/2001', periods=10)) df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), 'time': raw_dates.map(lambda x: str(x.time()))}) res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, index_col=1) newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self, datapath): data = datapath('io', 'data', 'computer_sales_page.html') with tm.assert_raises_regex(ParserError, r"Passed header=\[0,1\] are " r"too many rows for this " r"multi_index of columns"): self.read_html(data, header=[0, 1]) data = datapath('io', 'data', 'computer_sales_page.html') assert self.read_html(data, header=[1, 2]) def test_wikipedia_states_table(self, datapath): data = datapath('io', 'data', 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') def test_decimal_rows(self): # GH 12907 data = StringIO('''
Header
1100#101
''') expected = DataFrame(data={'Header': 1100.101}, index=[0]) result = self.read_html(data, decimal='#')[0] assert result['Header'].dtype == np.dtype('float64') tm.assert_frame_equal(result, expected) def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): read_html(self.spam_data, header=arg) def test_converters(self): # GH 13461 html_data = """
a
0.763
0.244
""" expected_df = DataFrame({'a': ['0.763', '0.244']}) html_df = read_html(html_data, converters={'a': str})[0] tm.assert_frame_equal(expected_df, html_df) def test_na_values(self): # GH 13461 html_data = """
a
0.763
0.244
""" expected_df = DataFrame({'a': [0.763, np.nan]}) html_df = read_html(html_data, na_values=[0.244])[0] tm.assert_frame_equal(expected_df, html_df) def test_keep_default_na(self): html_data = """
a
N/A
NA
""" expected_df = DataFrame({'a': ['N/A', 'NA']}) html_df = read_html(html_data, keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({'a': [np.nan, np.nan]}) html_df = read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_multiple_header_rows(self): # Issue #13434 expected_df = DataFrame(data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]) expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"]] html = expected_df.to_html(index=False) html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): filename = datapath('io', 'data', 'valid_markup.html') dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath): banklist_data = datapath('io', 'data', 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) def test_to_html_timestamp(self): rng = date_range('2000-01-01', periods=10) df = DataFrame(np.random.randn(10, 4), index=rng) result = df.to_html() assert '2000-01-01' in result @pytest.mark.parametrize("displayed_only,exp0,exp1", [ (True, DataFrame(["foo"]), None), (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 data = StringIO("""
foo bar baz qux
foo
""") dfs = self.read_html(data, displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: tm.assert_frame_equal(dfs[1], exp1) else: assert len(dfs) == 1 # Should not parse hidden table def test_encode(self, html_encoding_file): _, encoding = os.path.splitext( os.path.basename(html_encoding_file) )[0].split('_') try: with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() from_filename = self.read_html(html_encoding_file, encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if '16' in encoding or '32' in encoding: pytest.skip() raise def test_parse_failure_unseekable(self): # Issue #17975 if self.read_html.keywords.get('flavor') == 'lxml': pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): def seekable(self): return False bad = UnseekableStringIO('''
spameggs
''') assert self.read_html(bad) with pytest.raises(ValueError, match='passed a non-rewindable file object'): self.read_html(bad) def test_parse_failure_rewinds(self): # Issue #17975 class MockFile(object): def __init__(self, data): self.data = data self.at_end = False def read(self, size=None): data = '' if self.at_end else self.data self.at_end = True return data def seek(self, offset): self.at_end = False def seekable(self): return True good = MockFile('
spam
eggs
') bad = MockFile('
spameggs
') assert self.read_html(good) assert self.read_html(bad) @pytest.mark.slow def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): def run(self): try: super(ErrorThread, self).run() except Exception as e: self.err = e else: self.err = None # force import check by reinitalising global vars in html.py reload(pandas.io.html) filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread1.start() helper_thread2.start() while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err