# -*- coding: utf-8 -*- # pylint: disable-msg=E1101,W0612 from datetime import datetime, timedelta import pytest import re from numpy import nan as NA import numpy as np from numpy.random import randint from pandas.compat import range, u import pandas.compat as compat from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat from pandas.util.testing import assert_series_equal, assert_index_equal import pandas.util.testing as tm import pandas.core.strings as strings def assert_series_or_index_equal(left, right): if isinstance(left, Series): assert_series_equal(left, right) else: # Index assert_index_equal(left, right) class TestStringMethods(object): def test_api(self): # GH 6106, GH 9322 assert Series.str is strings.StringMethods assert isinstance(Series(['']).str, strings.StringMethods) # GH 9184 invalid = Series([1]) with tm.assert_raises_regex(AttributeError, "only use .str accessor"): invalid.str assert not hasattr(invalid, 'str') def test_iter(self): # GH3638 strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel' ds = Series(strs) for s in ds.str: # iter must yield a Series assert isinstance(s, Series) # indices of each yielded Series should be equal to the index of # the original Series tm.assert_index_equal(s.index, ds.index) for el in s: # each element of the series is either a basestring/str or nan assert isinstance(el, compat.string_types) or isna(el) # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in # this case since 'wikitravel' is the longest string assert s.dropna().values.item() == 'l' def test_iter_empty(self): ds = Series([], dtype=object) i, s = 100, 1 for i, s in enumerate(ds.str): pass # nothing to iterate over so nothing defined values should remain # unchanged assert i == 100 assert s == 1 def test_iter_single_element(self): ds = Series(['a']) for i, s in enumerate(ds.str): pass assert not i assert_series_equal(ds, s) def test_iter_object_try_string(self): ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range( 4)]) i, s = 100, 'h' for i, s in enumerate(ds.str): pass assert i == 100 assert s == 'h' def test_cat(self): one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) # single array result = strings.str_cat(one) exp = 'aabbc' assert result == exp result = strings.str_cat(one, na_rep='NA') exp = 'aabbcNA' assert result == exp result = strings.str_cat(one, na_rep='-') exp = 'aabbc-' assert result == exp result = strings.str_cat(one, sep='_', na_rep='NA') exp = 'a_a_b_b_c_NA' assert result == exp result = strings.str_cat(two, sep='-') exp = 'a-b-d-foo' assert result == exp # Multiple arrays result = strings.str_cat(one, [two], na_rep='NA') exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], dtype=np.object_) tm.assert_numpy_array_equal(result, exp) result = strings.str_cat(one, two) exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) # error for incorrect lengths rgx = 'All arrays must be same length' three = Series(['1', '2', '3']) with tm.assert_raises_regex(ValueError, rgx): strings.str_cat(one, three) # error for incorrect type rgx = "Must pass arrays containing strings to str_cat" with tm.assert_raises_regex(ValueError, rgx): strings.str_cat(one, 'three') @pytest.mark.parametrize('container', [Series, Index]) @pytest.mark.parametrize('other', [None, Series, Index]) def test_str_cat_name(self, container, other): # https://github.com/pandas-dev/pandas/issues/21053 values = ['a', 'b'] if other: other = other(values) else: other = values result = container(values, name='name').str.cat(other, sep=',', join='left') assert result.name == 'name' @pytest.mark.parametrize('series_or_index', ['series', 'index']) def test_str_cat(self, series_or_index): # test_cat above tests "str_cat" from ndarray to ndarray; # here testing "str.cat" from Series/Index to Series/Index/ndarray/list s = Index(['a', 'a', 'b', 'b', 'c', np.nan]) if series_or_index == 'series': s = Series(s) t = Index(['a', np.nan, 'b', 'd', 'foo', np.nan]) # single array result = s.str.cat() exp = 'aabbc' assert result == exp result = s.str.cat(na_rep='-') exp = 'aabbc-' assert result == exp result = s.str.cat(sep='_', na_rep='NA') exp = 'a_a_b_b_c_NA' assert result == exp # Series/Index with Index exp = Index(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) if series_or_index == 'series': exp = Series(exp) # s.index / s is different from t (as Index) -> warning with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) # Series/Index with Series t = Series(t) # s as Series has same index as t -> no warning # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat(t, na_rep='-'), exp) # Series/Index with Series: warning if different indexes t.index = t.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) # Series/Index with array assert_series_or_index_equal(s.str.cat(t.values, na_rep='-'), exp) # Series/Index with list assert_series_or_index_equal(s.str.cat(list(t), na_rep='-'), exp) # errors for incorrect lengths rgx = 'All arrays must be same length, except.*' z = Series(['1', '2', '3']) with tm.assert_raises_regex(ValueError, rgx): s.str.cat(z) with tm.assert_raises_regex(ValueError, rgx): s.str.cat(z.values) with tm.assert_raises_regex(ValueError, rgx): s.str.cat(list(z)) @pytest.mark.parametrize('series_or_index', ['series', 'index']) def test_str_cat_raises_intuitive_error(self, series_or_index): # https://github.com/pandas-dev/pandas/issues/11334 s = Index(['a', 'b', 'c', 'd']) if series_or_index == 'series': s = Series(s) message = "Did you mean to supply a `sep` keyword?" with tm.assert_raises_regex(ValueError, message): s.str.cat('|') with tm.assert_raises_regex(ValueError, message): s.str.cat(' ') @pytest.mark.parametrize('series_or_index, dtype_caller, dtype_target', [ ('series', 'object', 'object'), ('series', 'object', 'category'), ('series', 'category', 'object'), ('series', 'category', 'category'), ('index', 'object', 'object'), ('index', 'object', 'category'), ('index', 'category', 'object'), ('index', 'category', 'category') ]) def test_str_cat_categorical(self, series_or_index, dtype_caller, dtype_target): s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller) if series_or_index == 'series': s = Series(s) t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) exp = Index(['ab', 'aa', 'bb', 'ac']) if series_or_index == 'series': exp = Series(exp) # Series/Index with Index # s.index / s is different from t (as Index) -> warning with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t), exp) # Series/Index with Series t = Series(t) # s as Series has same index as t -> no warning # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat(t), exp) # Series/Index with Series: warning if different indexes t.index = t.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) @pytest.mark.parametrize('series_or_index', ['series', 'index']) def test_str_cat_mixed_inputs(self, series_or_index): s = Index(['a', 'b', 'c', 'd']) if series_or_index == 'series': s = Series(s) t = Series(['A', 'B', 'C', 'D']) d = concat([t, Series(s)], axis=1) exp = Index(['aAa', 'bBb', 'cCc', 'dDd']) if series_or_index == 'series': exp = Series(exp) # Series/Index with DataFrame # s as Series has same index as d -> no warning # s as Index is different from d.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat(d), exp) # Series/Index with DataFrame: warning if different indexes d.index = d.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(d), exp) # Series/Index with two-dimensional ndarray assert_series_or_index_equal(s.str.cat(d.values), exp) # Series/Index with list of Series # s as Series has same index as t, s -> no warning # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat([t, s]), exp) # Series/Index with list of Series: warning if different indexes tt = t.copy() tt.index = tt.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat([tt, s]), exp) # Series/Index with list of list-likes assert_series_or_index_equal(s.str.cat([t.values, list(s)]), exp) # Series/Index with mixed list of Series/list-like # s as Series has same index as t -> no warning # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat([t, s.values]), exp) # Series/Index with mixed list: warning if different indexes with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat([tt, s.values]), exp) # Series/Index with iterator of list-likes assert_series_or_index_equal(s.str.cat(iter([t.values, list(s)])), exp) # errors for incorrect lengths rgx = 'All arrays must be same length, except.*' z = Series(['1', '2', '3']) e = concat([z, z], axis=1) # DataFrame with tm.assert_raises_regex(ValueError, rgx): s.str.cat(e) # two-dimensional ndarray with tm.assert_raises_regex(ValueError, rgx): s.str.cat(e.values) # list of Series with tm.assert_raises_regex(ValueError, rgx): s.str.cat([z, s]) # list of list-likes with tm.assert_raises_regex(ValueError, rgx): s.str.cat([z.values, list(s)]) # mixed list of Series/list-like with tm.assert_raises_regex(ValueError, rgx): s.str.cat([z, list(s)]) # errors for incorrect arguments in list-like rgx = 'others must be Series, Index, DataFrame,.*' # make sure None/NaN do not crash checks in _get_series_list u = Series(['a', np.nan, 'c', None]) # mix of string and Series with tm.assert_raises_regex(TypeError, rgx): s.str.cat([u, 'u']) # DataFrame in list with tm.assert_raises_regex(TypeError, rgx): s.str.cat([u, d]) # 2-dim ndarray in list with tm.assert_raises_regex(TypeError, rgx): s.str.cat([u, d.values]) # nested lists with tm.assert_raises_regex(TypeError, rgx): s.str.cat([u, [u, d]]) # forbidden input type, e.g. int with tm.assert_raises_regex(TypeError, rgx): s.str.cat(1) @pytest.mark.parametrize('series_or_index, join', [ ('series', 'left'), ('series', 'outer'), ('series', 'inner'), ('series', 'right'), ('index', 'left'), ('index', 'outer'), ('index', 'inner'), ('index', 'right') ]) def test_str_cat_align_indexed(self, series_or_index, join): # https://github.com/pandas-dev/pandas/issues/18657 s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) sa, ta = s.align(t, join=join) # result after manual alignment of inputs exp = sa.str.cat(ta, na_rep='-') if series_or_index == 'index': s = Index(s) sa = Index(sa) exp = Index(exp) assert_series_or_index_equal(s.str.cat(t, join=join, na_rep='-'), exp) @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) def test_str_cat_align_mixed_inputs(self, join): s = Series(['a', 'b', 'c', 'd']) t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) d = concat([t, t], axis=1) exp_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee']) sa, ta = s.align(t, join=join) exp = exp_outer.loc[ta.index] # list of Series tm.assert_series_equal(s.str.cat([t, t], join=join, na_rep='-'), exp) # DataFrame tm.assert_series_equal(s.str.cat(d, join=join, na_rep='-'), exp) # mixed list of indexed/unindexed u = ['A', 'B', 'C', 'D'] exp_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) # u will be forced have index of s -> use s here as placeholder e = concat([t, s], axis=1, join=(join if join == 'inner' else 'outer')) sa, ea = s.align(e, join=join) exp = exp_outer.loc[ea.index] tm.assert_series_equal(s.str.cat([t, u], join=join, na_rep='-'), exp) # errors for incorrect lengths rgx = 'If `others` contains arrays or lists.*' z = ['1', '2', '3'] # unindexed object of wrong length with tm.assert_raises_regex(ValueError, rgx): s.str.cat(z, join=join) # unindexed object of wrong length in list with tm.assert_raises_regex(ValueError, rgx): s.str.cat([t, z], join=join) def test_str_cat_special_cases(self): s = Series(['a', 'b', 'c', 'd']) t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) # iterator of elements with different types exp = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) tm.assert_series_equal(s.str.cat(iter([t, ['A', 'B', 'C', 'D']]), join='outer', na_rep='-'), exp) # right-align with different indexes in others exp = Series(['aa-', 'd-d'], index=[0, 3]) tm.assert_series_equal(s.str.cat([t.loc[[0]], t.loc[[3]]], join='right', na_rep='-'), exp) def test_cat_on_filtered_index(self): df = DataFrame(index=MultiIndex.from_product( [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) df = df.reset_index() df = df[df.month > 1] str_year = df.year.astype('str') str_month = df.month.astype('str') str_both = str_year.str.cat(str_month, sep=' ', join='left') assert str_both.loc[1] == '2011 2' str_multiple = str_year.str.cat([str_month, str_month], sep=' ', join='left') assert str_multiple.loc[1] == '2011 2 2' def test_count(self): values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], dtype=np.object_) result = strings.str_count(values, 'f[o]+') exp = np.array([1, 2, NA, 4]) tm.assert_numpy_array_equal(result, exp) result = Series(values).str.count('f[o]+') exp = Series([1, 2, NA, 4]) assert isinstance(result, Series) tm.assert_series_equal(result, exp) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_count(mixed, 'a') xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA]) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.count('a') xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = [u('foo'), u('foofoo'), NA, u('foooofooofommmfoo')] result = strings.str_count(values, 'f[o]+') exp = np.array([1, 2, NA, 4]) tm.assert_numpy_array_equal(result, exp) result = Series(values).str.count('f[o]+') exp = Series([1, 2, NA, 4]) assert isinstance(result, Series) tm.assert_series_equal(result, exp) def test_contains(self): values = np.array(['foo', NA, 'fooommm__foo', 'mmm_', 'foommm[_]+bar'], dtype=np.object_) pat = 'mmm[_]+' result = strings.str_contains(values, pat) expected = np.array([False, NA, True, True, False], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, regex=False) expected = np.array([False, NA, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_'] result = strings.str_contains(values, 'FOO|mmm', case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) # case insensitive without regex result = strings.str_contains(values, 'foo', regex=False, case=False) expected = np.array([True, False, True, False]) tm.assert_numpy_array_equal(result, expected) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_contains(mixed, 'o') xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.contains('o') xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = np.array([u'foo', NA, u'fooommm__foo', u'mmm_'], dtype=np.object_) pat = 'mmm[_]+' result = strings.str_contains(values, pat) expected = np.array([False, np.nan, True, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, na=False) expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(result, expected) values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'], dtype=np.object_) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # na values = Series(['om', 'foo', np.nan]) res = values.str.contains('foo', na="foo") assert res.loc[2] == "foo" def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) result = values.str.startswith('foo') exp = Series([False, NA, True, False, False, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = np.array(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.], dtype=np.object_) rs = strings.str_startswith(mixed, 'f') xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.startswith('f') assert isinstance(rs, Series) xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assert_series_equal(rs, xp) # unicode values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, u('foo')]) result = values.str.startswith('foo') exp = Series([False, NA, True, False, False, NA, True]) tm.assert_series_equal(result, exp) result = values.str.startswith('foo', na=True) tm.assert_series_equal(result, exp.fillna(True).astype(bool)) def test_endswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_endswith(mixed, 'f') xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.endswith('f') xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, u('foo')]) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) result = values.str.endswith('foo', na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool)) def test_title(self): values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) result = values.str.title() exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"]) tm.assert_series_equal(result, exp) # mixed mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.]) mixed = mixed.str.title() exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) # unicode values = Series([u("FOO"), NA, u("bar"), u("Blurg")]) results = values.str.title() exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")]) tm.assert_series_equal(results, exp) def test_lower_upper(self): values = Series(['om', NA, 'nom', 'nom']) result = values.str.upper() exp = Series(['OM', NA, 'NOM', 'NOM']) tm.assert_series_equal(result, exp) result = result.str.lower() tm.assert_series_equal(result, values) # mixed mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]) mixed = mixed.str.upper() rs = Series(mixed).str.lower() xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = Series([u('om'), NA, u('nom'), u('nom')]) result = values.str.upper() exp = Series([u('OM'), NA, u('NOM'), u('NOM')]) tm.assert_series_equal(result, exp) result = result.str.lower() tm.assert_series_equal(result, values) def test_capitalize(self): values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) result = values.str.capitalize() exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"]) tm.assert_series_equal(result, exp) # mixed mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.]) mixed = mixed.str.capitalize() exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) # unicode values = Series([u("FOO"), NA, u("bar"), u("Blurg")]) results = values.str.capitalize() exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")]) tm.assert_series_equal(results, exp) def test_swapcase(self): values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) result = values.str.swapcase() exp = Series(["foo", "bar", NA, "bLAH", "BLURG"]) tm.assert_series_equal(result, exp) # mixed mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None, 1, 2.]) mixed = mixed.str.swapcase() exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) # unicode values = Series([u("FOO"), NA, u("bar"), u("Blurg")]) results = values.str.swapcase() exp = Series([u("foo"), NA, u("BAR"), u("bLURG")]) tm.assert_series_equal(results, exp) def test_casemethods(self): values = ['aaa', 'bbb', 'CCC', 'Dddd', 'eEEE'] s = Series(values) assert s.str.lower().tolist() == [v.lower() for v in values] assert s.str.upper().tolist() == [v.upper() for v in values] assert s.str.title().tolist() == [v.title() for v in values] assert s.str.capitalize().tolist() == [v.capitalize() for v in values] assert s.str.swapcase().tolist() == [v.swapcase() for v in values] def test_replace(self): values = Series(['fooBAD__barBAD', NA]) result = values.str.replace('BAD[_]*', '') exp = Series(['foobar', NA]) tm.assert_series_equal(result, exp) result = values.str.replace('BAD[_]*', '', n=1) exp = Series(['foobarBAD', NA]) tm.assert_series_equal(result, exp) # mixed mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', None, 1, 2.]) rs = Series(mixed).str.replace('BAD[_]*', '') xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA]) result = values.str.replace('BAD[_]*', '') exp = Series([u('foobar'), NA]) tm.assert_series_equal(result, exp) result = values.str.replace('BAD[_]*', '', n=1) exp = Series([u('foobarBAD'), NA]) tm.assert_series_equal(result, exp) # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp) # GH 13438 for klass in (Series, Index): for repl in (None, 3, {'a': 'b'}): for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']): values = klass(data) pytest.raises(TypeError, values.str.replace, 'a', repl) def test_replace_callable(self): # GH 15055 values = Series(['fooBAD__barBAD', NA]) # test with callable repl = lambda m: m.group(0).swapcase() result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) exp = Series(['foObaD__baRbaD', NA]) tm.assert_series_equal(result, exp) # test with wrong number of arguments, raising an error if compat.PY2: p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?' else: p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' r'(?(3)required )positional arguments?') repl = lambda: None with tm.assert_raises_regex(TypeError, p_err): values.str.replace('a', repl) repl = lambda m, x: None with tm.assert_raises_regex(TypeError, p_err): values.str.replace('a', repl) repl = lambda m, x, y=None: None with tm.assert_raises_regex(TypeError, p_err): values.str.replace('a', repl) # test regex named groups values = Series(['Foo Bar Baz', NA]) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group('middle').swapcase() result = values.str.replace(pat, repl) exp = Series(['bAR', NA]) tm.assert_series_equal(result, exp) def test_replace_compiled_regex(self): # GH 15446 values = Series(['fooBAD__barBAD', NA]) # test with compiled regex pat = re.compile(r'BAD[_]*') result = values.str.replace(pat, '') exp = Series(['foobar', NA]) tm.assert_series_equal(result, exp) # mixed mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', None, 1, 2.]) rs = Series(mixed).str.replace(pat, '') xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA]) result = values.str.replace(pat, '') exp = Series([u('foobar'), NA]) tm.assert_series_equal(result, exp) result = values.str.replace(pat, '', n=1) exp = Series([u('foobarBAD'), NA]) tm.assert_series_equal(result, exp) # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) result = values.str.replace(pat, ", ") tm.assert_series_equal(result, exp) # case and flags provided to str.replace will have no effect # and will produce warnings values = Series(['fooBAD__barBAD__bad', NA]) pat = re.compile(r'BAD[_]*') with tm.assert_raises_regex(ValueError, "case and flags cannot be"): result = values.str.replace(pat, '', flags=re.IGNORECASE) with tm.assert_raises_regex(ValueError, "case and flags cannot be"): result = values.str.replace(pat, '', case=False) with tm.assert_raises_regex(ValueError, "case and flags cannot be"): result = values.str.replace(pat, '', case=True) # test with callable values = Series(['fooBAD__barBAD', NA]) repl = lambda m: m.group(0).swapcase() pat = re.compile('[a-z][A-Z]{2}') result = values.str.replace(pat, repl, n=2) exp = Series(['foObaD__baRbaD', NA]) tm.assert_series_equal(result, exp) def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) values = Series(['f.o', 'foo', NA]) exp = Series(['bao', 'bao', NA]) result = values.str.replace('f.', 'ba') tm.assert_series_equal(result, exp) exp = Series(['bao', 'foo', NA]) result = values.str.replace('f.', 'ba', regex=False) tm.assert_series_equal(result, exp) # Cannot do a literal replace if given a callable repl or compiled # pattern callable_repl = lambda m: m.group(0).swapcase() compiled_pat = re.compile('[a-z][A-Z]{2}') pytest.raises(ValueError, values.str.replace, 'abc', callable_repl, regex=False) pytest.raises(ValueError, values.str.replace, compiled_pat, '', regex=False) def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) result = values.str.repeat(3) exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd']) tm.assert_series_equal(result, exp) result = values.str.repeat([1, 2, 3, 4, 5, 6]) exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd']) tm.assert_series_equal(result, exp) # mixed mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.repeat(3) xp = Series(['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = Series([u('a'), u('b'), NA, u('c'), NA, u('d')]) result = values.str.repeat(3) exp = Series([u('aaa'), u('bbb'), NA, u('ccc'), NA, u('ddd')]) tm.assert_series_equal(result, exp) result = values.str.repeat([1, 2, 3, 4, 5, 6]) exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')]) tm.assert_series_equal(result, exp) def test_match(self): # New match behavior introduced in 0.13 values = Series(['fooBAD__barBAD', NA, 'foo']) result = values.str.match('.*(BAD[_]+).*(BAD)') exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) values = Series(['fooBAD__barBAD', NA, 'foo']) result = values.str.match('.*BAD[_]+.*BAD') exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) # test passing as_indexer still works but is ignored values = Series(['fooBAD__barBAD', NA, 'foo']) exp = Series([True, NA, False]) with tm.assert_produces_warning(FutureWarning): result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True) tm.assert_series_equal(result, exp) with tm.assert_produces_warning(FutureWarning): result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False) tm.assert_series_equal(result, exp) with tm.assert_produces_warning(FutureWarning): result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) tm.assert_series_equal(result, exp) pytest.raises(ValueError, values.str.match, '.*(BAD[_]+).*(BAD)', as_indexer=False) # mixed mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') xp = Series([True, NA, True, NA, NA, False, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) result = values.str.match('.*(BAD[_]+).*(BAD)') exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) # na GH #6609 res = Series(['a', 0, np.nan]).str.match('a', na=False) exp = Series([True, False, False]) assert_series_equal(exp, res) res = Series(['a', 0, np.nan]).str.match('a') exp = Series([True, np.nan, np.nan]) assert_series_equal(exp, res) def test_extract_expand_None(self): values = Series(['fooBAD__barBAD', NA, 'foo']) with tm.assert_raises_regex(ValueError, 'expand must be True or False'): values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) def test_extract_expand_unspecified(self): values = Series(['fooBAD__barBAD', NA, 'foo']) result_unspecified = values.str.extract('.*(BAD[_]+).*') assert isinstance(result_unspecified, DataFrame) result_true = values.str.extract('.*(BAD[_]+).*', expand=True) tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. values = Series(['fooBAD__barBAD', NA, 'foo']) er = [NA, NA] # empty row result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False) exp = DataFrame([['BAD__', 'BAD'], er, er]) tm.assert_frame_equal(result, exp) # mixed mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=False) exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False) exp = DataFrame([[u('BAD__'), u('BAD')], er, er]) tm.assert_frame_equal(result, exp) # GH9980 # Index only works with one regex group since # multi-group would expand to a frame idx = Index(['A1', 'A2', 'A3', 'A4', 'B5']) with tm.assert_raises_regex(ValueError, "supported"): idx.str.extract('([AB])([123])', expand=False) # these should work for both Series and Index for klass in [Series, Index]: # no groups s_or_idx = klass(['A1', 'B2', 'C3']) f = lambda: s_or_idx.str.extract('[ABC][123]', expand=False) pytest.raises(ValueError, f) # only non-capturing groups f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=False) pytest.raises(ValueError, f) # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) result = s_or_idx.str.extract(r'(?PA)\d', expand=False) assert result.name == 'uno' exp = klass(['A', 'A'], name='uno') if klass == Series: tm.assert_series_equal(result, exp) else: tm.assert_index_equal(result, exp) s = Series(['A1', 'B2', 'C3']) # one group, no matches result = s.str.extract('(_)', expand=False) exp = Series([NA, NA, NA], dtype=object) tm.assert_series_equal(result, exp) # two groups, no matches result = s.str.extract('(_)(_)', expand=False) exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) tm.assert_frame_equal(result, exp) # one group, some matches result = s.str.extract('([AB])[123]', expand=False) exp = Series(['A', 'B', NA]) tm.assert_series_equal(result, exp) # two groups, some matches result = s.str.extract('([AB])([123])', expand=False) exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) tm.assert_frame_equal(result, exp) # one named group result = s.str.extract('(?P[AB])', expand=False) exp = Series(['A', 'B', NA], name='letter') tm.assert_series_equal(result, exp) # two named groups result = s.str.extract('(?P[AB])(?P[123])', expand=False) exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) # mix named and unnamed groups result = s.str.extract('([AB])(?P[123])', expand=False) exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number']) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group result = s.str.extract('([AB])(?:[123])', expand=False) exp = Series(['A', 'B', NA]) tm.assert_series_equal(result, exp) # two normal groups, one non-capturing group result = Series(['A11', 'B22', 'C33']).str.extract( '([AB])([123])(?:[123])', expand=False) exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group result = Series(['A1', 'B2', '3']).str.extract( '(?P[AB])?(?P[123])', expand=False) exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group result = Series(['A1', 'B2', 'C']).str.extract( '(?P[ABC])(?P[123])?', expand=False) exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): data = ['A1', 'B2', 'C'] index = index[:len(data)] s = Series(data, index=index) result = s.str.extract(r'(\d)', expand=False) exp = Series(['1', '2', NA], index=index) tm.assert_series_equal(result, exp) result = Series(data, index=index).str.extract( r'(?P\D)(?P\d)?', expand=False) e_list = [ ['A', '1'], ['B', '2'], ['C', NA] ] exp = DataFrame(e_list, columns=['letter', 'number'], index=index) tm.assert_frame_equal(result, exp) i_funs = [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex ] for index in i_funs: check_index(index()) # single_series_name_is_preserved. s = Series(['a3', 'b3', 'c2'], name='bob') r = s.str.extract(r'(?P[a-z])', expand=False) e = Series(['a', 'b', 'c'], name='sue') tm.assert_series_equal(r, e) assert r.name == e.name def test_extract_expand_True(self): # Contains tests like those in test_match and some others. values = Series(['fooBAD__barBAD', NA, 'foo']) er = [NA, NA] # empty row result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True) exp = DataFrame([['BAD__', 'BAD'], er, er]) tm.assert_frame_equal(result, exp) # mixed mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=True) exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True) exp = DataFrame([[u('BAD__'), u('BAD')], er, er]) tm.assert_frame_equal(result, exp) # these should work for both Series and Index for klass in [Series, Index]: # no groups s_or_idx = klass(['A1', 'B2', 'C3']) f = lambda: s_or_idx.str.extract('[ABC][123]', expand=True) pytest.raises(ValueError, f) # only non-capturing groups f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=True) pytest.raises(ValueError, f) # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) result_df = s_or_idx.str.extract(r'(?PA)\d', expand=True) assert isinstance(result_df, DataFrame) result_series = result_df['uno'] assert_series_equal(result_series, Series(['A', 'A'], name='uno')) def test_extract_series(self): # extract should give the same result whether or not the # series has a name. for series_name in None, "series_name": s = Series(['A1', 'B2', 'C3'], name=series_name) # one group, no matches result = s.str.extract('(_)', expand=True) exp = DataFrame([NA, NA, NA], dtype=object) tm.assert_frame_equal(result, exp) # two groups, no matches result = s.str.extract('(_)(_)', expand=True) exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) tm.assert_frame_equal(result, exp) # one group, some matches result = s.str.extract('([AB])[123]', expand=True) exp = DataFrame(['A', 'B', NA]) tm.assert_frame_equal(result, exp) # two groups, some matches result = s.str.extract('([AB])([123])', expand=True) exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) tm.assert_frame_equal(result, exp) # one named group result = s.str.extract('(?P[AB])', expand=True) exp = DataFrame({"letter": ['A', 'B', NA]}) tm.assert_frame_equal(result, exp) # two named groups result = s.str.extract( '(?P[AB])(?P[123])', expand=True) e_list = [ ['A', '1'], ['B', '2'], [NA, NA] ] exp = DataFrame(e_list, columns=['letter', 'number']) tm.assert_frame_equal(result, exp) # mix named and unnamed groups result = s.str.extract('([AB])(?P[123])', expand=True) exp = DataFrame(e_list, columns=[0, 'number']) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group result = s.str.extract('([AB])(?:[123])', expand=True) exp = DataFrame(['A', 'B', NA]) tm.assert_frame_equal(result, exp) def test_extract_optional_groups(self): # two normal groups, one non-capturing group result = Series(['A11', 'B22', 'C33']).str.extract( '([AB])([123])(?:[123])', expand=True) exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group result = Series(['A1', 'B2', '3']).str.extract( '(?P[AB])?(?P[123])', expand=True) e_list = [ ['A', '1'], ['B', '2'], [NA, '3'] ] exp = DataFrame(e_list, columns=['letter', 'number']) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group result = Series(['A1', 'B2', 'C']).str.extract( '(?P[ABC])(?P[123])?', expand=True) e_list = [ ['A', '1'], ['B', '2'], ['C', NA] ] exp = DataFrame(e_list, columns=['letter', 'number']) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): data = ['A1', 'B2', 'C'] index = index[:len(data)] result = Series(data, index=index).str.extract( r'(\d)', expand=True) exp = DataFrame(['1', '2', NA], index=index) tm.assert_frame_equal(result, exp) result = Series(data, index=index).str.extract( r'(?P\D)(?P\d)?', expand=True) e_list = [ ['A', '1'], ['B', '2'], ['C', NA] ] exp = DataFrame(e_list, columns=['letter', 'number'], index=index) tm.assert_frame_equal(result, exp) i_funs = [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex ] for index in i_funs: check_index(index()) def test_extract_single_group_returns_frame(self): # GH11386 extract should always return DataFrame, even when # there is only one group. Prior to v0.18.0, extract returned # Series when there was only one group in the regex. s = Series(['a3', 'b3', 'c2'], name='series_name') r = s.str.extract(r'(?P[a-z])', expand=True) e = DataFrame({"letter": ['a', 'b', 'c']}) tm.assert_frame_equal(r, e) def test_extractall(self): subject_list = [ 'dave@google.com', 'tdhock5@gmail.com', 'maudelaperriere@gmail.com', 'rob@gmail.com some text steve@gmail.com', 'a@b.com some text c@d.com and e@f.com', np.nan, "", ] expected_tuples = [ ("dave", "google", "com"), ("tdhock5", "gmail", "com"), ("maudelaperriere", "gmail", "com"), ("rob", "gmail", "com"), ("steve", "gmail", "com"), ("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"), ] named_pattern = r""" (?P[a-z0-9]+) @ (?P[a-z]+) \. (?P[a-z]{2,4}) """ expected_columns = ["user", "domain", "tld"] S = Series(subject_list) # extractall should return a DataFrame with one row for each # match, indexed by the subject from which the match came. expected_index = MultiIndex.from_tuples([ (0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2), ], names=(None, "match")) expected_df = DataFrame( expected_tuples, expected_index, expected_columns) computed_df = S.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # The index of the input Series should be used to construct # the index of the output DataFrame: series_index = MultiIndex.from_tuples([ ("single", "Dave"), ("single", "Toby"), ("single", "Maude"), ("multiple", "robAndSteve"), ("multiple", "abcdef"), ("none", "missing"), ("none", "empty"), ]) Si = Series(subject_list, series_index) expected_index = MultiIndex.from_tuples([ ("single", "Dave", 0), ("single", "Toby", 0), ("single", "Maude", 0), ("multiple", "robAndSteve", 0), ("multiple", "robAndSteve", 1), ("multiple", "abcdef", 0), ("multiple", "abcdef", 1), ("multiple", "abcdef", 2), ], names=(None, None, "match")) expected_df = DataFrame( expected_tuples, expected_index, expected_columns) computed_df = Si.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # MultiIndexed subject with names. Sn = Series(subject_list, series_index) Sn.index.names = ("matches", "description") expected_index.names = ("matches", "description", "match") expected_df = DataFrame( expected_tuples, expected_index, expected_columns) computed_df = Sn.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # optional groups. subject_list = ['', 'A1', '32'] named_pattern = '(?P[AB])?(?P[123])' computed_df = Series(subject_list).str.extractall(named_pattern) expected_index = MultiIndex.from_tuples([ (1, 0), (2, 0), (2, 1), ], names=(None, "match")) expected_df = DataFrame([ ('A', '1'), (NA, '3'), (NA, '2'), ], expected_index, columns=['letter', 'number']) tm.assert_frame_equal(computed_df, expected_df) # only one of two groups has a name. pattern = '([AB])?(?P[123])' computed_df = Series(subject_list).str.extractall(pattern) expected_df = DataFrame([ ('A', '1'), (NA, '3'), (NA, '2'), ], expected_index, columns=[0, 'number']) tm.assert_frame_equal(computed_df, expected_df) def test_extractall_single_group(self): # extractall(one named group) returns DataFrame with one named # column. s = Series(['a3', 'b3', 'd4c2'], name='series_name') r = s.str.extractall(r'(?P[a-z])') i = MultiIndex.from_tuples([ (0, 0), (1, 0), (2, 0), (2, 1), ], names=(None, "match")) e = DataFrame({"letter": ['a', 'b', 'd', 'c']}, i) tm.assert_frame_equal(r, e) # extractall(one un-named group) returns DataFrame with one # un-named column. r = s.str.extractall(r'([a-z])') e = DataFrame(['a', 'b', 'd', 'c'], i) tm.assert_frame_equal(r, e) def test_extractall_single_group_with_quantifier(self): # extractall(one un-named group with quantifier) returns # DataFrame with one un-named column (GH13382). s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name') r = s.str.extractall(r'([a-z]+)') i = MultiIndex.from_tuples([ (0, 0), (1, 0), (2, 0), (2, 1), ], names=(None, "match")) e = DataFrame(['ab', 'abc', 'd', 'cd'], i) tm.assert_frame_equal(r, e) @pytest.mark.parametrize('data, names', [ ([], (None, )), ([], ('i1', )), ([], (None, 'i2')), ([], ('i1', 'i2')), (['a3', 'b3', 'd4c2'], (None, )), (['a3', 'b3', 'd4c2'], ('i1', 'i2')), (['a3', 'b3', 'd4c2'], (None, 'i2')), (['a3', 'b3', 'd4c2'], ('i1', 'i2')), ]) def test_extractall_no_matches(self, data, names): # GH19075 extractall with no matches should return a valid MultiIndex n = len(data) if len(names) == 1: i = Index(range(n), name=names[0]) else: a = (tuple([i] * (n - 1)) for i in range(n)) i = MultiIndex.from_tuples(a, names=names) s = Series(data, name='series_name', index=i, dtype='object') ei = MultiIndex.from_tuples([], names=(names + ('match',))) # one un-named group. r = s.str.extractall('(z)') e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) # two un-named groups. r = s.str.extractall('(z)(z)') e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) # one named group. r = s.str.extractall('(?Pz)') e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) # two named groups. r = s.str.extractall('(?Pz)(?Pz)') e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) # one named, one un-named. r = s.str.extractall('(z)(?Pz)') e = DataFrame(columns=[0, "second"], index=ei) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self): s = Series(["a1a2", "b1", "c1"], name='xxx') res = s.str.extractall(r"[ab](?P\d)") exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, 'match']) exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) # index should return the same result as the default index without name # thus index.name doesn't affect to the result for idx in [Index(["a1a2", "b1", "c1"]), Index(["a1a2", "b1", "c1"], name='xxx')]: res = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(res, exp) s = Series(["a1a2", "b1", "c1"], name='s_name', index=Index(["XX", "yy", "zz"], name='idx_name')) res = s.str.extractall(r"[ab](?P\d)") exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", 'match']) exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for # each capture group) s = Series(['a3', 'b3', 'd4c2'], name='series_name') with tm.assert_raises_regex(ValueError, "no capture groups"): s.str.extractall(r'[a-z]') def test_extract_index_one_two_groups(self): s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], name='series_name') r = s.index.str.extract(r'([A-Z])', expand=True) e = DataFrame(['A', "B", "D"]) tm.assert_frame_equal(r, e) # Prior to v0.18.0, index.str.extract(regex with one group) # returned Index. With more than one group, extract raised an # error (GH9980). Now extract always returns DataFrame. r = s.index.str.extract( r'(?P[A-Z])(?P[0-9])', expand=True) e_list = [ ("A", "3"), ("B", "3"), ("D", "4"), ] e = DataFrame(e_list, columns=["letter", "digit"]) tm.assert_frame_equal(r, e) def test_extractall_same_as_extract(self): s = Series(['a3', 'b3', 'c2'], name='series_name') pattern_two_noname = r'([a-z])([0-9])' extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_multi_index = s.str.extractall(pattern_two_noname) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_multi_index) pattern_two_named = r'(?P[a-z])(?P[0-9])' extract_two_named = s.str.extract(pattern_two_named, expand=True) has_multi_index = s.str.extractall(pattern_two_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_multi_index) pattern_one_named = r'(?P[a-z])' extract_one_named = s.str.extract(pattern_one_named, expand=True) has_multi_index = s.str.extractall(pattern_one_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_multi_index) pattern_one_noname = r'([a-z])' extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_multi_index = s.str.extractall(pattern_one_noname) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_multi_index) def test_extractall_same_as_extract_subject_index(self): # same as above tests, but s has an MultiIndex. i = MultiIndex.from_tuples([ ("A", "first"), ("B", "second"), ("C", "third"), ], names=("capital", "ordinal")) s = Series(['a3', 'b3', 'c2'], i, name='series_name') pattern_two_noname = r'([a-z])([0-9])' extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_match_index = s.str.extractall(pattern_two_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_match_index) pattern_two_named = r'(?P[a-z])(?P[0-9])' extract_two_named = s.str.extract(pattern_two_named, expand=True) has_match_index = s.str.extractall(pattern_two_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_match_index) pattern_one_named = r'(?P[a-z])' extract_one_named = s.str.extract(pattern_one_named, expand=True) has_match_index = s.str.extractall(pattern_one_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_match_index) pattern_one_noname = r'([a-z])' extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_match_index) def test_empty_str_methods(self): empty_str = empty = Series(dtype=object) empty_int = Series(dtype=int) empty_bool = Series(dtype=bool) empty_bytes = Series(dtype=object) # GH7241 # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty, join='left')) assert '' == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count('a')) tm.assert_series_equal(empty_bool, empty.str.contains('a')) tm.assert_series_equal(empty_bool, empty.str.startswith('a')) tm.assert_series_equal(empty_bool, empty.str.endswith('a')) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) tm.assert_series_equal(empty_str, empty.str.replace('a', 'b')) tm.assert_series_equal(empty_str, empty.str.repeat(3)) tm.assert_series_equal(empty_bool, empty.str.match('^a')) tm.assert_frame_equal( DataFrame(columns=[0], dtype=str), empty.str.extract('()', expand=True)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=str), empty.str.extract('()()', expand=True)) tm.assert_series_equal( empty_str, empty.str.extract('()', expand=False)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=str), empty.str.extract('()()', expand=False)) tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join('')) tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_str, empty_str.str.findall('a')) tm.assert_series_equal(empty_int, empty.str.find('a')) tm.assert_series_equal(empty_int, empty.str.rfind('a')) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_str, empty.str.split('a')) tm.assert_series_equal(empty_str, empty.str.rsplit('a')) tm.assert_series_equal(empty_str, empty.str.partition('a', expand=False)) tm.assert_series_equal(empty_str, empty.str.rpartition('a', expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) tm.assert_series_equal(empty_str, empty.str.lstrip()) tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii')) tm.assert_series_equal(empty_bytes, empty.str.encode('ascii')) tm.assert_series_equal(empty_str, empty.str.isalnum()) tm.assert_series_equal(empty_str, empty.str.isalpha()) tm.assert_series_equal(empty_str, empty.str.isdigit()) tm.assert_series_equal(empty_str, empty.str.isspace()) tm.assert_series_equal(empty_str, empty.str.islower()) tm.assert_series_equal(empty_str, empty.str.isupper()) tm.assert_series_equal(empty_str, empty.str.istitle()) tm.assert_series_equal(empty_str, empty.str.isnumeric()) tm.assert_series_equal(empty_str, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) if compat.PY3: table = str.maketrans('a', 'b') else: import string table = string.maketrans('a', 'b') tm.assert_series_equal(empty_str, empty.str.translate(table)) def test_empty_str_methods_to_frame(self): empty = Series(dtype=str) empty_df = DataFrame([]) tm.assert_frame_equal(empty_df, empty.str.partition('a')) tm.assert_frame_equal(empty_df, empty.str.rpartition('a')) def test_ismethods(self): values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] str_s = Series(values) alnum_e = [True, True, True, True, True, False, True, True, False, False] alpha_e = [True, True, True, False, False, False, True, False, False, False] digit_e = [False, False, False, True, False, False, False, True, False, False] # TODO: unused num_e = [False, False, False, True, False, False, # noqa False, True, False, False] space_e = [False, False, False, False, False, False, False, False, False, True] lower_e = [False, True, False, False, False, False, False, False, False, False] upper_e = [True, False, False, False, True, False, True, False, False, False] title_e = [True, False, True, False, True, False, False, False, False, False] tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e)) tm.assert_series_equal(str_s.str.isspace(), Series(space_e)) tm.assert_series_equal(str_s.str.islower(), Series(lower_e)) tm.assert_series_equal(str_s.str.isupper(), Series(upper_e)) tm.assert_series_equal(str_s.str.istitle(), Series(title_e)) assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values] assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values] assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values] assert str_s.str.isspace().tolist() == [v.isspace() for v in values] assert str_s.str.islower().tolist() == [v.islower() for v in values] assert str_s.str.isupper().tolist() == [v.isupper() for v in values] assert str_s.str.istitle().tolist() == [v.istitle() for v in values] def test_isnumeric(self): # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 values = ['A', '3', u'¼', u'★', u'፸', u'3', 'four'] s = Series(values) numeric_e = [False, True, True, False, True, True, False] decimal_e = [False, True, False, False, False, True, False] tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) unicodes = [u'A', u'3', u'¼', u'★', u'፸', u'3', u'four'] assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] values = ['A', np.nan, u'¼', u'★', np.nan, u'3', 'four'] s = Series(values) numeric_e = [False, np.nan, True, False, np.nan, True, False] decimal_e = [False, np.nan, False, False, np.nan, True, False] tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) def test_get_dummies(self): s = Series(['a|b', 'a|c', np.nan]) result = s.str.get_dummies('|') expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list('abc')) tm.assert_frame_equal(result, expected) s = Series(['a;b', 'a', 7]) result = s.str.get_dummies(';') expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list('7ab')) tm.assert_frame_equal(result, expected) # GH9980, GH8028 idx = Index(['a|b', 'a|c', 'b|c']) result = idx.str.get_dummies('|') expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=('a', 'b', 'c')) tm.assert_index_equal(result, expected) def test_get_dummies_with_name_dummy(self): # GH 12180 # Dummies named 'name' should work as expected s = Series(['a', 'b,name', 'b']) result = s.str.get_dummies(',') expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=['a', 'b', 'name']) tm.assert_frame_equal(result, expected) idx = Index(['a|b', 'name|c', 'b|name']) result = idx.str.get_dummies('|') expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=('a', 'b', 'c', 'name')) tm.assert_index_equal(result, expected) def test_join(self): values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) result = values.str.split('_').str.join('_') tm.assert_series_equal(values, result) # mixed mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.split('_').str.join('_') xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')]) result = values.str.split('_').str.join('_') tm.assert_series_equal(values, result) def test_len(self): values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) result = values.str.len() exp = values.map(lambda x: len(x) if notna(x) else NA) tm.assert_series_equal(result, exp) # mixed mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.len() xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan, u( 'fooooooo')]) result = values.str.len() exp = values.map(lambda x: len(x) if notna(x) else NA) tm.assert_series_equal(result, exp) def test_findall(self): values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD']) result = values.str.findall('BAD[_]*') exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']]) tm.assert_almost_equal(result, exp) # mixed mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(), 'BAD', None, 1, 2.]) rs = Series(mixed).str.findall('BAD[_]*') xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo'), u('BAD')]) result = values.str.findall('BAD[_]*') exp = Series([[u('BAD__'), u('BAD')], NA, [], [u('BAD')]]) tm.assert_almost_equal(result, exp) def test_find(self): values = Series(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXXX']) result = values.str.find('EF') tm.assert_series_equal(result, Series([4, 3, 1, 0, -1])) expected = np.array([v.find('EF') for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rfind('EF') tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) expected = np.array([v.rfind('EF') for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.find('EF', 3) tm.assert_series_equal(result, Series([4, 3, 7, 4, -1])) expected = np.array([v.find('EF', 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rfind('EF', 3) tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) expected = np.array([v.rfind('EF', 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.find('EF', 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) expected = np.array([v.find('EF', 3, 6) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rfind('EF', 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) expected = np.array([v.rfind('EF', 3, 6) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) with tm.assert_raises_regex(TypeError, "expected a string object, not int"): result = values.str.find(0) with tm.assert_raises_regex(TypeError, "expected a string object, not int"): result = values.str.rfind(0) def test_find_nan(self): values = Series(['ABCDEFG', np.nan, 'DEFGHIJEF', np.nan, 'XXXX']) result = values.str.find('EF') tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1])) result = values.str.rfind('EF') tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) result = values.str.find('EF', 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) result = values.str.rfind('EF', 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) result = values.str.find('EF', 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) result = values.str.rfind('EF', 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) def test_index(self): def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result, expected) for klass in [Series, Index]: s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF']) result = s.str.index('EF') _check(result, klass([4, 3, 1, 0])) expected = np.array([v.index('EF') for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('EF') _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex('EF') for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index('EF', 3) _check(result, klass([4, 3, 7, 4])) expected = np.array([v.index('EF', 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('EF', 3) _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex('EF', 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index('E', 4, 8) _check(result, klass([4, 5, 7, 4])) expected = np.array([v.index('E', 4, 8) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('E', 0, 5) _check(result, klass([4, 3, 1, 4])) expected = np.array([v.rindex('E', 0, 5) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) with tm.assert_raises_regex(ValueError, "substring not found"): result = s.str.index('DE') with tm.assert_raises_regex(TypeError, "expected a string " "object, not int"): result = s.str.index(0) # test with nan s = Series(['abcb', 'ab', 'bcbe', np.nan]) result = s.str.index('b') tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) result = s.str.rindex('b') tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) result = values.str.pad(5, side='left') exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side='right') exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side='both') exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) # mixed mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. ]) rs = Series(mixed).str.pad(5, side='left') xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. ]) rs = Series(mixed).str.pad(5, side='right') xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. ]) rs = Series(mixed).str.pad(5, side='both') xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')]) result = values.str.pad(5, side='left') exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side='right') exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side='both') exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')]) tm.assert_almost_equal(result, exp) def test_pad_fillchar(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) result = values.str.pad(5, side='left', fillchar='X') exp = Series(['XXXXa', 'XXXXb', NA, 'XXXXc', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side='right', fillchar='X') exp = Series(['aXXXX', 'bXXXX', NA, 'cXXXX', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side='both', fillchar='X') exp = Series(['XXaXX', 'XXbXX', NA, 'XXcXX', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not str"): result = values.str.pad(5, fillchar='XY') with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not int"): result = values.str.pad(5, fillchar=5) def test_pad_width(self): # GH 13598 s = Series(['1', '22', 'a', 'bb']) for f in ['center', 'ljust', 'rjust', 'zfill', 'pad']: with tm.assert_raises_regex(TypeError, "width must be of " "integer type, not*"): getattr(s.str, f)('f') def test_translate(self): def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result, expected) for klass in [Series, Index]: s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) if not compat.PY3: import string table = string.maketrans('abc', 'cde') else: table = str.maketrans('abc', 'cde') result = s.str.translate(table) expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg']) _check(result, expected) # use of deletechars is python 2 only if not compat.PY3: result = s.str.translate(table, deletechars='fg') expected = klass(['cdede', 'cdee', 'eddd', 'ede']) _check(result, expected) result = s.str.translate(None, deletechars='fg') expected = klass(['abcde', 'abcc', 'cddd', 'cde']) _check(result, expected) else: with tm.assert_raises_regex( ValueError, "deletechars is not a valid argument"): result = s.str.translate(table, deletechars='fg') # Series with non-string values s = Series(['a', 'b', 'c', 1.2]) expected = Series(['c', 'd', 'e', np.nan]) result = s.str.translate(table) tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) result = values.str.center(5) exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) result = values.str.ljust(5) exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) result = values.str.rjust(5) exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) tm.assert_almost_equal(result, exp) # mixed mixed = Series(['a', NA, 'b', True, datetime.today(), 'c', 'eee', None, 1, 2.]) rs = Series(mixed).str.center(5) xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, NA ]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.ljust(5) xp = Series(['a ', NA, 'b ', NA, NA, 'c ', 'eee ', NA, NA, NA ]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rjust(5) xp = Series([' a', NA, ' b', NA, NA, ' c', ' eee', NA, NA, NA ]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')]) result = values.str.center(5) exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')]) tm.assert_almost_equal(result, exp) result = values.str.ljust(5) exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')]) tm.assert_almost_equal(result, exp) result = values.str.rjust(5) exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')]) tm.assert_almost_equal(result, exp) def test_center_ljust_rjust_fillchar(self): values = Series(['a', 'bb', 'cccc', 'ddddd', 'eeeeee']) result = values.str.center(5, fillchar='X') expected = Series(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee']) tm.assert_series_equal(result, expected) expected = np.array([v.center(5, 'X') for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.ljust(5, fillchar='X') expected = Series(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee']) tm.assert_series_equal(result, expected) expected = np.array([v.ljust(5, 'X') for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rjust(5, fillchar='X') expected = Series(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee']) tm.assert_series_equal(result, expected) expected = np.array([v.rjust(5, 'X') for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) # If fillchar is not a charatter, normal str raises TypeError # 'aaa'.ljust(5, 'XY') # TypeError: must be char, not str with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not str"): result = values.str.center(5, fillchar='XY') with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not str"): result = values.str.ljust(5, fillchar='XY') with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not str"): result = values.str.rjust(5, fillchar='XY') with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not int"): result = values.str.center(5, fillchar=1) with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not int"): result = values.str.ljust(5, fillchar=1) with tm.assert_raises_regex(TypeError, "fillchar must be a " "character, not int"): result = values.str.rjust(5, fillchar=1) def test_zfill(self): values = Series(['1', '22', 'aaa', '333', '45678']) result = values.str.zfill(5) expected = Series(['00001', '00022', '00aaa', '00333', '45678']) tm.assert_series_equal(result, expected) expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.zfill(3) expected = Series(['001', '022', 'aaa', '333', '45678']) tm.assert_series_equal(result, expected) expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) values = Series(['1', np.nan, 'aaa', np.nan, '45678']) result = values.str.zfill(5) expected = Series(['00001', np.nan, '00aaa', np.nan, '45678']) tm.assert_series_equal(result, expected) def test_split(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.split('_') exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) tm.assert_series_equal(result, exp) # more than one char values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) result = values.str.split('__') tm.assert_series_equal(result, exp) result = values.str.split('__', expand=False) tm.assert_series_equal(result, exp) # mixed mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, 2.]) result = mixed.str.split('_') exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA ]) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) result = mixed.str.split('_', expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # unicode values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) result = values.str.split('_') exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA, [u('f'), u('g'), u('h')]]) tm.assert_series_equal(result, exp) result = values.str.split('_', expand=False) tm.assert_series_equal(result, exp) # regex split values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')]) result = values.str.split('[,_]') exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA, [u('f'), u('g'), u('h')]]) tm.assert_series_equal(result, exp) def test_rsplit(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.rsplit('_') exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) tm.assert_series_equal(result, exp) # more than one char values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) result = values.str.rsplit('__') tm.assert_series_equal(result, exp) result = values.str.rsplit('__', expand=False) tm.assert_series_equal(result, exp) # mixed mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, 2.]) result = mixed.str.rsplit('_') exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA ]) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) result = mixed.str.rsplit('_', expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # unicode values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) result = values.str.rsplit('_') exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA, [u('f'), u('g'), u('h')]]) tm.assert_series_equal(result, exp) result = values.str.rsplit('_', expand=False) tm.assert_series_equal(result, exp) # regex split is not supported by rsplit values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')]) result = values.str.rsplit('[,_]') exp = Series([[u('a,b_c')], [u('c_d,e')], NA, [u('f,g,h')]]) tm.assert_series_equal(result, exp) # setting max number of splits, make sure it's from reverse values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.rsplit('_', n=1) exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']]) tm.assert_series_equal(result, exp) def test_split_blank_string(self): # expand blank split GH 20067 values = Series([''], name='test') result = values.str.split(expand=True) exp = DataFrame([[]]) tm.assert_frame_equal(result, exp) values = Series(['a b c', 'a b', '', ' '], name='test') result = values.str.split(expand=True) exp = DataFrame([['a', 'b', 'c'], ['a', 'b', np.nan], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]]) tm.assert_frame_equal(result, exp) def test_split_noargs(self): # #1859 s = Series(['Wes McKinney', 'Travis Oliphant']) result = s.str.split() expected = ['Travis', 'Oliphant'] assert result[1] == expected result = s.str.rsplit() assert result[1] == expected def test_split_maxsplit(self): # re.split 0, str.split -1 s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk']) result = s.str.split(n=-1) xp = s.str.split() tm.assert_series_equal(result, xp) result = s.str.split(n=0) tm.assert_series_equal(result, xp) xp = s.str.split('asdf') result = s.str.split('asdf', n=0) tm.assert_series_equal(result, xp) result = s.str.split('asdf', n=-1) tm.assert_series_equal(result, xp) def test_split_no_pat_with_nonzero_n(self): s = Series(['split once', 'split once too!']) result = s.str.split(n=1) expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) tm.assert_series_equal(expected, result, check_index_type=False) def test_split_to_dataframe(self): s = Series(['nosplit', 'alsonosplit']) result = s.str.split('_', expand=True) exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) tm.assert_frame_equal(result, exp) s = Series(['some_equal_splits', 'with_no_nans']) result = s.str.split('_', expand=True) exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], 2: ['splits', 'nans']}) tm.assert_frame_equal(result, exp) s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) result = s.str.split('_', expand=True) exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'], 2: ['splits', 'these'], 3: [NA, 'things'], 4: [NA, 'is'], 5: [NA, 'not']}) tm.assert_frame_equal(result, exp) s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) result = s.str.split('_', expand=True) exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, index=['preserve', 'me']) tm.assert_frame_equal(result, exp) with tm.assert_raises_regex(ValueError, "expand must be"): s.str.split('_', expand="not_a_boolean") def test_split_to_multiindex_expand(self): idx = Index(['nosplit', 'alsonosplit']) result = idx.str.split('_', expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 idx = Index(['some_equal_splits', 'with_no_nans']) result = idx.str.split('_', expand=True) exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), ( 'with', 'no', 'nans')]) tm.assert_index_equal(result, exp) assert result.nlevels == 3 idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) result = idx.str.split('_', expand=True) exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA ), ('one', 'of', 'these', 'things', 'is', 'not')]) tm.assert_index_equal(result, exp) assert result.nlevels == 6 with tm.assert_raises_regex(ValueError, "expand must be"): idx.str.split('_', expand="not_a_boolean") def test_rsplit_to_dataframe_expand(self): s = Series(['nosplit', 'alsonosplit']) result = s.str.rsplit('_', expand=True) exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) tm.assert_frame_equal(result, exp) s = Series(['some_equal_splits', 'with_no_nans']) result = s.str.rsplit('_', expand=True) exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], 2: ['splits', 'nans']}) tm.assert_frame_equal(result, exp) result = s.str.rsplit('_', expand=True, n=2) exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], 2: ['splits', 'nans']}) tm.assert_frame_equal(result, exp) result = s.str.rsplit('_', expand=True, n=1) exp = DataFrame({0: ['some_equal', 'with_no'], 1: ['splits', 'nans']}) tm.assert_frame_equal(result, exp) s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) result = s.str.rsplit('_', expand=True) exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, index=['preserve', 'me']) tm.assert_frame_equal(result, exp) def test_rsplit_to_multiindex_expand(self): idx = Index(['nosplit', 'alsonosplit']) result = idx.str.rsplit('_', expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 idx = Index(['some_equal_splits', 'with_no_nans']) result = idx.str.rsplit('_', expand=True) exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), ( 'with', 'no', 'nans')]) tm.assert_index_equal(result, exp) assert result.nlevels == 3 idx = Index(['some_equal_splits', 'with_no_nans']) result = idx.str.rsplit('_', expand=True, n=1) exp = MultiIndex.from_tuples([('some_equal', 'splits'), ('with_no', 'nans')]) tm.assert_index_equal(result, exp) assert result.nlevels == 2 def test_split_nan_expand(self): # gh-18450 s = Series(["foo,bar,baz", NA]) result = s.str.split(",", expand=True) exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) tm.assert_frame_equal(result, exp) # check that these are actually np.nan and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate assert all(np.isnan(x) for x in result.iloc[1]) def test_split_with_name(self): # GH 12617 # should preserve name s = Series(['a,b', 'c,d'], name='xxx') res = s.str.split(',') exp = Series([['a', 'b'], ['c', 'd']], name='xxx') tm.assert_series_equal(res, exp) res = s.str.split(',', expand=True) exp = DataFrame([['a', 'b'], ['c', 'd']]) tm.assert_frame_equal(res, exp) idx = Index(['a,b', 'c,d'], name='xxx') res = idx.str.split(',') exp = Index([['a', 'b'], ['c', 'd']], name='xxx') assert res.nlevels == 1 tm.assert_index_equal(res, exp) res = idx.str.split(',', expand=True) exp = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')]) assert res.nlevels == 2 tm.assert_index_equal(res, exp) def test_partition_series(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.partition('_', expand=False) exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, ('f', '_', 'g_h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, ('f_g', '_', 'h')]) tm.assert_series_equal(result, exp) # more than one char values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) result = values.str.partition('__', expand=False) exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, ('f', '__', 'g__h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('__', expand=False) exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, ('f__g', '__', 'h')]) tm.assert_series_equal(result, exp) # None values = Series(['a b c', 'c d e', NA, 'f g h']) result = values.str.partition(expand=False) exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, ('f', ' ', 'g h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, ('f g', ' ', 'h')]) tm.assert_series_equal(result, exp) # Not splited values = Series(['abc', 'cde', NA, 'fgh']) result = values.str.partition('_', expand=False) exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')]) tm.assert_series_equal(result, exp) # unicode values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h']) result = values.str.partition('_', expand=False) exp = Series([(u'a', u'_', u'b_c'), (u'c', u'_', u'd_e'), NA, (u'f', u'_', u'g_h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) exp = Series([(u'a_b', u'_', u'c'), (u'c_d', u'_', u'e'), NA, (u'f_g', u'_', u'h')]) tm.assert_series_equal(result, exp) # compare to standard lib values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF']) result = values.str.partition('_', expand=False).tolist() assert result == [v.partition('_') for v in values] result = values.str.rpartition('_', expand=False).tolist() assert result == [v.rpartition('_') for v in values] def test_partition_index(self): values = Index(['a_b_c', 'c_d_e', 'f_g_h']) result = values.str.partition('_', expand=False) exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.rpartition('_', expand=False) exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ( 'f_g', '_', 'h')])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.partition('_') exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 result = values.str.rpartition('_') exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 def test_partition_to_dataframe(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.partition('_') exp = DataFrame({0: ['a', 'c', np.nan, 'f'], 1: ['_', '_', np.nan, '_'], 2: ['b_c', 'd_e', np.nan, 'g_h']}) tm.assert_frame_equal(result, exp) result = values.str.rpartition('_') exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], 1: ['_', '_', np.nan, '_'], 2: ['c', 'e', np.nan, 'h']}) tm.assert_frame_equal(result, exp) values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.partition('_', expand=True) exp = DataFrame({0: ['a', 'c', np.nan, 'f'], 1: ['_', '_', np.nan, '_'], 2: ['b_c', 'd_e', np.nan, 'g_h']}) tm.assert_frame_equal(result, exp) result = values.str.rpartition('_', expand=True) exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], 1: ['_', '_', np.nan, '_'], 2: ['c', 'e', np.nan, 'h']}) tm.assert_frame_equal(result, exp) def test_partition_with_name(self): # GH 12617 s = Series(['a,b', 'c,d'], name='xxx') res = s.str.partition(',') exp = DataFrame({0: ['a', 'c'], 1: [',', ','], 2: ['b', 'd']}) tm.assert_frame_equal(res, exp) # should preserve name res = s.str.partition(',', expand=False) exp = Series([('a', ',', 'b'), ('c', ',', 'd')], name='xxx') tm.assert_series_equal(res, exp) idx = Index(['a,b', 'c,d'], name='xxx') res = idx.str.partition(',') exp = MultiIndex.from_tuples([('a', ',', 'b'), ('c', ',', 'd')]) assert res.nlevels == 3 tm.assert_index_equal(res, exp) # should preserve name res = idx.str.partition(',', expand=False) exp = Index(np.array([('a', ',', 'b'), ('c', ',', 'd')]), name='xxx') assert res.nlevels == 1 tm.assert_index_equal(res, exp) def test_pipe_failures(self): # #2119 s = Series(['A|B|C']) result = s.str.split('|') exp = Series([['A', 'B', 'C']]) tm.assert_series_equal(result, exp) result = s.str.replace('|', ' ') exp = Series(['A B C']) tm.assert_series_equal(result, exp) def test_slice(self): values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux']) result = values.str.slice(2, 5) exp = Series(['foo', 'bar', NA, 'baz']) tm.assert_series_equal(result, exp) for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), (3, 0, -1)]: try: result = values.str.slice(start, stop, step) expected = Series([s[start:stop:step] if not isna(s) else NA for s in values]) tm.assert_series_equal(result, expected) except: print('failed on %s:%s:%s' % (start, stop, step)) raise # mixed mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(), None, 1, 2.]) rs = Series(mixed).str.slice(2, 5) xp = Series(['foo', NA, 'bar', NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.slice(2, 5, -1) xp = Series(['oof', NA, 'rab', NA, NA, NA, NA, NA]) # unicode values = Series([u('aafootwo'), u('aabartwo'), NA, u('aabazqux')]) result = values.str.slice(2, 5) exp = Series([u('foo'), u('bar'), NA, u('baz')]) tm.assert_series_equal(result, exp) result = values.str.slice(0, -1, 2) exp = Series([u('afow'), u('abrw'), NA, u('abzu')]) tm.assert_series_equal(result, exp) def test_slice_replace(self): values = Series(['short', 'a bit longer', 'evenlongerthanthat', '', NA ]) exp = Series(['shrt', 'a it longer', 'evnlongerthanthat', '', NA]) result = values.str.slice_replace(2, 3) tm.assert_series_equal(result, exp) exp = Series(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z', NA]) result = values.str.slice_replace(2, 3, 'z') tm.assert_series_equal(result, exp) exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA ]) result = values.str.slice_replace(2, 2, 'z') tm.assert_series_equal(result, exp) exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA ]) result = values.str.slice_replace(2, 1, 'z') tm.assert_series_equal(result, exp) exp = Series(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z', NA]) result = values.str.slice_replace(-1, None, 'z') tm.assert_series_equal(result, exp) exp = Series(['zrt', 'zer', 'zat', 'z', NA]) result = values.str.slice_replace(None, -2, 'z') tm.assert_series_equal(result, exp) exp = Series(['shortz', 'a bit znger', 'evenlozerthanthat', 'z', NA]) result = values.str.slice_replace(6, 8, 'z') tm.assert_series_equal(result, exp) exp = Series(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z', NA]) result = values.str.slice_replace(-10, 3, 'z') tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip(self): values = Series([' aa ', ' bb \n', NA, 'cc ']) result = values.str.strip() exp = Series(['aa', 'bb', NA, 'cc']) tm.assert_series_equal(result, exp) result = values.str.lstrip() exp = Series(['aa ', 'bb \n', NA, 'cc ']) tm.assert_series_equal(result, exp) result = values.str.rstrip() exp = Series([' aa', ' bb', NA, 'cc']) tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip_mixed(self): # mixed mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), None, 1, 2.]) rs = Series(mixed).str.strip() xp = Series(['aa', NA, 'bb', NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.lstrip() xp = Series(['aa ', NA, 'bb \t\n', NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rstrip() xp = Series([' aa', NA, ' bb', NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_strip_lstrip_rstrip_unicode(self): # unicode values = Series([u(' aa '), u(' bb \n'), NA, u('cc ')]) result = values.str.strip() exp = Series([u('aa'), u('bb'), NA, u('cc')]) tm.assert_series_equal(result, exp) result = values.str.lstrip() exp = Series([u('aa '), u('bb \n'), NA, u('cc ')]) tm.assert_series_equal(result, exp) result = values.str.rstrip() exp = Series([u(' aa'), u(' bb'), NA, u('cc')]) tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip_args(self): values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx']) rs = values.str.strip('x') xp = Series(['ABC', ' BNSD', 'LDFJH ']) assert_series_equal(rs, xp) rs = values.str.lstrip('x') xp = Series(['ABCxx', ' BNSD', 'LDFJH xx']) assert_series_equal(rs, xp) rs = values.str.rstrip('x') xp = Series(['xxABC', 'xx BNSD', 'LDFJH ']) assert_series_equal(rs, xp) def test_strip_lstrip_rstrip_args_unicode(self): values = Series([u('xxABCxx'), u('xx BNSD'), u('LDFJH xx')]) rs = values.str.strip(u('x')) xp = Series(['ABC', ' BNSD', 'LDFJH ']) assert_series_equal(rs, xp) rs = values.str.lstrip(u('x')) xp = Series(['ABCxx', ' BNSD', 'LDFJH xx']) assert_series_equal(rs, xp) rs = values.str.rstrip(u('x')) xp = Series(['xxABC', 'xx BNSD', 'LDFJH ']) assert_series_equal(rs, xp) def test_wrap(self): # test values are: two words less than width, two words equal to width, # two words greater than width, one word less than width, one word # equal to width, one word greater than width, multiple tokens with # trailing whitespace equal to width values = Series([u('hello world'), u('hello world!'), u( 'hello world!!'), u('abcdefabcde'), u('abcdefabcdef'), u( 'abcdefabcdefa'), u('ab ab ab ab '), u('ab ab ab ab a'), u( '\t')]) # expected values xp = Series([u('hello world'), u('hello world!'), u('hello\nworld!!'), u('abcdefabcde'), u('abcdefabcdef'), u('abcdefabcdef\na'), u('ab ab ab ab'), u('ab ab ab ab\na'), u('')]) rs = values.str.wrap(12, break_long_words=True) assert_series_equal(rs, xp) # test with pre and post whitespace (non-unicode), NaN, and non-ascii # Unicode values = Series([' pre ', np.nan, u('\xac\u20ac\U00008000 abadcafe') ]) xp = Series([' pre', NA, u('\xac\u20ac\U00008000 ab\nadcafe')]) rs = values.str.wrap(6) assert_series_equal(rs, xp) def test_get(self): values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) result = values.str.split('_').str.get(1) expected = Series(['b', 'd', np.nan, 'g']) tm.assert_series_equal(result, expected) # mixed mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), None, 1, 2.]) rs = Series(mixed).str.split('_').str.get(1) xp = Series(['b', NA, 'd', NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')]) result = values.str.split('_').str.get(1) expected = Series([u('b'), u('d'), np.nan, u('g')]) tm.assert_series_equal(result, expected) # bounds testing values = Series(['1_2_3_4_5', '6_7_8_9_10', '11_12']) # positive index result = values.str.split('_').str.get(2) expected = Series(['3', '8', np.nan]) tm.assert_series_equal(result, expected) # negative index result = values.str.split('_').str.get(-3) expected = Series(['3', '8', np.nan]) tm.assert_series_equal(result, expected) def test_get_complex(self): # GH 20671, getting value not in dict raising `KeyError` values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: 'a', 2: 'b', 3: 'c'}]) result = values.str.get(1) expected = Series([2, 2, np.nan, 'a']) tm.assert_series_equal(result, expected) result = values.str.get(-1) expected = Series([3, 3, np.nan, np.nan]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('to_type', [tuple, list, np.array]) def test_get_complex_nested(self, to_type): values = Series([to_type([to_type([1, 2])])]) result = values.str.get(0) expected = Series([to_type([1, 2])]) tm.assert_series_equal(result, expected) result = values.str.get(1) expected = Series([np.nan]) tm.assert_series_equal(result, expected) def test_more_contains(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat']) result = s.str.contains('a') expected = Series([False, False, False, True, True, False, np.nan, False, False, True]) assert_series_equal(result, expected) result = s.str.contains('a', case=False) expected = Series([True, False, False, True, True, False, np.nan, True, False, True]) assert_series_equal(result, expected) result = s.str.contains('Aa') expected = Series([False, False, False, True, False, False, np.nan, False, False, False]) assert_series_equal(result, expected) result = s.str.contains('ba') expected = Series([False, False, False, True, False, False, np.nan, False, False, False]) assert_series_equal(result, expected) result = s.str.contains('ba', case=False) expected = Series([False, False, False, True, True, False, np.nan, True, False, False]) assert_series_equal(result, expected) def test_contains_nan(self): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=np.object_) result = s.str.contains('foo', na=False) expected = Series([False, False, False], dtype=np.bool_) assert_series_equal(result, expected) result = s.str.contains('foo', na=True) expected = Series([True, True, True], dtype=np.bool_) assert_series_equal(result, expected) result = s.str.contains('foo', na="foo") expected = Series(["foo", "foo", "foo"], dtype=np.object_) assert_series_equal(result, expected) result = s.str.contains('foo') expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) assert_series_equal(result, expected) def test_more_replace(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat']) result = s.str.replace('A', 'YYY') expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA, 'CYYYBYYY', 'dog', 'cat']) assert_series_equal(result, expected) result = s.str.replace('A', 'YYY', case=False) expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA, 'CYYYBYYY', 'dog', 'cYYYt']) assert_series_equal(result, expected) result = s.str.replace('^.a|dog', 'XX-XX ', case=False) expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA, 'XX-XX BA', 'XX-XX ', 'XX-XX t']) assert_series_equal(result, expected) def test_string_slice_get_syntax(self): s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, 'CYYYBYYY', 'dog', 'cYYYt']) result = s.str[0] expected = s.str.get(0) assert_series_equal(result, expected) result = s.str[:3] expected = s.str.slice(stop=3) assert_series_equal(result, expected) result = s.str[2::-1] expected = s.str.slice(start=2, step=-1) assert_series_equal(result, expected) def test_string_slice_out_of_bounds(self): s = Series([(1, 2), (1, ), (3, 4, 5)]) result = s.str[1] expected = Series([2, np.nan, 4]) assert_series_equal(result, expected) s = Series(['foo', 'b', 'ba']) result = s.str[1] expected = Series(['o', np.nan, 'a']) assert_series_equal(result, expected) def test_match_findall_flags(self): data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan} data = Series(data) pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ['dave', 'google', 'com'] result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ('dave', 'google', 'com') result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 with tm.assert_produces_warning(UserWarning): result = data.str.contains(pat, flags=re.IGNORECASE) assert result[0] def test_encode_decode(self): base = Series([u('a'), u('b'), u('a\xe4')]) series = base.str.encode('utf-8') f = lambda x: x.decode('utf-8') result = series.str.decode('utf-8') exp = series.map(f) tm.assert_series_equal(result, exp) def test_encode_decode_errors(self): encodeBase = Series([u('a'), u('b'), u('a\x9d')]) pytest.raises(UnicodeEncodeError, encodeBase.str.encode, 'cp1252') f = lambda x: x.encode('cp1252', 'ignore') result = encodeBase.str.encode('cp1252', 'ignore') exp = encodeBase.map(f) tm.assert_series_equal(result, exp) decodeBase = Series([b'a', b'b', b'a\x9d']) pytest.raises(UnicodeDecodeError, decodeBase.str.decode, 'cp1252') f = lambda x: x.decode('cp1252', 'ignore') result = decodeBase.str.decode('cp1252', 'ignore') exp = decodeBase.map(f) tm.assert_series_equal(result, exp) def test_normalize(self): values = ['ABC', u'ABC', u'123', np.nan, u'アイエ'] s = Series(values, index=['a', 'b', 'c', 'd', 'e']) normed = [u'ABC', u'ABC', u'123', np.nan, u'アイエ'] expected = Series(normed, index=['a', 'b', 'c', 'd', 'e']) result = s.str.normalize('NFKC') tm.assert_series_equal(result, expected) expected = Series([u'ABC', u'ABC', u'123', np.nan, u'アイエ'], index=['a', 'b', 'c', 'd', 'e']) result = s.str.normalize('NFC') tm.assert_series_equal(result, expected) with tm.assert_raises_regex(ValueError, "invalid normalization form"): s.str.normalize('xxx') s = Index([u'ABC', u'123', u'アイエ']) expected = Index([u'ABC', u'123', u'アイエ']) result = s.str.normalize('NFKC') tm.assert_index_equal(result, expected) def test_index_str_accessor_visibility(self): from pandas.core.strings import StringMethods if not compat.PY3: cases = [(['a', 'b'], 'string'), (['a', u('b')], 'mixed'), ([u('a'), u('b')], 'unicode'), (['a', 'b', 1], 'mixed-integer'), (['a', 'b', 1.3], 'mixed'), (['a', 'b', 1.3, 1], 'mixed-integer'), (['aa', datetime(2011, 1, 1)], 'mixed')] else: cases = [(['a', 'b'], 'string'), (['a', u('b')], 'string'), ([u('a'), u('b')], 'string'), (['a', 'b', 1], 'mixed-integer'), (['a', 'b', 1.3], 'mixed'), (['a', 'b', 1.3, 1], 'mixed-integer'), (['aa', datetime(2011, 1, 1)], 'mixed')] for values, tp in cases: idx = Index(values) assert isinstance(Series(values).str, StringMethods) assert isinstance(idx.str, StringMethods) assert idx.inferred_type == tp for values, tp in cases: idx = Index(values) assert isinstance(Series(values).str, StringMethods) assert isinstance(idx.str, StringMethods) assert idx.inferred_type == tp cases = [([1, np.nan], 'floating'), ([datetime(2011, 1, 1)], 'datetime64'), ([timedelta(1)], 'timedelta64')] for values, tp in cases: idx = Index(values) message = 'Can only use .str accessor with string values' with tm.assert_raises_regex(AttributeError, message): Series(values).str with tm.assert_raises_regex(AttributeError, message): idx.str assert idx.inferred_type == tp # MultiIndex has mixed dtype, but not allow to use accessor idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')]) assert idx.inferred_type == 'mixed' message = 'Can only use .str accessor with Index, not MultiIndex' with tm.assert_raises_regex(AttributeError, message): idx.str def test_str_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 s = Series(list('aabbcde')) with tm.assert_raises_regex(AttributeError, "You cannot add any new attribute"): s.str.xlabel = "a" def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: pytest.raises(TypeError, lhs.str.cat, rhs, join='left') else: result = lhs.str.cat(rhs, join='left') expected = Series(np.array( ['ad', 'be', 'cf'], 'S2').astype(object)) tm.assert_series_equal(result, expected)