laywerrobot/lib/python3.6/site-packages/pandas/tests/test_strings.py
2020-08-27 21:55:39 +02:00

3183 lines
122 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# pylint: disable-msg=E1101,W0612
from datetime import datetime, timedelta
import pytest
import re
from numpy import nan as NA
import numpy as np
from numpy.random import randint
from pandas.compat import range, u
import pandas.compat as compat
from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat
from pandas.util.testing import assert_series_equal, assert_index_equal
import pandas.util.testing as tm
import pandas.core.strings as strings
def assert_series_or_index_equal(left, right):
if isinstance(left, Series):
assert_series_equal(left, right)
else: # Index
assert_index_equal(left, right)
class TestStringMethods(object):
def test_api(self):
# GH 6106, GH 9322
assert Series.str is strings.StringMethods
assert isinstance(Series(['']).str, strings.StringMethods)
# GH 9184
invalid = Series([1])
with tm.assert_raises_regex(AttributeError,
"only use .str accessor"):
invalid.str
assert not hasattr(invalid, 'str')
def test_iter(self):
# GH3638
strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel'
ds = Series(strs)
for s in ds.str:
# iter must yield a Series
assert isinstance(s, Series)
# indices of each yielded Series should be equal to the index of
# the original Series
tm.assert_index_equal(s.index, ds.index)
for el in s:
# each element of the series is either a basestring/str or nan
assert isinstance(el, compat.string_types) or isna(el)
# desired behavior is to iterate until everything would be nan on the
# next iter so make sure the last element of the iterator was 'l' in
# this case since 'wikitravel' is the longest string
assert s.dropna().values.item() == 'l'
def test_iter_empty(self):
ds = Series([], dtype=object)
i, s = 100, 1
for i, s in enumerate(ds.str):
pass
# nothing to iterate over so nothing defined values should remain
# unchanged
assert i == 100
assert s == 1
def test_iter_single_element(self):
ds = Series(['a'])
for i, s in enumerate(ds.str):
pass
assert not i
assert_series_equal(ds, s)
def test_iter_object_try_string(self):
ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(
4)])
i, s = 100, 'h'
for i, s in enumerate(ds.str):
pass
assert i == 100
assert s == 'h'
def test_cat(self):
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)
# single array
result = strings.str_cat(one)
exp = 'aabbc'
assert result == exp
result = strings.str_cat(one, na_rep='NA')
exp = 'aabbcNA'
assert result == exp
result = strings.str_cat(one, na_rep='-')
exp = 'aabbc-'
assert result == exp
result = strings.str_cat(one, sep='_', na_rep='NA')
exp = 'a_a_b_b_c_NA'
assert result == exp
result = strings.str_cat(two, sep='-')
exp = 'a-b-d-foo'
assert result == exp
# Multiple arrays
result = strings.str_cat(one, [two], na_rep='NA')
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
dtype=np.object_)
tm.assert_numpy_array_equal(result, exp)
result = strings.str_cat(one, two)
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
tm.assert_almost_equal(result, exp)
# error for incorrect lengths
rgx = 'All arrays must be same length'
three = Series(['1', '2', '3'])
with tm.assert_raises_regex(ValueError, rgx):
strings.str_cat(one, three)
# error for incorrect type
rgx = "Must pass arrays containing strings to str_cat"
with tm.assert_raises_regex(ValueError, rgx):
strings.str_cat(one, 'three')
@pytest.mark.parametrize('container', [Series, Index])
@pytest.mark.parametrize('other', [None, Series, Index])
def test_str_cat_name(self, container, other):
# https://github.com/pandas-dev/pandas/issues/21053
values = ['a', 'b']
if other:
other = other(values)
else:
other = values
result = container(values, name='name').str.cat(other, sep=',',
join='left')
assert result.name == 'name'
@pytest.mark.parametrize('series_or_index', ['series', 'index'])
def test_str_cat(self, series_or_index):
# test_cat above tests "str_cat" from ndarray to ndarray;
# here testing "str.cat" from Series/Index to Series/Index/ndarray/list
s = Index(['a', 'a', 'b', 'b', 'c', np.nan])
if series_or_index == 'series':
s = Series(s)
t = Index(['a', np.nan, 'b', 'd', 'foo', np.nan])
# single array
result = s.str.cat()
exp = 'aabbc'
assert result == exp
result = s.str.cat(na_rep='-')
exp = 'aabbc-'
assert result == exp
result = s.str.cat(sep='_', na_rep='NA')
exp = 'a_a_b_b_c_NA'
assert result == exp
# Series/Index with Index
exp = Index(['aa', 'a-', 'bb', 'bd', 'cfoo', '--'])
if series_or_index == 'series':
exp = Series(exp)
# s.index / s is different from t (as Index) -> warning
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp)
# Series/Index with Series
t = Series(t)
# s as Series has same index as t -> no warning
# s as Index is different from t.index -> warning (tested below)
if series_or_index == 'series':
assert_series_equal(s.str.cat(t, na_rep='-'), exp)
# Series/Index with Series: warning if different indexes
t.index = t.index + 1
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp)
# Series/Index with array
assert_series_or_index_equal(s.str.cat(t.values, na_rep='-'), exp)
# Series/Index with list
assert_series_or_index_equal(s.str.cat(list(t), na_rep='-'), exp)
# errors for incorrect lengths
rgx = 'All arrays must be same length, except.*'
z = Series(['1', '2', '3'])
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat(z)
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat(z.values)
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat(list(z))
@pytest.mark.parametrize('series_or_index', ['series', 'index'])
def test_str_cat_raises_intuitive_error(self, series_or_index):
# https://github.com/pandas-dev/pandas/issues/11334
s = Index(['a', 'b', 'c', 'd'])
if series_or_index == 'series':
s = Series(s)
message = "Did you mean to supply a `sep` keyword?"
with tm.assert_raises_regex(ValueError, message):
s.str.cat('|')
with tm.assert_raises_regex(ValueError, message):
s.str.cat(' ')
@pytest.mark.parametrize('series_or_index, dtype_caller, dtype_target', [
('series', 'object', 'object'),
('series', 'object', 'category'),
('series', 'category', 'object'),
('series', 'category', 'category'),
('index', 'object', 'object'),
('index', 'object', 'category'),
('index', 'category', 'object'),
('index', 'category', 'category')
])
def test_str_cat_categorical(self, series_or_index,
dtype_caller, dtype_target):
s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller)
if series_or_index == 'series':
s = Series(s)
t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target)
exp = Index(['ab', 'aa', 'bb', 'ac'])
if series_or_index == 'series':
exp = Series(exp)
# Series/Index with Index
# s.index / s is different from t (as Index) -> warning
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat(t), exp)
# Series/Index with Series
t = Series(t)
# s as Series has same index as t -> no warning
# s as Index is different from t.index -> warning (tested below)
if series_or_index == 'series':
assert_series_equal(s.str.cat(t), exp)
# Series/Index with Series: warning if different indexes
t.index = t.index + 1
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp)
@pytest.mark.parametrize('series_or_index', ['series', 'index'])
def test_str_cat_mixed_inputs(self, series_or_index):
s = Index(['a', 'b', 'c', 'd'])
if series_or_index == 'series':
s = Series(s)
t = Series(['A', 'B', 'C', 'D'])
d = concat([t, Series(s)], axis=1)
exp = Index(['aAa', 'bBb', 'cCc', 'dDd'])
if series_or_index == 'series':
exp = Series(exp)
# Series/Index with DataFrame
# s as Series has same index as d -> no warning
# s as Index is different from d.index -> warning (tested below)
if series_or_index == 'series':
assert_series_equal(s.str.cat(d), exp)
# Series/Index with DataFrame: warning if different indexes
d.index = d.index + 1
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat(d), exp)
# Series/Index with two-dimensional ndarray
assert_series_or_index_equal(s.str.cat(d.values), exp)
# Series/Index with list of Series
# s as Series has same index as t, s -> no warning
# s as Index is different from t.index -> warning (tested below)
if series_or_index == 'series':
assert_series_equal(s.str.cat([t, s]), exp)
# Series/Index with list of Series: warning if different indexes
tt = t.copy()
tt.index = tt.index + 1
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat([tt, s]), exp)
# Series/Index with list of list-likes
assert_series_or_index_equal(s.str.cat([t.values, list(s)]), exp)
# Series/Index with mixed list of Series/list-like
# s as Series has same index as t -> no warning
# s as Index is different from t.index -> warning (tested below)
if series_or_index == 'series':
assert_series_equal(s.str.cat([t, s.values]), exp)
# Series/Index with mixed list: warning if different indexes
with tm.assert_produces_warning(expected_warning=FutureWarning):
# FutureWarning to switch to alignment by default
assert_series_or_index_equal(s.str.cat([tt, s.values]), exp)
# Series/Index with iterator of list-likes
assert_series_or_index_equal(s.str.cat(iter([t.values, list(s)])), exp)
# errors for incorrect lengths
rgx = 'All arrays must be same length, except.*'
z = Series(['1', '2', '3'])
e = concat([z, z], axis=1)
# DataFrame
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat(e)
# two-dimensional ndarray
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat(e.values)
# list of Series
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat([z, s])
# list of list-likes
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat([z.values, list(s)])
# mixed list of Series/list-like
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat([z, list(s)])
# errors for incorrect arguments in list-like
rgx = 'others must be Series, Index, DataFrame,.*'
# make sure None/NaN do not crash checks in _get_series_list
u = Series(['a', np.nan, 'c', None])
# mix of string and Series
with tm.assert_raises_regex(TypeError, rgx):
s.str.cat([u, 'u'])
# DataFrame in list
with tm.assert_raises_regex(TypeError, rgx):
s.str.cat([u, d])
# 2-dim ndarray in list
with tm.assert_raises_regex(TypeError, rgx):
s.str.cat([u, d.values])
# nested lists
with tm.assert_raises_regex(TypeError, rgx):
s.str.cat([u, [u, d]])
# forbidden input type, e.g. int
with tm.assert_raises_regex(TypeError, rgx):
s.str.cat(1)
@pytest.mark.parametrize('series_or_index, join', [
('series', 'left'), ('series', 'outer'),
('series', 'inner'), ('series', 'right'),
('index', 'left'), ('index', 'outer'),
('index', 'inner'), ('index', 'right')
])
def test_str_cat_align_indexed(self, series_or_index, join):
# https://github.com/pandas-dev/pandas/issues/18657
s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd'])
t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b'])
sa, ta = s.align(t, join=join)
# result after manual alignment of inputs
exp = sa.str.cat(ta, na_rep='-')
if series_or_index == 'index':
s = Index(s)
sa = Index(sa)
exp = Index(exp)
assert_series_or_index_equal(s.str.cat(t, join=join, na_rep='-'), exp)
@pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
def test_str_cat_align_mixed_inputs(self, join):
s = Series(['a', 'b', 'c', 'd'])
t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
d = concat([t, t], axis=1)
exp_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee'])
sa, ta = s.align(t, join=join)
exp = exp_outer.loc[ta.index]
# list of Series
tm.assert_series_equal(s.str.cat([t, t], join=join, na_rep='-'), exp)
# DataFrame
tm.assert_series_equal(s.str.cat(d, join=join, na_rep='-'), exp)
# mixed list of indexed/unindexed
u = ['A', 'B', 'C', 'D']
exp_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-'])
# u will be forced have index of s -> use s here as placeholder
e = concat([t, s], axis=1, join=(join if join == 'inner' else 'outer'))
sa, ea = s.align(e, join=join)
exp = exp_outer.loc[ea.index]
tm.assert_series_equal(s.str.cat([t, u], join=join, na_rep='-'), exp)
# errors for incorrect lengths
rgx = 'If `others` contains arrays or lists.*'
z = ['1', '2', '3']
# unindexed object of wrong length
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat(z, join=join)
# unindexed object of wrong length in list
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat([t, z], join=join)
def test_str_cat_special_cases(self):
s = Series(['a', 'b', 'c', 'd'])
t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
# iterator of elements with different types
exp = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-'])
tm.assert_series_equal(s.str.cat(iter([t, ['A', 'B', 'C', 'D']]),
join='outer', na_rep='-'), exp)
# right-align with different indexes in others
exp = Series(['aa-', 'd-d'], index=[0, 3])
tm.assert_series_equal(s.str.cat([t.loc[[0]], t.loc[[3]]],
join='right', na_rep='-'), exp)
def test_cat_on_filtered_index(self):
df = DataFrame(index=MultiIndex.from_product(
[[2011, 2012], [1, 2, 3]], names=['year', 'month']))
df = df.reset_index()
df = df[df.month > 1]
str_year = df.year.astype('str')
str_month = df.month.astype('str')
str_both = str_year.str.cat(str_month, sep=' ', join='left')
assert str_both.loc[1] == '2011 2'
str_multiple = str_year.str.cat([str_month, str_month],
sep=' ', join='left')
assert str_multiple.loc[1] == '2011 2 2'
def test_count(self):
values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'],
dtype=np.object_)
result = strings.str_count(values, 'f[o]+')
exp = np.array([1, 2, NA, 4])
tm.assert_numpy_array_equal(result, exp)
result = Series(values).str.count('f[o]+')
exp = Series([1, 2, NA, 4])
assert isinstance(result, Series)
tm.assert_series_equal(result, exp)
# mixed
mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
rs = strings.str_count(mixed, 'a')
xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA])
tm.assert_numpy_array_equal(rs, xp)
rs = Series(mixed).str.count('a')
xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_series_equal(rs, xp)
# unicode
values = [u('foo'), u('foofoo'), NA, u('foooofooofommmfoo')]
result = strings.str_count(values, 'f[o]+')
exp = np.array([1, 2, NA, 4])
tm.assert_numpy_array_equal(result, exp)
result = Series(values).str.count('f[o]+')
exp = Series([1, 2, NA, 4])
assert isinstance(result, Series)
tm.assert_series_equal(result, exp)
def test_contains(self):
values = np.array(['foo', NA, 'fooommm__foo',
'mmm_', 'foommm[_]+bar'], dtype=np.object_)
pat = 'mmm[_]+'
result = strings.str_contains(values, pat)
expected = np.array([False, NA, True, True, False], dtype=np.object_)
tm.assert_numpy_array_equal(result, expected)
result = strings.str_contains(values, pat, regex=False)
expected = np.array([False, NA, False, False, True], dtype=np.object_)
tm.assert_numpy_array_equal(result, expected)
values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']
result = strings.str_contains(values, pat)
expected = np.array([False, False, True, True])
assert result.dtype == np.bool_
tm.assert_numpy_array_equal(result, expected)
# case insensitive using regex
values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_']
result = strings.str_contains(values, 'FOO|mmm', case=False)
expected = np.array([True, False, True, True])
tm.assert_numpy_array_equal(result, expected)
# case insensitive without regex
result = strings.str_contains(values, 'foo', regex=False, case=False)
expected = np.array([True, False, True, False])
tm.assert_numpy_array_equal(result, expected)
# mixed
mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
rs = strings.str_contains(mixed, 'o')
xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA],
dtype=np.object_)
tm.assert_numpy_array_equal(rs, xp)
rs = Series(mixed).str.contains('o')
xp = Series([False, NA, False, NA, NA, True, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_series_equal(rs, xp)
# unicode
values = np.array([u'foo', NA, u'fooommm__foo', u'mmm_'],
dtype=np.object_)
pat = 'mmm[_]+'
result = strings.str_contains(values, pat)
expected = np.array([False, np.nan, True, True], dtype=np.object_)
tm.assert_numpy_array_equal(result, expected)
result = strings.str_contains(values, pat, na=False)
expected = np.array([False, False, True, True])
tm.assert_numpy_array_equal(result, expected)
values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'],
dtype=np.object_)
result = strings.str_contains(values, pat)
expected = np.array([False, False, True, True])
assert result.dtype == np.bool_
tm.assert_numpy_array_equal(result, expected)
# na
values = Series(['om', 'foo', np.nan])
res = values.str.contains('foo', na="foo")
assert res.loc[2] == "foo"
def test_startswith(self):
values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
result = values.str.startswith('foo')
exp = Series([False, NA, True, False, False, NA, True])
tm.assert_series_equal(result, exp)
# mixed
mixed = np.array(['a', NA, 'b', True, datetime.today(),
'foo', None, 1, 2.], dtype=np.object_)
rs = strings.str_startswith(mixed, 'f')
xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA],
dtype=np.object_)
tm.assert_numpy_array_equal(rs, xp)
rs = Series(mixed).str.startswith('f')
assert isinstance(rs, Series)
xp = Series([False, NA, False, NA, NA, True, NA, NA, NA])
tm.assert_series_equal(rs, xp)
# unicode
values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
u('foo')])
result = values.str.startswith('foo')
exp = Series([False, NA, True, False, False, NA, True])
tm.assert_series_equal(result, exp)
result = values.str.startswith('foo', na=True)
tm.assert_series_equal(result, exp.fillna(True).astype(bool))
def test_endswith(self):
values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
result = values.str.endswith('foo')
exp = Series([False, NA, False, False, True, NA, True])
tm.assert_series_equal(result, exp)
# mixed
mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
rs = strings.str_endswith(mixed, 'f')
xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA],
dtype=np.object_)
tm.assert_numpy_array_equal(rs, xp)
rs = Series(mixed).str.endswith('f')
xp = Series([False, NA, False, NA, NA, False, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_series_equal(rs, xp)
# unicode
values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
u('foo')])
result = values.str.endswith('foo')
exp = Series([False, NA, False, False, True, NA, True])
tm.assert_series_equal(result, exp)
result = values.str.endswith('foo', na=False)
tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def test_title(self):
values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
result = values.str.title()
exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"])
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None,
1, 2.])
mixed = mixed.str.title()
exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA])
tm.assert_almost_equal(mixed, exp)
# unicode
values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
results = values.str.title()
exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")])
tm.assert_series_equal(results, exp)
def test_lower_upper(self):
values = Series(['om', NA, 'nom', 'nom'])
result = values.str.upper()
exp = Series(['OM', NA, 'NOM', 'NOM'])
tm.assert_series_equal(result, exp)
result = result.str.lower()
tm.assert_series_equal(result, values)
# mixed
mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1,
2.])
mixed = mixed.str.upper()
rs = Series(mixed).str.lower()
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_series_equal(rs, xp)
# unicode
values = Series([u('om'), NA, u('nom'), u('nom')])
result = values.str.upper()
exp = Series([u('OM'), NA, u('NOM'), u('NOM')])
tm.assert_series_equal(result, exp)
result = result.str.lower()
tm.assert_series_equal(result, values)
def test_capitalize(self):
values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
result = values.str.capitalize()
exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"])
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None,
1, 2.])
mixed = mixed.str.capitalize()
exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA])
tm.assert_almost_equal(mixed, exp)
# unicode
values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
results = values.str.capitalize()
exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")])
tm.assert_series_equal(results, exp)
def test_swapcase(self):
values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
result = values.str.swapcase()
exp = Series(["foo", "bar", NA, "bLAH", "BLURG"])
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None,
1, 2.])
mixed = mixed.str.swapcase()
exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA])
tm.assert_almost_equal(mixed, exp)
# unicode
values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
results = values.str.swapcase()
exp = Series([u("foo"), NA, u("BAR"), u("bLURG")])
tm.assert_series_equal(results, exp)
def test_casemethods(self):
values = ['aaa', 'bbb', 'CCC', 'Dddd', 'eEEE']
s = Series(values)
assert s.str.lower().tolist() == [v.lower() for v in values]
assert s.str.upper().tolist() == [v.upper() for v in values]
assert s.str.title().tolist() == [v.title() for v in values]
assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
def test_replace(self):
values = Series(['fooBAD__barBAD', NA])
result = values.str.replace('BAD[_]*', '')
exp = Series(['foobar', NA])
tm.assert_series_equal(result, exp)
result = values.str.replace('BAD[_]*', '', n=1)
exp = Series(['foobarBAD', NA])
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
None, 1, 2.])
rs = Series(mixed).str.replace('BAD[_]*', '')
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('fooBAD__barBAD'), NA])
result = values.str.replace('BAD[_]*', '')
exp = Series([u('foobar'), NA])
tm.assert_series_equal(result, exp)
result = values.str.replace('BAD[_]*', '', n=1)
exp = Series([u('foobarBAD'), NA])
tm.assert_series_equal(result, exp)
# flags + unicode
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
tm.assert_series_equal(result, exp)
# GH 13438
for klass in (Series, Index):
for repl in (None, 3, {'a': 'b'}):
for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']):
values = klass(data)
pytest.raises(TypeError, values.str.replace, 'a', repl)
def test_replace_callable(self):
# GH 15055
values = Series(['fooBAD__barBAD', NA])
# test with callable
repl = lambda m: m.group(0).swapcase()
result = values.str.replace('[a-z][A-Z]{2}', repl, n=2)
exp = Series(['foObaD__baRbaD', NA])
tm.assert_series_equal(result, exp)
# test with wrong number of arguments, raising an error
if compat.PY2:
p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
else:
p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
r'(?(3)required )positional arguments?')
repl = lambda: None
with tm.assert_raises_regex(TypeError, p_err):
values.str.replace('a', repl)
repl = lambda m, x: None
with tm.assert_raises_regex(TypeError, p_err):
values.str.replace('a', repl)
repl = lambda m, x, y=None: None
with tm.assert_raises_regex(TypeError, p_err):
values.str.replace('a', repl)
# test regex named groups
values = Series(['Foo Bar Baz', NA])
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
repl = lambda m: m.group('middle').swapcase()
result = values.str.replace(pat, repl)
exp = Series(['bAR', NA])
tm.assert_series_equal(result, exp)
def test_replace_compiled_regex(self):
# GH 15446
values = Series(['fooBAD__barBAD', NA])
# test with compiled regex
pat = re.compile(r'BAD[_]*')
result = values.str.replace(pat, '')
exp = Series(['foobar', NA])
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
None, 1, 2.])
rs = Series(mixed).str.replace(pat, '')
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('fooBAD__barBAD'), NA])
result = values.str.replace(pat, '')
exp = Series([u('foobar'), NA])
tm.assert_series_equal(result, exp)
result = values.str.replace(pat, '', n=1)
exp = Series([u('foobarBAD'), NA])
tm.assert_series_equal(result, exp)
# flags + unicode
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
result = values.str.replace(pat, ", ")
tm.assert_series_equal(result, exp)
# case and flags provided to str.replace will have no effect
# and will produce warnings
values = Series(['fooBAD__barBAD__bad', NA])
pat = re.compile(r'BAD[_]*')
with tm.assert_raises_regex(ValueError,
"case and flags cannot be"):
result = values.str.replace(pat, '', flags=re.IGNORECASE)
with tm.assert_raises_regex(ValueError,
"case and flags cannot be"):
result = values.str.replace(pat, '', case=False)
with tm.assert_raises_regex(ValueError,
"case and flags cannot be"):
result = values.str.replace(pat, '', case=True)
# test with callable
values = Series(['fooBAD__barBAD', NA])
repl = lambda m: m.group(0).swapcase()
pat = re.compile('[a-z][A-Z]{2}')
result = values.str.replace(pat, repl, n=2)
exp = Series(['foObaD__baRbaD', NA])
tm.assert_series_equal(result, exp)
def test_replace_literal(self):
# GH16808 literal replace (regex=False vs regex=True)
values = Series(['f.o', 'foo', NA])
exp = Series(['bao', 'bao', NA])
result = values.str.replace('f.', 'ba')
tm.assert_series_equal(result, exp)
exp = Series(['bao', 'foo', NA])
result = values.str.replace('f.', 'ba', regex=False)
tm.assert_series_equal(result, exp)
# Cannot do a literal replace if given a callable repl or compiled
# pattern
callable_repl = lambda m: m.group(0).swapcase()
compiled_pat = re.compile('[a-z][A-Z]{2}')
pytest.raises(ValueError, values.str.replace, 'abc', callable_repl,
regex=False)
pytest.raises(ValueError, values.str.replace, compiled_pat, '',
regex=False)
def test_repeat(self):
values = Series(['a', 'b', NA, 'c', NA, 'd'])
result = values.str.repeat(3)
exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd'])
tm.assert_series_equal(result, exp)
result = values.str.repeat([1, 2, 3, 4, 5, 6])
exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd'])
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1,
2.])
rs = Series(mixed).str.repeat(3)
xp = Series(['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_series_equal(rs, xp)
# unicode
values = Series([u('a'), u('b'), NA, u('c'), NA, u('d')])
result = values.str.repeat(3)
exp = Series([u('aaa'), u('bbb'), NA, u('ccc'), NA, u('ddd')])
tm.assert_series_equal(result, exp)
result = values.str.repeat([1, 2, 3, 4, 5, 6])
exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
tm.assert_series_equal(result, exp)
def test_match(self):
# New match behavior introduced in 0.13
values = Series(['fooBAD__barBAD', NA, 'foo'])
result = values.str.match('.*(BAD[_]+).*(BAD)')
exp = Series([True, NA, False])
tm.assert_series_equal(result, exp)
values = Series(['fooBAD__barBAD', NA, 'foo'])
result = values.str.match('.*BAD[_]+.*BAD')
exp = Series([True, NA, False])
tm.assert_series_equal(result, exp)
# test passing as_indexer still works but is ignored
values = Series(['fooBAD__barBAD', NA, 'foo'])
exp = Series([True, NA, False])
with tm.assert_produces_warning(FutureWarning):
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True)
tm.assert_series_equal(result, exp)
with tm.assert_produces_warning(FutureWarning):
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
tm.assert_series_equal(result, exp)
with tm.assert_produces_warning(FutureWarning):
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
tm.assert_series_equal(result, exp)
pytest.raises(ValueError, values.str.match, '.*(BAD[_]+).*(BAD)',
as_indexer=False)
# mixed
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
'foo', None, 1, 2.])
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_series_equal(rs, xp)
# unicode
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
result = values.str.match('.*(BAD[_]+).*(BAD)')
exp = Series([True, NA, False])
tm.assert_series_equal(result, exp)
# na GH #6609
res = Series(['a', 0, np.nan]).str.match('a', na=False)
exp = Series([True, False, False])
assert_series_equal(exp, res)
res = Series(['a', 0, np.nan]).str.match('a')
exp = Series([True, np.nan, np.nan])
assert_series_equal(exp, res)
def test_extract_expand_None(self):
values = Series(['fooBAD__barBAD', NA, 'foo'])
with tm.assert_raises_regex(ValueError,
'expand must be True or False'):
values.str.extract('.*(BAD[_]+).*(BAD)', expand=None)
def test_extract_expand_unspecified(self):
values = Series(['fooBAD__barBAD', NA, 'foo'])
result_unspecified = values.str.extract('.*(BAD[_]+).*')
assert isinstance(result_unspecified, DataFrame)
result_true = values.str.extract('.*(BAD[_]+).*', expand=True)
tm.assert_frame_equal(result_unspecified, result_true)
def test_extract_expand_False(self):
# Contains tests like those in test_match and some others.
values = Series(['fooBAD__barBAD', NA, 'foo'])
er = [NA, NA] # empty row
result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False)
exp = DataFrame([['BAD__', 'BAD'], er, er])
tm.assert_frame_equal(result, exp)
# mixed
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
'foo', None, 1, 2.])
rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=False)
exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er,
er, er])
tm.assert_frame_equal(rs, exp)
# unicode
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False)
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
tm.assert_frame_equal(result, exp)
# GH9980
# Index only works with one regex group since
# multi-group would expand to a frame
idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
with tm.assert_raises_regex(ValueError, "supported"):
idx.str.extract('([AB])([123])', expand=False)
# these should work for both Series and Index
for klass in [Series, Index]:
# no groups
s_or_idx = klass(['A1', 'B2', 'C3'])
f = lambda: s_or_idx.str.extract('[ABC][123]', expand=False)
pytest.raises(ValueError, f)
# only non-capturing groups
f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=False)
pytest.raises(ValueError, f)
# single group renames series/index properly
s_or_idx = klass(['A1', 'A2'])
result = s_or_idx.str.extract(r'(?P<uno>A)\d', expand=False)
assert result.name == 'uno'
exp = klass(['A', 'A'], name='uno')
if klass == Series:
tm.assert_series_equal(result, exp)
else:
tm.assert_index_equal(result, exp)
s = Series(['A1', 'B2', 'C3'])
# one group, no matches
result = s.str.extract('(_)', expand=False)
exp = Series([NA, NA, NA], dtype=object)
tm.assert_series_equal(result, exp)
# two groups, no matches
result = s.str.extract('(_)(_)', expand=False)
exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
tm.assert_frame_equal(result, exp)
# one group, some matches
result = s.str.extract('([AB])[123]', expand=False)
exp = Series(['A', 'B', NA])
tm.assert_series_equal(result, exp)
# two groups, some matches
result = s.str.extract('([AB])([123])', expand=False)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
tm.assert_frame_equal(result, exp)
# one named group
result = s.str.extract('(?P<letter>[AB])', expand=False)
exp = Series(['A', 'B', NA], name='letter')
tm.assert_series_equal(result, exp)
# two named groups
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])',
expand=False)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]],
columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# mix named and unnamed groups
result = s.str.extract('([AB])(?P<number>[123])', expand=False)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]],
columns=[0, 'number'])
tm.assert_frame_equal(result, exp)
# one normal group, one non-capturing group
result = s.str.extract('([AB])(?:[123])', expand=False)
exp = Series(['A', 'B', NA])
tm.assert_series_equal(result, exp)
# two normal groups, one non-capturing group
result = Series(['A11', 'B22', 'C33']).str.extract(
'([AB])([123])(?:[123])', expand=False)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
tm.assert_frame_equal(result, exp)
# one optional group followed by one normal group
result = Series(['A1', 'B2', '3']).str.extract(
'(?P<letter>[AB])?(?P<number>[123])', expand=False)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']],
columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# one normal group followed by one optional group
result = Series(['A1', 'B2', 'C']).str.extract(
'(?P<letter>[ABC])(?P<number>[123])?', expand=False)
exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]],
columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# GH6348
# not passing index to the extractor
def check_index(index):
data = ['A1', 'B2', 'C']
index = index[:len(data)]
s = Series(data, index=index)
result = s.str.extract(r'(\d)', expand=False)
exp = Series(['1', '2', NA], index=index)
tm.assert_series_equal(result, exp)
result = Series(data, index=index).str.extract(
r'(?P<letter>\D)(?P<number>\d)?', expand=False)
e_list = [
['A', '1'],
['B', '2'],
['C', NA]
]
exp = DataFrame(e_list, columns=['letter', 'number'], index=index)
tm.assert_frame_equal(result, exp)
i_funs = [
tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex
]
for index in i_funs:
check_index(index())
# single_series_name_is_preserved.
s = Series(['a3', 'b3', 'c2'], name='bob')
r = s.str.extract(r'(?P<sue>[a-z])', expand=False)
e = Series(['a', 'b', 'c'], name='sue')
tm.assert_series_equal(r, e)
assert r.name == e.name
def test_extract_expand_True(self):
# Contains tests like those in test_match and some others.
values = Series(['fooBAD__barBAD', NA, 'foo'])
er = [NA, NA] # empty row
result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True)
exp = DataFrame([['BAD__', 'BAD'], er, er])
tm.assert_frame_equal(result, exp)
# mixed
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
'foo', None, 1, 2.])
rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=True)
exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er,
er, er, er, er])
tm.assert_frame_equal(rs, exp)
# unicode
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True)
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
tm.assert_frame_equal(result, exp)
# these should work for both Series and Index
for klass in [Series, Index]:
# no groups
s_or_idx = klass(['A1', 'B2', 'C3'])
f = lambda: s_or_idx.str.extract('[ABC][123]', expand=True)
pytest.raises(ValueError, f)
# only non-capturing groups
f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=True)
pytest.raises(ValueError, f)
# single group renames series/index properly
s_or_idx = klass(['A1', 'A2'])
result_df = s_or_idx.str.extract(r'(?P<uno>A)\d', expand=True)
assert isinstance(result_df, DataFrame)
result_series = result_df['uno']
assert_series_equal(result_series, Series(['A', 'A'], name='uno'))
def test_extract_series(self):
# extract should give the same result whether or not the
# series has a name.
for series_name in None, "series_name":
s = Series(['A1', 'B2', 'C3'], name=series_name)
# one group, no matches
result = s.str.extract('(_)', expand=True)
exp = DataFrame([NA, NA, NA], dtype=object)
tm.assert_frame_equal(result, exp)
# two groups, no matches
result = s.str.extract('(_)(_)', expand=True)
exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
tm.assert_frame_equal(result, exp)
# one group, some matches
result = s.str.extract('([AB])[123]', expand=True)
exp = DataFrame(['A', 'B', NA])
tm.assert_frame_equal(result, exp)
# two groups, some matches
result = s.str.extract('([AB])([123])', expand=True)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
tm.assert_frame_equal(result, exp)
# one named group
result = s.str.extract('(?P<letter>[AB])', expand=True)
exp = DataFrame({"letter": ['A', 'B', NA]})
tm.assert_frame_equal(result, exp)
# two named groups
result = s.str.extract(
'(?P<letter>[AB])(?P<number>[123])',
expand=True)
e_list = [
['A', '1'],
['B', '2'],
[NA, NA]
]
exp = DataFrame(e_list, columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# mix named and unnamed groups
result = s.str.extract('([AB])(?P<number>[123])', expand=True)
exp = DataFrame(e_list, columns=[0, 'number'])
tm.assert_frame_equal(result, exp)
# one normal group, one non-capturing group
result = s.str.extract('([AB])(?:[123])', expand=True)
exp = DataFrame(['A', 'B', NA])
tm.assert_frame_equal(result, exp)
def test_extract_optional_groups(self):
# two normal groups, one non-capturing group
result = Series(['A11', 'B22', 'C33']).str.extract(
'([AB])([123])(?:[123])', expand=True)
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
tm.assert_frame_equal(result, exp)
# one optional group followed by one normal group
result = Series(['A1', 'B2', '3']).str.extract(
'(?P<letter>[AB])?(?P<number>[123])', expand=True)
e_list = [
['A', '1'],
['B', '2'],
[NA, '3']
]
exp = DataFrame(e_list, columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# one normal group followed by one optional group
result = Series(['A1', 'B2', 'C']).str.extract(
'(?P<letter>[ABC])(?P<number>[123])?', expand=True)
e_list = [
['A', '1'],
['B', '2'],
['C', NA]
]
exp = DataFrame(e_list, columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# GH6348
# not passing index to the extractor
def check_index(index):
data = ['A1', 'B2', 'C']
index = index[:len(data)]
result = Series(data, index=index).str.extract(
r'(\d)', expand=True)
exp = DataFrame(['1', '2', NA], index=index)
tm.assert_frame_equal(result, exp)
result = Series(data, index=index).str.extract(
r'(?P<letter>\D)(?P<number>\d)?', expand=True)
e_list = [
['A', '1'],
['B', '2'],
['C', NA]
]
exp = DataFrame(e_list, columns=['letter', 'number'], index=index)
tm.assert_frame_equal(result, exp)
i_funs = [
tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex
]
for index in i_funs:
check_index(index())
def test_extract_single_group_returns_frame(self):
# GH11386 extract should always return DataFrame, even when
# there is only one group. Prior to v0.18.0, extract returned
# Series when there was only one group in the regex.
s = Series(['a3', 'b3', 'c2'], name='series_name')
r = s.str.extract(r'(?P<letter>[a-z])', expand=True)
e = DataFrame({"letter": ['a', 'b', 'c']})
tm.assert_frame_equal(r, e)
def test_extractall(self):
subject_list = [
'dave@google.com',
'tdhock5@gmail.com',
'maudelaperriere@gmail.com',
'rob@gmail.com some text steve@gmail.com',
'a@b.com some text c@d.com and e@f.com',
np.nan,
"",
]
expected_tuples = [
("dave", "google", "com"),
("tdhock5", "gmail", "com"),
("maudelaperriere", "gmail", "com"),
("rob", "gmail", "com"), ("steve", "gmail", "com"),
("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"),
]
named_pattern = r"""
(?P<user>[a-z0-9]+)
@
(?P<domain>[a-z]+)
\.
(?P<tld>[a-z]{2,4})
"""
expected_columns = ["user", "domain", "tld"]
S = Series(subject_list)
# extractall should return a DataFrame with one row for each
# match, indexed by the subject from which the match came.
expected_index = MultiIndex.from_tuples([
(0, 0),
(1, 0),
(2, 0),
(3, 0),
(3, 1),
(4, 0),
(4, 1),
(4, 2),
], names=(None, "match"))
expected_df = DataFrame(
expected_tuples, expected_index, expected_columns)
computed_df = S.str.extractall(named_pattern, re.VERBOSE)
tm.assert_frame_equal(computed_df, expected_df)
# The index of the input Series should be used to construct
# the index of the output DataFrame:
series_index = MultiIndex.from_tuples([
("single", "Dave"),
("single", "Toby"),
("single", "Maude"),
("multiple", "robAndSteve"),
("multiple", "abcdef"),
("none", "missing"),
("none", "empty"),
])
Si = Series(subject_list, series_index)
expected_index = MultiIndex.from_tuples([
("single", "Dave", 0),
("single", "Toby", 0),
("single", "Maude", 0),
("multiple", "robAndSteve", 0),
("multiple", "robAndSteve", 1),
("multiple", "abcdef", 0),
("multiple", "abcdef", 1),
("multiple", "abcdef", 2),
], names=(None, None, "match"))
expected_df = DataFrame(
expected_tuples, expected_index, expected_columns)
computed_df = Si.str.extractall(named_pattern, re.VERBOSE)
tm.assert_frame_equal(computed_df, expected_df)
# MultiIndexed subject with names.
Sn = Series(subject_list, series_index)
Sn.index.names = ("matches", "description")
expected_index.names = ("matches", "description", "match")
expected_df = DataFrame(
expected_tuples, expected_index, expected_columns)
computed_df = Sn.str.extractall(named_pattern, re.VERBOSE)
tm.assert_frame_equal(computed_df, expected_df)
# optional groups.
subject_list = ['', 'A1', '32']
named_pattern = '(?P<letter>[AB])?(?P<number>[123])'
computed_df = Series(subject_list).str.extractall(named_pattern)
expected_index = MultiIndex.from_tuples([
(1, 0),
(2, 0),
(2, 1),
], names=(None, "match"))
expected_df = DataFrame([
('A', '1'),
(NA, '3'),
(NA, '2'),
], expected_index, columns=['letter', 'number'])
tm.assert_frame_equal(computed_df, expected_df)
# only one of two groups has a name.
pattern = '([AB])?(?P<number>[123])'
computed_df = Series(subject_list).str.extractall(pattern)
expected_df = DataFrame([
('A', '1'),
(NA, '3'),
(NA, '2'),
], expected_index, columns=[0, 'number'])
tm.assert_frame_equal(computed_df, expected_df)
def test_extractall_single_group(self):
# extractall(one named group) returns DataFrame with one named
# column.
s = Series(['a3', 'b3', 'd4c2'], name='series_name')
r = s.str.extractall(r'(?P<letter>[a-z])')
i = MultiIndex.from_tuples([
(0, 0),
(1, 0),
(2, 0),
(2, 1),
], names=(None, "match"))
e = DataFrame({"letter": ['a', 'b', 'd', 'c']}, i)
tm.assert_frame_equal(r, e)
# extractall(one un-named group) returns DataFrame with one
# un-named column.
r = s.str.extractall(r'([a-z])')
e = DataFrame(['a', 'b', 'd', 'c'], i)
tm.assert_frame_equal(r, e)
def test_extractall_single_group_with_quantifier(self):
# extractall(one un-named group with quantifier) returns
# DataFrame with one un-named column (GH13382).
s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name')
r = s.str.extractall(r'([a-z]+)')
i = MultiIndex.from_tuples([
(0, 0),
(1, 0),
(2, 0),
(2, 1),
], names=(None, "match"))
e = DataFrame(['ab', 'abc', 'd', 'cd'], i)
tm.assert_frame_equal(r, e)
@pytest.mark.parametrize('data, names', [
([], (None, )),
([], ('i1', )),
([], (None, 'i2')),
([], ('i1', 'i2')),
(['a3', 'b3', 'd4c2'], (None, )),
(['a3', 'b3', 'd4c2'], ('i1', 'i2')),
(['a3', 'b3', 'd4c2'], (None, 'i2')),
(['a3', 'b3', 'd4c2'], ('i1', 'i2')),
])
def test_extractall_no_matches(self, data, names):
# GH19075 extractall with no matches should return a valid MultiIndex
n = len(data)
if len(names) == 1:
i = Index(range(n), name=names[0])
else:
a = (tuple([i] * (n - 1)) for i in range(n))
i = MultiIndex.from_tuples(a, names=names)
s = Series(data, name='series_name', index=i, dtype='object')
ei = MultiIndex.from_tuples([], names=(names + ('match',)))
# one un-named group.
r = s.str.extractall('(z)')
e = DataFrame(columns=[0], index=ei)
tm.assert_frame_equal(r, e)
# two un-named groups.
r = s.str.extractall('(z)(z)')
e = DataFrame(columns=[0, 1], index=ei)
tm.assert_frame_equal(r, e)
# one named group.
r = s.str.extractall('(?P<first>z)')
e = DataFrame(columns=["first"], index=ei)
tm.assert_frame_equal(r, e)
# two named groups.
r = s.str.extractall('(?P<first>z)(?P<second>z)')
e = DataFrame(columns=["first", "second"], index=ei)
tm.assert_frame_equal(r, e)
# one named, one un-named.
r = s.str.extractall('(z)(?P<second>z)')
e = DataFrame(columns=[0, "second"], index=ei)
tm.assert_frame_equal(r, e)
def test_extractall_stringindex(self):
s = Series(["a1a2", "b1", "c1"], name='xxx')
res = s.str.extractall(r"[ab](?P<digit>\d)")
exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)],
names=[None, 'match'])
exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
tm.assert_frame_equal(res, exp)
# index should return the same result as the default index without name
# thus index.name doesn't affect to the result
for idx in [Index(["a1a2", "b1", "c1"]),
Index(["a1a2", "b1", "c1"], name='xxx')]:
res = idx.str.extractall(r"[ab](?P<digit>\d)")
tm.assert_frame_equal(res, exp)
s = Series(["a1a2", "b1", "c1"], name='s_name',
index=Index(["XX", "yy", "zz"], name='idx_name'))
res = s.str.extractall(r"[ab](?P<digit>\d)")
exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)],
names=["idx_name", 'match'])
exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
tm.assert_frame_equal(res, exp)
def test_extractall_errors(self):
# Does not make sense to use extractall with a regex that has
# no capture groups. (it returns DataFrame with one column for
# each capture group)
s = Series(['a3', 'b3', 'd4c2'], name='series_name')
with tm.assert_raises_regex(ValueError, "no capture groups"):
s.str.extractall(r'[a-z]')
def test_extract_index_one_two_groups(self):
s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"],
name='series_name')
r = s.index.str.extract(r'([A-Z])', expand=True)
e = DataFrame(['A', "B", "D"])
tm.assert_frame_equal(r, e)
# Prior to v0.18.0, index.str.extract(regex with one group)
# returned Index. With more than one group, extract raised an
# error (GH9980). Now extract always returns DataFrame.
r = s.index.str.extract(
r'(?P<letter>[A-Z])(?P<digit>[0-9])', expand=True)
e_list = [
("A", "3"),
("B", "3"),
("D", "4"),
]
e = DataFrame(e_list, columns=["letter", "digit"])
tm.assert_frame_equal(r, e)
def test_extractall_same_as_extract(self):
s = Series(['a3', 'b3', 'c2'], name='series_name')
pattern_two_noname = r'([a-z])([0-9])'
extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
has_multi_index = s.str.extractall(pattern_two_noname)
no_multi_index = has_multi_index.xs(0, level="match")
tm.assert_frame_equal(extract_two_noname, no_multi_index)
pattern_two_named = r'(?P<letter>[a-z])(?P<digit>[0-9])'
extract_two_named = s.str.extract(pattern_two_named, expand=True)
has_multi_index = s.str.extractall(pattern_two_named)
no_multi_index = has_multi_index.xs(0, level="match")
tm.assert_frame_equal(extract_two_named, no_multi_index)
pattern_one_named = r'(?P<group_name>[a-z])'
extract_one_named = s.str.extract(pattern_one_named, expand=True)
has_multi_index = s.str.extractall(pattern_one_named)
no_multi_index = has_multi_index.xs(0, level="match")
tm.assert_frame_equal(extract_one_named, no_multi_index)
pattern_one_noname = r'([a-z])'
extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
has_multi_index = s.str.extractall(pattern_one_noname)
no_multi_index = has_multi_index.xs(0, level="match")
tm.assert_frame_equal(extract_one_noname, no_multi_index)
def test_extractall_same_as_extract_subject_index(self):
# same as above tests, but s has an MultiIndex.
i = MultiIndex.from_tuples([
("A", "first"),
("B", "second"),
("C", "third"),
], names=("capital", "ordinal"))
s = Series(['a3', 'b3', 'c2'], i, name='series_name')
pattern_two_noname = r'([a-z])([0-9])'
extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
has_match_index = s.str.extractall(pattern_two_noname)
no_match_index = has_match_index.xs(0, level="match")
tm.assert_frame_equal(extract_two_noname, no_match_index)
pattern_two_named = r'(?P<letter>[a-z])(?P<digit>[0-9])'
extract_two_named = s.str.extract(pattern_two_named, expand=True)
has_match_index = s.str.extractall(pattern_two_named)
no_match_index = has_match_index.xs(0, level="match")
tm.assert_frame_equal(extract_two_named, no_match_index)
pattern_one_named = r'(?P<group_name>[a-z])'
extract_one_named = s.str.extract(pattern_one_named, expand=True)
has_match_index = s.str.extractall(pattern_one_named)
no_match_index = has_match_index.xs(0, level="match")
tm.assert_frame_equal(extract_one_named, no_match_index)
pattern_one_noname = r'([a-z])'
extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
has_match_index = s.str.extractall(pattern_one_noname)
no_match_index = has_match_index.xs(0, level="match")
tm.assert_frame_equal(extract_one_noname, no_match_index)
def test_empty_str_methods(self):
empty_str = empty = Series(dtype=object)
empty_int = Series(dtype=int)
empty_bool = Series(dtype=bool)
empty_bytes = Series(dtype=object)
# GH7241
# (extract) on empty series
tm.assert_series_equal(empty_str, empty.str.cat(empty, join='left'))
assert '' == empty.str.cat()
tm.assert_series_equal(empty_str, empty.str.title())
tm.assert_series_equal(empty_int, empty.str.count('a'))
tm.assert_series_equal(empty_bool, empty.str.contains('a'))
tm.assert_series_equal(empty_bool, empty.str.startswith('a'))
tm.assert_series_equal(empty_bool, empty.str.endswith('a'))
tm.assert_series_equal(empty_str, empty.str.lower())
tm.assert_series_equal(empty_str, empty.str.upper())
tm.assert_series_equal(empty_str, empty.str.replace('a', 'b'))
tm.assert_series_equal(empty_str, empty.str.repeat(3))
tm.assert_series_equal(empty_bool, empty.str.match('^a'))
tm.assert_frame_equal(
DataFrame(columns=[0], dtype=str),
empty.str.extract('()', expand=True))
tm.assert_frame_equal(
DataFrame(columns=[0, 1], dtype=str),
empty.str.extract('()()', expand=True))
tm.assert_series_equal(
empty_str,
empty.str.extract('()', expand=False))
tm.assert_frame_equal(
DataFrame(columns=[0, 1], dtype=str),
empty.str.extract('()()', expand=False))
tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
tm.assert_series_equal(empty_str, empty_str.str.join(''))
tm.assert_series_equal(empty_int, empty.str.len())
tm.assert_series_equal(empty_str, empty_str.str.findall('a'))
tm.assert_series_equal(empty_int, empty.str.find('a'))
tm.assert_series_equal(empty_int, empty.str.rfind('a'))
tm.assert_series_equal(empty_str, empty.str.pad(42))
tm.assert_series_equal(empty_str, empty.str.center(42))
tm.assert_series_equal(empty_str, empty.str.split('a'))
tm.assert_series_equal(empty_str, empty.str.rsplit('a'))
tm.assert_series_equal(empty_str,
empty.str.partition('a', expand=False))
tm.assert_series_equal(empty_str,
empty.str.rpartition('a', expand=False))
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
tm.assert_series_equal(empty_str, empty.str.slice(step=1))
tm.assert_series_equal(empty_str, empty.str.strip())
tm.assert_series_equal(empty_str, empty.str.lstrip())
tm.assert_series_equal(empty_str, empty.str.rstrip())
tm.assert_series_equal(empty_str, empty.str.wrap(42))
tm.assert_series_equal(empty_str, empty.str.get(0))
tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii'))
tm.assert_series_equal(empty_bytes, empty.str.encode('ascii'))
tm.assert_series_equal(empty_str, empty.str.isalnum())
tm.assert_series_equal(empty_str, empty.str.isalpha())
tm.assert_series_equal(empty_str, empty.str.isdigit())
tm.assert_series_equal(empty_str, empty.str.isspace())
tm.assert_series_equal(empty_str, empty.str.islower())
tm.assert_series_equal(empty_str, empty.str.isupper())
tm.assert_series_equal(empty_str, empty.str.istitle())
tm.assert_series_equal(empty_str, empty.str.isnumeric())
tm.assert_series_equal(empty_str, empty.str.isdecimal())
tm.assert_series_equal(empty_str, empty.str.capitalize())
tm.assert_series_equal(empty_str, empty.str.swapcase())
tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
if compat.PY3:
table = str.maketrans('a', 'b')
else:
import string
table = string.maketrans('a', 'b')
tm.assert_series_equal(empty_str, empty.str.translate(table))
def test_empty_str_methods_to_frame(self):
empty = Series(dtype=str)
empty_df = DataFrame([])
tm.assert_frame_equal(empty_df, empty.str.partition('a'))
tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))
def test_ismethods(self):
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
str_s = Series(values)
alnum_e = [True, True, True, True, True, False, True, True, False,
False]
alpha_e = [True, True, True, False, False, False, True, False, False,
False]
digit_e = [False, False, False, True, False, False, False, True, False,
False]
# TODO: unused
num_e = [False, False, False, True, False, False, # noqa
False, True, False, False]
space_e = [False, False, False, False, False, False, False, False,
False, True]
lower_e = [False, True, False, False, False, False, False, False,
False, False]
upper_e = [True, False, False, False, True, False, True, False, False,
False]
title_e = [True, False, True, False, True, False, False, False, False,
False]
tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e))
tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e))
tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e))
tm.assert_series_equal(str_s.str.isspace(), Series(space_e))
tm.assert_series_equal(str_s.str.islower(), Series(lower_e))
tm.assert_series_equal(str_s.str.isupper(), Series(upper_e))
tm.assert_series_equal(str_s.str.istitle(), Series(title_e))
assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values]
assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values]
assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values]
assert str_s.str.isspace().tolist() == [v.isspace() for v in values]
assert str_s.str.islower().tolist() == [v.islower() for v in values]
assert str_s.str.isupper().tolist() == [v.isupper() for v in values]
assert str_s.str.istitle().tolist() == [v.istitle() for v in values]
def test_isnumeric(self):
# 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
# 0x2605: ★ not number
# 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
# 0xFF13: Em 3
values = ['A', '3', u'¼', u'', u'', u'', 'four']
s = Series(values)
numeric_e = [False, True, True, False, True, True, False]
decimal_e = [False, True, False, False, False, True, False]
tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
unicodes = [u'A', u'3', u'¼', u'', u'', u'', u'four']
assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes]
assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes]
values = ['A', np.nan, u'¼', u'', np.nan, u'', 'four']
s = Series(values)
numeric_e = [False, np.nan, True, False, np.nan, True, False]
decimal_e = [False, np.nan, False, False, np.nan, True, False]
tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
def test_get_dummies(self):
s = Series(['a|b', 'a|c', np.nan])
result = s.str.get_dummies('|')
expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]],
columns=list('abc'))
tm.assert_frame_equal(result, expected)
s = Series(['a;b', 'a', 7])
result = s.str.get_dummies(';')
expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]],
columns=list('7ab'))
tm.assert_frame_equal(result, expected)
# GH9980, GH8028
idx = Index(['a|b', 'a|c', 'b|c'])
result = idx.str.get_dummies('|')
expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1),
(0, 1, 1)], names=('a', 'b', 'c'))
tm.assert_index_equal(result, expected)
def test_get_dummies_with_name_dummy(self):
# GH 12180
# Dummies named 'name' should work as expected
s = Series(['a', 'b,name', 'b'])
result = s.str.get_dummies(',')
expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]],
columns=['a', 'b', 'name'])
tm.assert_frame_equal(result, expected)
idx = Index(['a|b', 'name|c', 'b|name'])
result = idx.str.get_dummies('|')
expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1),
(0, 1, 0, 1)],
names=('a', 'b', 'c', 'name'))
tm.assert_index_equal(result, expected)
def test_join(self):
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
result = values.str.split('_').str.join('_')
tm.assert_series_equal(values, result)
# mixed
mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
'foo', None, 1, 2.])
rs = Series(mixed).str.split('_').str.join('_')
xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')])
result = values.str.split('_').str.join('_')
tm.assert_series_equal(values, result)
def test_len(self):
values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo'])
result = values.str.len()
exp = values.map(lambda x: len(x) if notna(x) else NA)
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
'foo', None, 1, 2.])
rs = Series(mixed).str.len()
xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan, u(
'fooooooo')])
result = values.str.len()
exp = values.map(lambda x: len(x) if notna(x) else NA)
tm.assert_series_equal(result, exp)
def test_findall(self):
values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD'])
result = values.str.findall('BAD[_]*')
exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']])
tm.assert_almost_equal(result, exp)
# mixed
mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(),
'BAD', None, 1, 2.])
rs = Series(mixed).str.findall('BAD[_]*')
xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('fooBAD__barBAD'), NA, u('foo'), u('BAD')])
result = values.str.findall('BAD[_]*')
exp = Series([[u('BAD__'), u('BAD')], NA, [], [u('BAD')]])
tm.assert_almost_equal(result, exp)
def test_find(self):
values = Series(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXXX'])
result = values.str.find('EF')
tm.assert_series_equal(result, Series([4, 3, 1, 0, -1]))
expected = np.array([v.find('EF') for v in values.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.rfind('EF')
tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
expected = np.array([v.rfind('EF') for v in values.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.find('EF', 3)
tm.assert_series_equal(result, Series([4, 3, 7, 4, -1]))
expected = np.array([v.find('EF', 3) for v in values.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.rfind('EF', 3)
tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
expected = np.array([v.rfind('EF', 3) for v in values.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.find('EF', 3, 6)
tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
expected = np.array([v.find('EF', 3, 6) for v in values.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.rfind('EF', 3, 6)
tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
expected = np.array([v.rfind('EF', 3, 6) for v in values.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
with tm.assert_raises_regex(TypeError,
"expected a string object, not int"):
result = values.str.find(0)
with tm.assert_raises_regex(TypeError,
"expected a string object, not int"):
result = values.str.rfind(0)
def test_find_nan(self):
values = Series(['ABCDEFG', np.nan, 'DEFGHIJEF', np.nan, 'XXXX'])
result = values.str.find('EF')
tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1]))
result = values.str.rfind('EF')
tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
result = values.str.find('EF', 3)
tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
result = values.str.rfind('EF', 3)
tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
result = values.str.find('EF', 3, 6)
tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
result = values.str.rfind('EF', 3, 6)
tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
def test_index(self):
def _check(result, expected):
if isinstance(result, Series):
tm.assert_series_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
for klass in [Series, Index]:
s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF'])
result = s.str.index('EF')
_check(result, klass([4, 3, 1, 0]))
expected = np.array([v.index('EF') for v in s.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = s.str.rindex('EF')
_check(result, klass([4, 5, 7, 4]))
expected = np.array([v.rindex('EF') for v in s.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = s.str.index('EF', 3)
_check(result, klass([4, 3, 7, 4]))
expected = np.array([v.index('EF', 3) for v in s.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = s.str.rindex('EF', 3)
_check(result, klass([4, 5, 7, 4]))
expected = np.array([v.rindex('EF', 3) for v in s.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = s.str.index('E', 4, 8)
_check(result, klass([4, 5, 7, 4]))
expected = np.array([v.index('E', 4, 8) for v in s.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
result = s.str.rindex('E', 0, 5)
_check(result, klass([4, 3, 1, 4]))
expected = np.array([v.rindex('E', 0, 5) for v in s.values],
dtype=np.int64)
tm.assert_numpy_array_equal(result.values, expected)
with tm.assert_raises_regex(ValueError,
"substring not found"):
result = s.str.index('DE')
with tm.assert_raises_regex(TypeError,
"expected a string "
"object, not int"):
result = s.str.index(0)
# test with nan
s = Series(['abcb', 'ab', 'bcbe', np.nan])
result = s.str.index('b')
tm.assert_series_equal(result, Series([1, 1, 0, np.nan]))
result = s.str.rindex('b')
tm.assert_series_equal(result, Series([3, 1, 2, np.nan]))
def test_pad(self):
values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
result = values.str.pad(5, side='left')
exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
result = values.str.pad(5, side='right')
exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
result = values.str.pad(5, side='both')
exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
# mixed
mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
])
rs = Series(mixed).str.pad(5, side='left')
xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
])
rs = Series(mixed).str.pad(5, side='right')
xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
])
rs = Series(mixed).str.pad(5, side='both')
xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')])
result = values.str.pad(5, side='left')
exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')])
tm.assert_almost_equal(result, exp)
result = values.str.pad(5, side='right')
exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')])
tm.assert_almost_equal(result, exp)
result = values.str.pad(5, side='both')
exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')])
tm.assert_almost_equal(result, exp)
def test_pad_fillchar(self):
values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
result = values.str.pad(5, side='left', fillchar='X')
exp = Series(['XXXXa', 'XXXXb', NA, 'XXXXc', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
result = values.str.pad(5, side='right', fillchar='X')
exp = Series(['aXXXX', 'bXXXX', NA, 'cXXXX', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
result = values.str.pad(5, side='both', fillchar='X')
exp = Series(['XXaXX', 'XXbXX', NA, 'XXcXX', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not str"):
result = values.str.pad(5, fillchar='XY')
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not int"):
result = values.str.pad(5, fillchar=5)
def test_pad_width(self):
# GH 13598
s = Series(['1', '22', 'a', 'bb'])
for f in ['center', 'ljust', 'rjust', 'zfill', 'pad']:
with tm.assert_raises_regex(TypeError,
"width must be of "
"integer type, not*"):
getattr(s.str, f)('f')
def test_translate(self):
def _check(result, expected):
if isinstance(result, Series):
tm.assert_series_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
for klass in [Series, Index]:
s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg'])
if not compat.PY3:
import string
table = string.maketrans('abc', 'cde')
else:
table = str.maketrans('abc', 'cde')
result = s.str.translate(table)
expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg'])
_check(result, expected)
# use of deletechars is python 2 only
if not compat.PY3:
result = s.str.translate(table, deletechars='fg')
expected = klass(['cdede', 'cdee', 'eddd', 'ede'])
_check(result, expected)
result = s.str.translate(None, deletechars='fg')
expected = klass(['abcde', 'abcc', 'cddd', 'cde'])
_check(result, expected)
else:
with tm.assert_raises_regex(
ValueError, "deletechars is not a valid argument"):
result = s.str.translate(table, deletechars='fg')
# Series with non-string values
s = Series(['a', 'b', 'c', 1.2])
expected = Series(['c', 'd', 'e', np.nan])
result = s.str.translate(table)
tm.assert_series_equal(result, expected)
def test_center_ljust_rjust(self):
values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
result = values.str.center(5)
exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
result = values.str.ljust(5)
exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
result = values.str.rjust(5)
exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee'])
tm.assert_almost_equal(result, exp)
# mixed
mixed = Series(['a', NA, 'b', True, datetime.today(), 'c', 'eee', None,
1, 2.])
rs = Series(mixed).str.center(5)
xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, NA
])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
rs = Series(mixed).str.ljust(5)
xp = Series(['a ', NA, 'b ', NA, NA, 'c ', 'eee ', NA, NA, NA
])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
rs = Series(mixed).str.rjust(5)
xp = Series([' a', NA, ' b', NA, NA, ' c', ' eee', NA, NA, NA
])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')])
result = values.str.center(5)
exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')])
tm.assert_almost_equal(result, exp)
result = values.str.ljust(5)
exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')])
tm.assert_almost_equal(result, exp)
result = values.str.rjust(5)
exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')])
tm.assert_almost_equal(result, exp)
def test_center_ljust_rjust_fillchar(self):
values = Series(['a', 'bb', 'cccc', 'ddddd', 'eeeeee'])
result = values.str.center(5, fillchar='X')
expected = Series(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee'])
tm.assert_series_equal(result, expected)
expected = np.array([v.center(5, 'X') for v in values.values],
dtype=np.object_)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.ljust(5, fillchar='X')
expected = Series(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee'])
tm.assert_series_equal(result, expected)
expected = np.array([v.ljust(5, 'X') for v in values.values],
dtype=np.object_)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.rjust(5, fillchar='X')
expected = Series(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee'])
tm.assert_series_equal(result, expected)
expected = np.array([v.rjust(5, 'X') for v in values.values],
dtype=np.object_)
tm.assert_numpy_array_equal(result.values, expected)
# If fillchar is not a charatter, normal str raises TypeError
# 'aaa'.ljust(5, 'XY')
# TypeError: must be char, not str
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not str"):
result = values.str.center(5, fillchar='XY')
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not str"):
result = values.str.ljust(5, fillchar='XY')
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not str"):
result = values.str.rjust(5, fillchar='XY')
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not int"):
result = values.str.center(5, fillchar=1)
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not int"):
result = values.str.ljust(5, fillchar=1)
with tm.assert_raises_regex(TypeError,
"fillchar must be a "
"character, not int"):
result = values.str.rjust(5, fillchar=1)
def test_zfill(self):
values = Series(['1', '22', 'aaa', '333', '45678'])
result = values.str.zfill(5)
expected = Series(['00001', '00022', '00aaa', '00333', '45678'])
tm.assert_series_equal(result, expected)
expected = np.array([v.zfill(5) for v in values.values],
dtype=np.object_)
tm.assert_numpy_array_equal(result.values, expected)
result = values.str.zfill(3)
expected = Series(['001', '022', 'aaa', '333', '45678'])
tm.assert_series_equal(result, expected)
expected = np.array([v.zfill(3) for v in values.values],
dtype=np.object_)
tm.assert_numpy_array_equal(result.values, expected)
values = Series(['1', np.nan, 'aaa', np.nan, '45678'])
result = values.str.zfill(5)
expected = Series(['00001', np.nan, '00aaa', np.nan, '45678'])
tm.assert_series_equal(result, expected)
def test_split(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.split('_')
exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
tm.assert_series_equal(result, exp)
# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
result = values.str.split('__')
tm.assert_series_equal(result, exp)
result = values.str.split('__', expand=False)
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1,
2.])
result = mixed.str.split('_')
exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA
])
assert isinstance(result, Series)
tm.assert_almost_equal(result, exp)
result = mixed.str.split('_', expand=False)
assert isinstance(result, Series)
tm.assert_almost_equal(result, exp)
# unicode
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
result = values.str.split('_')
exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)
result = values.str.split('_', expand=False)
tm.assert_series_equal(result, exp)
# regex split
values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
result = values.str.split('[,_]')
exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)
def test_rsplit(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.rsplit('_')
exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
tm.assert_series_equal(result, exp)
# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
result = values.str.rsplit('__')
tm.assert_series_equal(result, exp)
result = values.str.rsplit('__', expand=False)
tm.assert_series_equal(result, exp)
# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1,
2.])
result = mixed.str.rsplit('_')
exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA
])
assert isinstance(result, Series)
tm.assert_almost_equal(result, exp)
result = mixed.str.rsplit('_', expand=False)
assert isinstance(result, Series)
tm.assert_almost_equal(result, exp)
# unicode
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
result = values.str.rsplit('_')
exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)
result = values.str.rsplit('_', expand=False)
tm.assert_series_equal(result, exp)
# regex split is not supported by rsplit
values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
result = values.str.rsplit('[,_]')
exp = Series([[u('a,b_c')], [u('c_d,e')], NA, [u('f,g,h')]])
tm.assert_series_equal(result, exp)
# setting max number of splits, make sure it's from reverse
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.rsplit('_', n=1)
exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']])
tm.assert_series_equal(result, exp)
def test_split_blank_string(self):
# expand blank split GH 20067
values = Series([''], name='test')
result = values.str.split(expand=True)
exp = DataFrame([[]])
tm.assert_frame_equal(result, exp)
values = Series(['a b c', 'a b', '', ' '], name='test')
result = values.str.split(expand=True)
exp = DataFrame([['a', 'b', 'c'], ['a', 'b', np.nan],
[np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]])
tm.assert_frame_equal(result, exp)
def test_split_noargs(self):
# #1859
s = Series(['Wes McKinney', 'Travis Oliphant'])
result = s.str.split()
expected = ['Travis', 'Oliphant']
assert result[1] == expected
result = s.str.rsplit()
assert result[1] == expected
def test_split_maxsplit(self):
# re.split 0, str.split -1
s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk'])
result = s.str.split(n=-1)
xp = s.str.split()
tm.assert_series_equal(result, xp)
result = s.str.split(n=0)
tm.assert_series_equal(result, xp)
xp = s.str.split('asdf')
result = s.str.split('asdf', n=0)
tm.assert_series_equal(result, xp)
result = s.str.split('asdf', n=-1)
tm.assert_series_equal(result, xp)
def test_split_no_pat_with_nonzero_n(self):
s = Series(['split once', 'split once too!'])
result = s.str.split(n=1)
expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']})
tm.assert_series_equal(expected, result, check_index_type=False)
def test_split_to_dataframe(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)
s = Series(['some_equal_splits', 'with_no_nans'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'],
1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)
s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'one'],
1: ['unequal', 'of'],
2: ['splits', 'these'],
3: [NA, 'things'],
4: [NA, 'is'],
5: [NA, 'not']})
tm.assert_frame_equal(result, exp)
s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)
with tm.assert_raises_regex(ValueError, "expand must be"):
s.str.split('_', expand="not_a_boolean")
def test_split_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
result = idx.str.split('_', expand=True)
exp = idx
tm.assert_index_equal(result, exp)
assert result.nlevels == 1
idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
'with', 'no', 'nans')])
tm.assert_index_equal(result, exp)
assert result.nlevels == 3
idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA
), ('one', 'of', 'these', 'things',
'is', 'not')])
tm.assert_index_equal(result, exp)
assert result.nlevels == 6
with tm.assert_raises_regex(ValueError, "expand must be"):
idx.str.split('_', expand="not_a_boolean")
def test_rsplit_to_dataframe_expand(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.rsplit('_', expand=True)
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)
s = Series(['some_equal_splits', 'with_no_nans'])
result = s.str.rsplit('_', expand=True)
exp = DataFrame({0: ['some', 'with'],
1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)
result = s.str.rsplit('_', expand=True, n=2)
exp = DataFrame({0: ['some', 'with'],
1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)
result = s.str.rsplit('_', expand=True, n=1)
exp = DataFrame({0: ['some_equal', 'with_no'], 1: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)
s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
result = s.str.rsplit('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)
def test_rsplit_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
result = idx.str.rsplit('_', expand=True)
exp = idx
tm.assert_index_equal(result, exp)
assert result.nlevels == 1
idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.rsplit('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
'with', 'no', 'nans')])
tm.assert_index_equal(result, exp)
assert result.nlevels == 3
idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.rsplit('_', expand=True, n=1)
exp = MultiIndex.from_tuples([('some_equal', 'splits'),
('with_no', 'nans')])
tm.assert_index_equal(result, exp)
assert result.nlevels == 2
def test_split_nan_expand(self):
# gh-18450
s = Series(["foo,bar,baz", NA])
result = s.str.split(",", expand=True)
exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]])
tm.assert_frame_equal(result, exp)
# check that these are actually np.nan and not None
# TODO see GH 18463
# tm.assert_frame_equal does not differentiate
assert all(np.isnan(x) for x in result.iloc[1])
def test_split_with_name(self):
# GH 12617
# should preserve name
s = Series(['a,b', 'c,d'], name='xxx')
res = s.str.split(',')
exp = Series([['a', 'b'], ['c', 'd']], name='xxx')
tm.assert_series_equal(res, exp)
res = s.str.split(',', expand=True)
exp = DataFrame([['a', 'b'], ['c', 'd']])
tm.assert_frame_equal(res, exp)
idx = Index(['a,b', 'c,d'], name='xxx')
res = idx.str.split(',')
exp = Index([['a', 'b'], ['c', 'd']], name='xxx')
assert res.nlevels == 1
tm.assert_index_equal(res, exp)
res = idx.str.split(',', expand=True)
exp = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')])
assert res.nlevels == 2
tm.assert_index_equal(res, exp)
def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_', expand=False)
exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
('f', '_', 'g_h')])
tm.assert_series_equal(result, exp)
result = values.str.rpartition('_', expand=False)
exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
('f_g', '_', 'h')])
tm.assert_series_equal(result, exp)
# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
result = values.str.partition('__', expand=False)
exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
('f', '__', 'g__h')])
tm.assert_series_equal(result, exp)
result = values.str.rpartition('__', expand=False)
exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
('f__g', '__', 'h')])
tm.assert_series_equal(result, exp)
# None
values = Series(['a b c', 'c d e', NA, 'f g h'])
result = values.str.partition(expand=False)
exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
('f', ' ', 'g h')])
tm.assert_series_equal(result, exp)
result = values.str.rpartition(expand=False)
exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
('f g', ' ', 'h')])
tm.assert_series_equal(result, exp)
# Not splited
values = Series(['abc', 'cde', NA, 'fgh'])
result = values.str.partition('_', expand=False)
exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')])
tm.assert_series_equal(result, exp)
result = values.str.rpartition('_', expand=False)
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')])
tm.assert_series_equal(result, exp)
# unicode
values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h'])
result = values.str.partition('_', expand=False)
exp = Series([(u'a', u'_', u'b_c'), (u'c', u'_', u'd_e'),
NA, (u'f', u'_', u'g_h')])
tm.assert_series_equal(result, exp)
result = values.str.rpartition('_', expand=False)
exp = Series([(u'a_b', u'_', u'c'), (u'c_d', u'_', u'e'),
NA, (u'f_g', u'_', u'h')])
tm.assert_series_equal(result, exp)
# compare to standard lib
values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
result = values.str.partition('_', expand=False).tolist()
assert result == [v.partition('_') for v in values]
result = values.str.rpartition('_', expand=False).tolist()
assert result == [v.rpartition('_') for v in values]
def test_partition_index(self):
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
result = values.str.partition('_', expand=False)
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_',
'g_h')]))
tm.assert_index_equal(result, exp)
assert result.nlevels == 1
result = values.str.rpartition('_', expand=False)
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), (
'f_g', '_', 'h')]))
tm.assert_index_equal(result, exp)
assert result.nlevels == 1
result = values.str.partition('_')
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
tm.assert_index_equal(result, exp)
assert isinstance(result, MultiIndex)
assert result.nlevels == 3
result = values.str.rpartition('_')
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
tm.assert_index_equal(result, exp)
assert isinstance(result, MultiIndex)
assert result.nlevels == 3
def test_partition_to_dataframe(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_')
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
tm.assert_frame_equal(result, exp)
result = values.str.rpartition('_')
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
tm.assert_frame_equal(result, exp)
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_', expand=True)
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
tm.assert_frame_equal(result, exp)
result = values.str.rpartition('_', expand=True)
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
tm.assert_frame_equal(result, exp)
def test_partition_with_name(self):
# GH 12617
s = Series(['a,b', 'c,d'], name='xxx')
res = s.str.partition(',')
exp = DataFrame({0: ['a', 'c'], 1: [',', ','], 2: ['b', 'd']})
tm.assert_frame_equal(res, exp)
# should preserve name
res = s.str.partition(',', expand=False)
exp = Series([('a', ',', 'b'), ('c', ',', 'd')], name='xxx')
tm.assert_series_equal(res, exp)
idx = Index(['a,b', 'c,d'], name='xxx')
res = idx.str.partition(',')
exp = MultiIndex.from_tuples([('a', ',', 'b'), ('c', ',', 'd')])
assert res.nlevels == 3
tm.assert_index_equal(res, exp)
# should preserve name
res = idx.str.partition(',', expand=False)
exp = Index(np.array([('a', ',', 'b'), ('c', ',', 'd')]), name='xxx')
assert res.nlevels == 1
tm.assert_index_equal(res, exp)
def test_pipe_failures(self):
# #2119
s = Series(['A|B|C'])
result = s.str.split('|')
exp = Series([['A', 'B', 'C']])
tm.assert_series_equal(result, exp)
result = s.str.replace('|', ' ')
exp = Series(['A B C'])
tm.assert_series_equal(result, exp)
def test_slice(self):
values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux'])
result = values.str.slice(2, 5)
exp = Series(['foo', 'bar', NA, 'baz'])
tm.assert_series_equal(result, exp)
for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2),
(3, 0, -1)]:
try:
result = values.str.slice(start, stop, step)
expected = Series([s[start:stop:step] if not isna(s) else NA
for s in values])
tm.assert_series_equal(result, expected)
except:
print('failed on %s:%s:%s' % (start, stop, step))
raise
# mixed
mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(),
None, 1, 2.])
rs = Series(mixed).str.slice(2, 5)
xp = Series(['foo', NA, 'bar', NA, NA, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
rs = Series(mixed).str.slice(2, 5, -1)
xp = Series(['oof', NA, 'rab', NA, NA, NA, NA, NA])
# unicode
values = Series([u('aafootwo'), u('aabartwo'), NA, u('aabazqux')])
result = values.str.slice(2, 5)
exp = Series([u('foo'), u('bar'), NA, u('baz')])
tm.assert_series_equal(result, exp)
result = values.str.slice(0, -1, 2)
exp = Series([u('afow'), u('abrw'), NA, u('abzu')])
tm.assert_series_equal(result, exp)
def test_slice_replace(self):
values = Series(['short', 'a bit longer', 'evenlongerthanthat', '', NA
])
exp = Series(['shrt', 'a it longer', 'evnlongerthanthat', '', NA])
result = values.str.slice_replace(2, 3)
tm.assert_series_equal(result, exp)
exp = Series(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z', NA])
result = values.str.slice_replace(2, 3, 'z')
tm.assert_series_equal(result, exp)
exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA
])
result = values.str.slice_replace(2, 2, 'z')
tm.assert_series_equal(result, exp)
exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA
])
result = values.str.slice_replace(2, 1, 'z')
tm.assert_series_equal(result, exp)
exp = Series(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z', NA])
result = values.str.slice_replace(-1, None, 'z')
tm.assert_series_equal(result, exp)
exp = Series(['zrt', 'zer', 'zat', 'z', NA])
result = values.str.slice_replace(None, -2, 'z')
tm.assert_series_equal(result, exp)
exp = Series(['shortz', 'a bit znger', 'evenlozerthanthat', 'z', NA])
result = values.str.slice_replace(6, 8, 'z')
tm.assert_series_equal(result, exp)
exp = Series(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z', NA])
result = values.str.slice_replace(-10, 3, 'z')
tm.assert_series_equal(result, exp)
def test_strip_lstrip_rstrip(self):
values = Series([' aa ', ' bb \n', NA, 'cc '])
result = values.str.strip()
exp = Series(['aa', 'bb', NA, 'cc'])
tm.assert_series_equal(result, exp)
result = values.str.lstrip()
exp = Series(['aa ', 'bb \n', NA, 'cc '])
tm.assert_series_equal(result, exp)
result = values.str.rstrip()
exp = Series([' aa', ' bb', NA, 'cc'])
tm.assert_series_equal(result, exp)
def test_strip_lstrip_rstrip_mixed(self):
# mixed
mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), None,
1, 2.])
rs = Series(mixed).str.strip()
xp = Series(['aa', NA, 'bb', NA, NA, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
rs = Series(mixed).str.lstrip()
xp = Series(['aa ', NA, 'bb \t\n', NA, NA, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
rs = Series(mixed).str.rstrip()
xp = Series([' aa', NA, ' bb', NA, NA, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
def test_strip_lstrip_rstrip_unicode(self):
# unicode
values = Series([u(' aa '), u(' bb \n'), NA, u('cc ')])
result = values.str.strip()
exp = Series([u('aa'), u('bb'), NA, u('cc')])
tm.assert_series_equal(result, exp)
result = values.str.lstrip()
exp = Series([u('aa '), u('bb \n'), NA, u('cc ')])
tm.assert_series_equal(result, exp)
result = values.str.rstrip()
exp = Series([u(' aa'), u(' bb'), NA, u('cc')])
tm.assert_series_equal(result, exp)
def test_strip_lstrip_rstrip_args(self):
values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx'])
rs = values.str.strip('x')
xp = Series(['ABC', ' BNSD', 'LDFJH '])
assert_series_equal(rs, xp)
rs = values.str.lstrip('x')
xp = Series(['ABCxx', ' BNSD', 'LDFJH xx'])
assert_series_equal(rs, xp)
rs = values.str.rstrip('x')
xp = Series(['xxABC', 'xx BNSD', 'LDFJH '])
assert_series_equal(rs, xp)
def test_strip_lstrip_rstrip_args_unicode(self):
values = Series([u('xxABCxx'), u('xx BNSD'), u('LDFJH xx')])
rs = values.str.strip(u('x'))
xp = Series(['ABC', ' BNSD', 'LDFJH '])
assert_series_equal(rs, xp)
rs = values.str.lstrip(u('x'))
xp = Series(['ABCxx', ' BNSD', 'LDFJH xx'])
assert_series_equal(rs, xp)
rs = values.str.rstrip(u('x'))
xp = Series(['xxABC', 'xx BNSD', 'LDFJH '])
assert_series_equal(rs, xp)
def test_wrap(self):
# test values are: two words less than width, two words equal to width,
# two words greater than width, one word less than width, one word
# equal to width, one word greater than width, multiple tokens with
# trailing whitespace equal to width
values = Series([u('hello world'), u('hello world!'), u(
'hello world!!'), u('abcdefabcde'), u('abcdefabcdef'), u(
'abcdefabcdefa'), u('ab ab ab ab '), u('ab ab ab ab a'), u(
'\t')])
# expected values
xp = Series([u('hello world'), u('hello world!'), u('hello\nworld!!'),
u('abcdefabcde'), u('abcdefabcdef'), u('abcdefabcdef\na'),
u('ab ab ab ab'), u('ab ab ab ab\na'), u('')])
rs = values.str.wrap(12, break_long_words=True)
assert_series_equal(rs, xp)
# test with pre and post whitespace (non-unicode), NaN, and non-ascii
# Unicode
values = Series([' pre ', np.nan, u('\xac\u20ac\U00008000 abadcafe')
])
xp = Series([' pre', NA, u('\xac\u20ac\U00008000 ab\nadcafe')])
rs = values.str.wrap(6)
assert_series_equal(rs, xp)
def test_get(self):
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
result = values.str.split('_').str.get(1)
expected = Series(['b', 'd', np.nan, 'g'])
tm.assert_series_equal(result, expected)
# mixed
mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), None, 1,
2.])
rs = Series(mixed).str.split('_').str.get(1)
xp = Series(['b', NA, 'd', NA, NA, NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
# unicode
values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')])
result = values.str.split('_').str.get(1)
expected = Series([u('b'), u('d'), np.nan, u('g')])
tm.assert_series_equal(result, expected)
# bounds testing
values = Series(['1_2_3_4_5', '6_7_8_9_10', '11_12'])
# positive index
result = values.str.split('_').str.get(2)
expected = Series(['3', '8', np.nan])
tm.assert_series_equal(result, expected)
# negative index
result = values.str.split('_').str.get(-3)
expected = Series(['3', '8', np.nan])
tm.assert_series_equal(result, expected)
def test_get_complex(self):
# GH 20671, getting value not in dict raising `KeyError`
values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3},
{1: 'a', 2: 'b', 3: 'c'}])
result = values.str.get(1)
expected = Series([2, 2, np.nan, 'a'])
tm.assert_series_equal(result, expected)
result = values.str.get(-1)
expected = Series([3, 3, np.nan, np.nan])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('to_type', [tuple, list, np.array])
def test_get_complex_nested(self, to_type):
values = Series([to_type([to_type([1, 2])])])
result = values.str.get(0)
expected = Series([to_type([1, 2])])
tm.assert_series_equal(result, expected)
result = values.str.get(1)
expected = Series([np.nan])
tm.assert_series_equal(result, expected)
def test_more_contains(self):
# PR #1179
s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA,
'CABA', 'dog', 'cat'])
result = s.str.contains('a')
expected = Series([False, False, False, True, True, False, np.nan,
False, False, True])
assert_series_equal(result, expected)
result = s.str.contains('a', case=False)
expected = Series([True, False, False, True, True, False, np.nan, True,
False, True])
assert_series_equal(result, expected)
result = s.str.contains('Aa')
expected = Series([False, False, False, True, False, False, np.nan,
False, False, False])
assert_series_equal(result, expected)
result = s.str.contains('ba')
expected = Series([False, False, False, True, False, False, np.nan,
False, False, False])
assert_series_equal(result, expected)
result = s.str.contains('ba', case=False)
expected = Series([False, False, False, True, True, False, np.nan,
True, False, False])
assert_series_equal(result, expected)
def test_contains_nan(self):
# PR #14171
s = Series([np.nan, np.nan, np.nan], dtype=np.object_)
result = s.str.contains('foo', na=False)
expected = Series([False, False, False], dtype=np.bool_)
assert_series_equal(result, expected)
result = s.str.contains('foo', na=True)
expected = Series([True, True, True], dtype=np.bool_)
assert_series_equal(result, expected)
result = s.str.contains('foo', na="foo")
expected = Series(["foo", "foo", "foo"], dtype=np.object_)
assert_series_equal(result, expected)
result = s.str.contains('foo')
expected = Series([np.nan, np.nan, np.nan], dtype=np.object_)
assert_series_equal(result, expected)
def test_more_replace(self):
# PR #1179
s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA',
'dog', 'cat'])
result = s.str.replace('A', 'YYY')
expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA,
'CYYYBYYY', 'dog', 'cat'])
assert_series_equal(result, expected)
result = s.str.replace('A', 'YYY', case=False)
expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA,
'CYYYBYYY', 'dog', 'cYYYt'])
assert_series_equal(result, expected)
result = s.str.replace('^.a|dog', 'XX-XX ', case=False)
expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA,
'XX-XX BA', 'XX-XX ', 'XX-XX t'])
assert_series_equal(result, expected)
def test_string_slice_get_syntax(self):
s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, 'CYYYBYYY',
'dog', 'cYYYt'])
result = s.str[0]
expected = s.str.get(0)
assert_series_equal(result, expected)
result = s.str[:3]
expected = s.str.slice(stop=3)
assert_series_equal(result, expected)
result = s.str[2::-1]
expected = s.str.slice(start=2, step=-1)
assert_series_equal(result, expected)
def test_string_slice_out_of_bounds(self):
s = Series([(1, 2), (1, ), (3, 4, 5)])
result = s.str[1]
expected = Series([2, np.nan, 4])
assert_series_equal(result, expected)
s = Series(['foo', 'b', 'ba'])
result = s.str[1]
expected = Series(['o', np.nan, 'a'])
assert_series_equal(result, expected)
def test_match_findall_flags(self):
data = {'Dave': 'dave@google.com',
'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com',
'Wes': np.nan}
data = Series(data)
pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
assert result.iloc[0].tolist() == ['dave', 'google', 'com']
result = data.str.match(pat, flags=re.IGNORECASE)
assert result[0]
result = data.str.findall(pat, flags=re.IGNORECASE)
assert result[0][0] == ('dave', 'google', 'com')
result = data.str.count(pat, flags=re.IGNORECASE)
assert result[0] == 1
with tm.assert_produces_warning(UserWarning):
result = data.str.contains(pat, flags=re.IGNORECASE)
assert result[0]
def test_encode_decode(self):
base = Series([u('a'), u('b'), u('a\xe4')])
series = base.str.encode('utf-8')
f = lambda x: x.decode('utf-8')
result = series.str.decode('utf-8')
exp = series.map(f)
tm.assert_series_equal(result, exp)
def test_encode_decode_errors(self):
encodeBase = Series([u('a'), u('b'), u('a\x9d')])
pytest.raises(UnicodeEncodeError, encodeBase.str.encode, 'cp1252')
f = lambda x: x.encode('cp1252', 'ignore')
result = encodeBase.str.encode('cp1252', 'ignore')
exp = encodeBase.map(f)
tm.assert_series_equal(result, exp)
decodeBase = Series([b'a', b'b', b'a\x9d'])
pytest.raises(UnicodeDecodeError, decodeBase.str.decode, 'cp1252')
f = lambda x: x.decode('cp1252', 'ignore')
result = decodeBase.str.decode('cp1252', 'ignore')
exp = decodeBase.map(f)
tm.assert_series_equal(result, exp)
def test_normalize(self):
values = ['ABC', u'', u'', np.nan, u'アイエ']
s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
normed = [u'ABC', u'ABC', u'123', np.nan, u'アイエ']
expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
result = s.str.normalize('NFKC')
tm.assert_series_equal(result, expected)
expected = Series([u'ABC', u'', u'', np.nan, u'アイエ'],
index=['a', 'b', 'c', 'd', 'e'])
result = s.str.normalize('NFC')
tm.assert_series_equal(result, expected)
with tm.assert_raises_regex(ValueError,
"invalid normalization form"):
s.str.normalize('xxx')
s = Index([u'', u'', u'アイエ'])
expected = Index([u'ABC', u'123', u'アイエ'])
result = s.str.normalize('NFKC')
tm.assert_index_equal(result, expected)
def test_index_str_accessor_visibility(self):
from pandas.core.strings import StringMethods
if not compat.PY3:
cases = [(['a', 'b'], 'string'), (['a', u('b')], 'mixed'),
([u('a'), u('b')], 'unicode'),
(['a', 'b', 1], 'mixed-integer'),
(['a', 'b', 1.3], 'mixed'),
(['a', 'b', 1.3, 1], 'mixed-integer'),
(['aa', datetime(2011, 1, 1)], 'mixed')]
else:
cases = [(['a', 'b'], 'string'), (['a', u('b')], 'string'),
([u('a'), u('b')], 'string'),
(['a', 'b', 1], 'mixed-integer'),
(['a', 'b', 1.3], 'mixed'),
(['a', 'b', 1.3, 1], 'mixed-integer'),
(['aa', datetime(2011, 1, 1)], 'mixed')]
for values, tp in cases:
idx = Index(values)
assert isinstance(Series(values).str, StringMethods)
assert isinstance(idx.str, StringMethods)
assert idx.inferred_type == tp
for values, tp in cases:
idx = Index(values)
assert isinstance(Series(values).str, StringMethods)
assert isinstance(idx.str, StringMethods)
assert idx.inferred_type == tp
cases = [([1, np.nan], 'floating'),
([datetime(2011, 1, 1)], 'datetime64'),
([timedelta(1)], 'timedelta64')]
for values, tp in cases:
idx = Index(values)
message = 'Can only use .str accessor with string values'
with tm.assert_raises_regex(AttributeError, message):
Series(values).str
with tm.assert_raises_regex(AttributeError, message):
idx.str
assert idx.inferred_type == tp
# MultiIndex has mixed dtype, but not allow to use accessor
idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')])
assert idx.inferred_type == 'mixed'
message = 'Can only use .str accessor with Index, not MultiIndex'
with tm.assert_raises_regex(AttributeError, message):
idx.str
def test_str_accessor_no_new_attributes(self):
# https://github.com/pandas-dev/pandas/issues/10673
s = Series(list('aabbcde'))
with tm.assert_raises_regex(AttributeError,
"You cannot add any new attribute"):
s.str.xlabel = "a"
def test_method_on_bytes(self):
lhs = Series(np.array(list('abc'), 'S1').astype(object))
rhs = Series(np.array(list('def'), 'S1').astype(object))
if compat.PY3:
pytest.raises(TypeError, lhs.str.cat, rhs, join='left')
else:
result = lhs.str.cat(rhs, join='left')
expected = Series(np.array(
['ad', 'be', 'cf'], 'S2').astype(object))
tm.assert_series_equal(result, expected)