laywerrobot/lib/python3.6/site-packages/pandas/plotting/_misc.py
2020-08-27 21:55:39 +02:00

645 lines
21 KiB
Python

# being a bit too dynamic
# pylint: disable=E1101
from __future__ import division
import numpy as np
from pandas.util._decorators import deprecate_kwarg
from pandas.core.dtypes.missing import notna
from pandas.compat import range, lrange, lmap, zip
from pandas.io.formats.printing import pprint_thing
from pandas.plotting._style import _get_standard_colors
from pandas.plotting._tools import _subplots, _set_ticks_props
def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
diagonal='hist', marker='.', density_kwds=None,
hist_kwds=None, range_padding=0.05, **kwds):
"""
Draw a matrix of scatter plots.
Parameters
----------
frame : DataFrame
alpha : float, optional
amount of transparency applied
figsize : (float,float), optional
a tuple (width, height) in inches
ax : Matplotlib axis object, optional
grid : bool, optional
setting this to True will show the grid
diagonal : {'hist', 'kde'}
pick between 'kde' and 'hist' for
either Kernel Density Estimation or Histogram
plot in the diagonal
marker : str, optional
Matplotlib marker type, default '.'
hist_kwds : other plotting keyword arguments
To be passed to hist function
density_kwds : other plotting keyword arguments
To be passed to kernel density estimate plot
range_padding : float, optional
relative extension of axis range in x and y
with respect to (x_max - x_min) or (y_max - y_min),
default 0.05
kwds : other plotting keyword arguments
To be passed to scatter function
Examples
--------
>>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
>>> scatter_matrix(df, alpha=0.2)
"""
df = frame._get_numeric_data()
n = df.columns.size
naxes = n * n
fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax,
squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = notna(df)
marker = _get_marker_compat(marker)
hist_kwds = hist_kwds or {}
density_kwds = density_kwds or {}
# GH 14855
kwds.setdefault('edgecolors', 'none')
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
ax = axes[i, j]
if i == j:
values = df[a].values[mask[a].values]
# Deal with the diagonal by drawing a histogram there.
if diagonal == 'hist':
ax.hist(values, **hist_kwds)
elif diagonal in ('kde', 'density'):
from scipy.stats import gaussian_kde
y = values
gkde = gaussian_kde(y)
ind = np.linspace(y.min(), y.max(), 1000)
ax.plot(ind, gkde.evaluate(ind), **density_kwds)
ax.set_xlim(boundaries_list[i])
else:
common = (mask[a] & mask[b]).values
ax.scatter(df[b][common], df[a][common],
marker=marker, alpha=alpha, **kwds)
ax.set_xlim(boundaries_list[j])
ax.set_ylim(boundaries_list[i])
ax.set_xlabel(b)
ax.set_ylabel(a)
if j != 0:
ax.yaxis.set_visible(False)
if i != n - 1:
ax.xaxis.set_visible(False)
if len(df.columns) > 1:
lim1 = boundaries_list[0]
locs = axes[0][1].yaxis.get_majorticklocs()
locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
lim0 = axes[0][0].get_ylim()
adj = adj * (lim0[1] - lim0[0]) + lim0[0]
axes[0][0].yaxis.set_ticks(adj)
if np.all(locs == locs.astype(int)):
# if all ticks are int
locs = locs.astype(int)
axes[0][0].yaxis.set_ticklabels(locs)
_set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
return axes
def _get_marker_compat(marker):
import matplotlib.lines as mlines
import matplotlib as mpl
if mpl.__version__ < '1.1.0' and marker == '.':
return 'o'
if marker not in mlines.lineMarkers:
return 'o'
return marker
def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds):
"""
Plot a multidimensional dataset in 2D.
Each Series in the DataFrame is represented as a evenly distributed
slice on a circle. Each data point is rendered in the circle according to
the value on each Series. Highly correlated `Series` in the `DataFrame`
are placed closer on the unit circle.
RadViz allow to project a N-dimensional data set into a 2D space where the
influence of each dimension can be interpreted as a balance between the
influence of all dimensions.
More info available at the `original article
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.135.889>`_
describing RadViz.
Parameters
----------
frame : `DataFrame`
Pandas object holding the data.
class_column : str
Column name containing the name of the data point category.
ax : :class:`matplotlib.axes.Axes`, optional
A plot instance to which to add the information.
color : list[str] or tuple[str], optional
Assign a color to each category. Example: ['blue', 'green'].
colormap : str or :class:`matplotlib.colors.Colormap`, default None
Colormap to select colors from. If string, load colormap with that
name from matplotlib.
kwds : optional
Options to pass to matplotlib scatter plotting method.
Returns
-------
axes : :class:`matplotlib.axes.Axes`
See Also
--------
pandas.plotting.andrews_curves : Plot clustering visualization
Examples
--------
.. plot::
:context: close-figs
>>> df = pd.DataFrame({
... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6,
... 6.7, 4.6],
... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2,
... 3.3, 3.6],
... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4,
... 5.7, 1.0],
... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2,
... 2.1, 0.2],
... 'Category': ['virginica', 'virginica', 'setosa',
... 'virginica', 'virginica', 'versicolor',
... 'versicolor', 'setosa', 'virginica',
... 'setosa']
... })
>>> rad_viz = pd.plotting.radviz(df, 'Category')
"""
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def normalize(series):
a = min(series)
b = max(series)
return (series - a) / (b - a)
n = len(frame)
classes = frame[class_column].drop_duplicates()
class_col = frame[class_column]
df = frame.drop(class_column, axis=1).apply(normalize)
if ax is None:
ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])
to_plot = {}
colors = _get_standard_colors(num_colors=len(classes), colormap=colormap,
color_type='random', color=color)
for kls in classes:
to_plot[kls] = [[], []]
m = len(frame.columns) - 1
s = np.array([(np.cos(t), np.sin(t))
for t in [2.0 * np.pi * (i / float(m))
for i in range(m)]])
for i in range(n):
row = df.iloc[i].values
row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
y = (s * row_).sum(axis=0) / row.sum()
kls = class_col.iat[i]
to_plot[kls][0].append(y[0])
to_plot[kls][1].append(y[1])
for i, kls in enumerate(classes):
ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i],
label=pprint_thing(kls), **kwds)
ax.legend()
ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none'))
for xy, name in zip(s, df.columns):
ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray'))
if xy[0] < 0.0 and xy[1] < 0.0:
ax.text(xy[0] - 0.025, xy[1] - 0.025, name,
ha='right', va='top', size='small')
elif xy[0] < 0.0 and xy[1] >= 0.0:
ax.text(xy[0] - 0.025, xy[1] + 0.025, name,
ha='right', va='bottom', size='small')
elif xy[0] >= 0.0 and xy[1] < 0.0:
ax.text(xy[0] + 0.025, xy[1] - 0.025, name,
ha='left', va='top', size='small')
elif xy[0] >= 0.0 and xy[1] >= 0.0:
ax.text(xy[0] + 0.025, xy[1] + 0.025, name,
ha='left', va='bottom', size='small')
ax.axis('equal')
return ax
@deprecate_kwarg(old_arg_name='data', new_arg_name='frame')
def andrews_curves(frame, class_column, ax=None, samples=200, color=None,
colormap=None, **kwds):
"""
Generates a matplotlib plot of Andrews curves, for visualising clusters of
multivariate data.
Andrews curves have the functional form:
f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) +
x_4 sin(2t) + x_5 cos(2t) + ...
Where x coefficients correspond to the values of each dimension and t is
linearly spaced between -pi and +pi. Each row of frame then corresponds to
a single curve.
Parameters
----------
frame : DataFrame
Data to be plotted, preferably normalized to (0.0, 1.0)
class_column : Name of the column containing class names
ax : matplotlib axes object, default None
samples : Number of points to plot in each curve
color: list or tuple, optional
Colors to use for the different classes
colormap : str or matplotlib colormap object, default None
Colormap to select colors from. If string, load colormap with that name
from matplotlib.
kwds: keywords
Options to pass to matplotlib plotting method
Returns
-------
ax: Matplotlib axis object
"""
from math import sqrt, pi
import matplotlib.pyplot as plt
def function(amplitudes):
def f(t):
x1 = amplitudes[0]
result = x1 / sqrt(2.0)
# Take the rest of the coefficients and resize them
# appropriately. Take a copy of amplitudes as otherwise numpy
# deletes the element from amplitudes itself.
coeffs = np.delete(np.copy(amplitudes), 0)
coeffs.resize(int((coeffs.size + 1) / 2), 2)
# Generate the harmonics and arguments for the sin and cos
# functions.
harmonics = np.arange(0, coeffs.shape[0]) + 1
trig_args = np.outer(harmonics, t)
result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) +
coeffs[:, 1, np.newaxis] * np.cos(trig_args),
axis=0)
return result
return f
n = len(frame)
class_col = frame[class_column]
classes = frame[class_column].drop_duplicates()
df = frame.drop(class_column, axis=1)
t = np.linspace(-pi, pi, samples)
used_legends = set([])
color_values = _get_standard_colors(num_colors=len(classes),
colormap=colormap, color_type='random',
color=color)
colors = dict(zip(classes, color_values))
if ax is None:
ax = plt.gca(xlim=(-pi, pi))
for i in range(n):
row = df.iloc[i].values
f = function(row)
y = f(t)
kls = class_col.iat[i]
label = pprint_thing(kls)
if label not in used_legends:
used_legends.add(label)
ax.plot(t, y, color=colors[kls], label=label, **kwds)
else:
ax.plot(t, y, color=colors[kls], **kwds)
ax.legend(loc='upper right')
ax.grid()
return ax
def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds):
"""
Bootstrap plot on mean, median and mid-range statistics.
The bootstrap plot is used to estimate the uncertainty of a statistic
by relaying on random sampling with replacement [1]_. This function will
generate bootstrapping plots for mean, median and mid-range statistics
for the given number of samples of the given size.
.. [1] "Bootstrapping (statistics)" in \
https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
Parameters
----------
series : pandas.Series
Pandas Series from where to get the samplings for the bootstrapping.
fig : matplotlib.figure.Figure, default None
If given, it will use the `fig` reference for plotting instead of
creating a new one with default parameters.
size : int, default 50
Number of data points to consider during each sampling. It must be
greater or equal than the length of the `series`.
samples : int, default 500
Number of times the bootstrap procedure is performed.
**kwds :
Options to pass to matplotlib plotting method.
Returns
-------
fig : matplotlib.figure.Figure
Matplotlib figure
See Also
--------
pandas.DataFrame.plot : Basic plotting for DataFrame objects.
pandas.Series.plot : Basic plotting for Series objects.
Examples
--------
.. plot::
:context: close-figs
>>> import numpy as np
>>> s = pd.Series(np.random.uniform(size=100))
>>> fig = pd.plotting.bootstrap_plot(s)
"""
import random
import matplotlib.pyplot as plt
# random.sample(ndarray, int) fails on python 3.3, sigh
data = list(series.values)
samplings = [random.sample(data, size) for _ in range(samples)]
means = np.array([np.mean(sampling) for sampling in samplings])
medians = np.array([np.median(sampling) for sampling in samplings])
midranges = np.array([(min(sampling) + max(sampling)) * 0.5
for sampling in samplings])
if fig is None:
fig = plt.figure()
x = lrange(samples)
axes = []
ax1 = fig.add_subplot(2, 3, 1)
ax1.set_xlabel("Sample")
axes.append(ax1)
ax1.plot(x, means, **kwds)
ax2 = fig.add_subplot(2, 3, 2)
ax2.set_xlabel("Sample")
axes.append(ax2)
ax2.plot(x, medians, **kwds)
ax3 = fig.add_subplot(2, 3, 3)
ax3.set_xlabel("Sample")
axes.append(ax3)
ax3.plot(x, midranges, **kwds)
ax4 = fig.add_subplot(2, 3, 4)
ax4.set_xlabel("Mean")
axes.append(ax4)
ax4.hist(means, **kwds)
ax5 = fig.add_subplot(2, 3, 5)
ax5.set_xlabel("Median")
axes.append(ax5)
ax5.hist(medians, **kwds)
ax6 = fig.add_subplot(2, 3, 6)
ax6.set_xlabel("Midrange")
axes.append(ax6)
ax6.hist(midranges, **kwds)
for axis in axes:
plt.setp(axis.get_xticklabels(), fontsize=8)
plt.setp(axis.get_yticklabels(), fontsize=8)
return fig
@deprecate_kwarg(old_arg_name='colors', new_arg_name='color')
@deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3)
def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None,
use_columns=False, xticks=None, colormap=None,
axvlines=True, axvlines_kwds=None, sort_labels=False,
**kwds):
"""Parallel coordinates plotting.
Parameters
----------
frame: DataFrame
class_column: str
Column name containing class names
cols: list, optional
A list of column names to use
ax: matplotlib.axis, optional
matplotlib axis object
color: list or tuple, optional
Colors to use for the different classes
use_columns: bool, optional
If true, columns will be used as xticks
xticks: list or tuple, optional
A list of values to use for xticks
colormap: str or matplotlib colormap, default None
Colormap to use for line colors.
axvlines: bool, optional
If true, vertical lines will be added at each xtick
axvlines_kwds: keywords, optional
Options to be passed to axvline method for vertical lines
sort_labels: bool, False
Sort class_column labels, useful when assigning colors
.. versionadded:: 0.20.0
kwds: keywords
Options to pass to matplotlib plotting method
Returns
-------
ax: matplotlib axis object
Examples
--------
>>> from pandas import read_csv
>>> from pandas.tools.plotting import parallel_coordinates
>>> from matplotlib import pyplot as plt
>>> df = read_csv('https://raw.github.com/pandas-dev/pandas/master'
'/pandas/tests/data/iris.csv')
>>> parallel_coordinates(df, 'Name', color=('#556270',
'#4ECDC4', '#C7F464'))
>>> plt.show()
"""
if axvlines_kwds is None:
axvlines_kwds = {'linewidth': 1, 'color': 'black'}
import matplotlib.pyplot as plt
n = len(frame)
classes = frame[class_column].drop_duplicates()
class_col = frame[class_column]
if cols is None:
df = frame.drop(class_column, axis=1)
else:
df = frame[cols]
used_legends = set([])
ncols = len(df.columns)
# determine values to use for xticks
if use_columns is True:
if not np.all(np.isreal(list(df.columns))):
raise ValueError('Columns must be numeric to be used as xticks')
x = df.columns
elif xticks is not None:
if not np.all(np.isreal(xticks)):
raise ValueError('xticks specified must be numeric')
elif len(xticks) != ncols:
raise ValueError('Length of xticks must match number of columns')
x = xticks
else:
x = lrange(ncols)
if ax is None:
ax = plt.gca()
color_values = _get_standard_colors(num_colors=len(classes),
colormap=colormap, color_type='random',
color=color)
if sort_labels:
classes = sorted(classes)
color_values = sorted(color_values)
colors = dict(zip(classes, color_values))
for i in range(n):
y = df.iloc[i].values
kls = class_col.iat[i]
label = pprint_thing(kls)
if label not in used_legends:
used_legends.add(label)
ax.plot(x, y, color=colors[kls], label=label, **kwds)
else:
ax.plot(x, y, color=colors[kls], **kwds)
if axvlines:
for i in x:
ax.axvline(i, **axvlines_kwds)
ax.set_xticks(x)
ax.set_xticklabels(df.columns)
ax.set_xlim(x[0], x[-1])
ax.legend(loc='upper right')
ax.grid()
return ax
def lag_plot(series, lag=1, ax=None, **kwds):
"""Lag plot for time series.
Parameters
----------
series: Time series
lag: lag of the scatter plot, default 1
ax: Matplotlib axis object, optional
kwds: Matplotlib scatter method keyword arguments, optional
Returns
-------
ax: Matplotlib axis object
"""
import matplotlib.pyplot as plt
# workaround because `c='b'` is hardcoded in matplotlibs scatter method
kwds.setdefault('c', plt.rcParams['patch.facecolor'])
data = series.values
y1 = data[:-lag]
y2 = data[lag:]
if ax is None:
ax = plt.gca()
ax.set_xlabel("y(t)")
ax.set_ylabel("y(t + {lag})".format(lag=lag))
ax.scatter(y1, y2, **kwds)
return ax
def autocorrelation_plot(series, ax=None, **kwds):
"""Autocorrelation plot for time series.
Parameters:
-----------
series: Time series
ax: Matplotlib axis object, optional
kwds : keywords
Options to pass to matplotlib plotting method
Returns:
-----------
ax: Matplotlib axis object
"""
import matplotlib.pyplot as plt
n = len(series)
data = np.asarray(series)
if ax is None:
ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0))
mean = np.mean(data)
c0 = np.sum((data - mean) ** 2) / float(n)
def r(h):
return ((data[:n - h] - mean) *
(data[h:] - mean)).sum() / float(n) / c0
x = np.arange(n) + 1
y = lmap(r, x)
z95 = 1.959963984540054
z99 = 2.5758293035489004
ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey')
ax.axhline(y=z95 / np.sqrt(n), color='grey')
ax.axhline(y=0.0, color='black')
ax.axhline(y=-z95 / np.sqrt(n), color='grey')
ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey')
ax.set_xlabel("Lag")
ax.set_ylabel("Autocorrelation")
ax.plot(x, y, **kwds)
if 'label' in kwds:
ax.legend()
ax.grid()
return ax