laywerrobot/lib/python3.6/site-packages/tensorboard/plugins/debugger/numerics_alert.py

# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Data structures and algorithms for numerics alert.

The alerts are generated when a Tensor's elements contain bad values,
including nan, -inf and +inf.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import json
import re

import tensorflow as tf

from tensorflow.python import debug as tf_debug
from tensorboard.plugins.debugger import constants

# The following two namedtuples are the same except that
# 1) `timestamp` in `NumericsAlert` is the timestamp of a single alerting event,
#    while `first_timestamp` in `NumericsAlertReportRow` is the first (earliest)
#    timestamp of a se tof aggregated `NumericsAlert`s.
# 2) The counts in `NumericsAlert` are the counts of elements in the tensor,
#    while the event counts in `NumericsAlertReportRow` are counts of previous
#    `NumericsAlert` events of the corresponding categories.
NumericsAlert = collections.namedtuple(
    "NumericsAlert",
    ["device_name", "tensor_name", "timestamp", "nan_count", "neg_inf_count",
     "pos_inf_count"])

NumericsAlertReportRow = collections.namedtuple(
    "NumericsAlertReportRow",
    ["device_name", "tensor_name", "first_timestamp", "nan_event_count",
     "neg_inf_event_count", "pos_inf_event_count"])

# Used to reconstruct an _EventTracker from data read from disk. When updating
# this named tuple, make sure to keep the properties of _EventTracker in sync.
EventTrackerDescription = collections.namedtuple(
    "EventTrackerDescription",
    ["event_count", "first_timestamp", "last_timestamp"])

# Used to reconstruct NumericsAlertHistory.
HistoryTriplet = collections.namedtuple(
    "HistoryTriplet",
    ["device", "tensor", "jsonable_history"])


class _EventTracker(object):
  """Track events for a single category of values (NaN, -Inf, or +Inf)."""

  def __init__(self, event_count=0, first_timestamp=-1, last_timestamp=-1):
    """Tracks events for a single category of values.

    Args:
      event_count: The initial event count to use.
      first_timestamp: The timestamp of the first event with this value.
      last_timestamp: The timestamp of the last event with this category of
          values.
    """

    # When updating the properties of this class, make sure to keep
    # EventTrackerDescription in sync so that data can be written to and from
    # disk correctly.
    self.event_count = event_count
    self.first_timestamp = first_timestamp
    self.last_timestamp = last_timestamp

  def add(self, timestamp):
    if self.event_count == 0:
      self.first_timestamp = timestamp
      self.last_timestamp = timestamp
    else:
      if timestamp < self.first_timestamp:
        self.first_timestamp = timestamp
      if timestamp > self.last_timestamp:
        self.last_timestamp = timestamp
    self.event_count += 1

  def get_description(self):
    return EventTrackerDescription(
        self.event_count, self.first_timestamp, self.last_timestamp)


class NumericsAlertHistory(object):
  """History of numerics alerts."""

  def __init__(self, initialization_list=None):
    """Stores alert history for a single device, tensor pair.

    Args:
      initialization_list: (`list`) An optional list parsed from JSON read
        from disk. That entity is used to initialize this NumericsAlertHistory.
        Use the create_jsonable_object method of this class to create such an
        object.
    """
    if initialization_list:
      # Use data to initialize this NumericsAlertHistory.
      self._trackers = {}
      for value_category_key, description_list in initialization_list.items():
        description = EventTrackerDescription._make(description_list)
        self._trackers[value_category_key] = _EventTracker(
            event_count=description.event_count,
            first_timestamp=description.first_timestamp,
            last_timestamp=description.last_timestamp)
    else:
      # Start cleanly. With no prior data.
      self._trackers = {
          constants.NAN_KEY: _EventTracker(),
          constants.NEG_INF_KEY: _EventTracker(),
          constants.POS_INF_KEY: _EventTracker(),
      }

  def add(self, numerics_alert):
    if numerics_alert.nan_count:
      self._trackers[constants.NAN_KEY].add(numerics_alert.timestamp)
    if numerics_alert.neg_inf_count:
      self._trackers[constants.NEG_INF_KEY].add(numerics_alert.timestamp)
    if numerics_alert.pos_inf_count:
      self._trackers[constants.POS_INF_KEY].add(numerics_alert.timestamp)

  def first_timestamp(self, event_key=None):
    """Obtain the first timestamp.

    Args:
      event_key: the type key of the sought events (e.g., constants.NAN_KEY).
      If None, includes all event type keys.

    Returns:
      First (earliest) timestamp of all the events of the given type (or all
        event types if event_key is None).
    """
    if event_key is None:
      timestamps = [self._trackers[key].first_timestamp
                    for key in self._trackers]
      return min(timestamp for timestamp in timestamps if timestamp >= 0)
    else:
      return self._trackers[event_key].first_timestamp

  def last_timestamp(self, event_key=None):
    """Obtain the last timestamp.

    Args:
      event_key: the type key of the sought events (e.g., constants.NAN_KEY). If
      None, includes all event type keys.

    Returns:
      Last (latest) timestamp of all the events of the given type (or all
        event types if event_key is None).
    """
    if event_key is None:
      timestamps = [self._trackers[key].first_timestamp
                    for key in self._trackers]
      return max(timestamp for timestamp in timestamps if timestamp >= 0)
    else:
      return self._trackers[event_key].last_timestamp

  def event_count(self, event_key):
    """Obtain event count.

    Args:
      event_key: the type key of the sought events (e.g., constants.NAN_KEY). If
      None, includes all event type keys.

    Returns:
      If event_key is None, return the sum of the event_count of all event
      types. Otherwise, return the event_count of the specified event type.
    """
    return self._trackers[event_key].event_count

  def create_jsonable_history(self):
    """Creates a JSON-able representation of this object.

    Returns:
      A dictionary mapping key to EventTrackerDescription (which can be used to
      create event trackers).
    """
    return {value_category_key: tracker.get_description()
            for (value_category_key, tracker) in self._trackers.items()}


class NumericsAlertRegistry(object):
  """A registry for alerts on numerics (e.g., due to NaNs and infinities)."""

  def __init__(self, capacity=100, initialization_list=None):
    """Constructor.

    Args:
      capacity: (`int`) maximum number of device-tensor keys to store.
      initialization_list: (`list`) An optional list (parsed from JSON) that
        is used to initialize the data within this registry. Use the
        create_jsonable_registry method of NumericsAlertRegistry to create such
        a list.
    """
    self._capacity = capacity

    # A map from device-tensor key to a the TensorAlertRecord namedtuple.
    # The device-tensor key is a 2-tuple of the format (device_name, node_name).
    # E.g., ("/job:worker/replica:0/task:1/gpu:0", "cross_entropy/Log:0").
    self._data = dict()

    if initialization_list:
      # Initialize the alert registry using the data passed in. This might be
      # backup data used to restore the registry after say a borg pre-emption.
      for entry in initialization_list:
        triplet = HistoryTriplet._make(entry)
        self._data[(triplet.device, triplet.tensor)] = NumericsAlertHistory(
            initialization_list=triplet.jsonable_history)

  def register(self, numerics_alert):
    """Register an alerting numeric event.

    Args:
      numerics_alert: An instance of `NumericsAlert`.
    """
    key = (numerics_alert.device_name, numerics_alert.tensor_name)
    if key in self._data:
      self._data[key].add(numerics_alert)
    else:
      if len(self._data) < self._capacity:
        history = NumericsAlertHistory()
        history.add(numerics_alert)
        self._data[key] = history

  def report(self, device_name_filter=None, tensor_name_filter=None):
    """Get a report of offending device/tensor names.

    The report includes information about the device name, tensor name, first
    (earliest) timestamp of the alerting events from the tensor, in addition to
    counts of nan, positive inf and negative inf events.

    Args:
      device_name_filter: regex filter for device name, or None (not filtered).
      tensor_name_filter: regex filter for tensor name, or None (not filtered).

    Returns:
      A list of NumericsAlertReportRow, sorted by the first_timestamp in
      asecnding order.
    """
    report = []
    for key in self._data:
      device_name, tensor_name = key
      history = self._data[key]
      report.append(
          NumericsAlertReportRow(
              device_name=device_name,
              tensor_name=tensor_name,
              first_timestamp=history.first_timestamp(),
              nan_event_count=history.event_count(constants.NAN_KEY),
              neg_inf_event_count=history.event_count(constants.NEG_INF_KEY),
              pos_inf_event_count=history.event_count(constants.POS_INF_KEY)))

    if device_name_filter:
      device_name_pattern = re.compile(device_name_filter)
      report = [item for item in report
                if device_name_pattern.match(item.device_name)]
    if tensor_name_filter:
      tensor_name_pattern = re.compile(tensor_name_filter)
      report = [item for item in report
                if tensor_name_pattern.match(item.tensor_name)]
    # Sort results chronologically.
    return sorted(report, key=lambda x: x.first_timestamp)

  def create_jsonable_registry(self):
    """Creates a JSON-able representation of this object.

    Returns:
      A dictionary mapping (device, tensor name) to JSON-able object
      representations of NumericsAlertHistory.
    """
    # JSON does not support tuples as keys. Only strings. Therefore, we store
    # the device name, tensor name, and dictionary data within a 3-item list.
    return [HistoryTriplet(pair[0], pair[1], history.create_jsonable_history())
            for (pair, history) in self._data.items()]


def extract_numerics_alert(event):
  """Determines whether a health pill event contains bad values.

  A bad value is one of NaN, -Inf, or +Inf.

  Args:
    event: (`Event`) A `tensorflow.Event` proto from `DebugNumericSummary`
      ops.

  Returns:
    An instance of `NumericsAlert`, if bad values are found.
    `None`, if no bad values are found.

  Raises:
    ValueError: if the event does not have the expected tag prefix or the
      debug op name is not the expected debug op name suffix.
  """
  value = event.summary.value[0]
  debugger_plugin_metadata_content = None
  if value.HasField("metadata"):
    plugin_data = value.metadata.plugin_data
    if plugin_data.plugin_name == constants.DEBUGGER_PLUGIN_NAME:
      debugger_plugin_metadata_content = plugin_data.content

  if not debugger_plugin_metadata_content:
    raise ValueError("Event proto input lacks debugger plugin SummaryMetadata.")

  debugger_plugin_metadata_content = tf.compat.as_text(
      debugger_plugin_metadata_content)
  try:
    content_object = json.loads(debugger_plugin_metadata_content)
    device_name = content_object["device"]
  except (KeyError, ValueError) as e:
    raise ValueError("Could not determine device from JSON string %r, %r" %
                     (debugger_plugin_metadata_content, e))

  debug_op_suffix = ":DebugNumericSummary"
  if not value.node_name.endswith(debug_op_suffix):
    raise ValueError(
        "Event proto input does not have the expected debug op suffix %s" %
        debug_op_suffix)
  tensor_name = value.node_name[:-len(debug_op_suffix)]

  elements = tf_debug.load_tensor_from_event(event)
  nan_count = elements[constants.NAN_NUMERIC_SUMMARY_OP_INDEX]
  neg_inf_count = elements[constants.NEG_INF_NUMERIC_SUMMARY_OP_INDEX]
  pos_inf_count = elements[constants.POS_INF_NUMERIC_SUMMARY_OP_INDEX]
  if nan_count > 0 or neg_inf_count > 0 or pos_inf_count > 0:
    return NumericsAlert(
        device_name, tensor_name, event.wall_time, nan_count, neg_inf_count,
        pos_inf_count)
  return None