# Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Control flow graph (CFG) structure for Python AST representation. The CFG is a digraph with edges representing valid control flow. Each node is associated with exactly one AST node, but not all AST nodes may have a corresponding CFG counterpart. Once built, the CFG itself is immutable, but the values it holds need not be; they are usually annotated with information extracted by walking the graph. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections from enum import Enum # pylint:disable=g-bad-import-order import gast # pylint:enable=g-bad-import-order from tensorflow.contrib.autograph.pyct import compiler class Node(object): """A node in the CFG. Although new instances of this class are mutable, the objects that a user finds in the CFG are typically not. The nodes represent edges in the CFG graph, and maintain pointers to allow efficient walking in both forward and reverse order. The following property holds for all nodes: "child in node.next" iff "node in child.prev". Attributes: next: FrozenSet[Node, ...], the nodes that follow this node, in control flow order prev: FrozenSet[Node, ...], the nodes that precede this node, in reverse control flow order ast_node: ast.AST, the AST node corresponding to this CFG node """ def __init__(self, next_, prev, ast_node): self.next = next_ self.prev = prev self.ast_node = ast_node def freeze(self): self.next = frozenset(self.next) self.prev = frozenset(self.prev) def __repr__(self): if isinstance(self.ast_node, gast.FunctionDef): return 'def %s' % self.ast_node.name elif isinstance(self.ast_node, gast.withitem): source, _ = compiler.ast_to_source(self.ast_node.context_expr) return source.strip() source, _ = compiler.ast_to_source(self.ast_node) return source.strip() class Graph( collections.namedtuple( 'Graph', ['entry', 'exit', 'error', 'index', 'stmt_prev', 'stmt_next'])): """A Control Flow Graph. The CFG maintains an index to allow looking up a CFG node by the AST node to which it is associated. The index can also be enumerated in top-down, depth first order. Walking the graph in forward or reverse order is supported by double parent-child links. Note: the error nodes are not wired to their corresponding finally guards, because these are shared, and wiring them would create a reverse path from normal control flow into the error nodes, which we want to avoid. The graph also maintains edges corresponding to higher level statements like for-else loops. A node is considered successor of a statement if there is an edge from a node that is lexically a child of that statement to a node that is not. Statement predecessors are analogously defined. Attributes: entry: Node, the entry node exit: FrozenSet[Node, ...], the exit nodes error: FrozenSet[Node, ...], nodes that exit due to an explicitly raised error (errors propagated from function calls are not accounted) index: Dict[ast.Node, Node], mapping AST nodes to the respective CFG node stmt_prev: Dict[ast.Node, FrozenSet[Node, ...]], mapping statement AST nodes to their predecessor CFG nodes stmt_next: Dict[ast.Node, FrozenSet[Node, ...]], mapping statement AST nodes to their successor CFG nodes """ def __repr__(self): result = 'digraph CFG {\n' for node in self.index.values(): result += ' %s [label="%s"];\n' % (id(node), node) for node in self.index.values(): for next_ in node.next: result += ' %s -> %s;\n' % (id(node), id(next_)) result += '}' return result class _WalkMode(Enum): FORWARD = 1 REVERSE = 2 # TODO(mdan): Rename to DataFlowAnalyzer. # TODO(mdan): Consider specializations that use gen/kill/transfer abstractions. class GraphVisitor(object): """Base class for a CFG visitors. This implementation is not thread safe. The visitor has some facilities to simplify dataflow analyses. In particular, it allows revisiting the nodes at the decision of the subclass. This can be used to visit the graph until the state reaches a fixed point. For more details on dataflow analysis, see https://www.seas.harvard.edu/courses/cs252/2011sp/slides/Lec02-Dataflow.pdf Note: the literature generally suggests visiting successor nodes only when the state of the current node changed, regardless of whether that successor has ever been visited. This implementation visits every successor at least once. Attributes: graph: Graph in_: Dict[Node, Any], stores node-keyed state during a visit out: Dict[Node, Any], stores node-keyed state during a visit """ def __init__(self, graph): self.graph = graph self.reset() def init_state(self, node): """State initialization function. Optional to overload. An in/out state slot will be created for each node in the graph. Subclasses must overload this to control what that is initialized to. Args: node: Node """ raise NotImplementedError('Subclasses must implement this.') # TODO(mdan): Rename to flow? def visit_node(self, node): """Visitor function. Args: node: Node Returns: bool, whether the node should be revisited; subclasses can visit every reachable node exactly once by always returning False """ raise NotImplementedError('Subclasses must implement this.') def reset(self): self.in_ = { node: self.init_state(node) for node in self.graph.index.values() } self.out = { node: self.init_state(node) for node in self.graph.index.values() } def _visit_internal(self, mode): """Visits the CFG, depth-first.""" assert mode in (_WalkMode.FORWARD, _WalkMode.REVERSE) if mode == _WalkMode.FORWARD: open_ = [self.graph.entry] elif mode == _WalkMode.REVERSE: open_ = list(self.graph.exit) closed = set() while open_: node = open_.pop(0) closed.add(node) should_revisit = self.visit_node(node) if mode == _WalkMode.FORWARD: children = node.next elif mode == _WalkMode.REVERSE: children = node.prev for next_ in children: if should_revisit or next_ not in closed: open_.append(next_) def visit_forward(self): self._visit_internal(_WalkMode.FORWARD) def visit_reverse(self): self._visit_internal(_WalkMode.REVERSE) class GraphBuilder(object): """Builder that constructs a CFG from a given AST. This GraphBuilder facilitates constructing the DAG that forms the CFG when nodes are supplied in lexical order (i.e., top-down, depth first). Under these conditions, it supports building patterns found in typical structured programs. This builder ignores the flow generated by exceptions, which are assumed to always be catastrophic and present purely for diagnostic purposes (e.g. to print debug information). Statements like raise and try/catch sections are allowed and will generate control flow edges, but ordinaty statements are assumed not to raise exceptions. Finally sections are also correctly interleaved between break/continue/return nodes and their subsequent statements. Important concepts: * nodes - nodes refer refer to CFG nodes; AST nodes are qualified explicitly * leaf set - since the graph is constructed gradually, a leaf set maintains the CFG nodes that will precede the node that the builder expects to receive next; when an ordinary node is added, it is connected to the existing leaves and it in turn becomes the new leaf * jump nodes - nodes that should generate edges other than what ordinary nodes would; these correspond to break, continue and return statements * sections - logical delimiters for subgraphs that require special edges; there are various types of nodes, each admitting various types of jump nodes; sections are identified by their corresponding AST node """ # TODO(mdan): Perhaps detail this in a markdown doc. # TODO(mdan): Add exception support. def __init__(self, parent_ast_node): self.reset() self.parent = parent_ast_node def reset(self): """Resets the state of this factory.""" self.head = None self.errors = set() self.node_index = collections.OrderedDict() # TODO(mdan): Too many primitives. Use classes. self.leaves = set() # Note: This mechanism requires that nodes are added in lexical order (top # to bottom, depth first). self.active_stmts = set() self.owners = {} # type: Set[any] self.forward_edges = set() # type: Tuple[Node, Node] # (from, to) self.finally_sections = {} # Dict values represent (entry, exits) self.finally_section_subgraphs = { } # type: Dict[ast.AST, Tuple[Node, Set[Node]]] # Whether the guard section can be reached from the statement that precedes # it. self.finally_section_has_direct_flow = {} # Finally sections that await their first node. self.pending_finally_sections = set() # Exit jumps keyed by the section they affect. self.exits = {} # The entry of loop sections, keyed by the section. self.section_entry = {} # Continue jumps keyed by the section they affect. self.continues = {} # The entry of conditional sections, keyed by the section. self.cond_entry = {} # Lists of leaf nodes corresponding to each branch in the section. self.cond_leaves = {} def _connect_nodes(self, first, second): """Connects nodes to signify that control flows from first to second. Args: first: Union[Set[Node, ...], Node] second: Node """ if isinstance(first, Node): first.next.add(second) second.prev.add(first) self.forward_edges.add((first, second)) else: for node in first: self._connect_nodes(node, second) def _add_new_node(self, ast_node): """Grows the graph by adding a CFG node following the current leaves.""" if ast_node is self.node_index: raise ValueError('%s added twice' % ast_node) node = Node(next_=set(), prev=set(), ast_node=ast_node) self.node_index[ast_node] = node self.owners[node] = frozenset(self.active_stmts) if self.head is None: self.head = node for leaf in self.leaves: self._connect_nodes(leaf, node) # If any finally section awaits its first node, populate it. for section_id in self.pending_finally_sections: self.finally_section_subgraphs[section_id][0] = node self.pending_finally_sections = set() return node def begin_statement(self, stmt): """Marks the beginning of a statement. Args: stmt: Hashable, a key by which the statement can be identified in the CFG's stmt_prev and stmt_next attributes """ self.active_stmts.add(stmt) def end_statement(self, stmt): """Marks the end of a statement. Args: stmt: Hashable, a key by which the statement can be identified in the CFG's stmt_prev and stmt_next attributes; must match a key previously passed to begin_statement. """ self.active_stmts.remove(stmt) def add_ordinary_node(self, ast_node): """Grows the graph by adding an ordinary CFG node. Ordinary nodes are followed by the next node, in lexical order, that is, they become the new leaf set. Args: ast_node: ast.AST Returns: Node """ node = self._add_new_node(ast_node) self.leaves = set((node,)) return node def _add_jump_node(self, ast_node, guards): """Grows the graph by adding a jump node. Jump nodes are added to the current leaf set, and the leaf set becomes empty. If the jump node is the last in a cond section, then it may be added back to the leaf set by a separate mechanism. Args: ast_node: ast.AST guards: Tuple[ast.AST, ...], the finally sections active for this node Returns: Node """ node = self._add_new_node(ast_node) self.leaves = set() # The guards themselves may not yet be complete, and will be wired later. self.finally_sections[node] = guards return node def _connect_jump_to_finally_sections(self, node): """Connects a jump node to the finally sections protecting it.""" cursor = set((node,)) for guard_section_id in self.finally_sections[node]: guard_begin, guard_ends = self.finally_section_subgraphs[guard_section_id] self._connect_nodes(cursor, guard_begin) cursor = guard_ends del self.finally_sections[node] # TODO(mdan): Should garbage-collect finally_section_subgraphs. return cursor def add_exit_node(self, ast_node, section_id, guards): """Grows the graph by adding an exit node. This node becomes an exit for the current section. Args: ast_node: ast.AST section_id: Hashable, the node for which ast_node should be considered to be an exit node guards: Tuple[ast.AST, ...], the finally sections that guard ast_node """ node = self._add_jump_node(ast_node, guards) self.exits[section_id].add(node) def add_continue_node(self, ast_node, section_id, guards): """Grows the graph by adding a reentry node. This node causes control flow to go back to the loop section's entry. Args: ast_node: ast.AST section_id: Hashable, the node for which ast_node should be considered to be an exit node guards: Tuple[ast.AST, ...], the finally sections that guard ast_node """ node = self._add_jump_node(ast_node, guards) self.continues[section_id].add(node) def add_error_node(self, ast_node, guards): """Grows the graph by adding an error node. This node becomes an exit for the entire graph. Args: ast_node: ast.AST guards: Tuple[ast.AST, ...], the finally sections that guard ast_node """ node = self._add_jump_node(ast_node, guards) self.errors.add(node) self.leaves = set() def enter_section(self, section_id): """Enters a regular section. Regular sections admit exit jumps, which end the section. Args: section_id: Hashable, the same node that will be used in calls to the ast_node arg passed to add_exit_node """ assert section_id not in self.exits self.exits[section_id] = set() def exit_section(self, section_id): """Exits a regular section.""" # Exits are jump nodes, which may be protected. for exit_ in self.exits[section_id]: self.leaves |= self._connect_jump_to_finally_sections(exit_) del self.exits[section_id] def enter_loop_section(self, section_id, entry_node): """Enters a loop section. Loop sections define an entry node. The end of the section always flows back to the entry node. These admit continue jump nodes which also flow to the entry node. Args: section_id: Hashable, the same node that will be used in calls to the ast_node arg passed to add_continue_node entry_node: ast.AST, the entry node into the loop (e.g. the test node for while loops) """ assert section_id not in self.section_entry assert section_id not in self.continues self.continues[section_id] = set() node = self.add_ordinary_node(entry_node) self.section_entry[section_id] = node def exit_loop_section(self, section_id): """Exits a loop section.""" self._connect_nodes(self.leaves, self.section_entry[section_id]) # continues are jump nodes, which may be protected. for reentry in self.continues[section_id]: guard_ends = self._connect_jump_to_finally_sections(reentry) self._connect_nodes(guard_ends, self.section_entry[section_id]) # Loop nodes always loop back. self.leaves = set((self.section_entry[section_id],)) del self.continues[section_id] del self.section_entry[section_id] def enter_cond_section(self, section_id): """Enters a conditional section. Conditional sections define an entry node, and one or more branches. Args: section_id: Hashable, the same node that will be used in calls to the section_id arg passed to new_cond_branch """ assert section_id not in self.cond_entry assert section_id not in self.cond_leaves self.cond_leaves[section_id] = [] def new_cond_branch(self, section_id): """Begins a new branch in a cond section.""" assert section_id in self.cond_leaves if section_id in self.cond_entry: # Subsequent splits move back to the split point, and memorize the # current leaves. self.cond_leaves[section_id].append(self.leaves) self.leaves = self.cond_entry[section_id] else: # If this is the first time we split a section, just remember the split # point. self.cond_entry[section_id] = self.leaves def exit_cond_section(self, section_id): """Exits a conditional section.""" for split in self.cond_leaves[section_id]: self.leaves |= split del self.cond_entry[section_id] del self.cond_leaves[section_id] def enter_finally_section(self, section_id): """Enters a finally section.""" # TODO(mdan): This, not the caller, should track the active sections. self.finally_section_subgraphs[section_id] = [None, None] if self.leaves: self.finally_section_has_direct_flow[section_id] = True else: self.finally_section_has_direct_flow[section_id] = False self.pending_finally_sections.add(section_id) def exit_finally_section(self, section_id): """Exits a finally section.""" assert section_id not in self.pending_finally_sections, 'Empty finally?' self.finally_section_subgraphs[section_id][1] = self.leaves # If the guard can only be reached by a jump, then it will not flow # into the statement that follows it. if not self.finally_section_has_direct_flow[section_id]: self.leaves = set() del self.finally_section_has_direct_flow[section_id] def build(self): """Returns the CFG accumulated so far and resets the builder. Returns: Graph """ # Freeze the nodes. for node in self.node_index.values(): node.freeze() # Build the statement edges. stmt_next = {} stmt_prev = {} for node, _ in self.forward_edges: for stmt in self.owners[node]: if stmt not in stmt_next: stmt_next[stmt] = set() if stmt not in stmt_prev: stmt_prev[stmt] = set() for first, second in self.forward_edges: stmts_exited = self.owners[first] - self.owners[second] for stmt in stmts_exited: stmt_next[stmt].add(second) stmts_entered = self.owners[second] - self.owners[first] for stmt in stmts_entered: stmt_prev[stmt].add(first) for stmt in stmt_next: stmt_next[stmt] = frozenset(stmt_next[stmt]) for stmt in stmt_prev: stmt_prev[stmt] = frozenset(stmt_prev[stmt]) # Construct the final graph object. result = Graph( entry=self.head, exit=self.leaves, error=self.errors, index=self.node_index, stmt_prev=stmt_prev, stmt_next=stmt_next) # Reset the state. self.reset() return result class AstToCfg(gast.NodeVisitor): """Converts an AST to CFGs. A separate CFG will be constructed for each function. """ def __init__(self): super(AstToCfg, self).__init__() self.builder_stack = [] self.builder = None self.cfgs = {} self.lexical_scopes = [] def _enter_lexical_scope(self, node): self.lexical_scopes.append(node) def _exit_lexical_scope(self, node): leaving_node = self.lexical_scopes.pop() assert node == leaving_node def _get_enclosing_scopes(self, include, stop_at): included = [] for node in reversed(self.lexical_scopes): if isinstance(node, include): included.append(node) if isinstance(node, stop_at): return node, included return None, included def _process_basic_statement(self, node): self.generic_visit(node) self.builder.add_ordinary_node(node) def _process_exit_statement(self, node, *exits_nodes_of_type): # Note: this is safe because we process functions separately. try_node, guards = self._get_enclosing_scopes( include=(gast.Try,), stop_at=tuple(exits_nodes_of_type), ) if try_node is None: raise ValueError( '%s that is not enclosed by any of %s' % (node, exits_nodes_of_type)) self.builder.add_exit_node(node, try_node, guards) def _process_continue_statement(self, node, *loops_to_nodes_of_type): # Note: this is safe because we process functions separately. try_node, guards = self._get_enclosing_scopes( include=(gast.Try,), stop_at=tuple(loops_to_nodes_of_type), ) if try_node is None: raise ValueError('%s that is not enclosed by any of %s' % (node, loops_to_nodes_of_type)) self.builder.add_continue_node(node, try_node, guards) def visit_FunctionDef(self, node): # We also keep the FunctionDef node in the CFG. This allows us to determine # things like reaching definitions via closure. Note that the function body # will be stored in a separate graph, because function definitions are not # the same as function calls. if self.builder is not None: self.builder.add_ordinary_node(node) self.builder_stack.append(self.builder) self.builder = GraphBuilder(node) self._enter_lexical_scope(node) self.builder.enter_section(node) self._process_basic_statement(node.args) for stmt in node.body: self.visit(stmt) self.builder.exit_section(node) self._exit_lexical_scope(node) self.cfgs[node] = self.builder.build() self.builder = self.builder_stack.pop() def visit_Lambda(self, node): # TODO(mdan): Treat like FunctionDef? That would be a separate CFG. raise NotImplementedError() def visit_Return(self, node): self._process_exit_statement(node, gast.FunctionDef) def visit_Expr(self, node): self._process_basic_statement(node) def visit_Assign(self, node): self._process_basic_statement(node) def visit_AnnAssign(self, node): self._process_basic_statement(node) def visit_AugAssign(self, node): self._process_basic_statement(node) def visit_Print(self, node): self._process_basic_statement(node) def visit_Raise(self, node): try_node, guards = self._get_enclosing_scopes( include=(gast.Try,), stop_at=(gast.FunctionDef,), ) if try_node is None: raise ValueError('%s that is not enclosed by any FunctionDef' % node) self.builder.add_error_node(node, try_node, guards) def visit_Assert(self, node): # Ignoring the effect of exceptions. self._process_basic_statement(node) def visit_Delete(self, node): self._process_basic_statement(node) def visit_If(self, node): # No need to track ifs as lexical scopes, for now. # Lexical scopes are generally tracked in order to be able to resolve the # targets of jump statements like break/continue/etc. Since there is no # statement that can interrupt a conditional, we don't need to track their # lexical scope. That may change in the future. self.builder.begin_statement(node) self.builder.enter_cond_section(node) self._process_basic_statement(node.test) self.builder.new_cond_branch(node) for stmt in node.body: self.visit(stmt) self.builder.new_cond_branch(node) for stmt in node.orelse: self.visit(stmt) self.builder.exit_cond_section(node) self.builder.end_statement(node) def visit_While(self, node): self.builder.begin_statement(node) self._enter_lexical_scope(node) self.builder.enter_section(node) self.builder.enter_loop_section(node, node.test) for stmt in node.body: self.visit(stmt) self.builder.exit_loop_section(node) # Note: although the orelse is technically part of the loop node, # the statements inside it don't affect the loop itself. For example, a # break in the loop's orelse will not affect the loop itself. self._exit_lexical_scope(node) for stmt in node.orelse: self.visit(stmt) self.builder.exit_section(node) self.builder.end_statement(node) def visit_For(self, node): self.builder.begin_statement(node) self._enter_lexical_scope(node) self.builder.enter_section(node) # TODO(mdan): Strictly speaking, this should be node.target + node.iter. # A blind dataflow analysis would have to process both node.target and # node.iter to properly process read and write access. self.builder.enter_loop_section(node, node.iter) for stmt in node.body: self.visit(stmt) self.builder.exit_loop_section(node) # Note: although the orelse is technically part of the loop node, # they don't count as loop bodies. For example, a break in the loop's # orelse will affect the parent loop, not the current one. self._exit_lexical_scope(node) for stmt in node.orelse: self.visit(stmt) self.builder.exit_section(node) self.builder.end_statement(node) def visit_Break(self, node): self._process_exit_statement(node, gast.While, gast.For) def visit_Continue(self, node): self._process_continue_statement(node, gast.While, gast.For) def visit_Try(self, node): self._enter_lexical_scope(node) for stmt in node.body: self.visit(stmt) # Unlike loops, the orelse is a simple continuation of the body. for stmt in node.orelse: self.visit(stmt) if node.handlers: # TODO(mdan): Should we still support bare try/except? Might be confusing. raise NotImplementedError('exceptions are not yet supported') self._exit_lexical_scope(node) self.builder.enter_finally_section(node) for stmt in node.finalbody: self.visit(stmt) self.builder.exit_finally_section(node) def visit_With(self, node): # TODO(mdan): Mark the context manager's exit call as exit guard. for item in node.items: self._process_basic_statement(item) for stmt in node.body: self.visit(stmt) def build(node): visitor = AstToCfg() visitor.visit(node) return visitor.cfgs