memory_util.py

from __future__ import absolute_import
from __future__ import print_function
import os
import re
import sys
import tempfile
import tensorflow as tf

debug_messages = False

def vlog(level):
  os.environ['TF_CPP_MIN_VLOG_LEVEL'] = str(level)

# this helper is here in case we later want to capture huge stderr that doesn't fit in RAM
class TemporaryFileHelper:
  """Provides a way to fetch contents of temporary file.""" 
  def __init__(self, temporary_file):
    self.temporary_file = temporary_file
  def getvalue(self):
    return open(self.temporary_file.name).read() 


STDOUT=1
STDERR=2
class capture_stderr:
  """Utility to capture output, use as follows
     with util.capture_stderr() as stderr:
        sess = tf.Session()

    print("Captured:", stderr.getvalue()).
    """

  def __init__(self, fd=STDERR):
    self.fd = fd
    self.prevfd = None

  def __enter__(self):
    t = tempfile.NamedTemporaryFile()
    self.prevfd = os.dup(self.fd)
    os.dup2(t.fileno(), self.fd)
    return TemporaryFileHelper(t)

  def __exit__(self, exc_type, exc_value, traceback):
    os.dup2(self.prevfd, self.fd)


################################################################################
# LOG_MEMORY_PARSING
################################################################################
# Until https://github.com/tensorflow/tensorflow/issues/6716 is resolved, the
# reliable way to get access to tensor deallocation information is to parse
# __LOG_MEMORY__ from VLOG print statements. This is sensitive to print order
# run unbuffered to prevent interleaving:
#   python -u script.py

# Regex'es to parse __LOG_MEMORY__ statements
# Each regex is preceded by an example of line it's meant to pass

# I 5143420588.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogTensorAllocation { step_id: -6 kernel_name: "Unknown (from Proto)" tensor { dtype: DT_INT32 shape { dim { size: 3 } } allocation_description { requested_bytes: 12 allocated_bytes: 12 allocator_name: "cpu" allocation_id: 3 has_single_reference: true ptr: 29496256 } } }
tensor_allocation_regex = re.compile("""MemoryLogTensorAllocation.*?step_id: (?P<step_id>[-0123456789]+).*kernel_name: \"(?P<kernel_name>[^"]+)\".*?allocated_bytes: (?P<allocated_bytes>\d+).*allocator_name: \"(?P<allocator_name>[^"]+)\".*allocation_id: (?P<allocation_id>\d+).*""")

# I 6795349363.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogRawAllocation { step_id: -3 operation: "TF_AllocateTensor" num_bytes: 1000000 ptr: 80910752 allocation_id: 99 allocator_name: "cpu" }
raw_allocation_regex = re.compile("""MemoryLogRawAllocation.*?step_id: (?P<step_id>[-0123456789]+).*operation: \"(?P<kernel_name>[^"]+)\".*?num_bytes: (?P<allocated_bytes>\d+).*allocation_id: (?P<allocation_id>\d+).*allocator_name: "(?P<allocator_name>[^"]+)".*""")

# I 5143420588.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogTensorOutput { step_id: 1 kernel_name: "Const" tensor { dtype: DT_INT32 shape { dim { size: 3 } } allocation_description { requested_bytes: 12 allocated_bytes: 12 allocator_name: "cpu" allocation_id: 3 ptr: 29496256 } } }
# 2017-01-26 10:13:30: I tensorflow/core/framework/log_memory.cc:35] __LOG_MEMORY__ MemoryLogTensorOutput { step_id: 2 kernel_name: "a0" tensor { dtype: DT_FLOAT shape { dim { size: 250000 } } allocation_description { requested_bytes: 1000000 allocated_bytes: 1000192 allocator_name: "gpu_bfc" allocation_id: 3 ptr: 30076651520 } } }
#tensor_output_regex = re.compile("""MemoryLogTensorOutput.* step_id: (?P<step_id>[-0123456789]+) kernel_name: \"(?P<kernel_name>[^"]+).*allocated_bytes: (?P<allocated_bytes>\d+).*allocation_id: (?P<allocation_id>\d+).*""")   
tensor_output_regex = re.compile("""MemoryLogTensorOutput.* step_id: (?P<step_id>[-0123456789]+) kernel_name: \"(?P<kernel_name>[^"]+).*allocated_bytes: (?P<allocated_bytes>\d+).*allocator_name: \"(?P<allocator_name>[^"]+)\".*allocation_id: (?P<allocation_id>\d+).*""")

# some Shape lines are missing bytes info so have separate regex for them
# I 5162643141.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogTensorOutput { step_id: 5 kernel_name: "gradients/Shape" tensor { dtype: DT_INT32 shape { dim { } } } }
tensor_output_regex_no_bytes = re.compile("""MemoryLogTensorOutput.* step_id: (?P<step_id>[-0123456789]+) kernel_name: \"(?P<kernel_name>[^"]+).*""")


# 5143420588.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogTensorDeallocation { allocation_id: 2 allocator_name: "cpu" }
tensor_deallocation_regex = re.compile("""allocation_id: (?P<allocation_id>\d+).*allocator_name: \"(?P<allocator_name>[^"]+)\".*""")

# I 6796000229.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogRawDeallocation { step_id: -3 operation: "TensorFlow C Api" allocation_id: 177 allocator_name: "cpu" }
raw_deallocation_regex = re.compile("""allocation_id: (?P<allocation_id>\d+).*allocator_name: \"(?P<allocator_name>[^"]+)\".*""")

# I 5143420588.000000 file tensorflow/core/framework/log_memory.cc:41] __LOG_MEMORY__ MemoryLogStep { step_id: 1 handle: "->Print:0//0/;0" }
tensor_logstep_regex = re.compile("""MemoryLogStep.*?step_id: (?P<step_id>[-0123456789]+).*""")


def _parse_logline(l):
    if 'MemoryLogTensorOutput' in l:
        m = tensor_output_regex.search(l)
        if not m:
            m = tensor_output_regex_no_bytes.search(l)

        assert m, l
        d = m.groupdict()
        d["type"] = "MemoryLogTensorOutput"
            
    elif 'MemoryLogTensorAllocation' in l:
        m = tensor_allocation_regex.search(l)

        # Broadcast args give weird allocation messages without size, ignore
        # I tensorflow/core/framework/log_memory.cc:35] __LOG_MEMORY__ MemoryLogTensorAllocation { step_id: 2 kernel_name: "gradients/node_5_grad/BroadcastGradientArgs" tensor { dtype: DT_INT32 shape { dim { } } } }
        if not m:
            return {"type": "MemoryLogTensorAllocation", "line": l,
                    "allocation_id": "-1"}

        assert m, l
        d = m.groupdict()
        d["type"] = "MemoryLogTensorAllocation"
        if debug_messages:
            print(("Got allocation for %s, %s"%(d["allocation_id"], d["kernel_name"])))
    elif 'MemoryLogTensorDeallocation' in l:
        m = tensor_deallocation_regex.search(l)
        assert m, l
        d = m.groupdict()
        d["type"] = "MemoryLogTensorDeallocation"
        if debug_messages:
            print(("Got deallocation for %s"%(d["allocation_id"])))
    elif 'MemoryLogStep' in l:
        m = tensor_logstep_regex.search(l)
        assert m, l
        d = m.groupdict()
        d["type"] = "MemoryLogStep"
    elif 'MemoryLogRawAllocation' in l:
        m = raw_allocation_regex.search(l)
        assert m, l
        d = m.groupdict()
        d["type"] = "MemoryLogRawAllocation"
    elif 'MemoryLogRawDeallocation' in l:
        m = raw_deallocation_regex.search(l)
        assert m, l
        d = m.groupdict()
        d["type"] = "MemoryLogRawDeallocation"
    else:
        assert False, "Unknown log line: "+l
        
    if not "allocation_id" in d:
        d["allocation_id"] = "-1"

    d["line"] = l
    return d

def memory_timeline(log):
    if hasattr(log, 'getvalue'):
        log = log.getvalue()
    
    def unique_alloc_id(line):
        if line["allocation_id"] == "-1":
            return "-1"
        return line["allocation_id"]+"-"+line["allocator_name"]
    
    def get_alloc_names(line):
        alloc_id = unique_alloc_id(line)
        for entry in reversed(allocation_map.get(alloc_id, [])):
            kernel_name = entry.get("kernel_name", "unknown")
            if not "unknown" in kernel_name:
                return kernel_name+"("+unique_alloc_id(line)+")"
        # couldn't find an allocation message with name of kernel
        return "("+alloc_id+")"

    def get_alloc_bytes(line):
        for entry in allocation_map.get(unique_alloc_id(line), []):
            if "allocated_bytes" in entry:
                return entry["allocated_bytes"]
        return "0"

    def get_alloc_type(line):
        for entry in allocation_map.get(unique_alloc_id(line), []):
            if "allocator_name" in entry:
                return entry["allocator_name"]
        return "0"

    parsed_lines = []
    for l in log.split("\n"):
        if 'LOG_MEMORY' in l: # and not 'step_id: -6' in l:
            parsed_lines.append(_parse_logline(l))

    allocation_map = {} # map of <allocation_id>-<allocator_name>->parsed_logline of allocation
    for line in parsed_lines:
        if (line["type"] == "MemoryLogTensorAllocation" or line["type"] == "MemoryLogRawAllocation" or
            line["type"] == "MemoryLogTensorOutput"):
            allocation_map.setdefault(unique_alloc_id(line), []).append(line)
    if debug_messages:
        print(allocation_map)
    result = []
    for i, line in enumerate(parsed_lines):
        # skip lines without allocation_id, ie lines like
        # I tensorflow/core/framework/log_memory.cc:35] __LOG_MEMORY__ MemoryLogStep { step_id: 2 handle: "->/gradients/a1_grad/TanhGrad/0/;1" }

        if int(line["allocation_id"]) == -1:
            continue
        alloc_names = get_alloc_names(line)
        # if line doesn't specify bytes, look in history if there was corresponding TensorOutput or TensorAllocation msg
        if int(line.get('allocated_bytes', -1)) < 0:
            alloc_bytes = get_alloc_bytes(line)
        else:
            alloc_bytes = line.get('allocated_bytes', -1)
        alloc_type = get_alloc_type(line)
        if line["type"] == "MemoryLogTensorOutput":
            continue
        if line["type"] == "MemoryLogTensorDeallocation" or line["type"]=="MemoryLogRawDeallocation":
            alloc_bytes = "-" + alloc_bytes
        result.append((i, alloc_names, alloc_bytes, alloc_type))
    return result

def peak_memory(log, gpu_only=False):
    """Peak memory used across all devices."""
    peak_memory = -123456789 # to catch bugs
    total_memory = 0
    for record in memory_timeline(log):
        i, kernel_name, allocated_bytes, allocator_type = record
        allocated_bytes = int(allocated_bytes)
        if gpu_only:
            if not allocator_type.startswith("gpu"):
                continue
        total_memory += allocated_bytes
        peak_memory = max(total_memory, peak_memory)
    return peak_memory
    
def print_memory_timeline(log, gpu_only=False, ignore_less_than_bytes=0):
      
    total_memory = 0
    for record in memory_timeline(log):
        i, kernel_name, allocated_bytes, allocator_type = record
        allocated_bytes = int(allocated_bytes)
        if gpu_only:
            if not allocator_type.startswith("gpu"):
                continue
        if abs(allocated_bytes)<ignore_less_than_bytes:
            continue  # ignore small allocations
        total_memory += allocated_bytes
        print(("%9d %42s %11d %11d %s"%(i, kernel_name, allocated_bytes, total_memory, allocator_type)))

import matplotlib.pyplot as plt
def plot_memory_timeline(log, gpu_only=False, ignore_less_than_bytes=1000):
      
    total_memory = 0
    timestamps = []
    data = []
    current_time = 0
    for record in memory_timeline(log):
        timestamp, kernel_name, allocated_bytes, allocator_type = record
        allocated_bytes = int(allocated_bytes)
        
        if abs(allocated_bytes)<ignore_less_than_bytes:
            continue  # ignore small allocations
        if gpu_only:
            if not record[3].startswith("gpu"):
                continue
        timestamps.append(current_time-.00000001)
        data.append(total_memory)
        total_memory += int(record[2])
        timestamps.append(current_time)
        data.append(total_memory)
        current_time+=1
    plt.plot(timestamps, data)

################################################################################
# smart initialize
################################################################################

def smart_initialize(variables=None, sess=None):
  """Initializes all uninitialized variables in correct order. Initializers
  are only run for uninitialized variables, so it's safe to run this multiple
  times.

  Args:
      sess: session to use. Use default session if None.
  """

  from tensorflow.contrib import graph_editor as ge
  def make_initializer(var): 
    def f():
      return tf.assign(var, var.initial_value).op
    return f
  
  def make_noop(): return tf.no_op()

  def make_safe_initializer(var):
    """Returns initializer op that only runs for uninitialized ops."""
    return tf.cond(tf.is_variable_initialized(var), make_noop,
                   make_initializer(var), name="safe_init_"+var.op.name).op

  if not sess:
    sess = tf.get_default_session()
  g = tf.get_default_graph()

  if not variables:
    variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
      
  safe_initializers = {}
  for v in variables:
    safe_initializers[v.op.name] = make_safe_initializer(v)
      
  # initializers access variable vaue through read-only value cached in
  # <varname>/read, so add control dependency to trigger safe_initializer
  # on read access
  for v in variables:
    var_name = v.op.name
    var_cache = g.get_operation_by_name(var_name+"/read")
    ge.reroute.add_control_inputs(var_cache, [safe_initializers[var_name]])

  sess.run(tf.group(*safe_initializers.values()))
    
  # remove initializer dependencies to avoid slowing down future variable reads
  for v in variables:
    var_name = v.op.name
    var_cache = g.get_operation_by_name(var_name+"/read")
    ge.reroute.remove_control_inputs(var_cache, [safe_initializers[var_name]])