How to do it...

We proceed with the recipe as follows:

  1. Clone the code from github:
git clone
  1. Define a Xavier initializer as defined in the paper Understanding the difficulty of training deep feedforward neural networks (2009) by Xavier Glorot, Yoshua Bengio. The initializers are proven to allow better convergence for GANs:
def xavier_init(size):
in_dim = size[0]
xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
# return tf.random_normal(shape=size, stddev=xavier_stddev)
return xavier_stddev
  1. Define a convolutional operation for the given input x, weight w, bias b, and given stride. Our code uses the standard tf.nn.conv2d(...) module. Note that we use 'SAME' padding, as defined in Chapter 4:
def conv(x, w, b, stride, name):
with tf.variable_scope('conv'):
tf.summary.histogram('weight', w)
tf.summary.histogram('biases', b)
return tf.nn.conv2d(x,
strides=[1, stride, stride, 1],
name=name) + b
  1. Define a de-convolutional operation for the given input x, weight w, bias b, and given stride. Our code uses the standard tf.nn.conv2d_transpose(...) module. Again, we use 'SAME' padding.
def deconv(x, w, b, shape, stride, name):
with tf.variable_scope('deconv'):
tf.summary.histogram('weight', w)
tf.summary.histogram('biases', b)
return tf.nn.conv2d_transpose(x,
strides=[1, stride, stride, 1],
name=name) + b
  1. Define a standard LeakyReLU, which is a very effective activation function for GANs:
def lrelu(x, alpha=0.2):
with tf.variable_scope('leakyReLU'):
return tf.maximum(x, alpha * x)
  1. Define the generator. First, we define a fully connected layer with input size 100 (an arbitrary size for Z, the initial noise used by our generator). The fully connected layer consists of a matrix W1 with the dimension [100, 7*7*256], initialized according to a normal distribution, and a bias B1 with the dimension [7*7*256]. The layer uses ReLu as an activation function. After the fully connected layer, the generator applies two deconvolutional operations, deconv1 and deconv2, both with stride=2. After the first deconv1 operation is done, results are batch normalized. Note that the second deconvolutional operation is preceded by dropout, with a probability of 40percent. The last stage is a sigmoid used as non-linear activation as reported in the below code snippet:
def generator(X, batch_size=64):
with tf.variable_scope('generator'):
K = 256
L = 128
M = 64
W1 = tf.get_variable('G_W1', [100, 7*7*K], initializer=tf.random_normal_initializer(stddev=0.1))
B1 = tf.get_variable('G_B1', [7*7*K], initializer=tf.constant_initializer())
W2 = tf.get_variable('G_W2', [4, 4, M, K], initializer=tf.random_normal_initializer(stddev=0.1))
B2 = tf.get_variable('G_B2', [M], initializer=tf.constant_initializer())
W3 = tf.get_variable('G_W3', [4, 4, 1, M], initializer=tf.random_normal_initializer(stddev=0.1))
B3 = tf.get_variable('G_B3', [1], initializer=tf.constant_initializer())
X = lrelu(tf.matmul(X, W1) + B1)
X = tf.reshape(X, [batch_size, 7, 7, K])
deconv1 = deconv(X, W2, B2, shape=[batch_size, 14, 14, M], stride=2, name='deconv1')
bn1 = tf.contrib.layers.batch_norm(deconv1)
deconv2 = deconv(tf.nn.dropout(lrelu(bn1), 0.4), W3, B3, shape=[batch_size, 28, 28, 1], stride=2, name='deconv2')
XX = tf.reshape(deconv2, [-1, 28*28], 'reshape')
return tf.nn.sigmoid(XX)
  1. Define the discriminator. As in the previous recipe, if the parameter reuse is true, then we call scope.reuse_variables() to trigger a reuse. The discriminator uses two convolutional layers. The first one is followed by batch normalization while the second one is preceded by a dropout with a probability of 40 percent and followed again by a batch normalization step. After that, we have a dense layer with the activation function ReLU followed by another dense layer with the activation function based on the sigmoid:
def discriminator(X, reuse=False):
with tf.variable_scope('discriminator'):
if reuse:
K = 64
M = 128
N = 256
W1 = tf.get_variable('D_W1', [4, 4, 1, K], initializer=tf.random_normal_initializer(stddev=0.1))
B1 = tf.get_variable('D_B1', [K], initializer=tf.constant_initializer())
W2 = tf.get_variable('D_W2', [4, 4, K, M], initializer=tf.random_normal_initializer(stddev=0.1))
B2 = tf.get_variable('D_B2', [M], initializer=tf.constant_initializer())
W3 = tf.get_variable('D_W3', [7*7*M, N], initializer=tf.random_normal_initializer(stddev=0.1))
B3 = tf.get_variable('D_B3', [N], initializer=tf.constant_initializer())
W4 = tf.get_variable('D_W4', [N, 1], initializer=tf.random_normal_initializer(stddev=0.1))
B4 = tf.get_variable('D_B4', [1], initializer=tf.constant_initializer())
X = tf.reshape(X, [-1, 28, 28, 1], 'reshape')
conv1 = conv(X, W1, B1, stride=2, name='conv1')
bn1 = tf.contrib.layers.batch_norm(conv1)
conv2 = conv(tf.nn.dropout(lrelu(bn1), 0.4), W2, B2, stride=2, name='conv2')
bn2 = tf.contrib.layers.batch_norm(conv2)
flat = tf.reshape(tf.nn.dropout(lrelu(bn2), 0.4), [-1, 7*7*M], name='flat')
dense = lrelu(tf.matmul(flat, W3) + B3)
logits = tf.matmul(dense, W4) + B4
prob = tf.nn.sigmoid(logits)
return prob, logits
  1. Then we read the data from the MNIST dataset, and define an auxiliary function for plotting samples:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
import argparse

def read_data():
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("../MNIST_data/", one_hot=True)
return mnist

def plot(samples):
fig = plt.figure(figsize=(8, 8))
gs = gridspec.GridSpec(8, 8)
gs.update(wspace=0.05, hspace=0.05)
for i, sample in enumerate(samples):
ax = plt.subplot(gs[i])
plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
return fig
  1. Now let's define the training function. First, let's read the MNIST data, and then define a matrix X of shape 28 x 28 with one channel for a standard MNIST handwritten character. Then let's define az noise vector of size 100—a common choice proposed in the seminal GANs paper. The next step is to call the generator on z and assign the result to G. After that, we pass X to the discriminator without reuse. Then we pass the forged/fake G result to the discriminator, reusing the learned weight. One important aspect of this is how we chose the loss function for the discriminator, which is a sum of two cross entropies: one for real characters where all the real MNIST characters have a label set to one, and one for forgetting characters where all the forged characters have a label set to zero. The discriminator and the generator run in an alternate sequence for 100,000 steps. Every 500 steps, a sample is drawn from the learned distribution for printing what that generator has learned so far. This is what defines a new epoch, and the results are shown during the next section. The training function code snippet is reported below
def train(logdir, batch_size):
from model_conv import discriminator, generator
mnist = read_data()

with tf.variable_scope('placeholder'):
# Raw image
X = tf.placeholder(tf.float32, [None, 784])
tf.summary.image('raw image', tf.reshape(X, [-1, 28, 28, 1]), 3)
# Noise
z = tf.placeholder(tf.float32, [None, 100]) # noise
tf.summary.histogram('Noise', z)

with tf.variable_scope('GAN'):
G = generator(z, batch_size)
D_real, D_real_logits = discriminator(X, reuse=False)
D_fake, D_fake_logits = discriminator(G, reuse=True)
tf.summary.image('generated image', tf.reshape(G, [-1, 28, 28, 1]), 3)

with tf.variable_scope('Prediction'):
tf.summary.histogram('real', D_real)
tf.summary.histogram('fake', D_fake)

with tf.variable_scope('D_loss'):
d_loss_real = tf.reduce_mean(
logits=D_real_logits, labels=tf.ones_like(D_real_logits)))

d_loss_fake = tf.reduce_mean(
logits=D_fake_logits, labels=tf.zeros_like(D_fake_logits)))
d_loss = d_loss_real + d_loss_fake
tf.summary.scalar('d_loss_real', d_loss_real)
tf.summary.scalar('d_loss_fake', d_loss_fake)
tf.summary.scalar('d_loss', d_loss)

with tf.name_scope('G_loss'):
g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits
(logits=D_fake_logits, labels=tf.ones_like(D_fake_logits)))
tf.summary.scalar('g_loss', g_loss)
tvar = tf.trainable_variables()
dvar = [var for var in tvar if 'discriminator' in]
gvar = [var for var in tvar if 'generator' in]

with tf.name_scope('train'):
d_train_step = tf.train.AdamOptimizer().minimize(d_loss, var_list=dvar)
g_train_step = tf.train.AdamOptimizer().minimize(g_loss, var_list=gvar)

sess = tf.Session()
init = tf.global_variables_initializer()
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter('tmp/'+'gan_conv_'+logdir)
num_img = 0

if not os.path.exists('output/'):
for i in range(100000):
batch_X, _ = mnist.train.next_batch(batch_size)
batch_noise = np.random.uniform(-1., 1., [batch_size, 100])
if i % 500 == 0:
samples =, feed_dict={z: np.random.uniform(-1., 1., [64, 100])})
fig = plot(samples)
plt.savefig('output/%s.png' % str(num_img).zfill(3), bbox_inches='tight')
num_img += 1

_, d_loss_print =[d_train_step, d_loss],
feed_dict={X: batch_X, z: batch_noise})
_, g_loss_print =[g_train_step, g_loss],
feed_dict={z: batch_noise})

if i % 100 == 0:
s =, feed_dict={X: batch_X, z: batch_noise})
writer.add_summary(s, i)
print('epoch:%d g_loss:%f d_loss:%f' % (i, g_loss_print, d_loss_print))

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train vanila GAN using convolutional networks')
parser.add_argument('--logdir', type=str, default='1', help='logdir for Tensorboard, give a string')
parser.add_argument('--batch_size', type=int, default=64, help='batch size: give a int')
args = parser.parse_args()
train(logdir=args.logdir, batch_size=args.batch_size)
