Why?

A good question to ask is why? Why bother with this separation of processes? After all, the preceding code could be rewritten as the previous chapter's Predict function:

func (nn *NN) Predict(a tensor.Tensor) (int, error) {
  if a.Dims() != 1 {
    return nil, errors.New("Expected a vector")
  }

  var m maybe
  act0 := m.sigmoid(m.matVecMul(nn.hidden, a))
  pred := m.sigmoid(m.matVecMul(nn.final, act0))
  if m.err != nil {
    return -1, m.err
  }
  return argmax(pred.Data().([]float64)), nil
}

Here, we define the network in Go, and when we run the Go code, the neural network is run as it is being defined. What's the problem we face that we need to introduce the idea of separating the definition of the neural network and running it? We've already seen the problem when we wrote the Train method.

If you recall, in the last chapter, I said that writing the Train method requires us to actually copy and paste code from the Predict method. To refresh your memory, here's the Train method:

// X is the image, Y is a one hot vector
func (nn *NN) Train(x, y tensor.Tensor, learnRate float64) (cost float64, err error) {
  // predict
  var m maybe
  m.reshape(x, s.Shape()[0], 1)
  m.reshape(y, 10, 1)
  act0 := m.sigmoid(m.matmul(nn.hidden, x))
  pred := m.sigmoid(m.matmul(nn.final, act0))

  // backpropagation.
  outputErrors := m.sub(y, pred))
  cost = sum(outputErrors.Data().([]float64))

  hidErrs := m.do(func() (tensor.Tensor, error) {
    if err := nn.final.T(); err != nil {
      return nil, err
    }
    defer nn.final.UT()
    return tensor.MatMul(nn.final, outputErrors)
  })
  dpred := m.mul(m.dsigmoid(pred), outputErrors, tensor.UseUnsafe())
  dpred_dfinal := m.dmatmul(outputErrors, act0)
    if err := act0.T(); err != nil {
      return nil, err
    }
    defer act0.UT()
    return tensor.MatMul(outputErrors, act0)
  })

  m.reshape(m.mul(hidErrs, m.dsigmoid(act0), tensor.UseUnsafe()), 
                  hidErrs.Shape()[0], 1)
  dcost_dhidden := m.do(func() (tensor.Tensor, error) {
    if err := x.T(); err != nil {
      return nil, err
    }
    defer x.UT()
    return tensor.MatMul(hidErrs, x)
  })

  // gradient update
  m.mul(dpred_dfinal, learnRate, tensor.UseUnsafe())
  m.mul(dcost_dhidden, learnRate, tensor.UseUnsafe())
  m.add(nn.final, dpred_dfinal, tensor.UseUnsafe())
  m.add(nn.hidden, dcost_dhidden, tensor.UseUnsafe())
  return cost, m.err
}

Let's go through an exercise of refactoring to highlight the problem. Taking off our ML hat for a bit, and putting on our software engineer hat, let's see how we can refactor Train and Predict, even if conceptually. We see in the Train method that we need access to act0 and pred in order to backpropagate the errors. Where in Predict act0 and pred are terminal values (that is, we don't use them after the function has returned), in Train, they are not.

So, here, we can create a new method; let's call it fwd:

func (nn *NN) fwd(x tensor.Tensor) (act0, pred tensor.Tensor, err error) {
  var m maybe
  m.reshape(x, s.Shape()[0], 1)
  act0 := m.sigmoid(m.matmul(nn.hidden, x))
  pred := m.sigmoid(m.matmul(nn.final, act0))
  return act0, pred, m.err
}

And we can refactor Predict to look like this:

func (nn *NN) Predict(a tensor.Tensor) (int, error) {
  if a.Dims() != 1 {
    return nil, errors.New("Expected a vector")
  }

  var err error
  var pred tensor.Tensor
  if _, pred, err = nn.fwd(a); err!= nil {
    return -1, err
  }
  return argmax(pred.Data().([]float64)), nil
}

And the Train method would look like this:

// X is the image, Y is a one hot vector
func (nn *NN) Train(x, y tensor.Tensor, learnRate float64) (cost float64, err error) {
  // predict
  var act0, pred tensor.Tensor
  if act0, pred, err = nn.fwd(); err != nil {
    return math.Inf(1), err
  }

  var m maybe
  m.reshape(y, 10, 1)
  // backpropagation.
  outputErrors := m.sub(y, pred))
  cost = sum(outputErrors.Data().([]float64))

  hidErrs := m.do(func() (tensor.Tensor, error) {
    if err := nn.final.T(); err != nil {
      return nil, err
    }
    defer nn.final.UT()
    return tensor.MatMul(nn.final, outputErrors)
  })
  dpred := m.mul(m.dsigmoid(pred), outputErrors, tensor.UseUnsafe())
  dpred_dfinal := m.dmatmul(outputErrors, act0)
    if err := act0.T(); err != nil {
      return nil, err
    }
    defer act0.UT()
    return tensor.MatMul(outputErrors, act0)
  })

  m.reshape(m.mul(hidErrs, m.dsigmoid(act0), tensor.UseUnsafe()), 
                  hidErrs.Shape()[0], 1)
  dcost_dhidden := m.do(func() (tensor.Tensor, error) {
    if err := x.T(); err != nil {
      return nil, err
    }
    defer x.UT()
    return tensor.MatMul(hidErrs, x)
  })

  // gradient update
  m.mul(dpred_dfinal, learnRate, tensor.UseUnsafe())
  m.mul(dcost_dhidden, learnRate, tensor.UseUnsafe())
  m.add(nn.final, dpred_dfinal, tensor.UseUnsafe())
  m.add(nn.hidden, dcost_dhidden, tensor.UseUnsafe())
  return cost, m.err
}

This looks better. What exactly are we doing here? We are programming. We are rearranging one form of syntax into another form of syntax but we are not changing the semantics, the meaning of the program. The refactored program has exactly the same meaning as the pre-refactored program.

Table of Contents for Why?

Create new playlist

Sign In

Sign Up

Table of Contents for
Why?