Performing a fine-tune on the model

In order to fine-tune the model, we need to know which one is the latest model and its corresponding checkpoint to restore weights and biases. Therefore, we call the /model endpoint to get the checkpoint name and a version number:

    def get_latest_model(url): 
    response = requests.get("%s/model" % url) 
    data = json.loads(response.text) 
    print(data) 
    return data["ckpt_name"], int(data["version"])

The response JSON should look like this:

    { 
     "ckpt_name": "2017-05-26_02-12-49",  
     "id": 10,  
     "link": "http://1.53.110.161:8181/pet-model/8.zip",  
     "name": "pet-model",  
     "version": 8 
    }

Now, we will implement the code to fine-tune the model. Let's start with some parameters:

    # Server info 
    URL = "http://localhost:5000" 
    dest_api = URL + "/model" 
 
    # Server Endpoints 
    source_api = "http://1.53.110.161:8181" 
 
    # Dataset 
    dataset_dir = "data/train_data" 
    user_dir = "data/user_data" 
    batch_size = 64 
    image_size = 224 
 
    # Learning rate 
    initial_learning_rate = 0.0001 
    decay_steps = 250 
    decay_rate = 0.9 
 
    # Validation 
    output_steps = 10  # Number of steps to print output 
    eval_steps = 20  # Number of steps to perform evaluations 
 
    # Training 
    max_steps = 3000  # Number of steps to perform training 
    save_steps = 200  # Number of steps to perform saving    
    checkpoints 
    num_tests = 5  # Number of times to test for test accuracy 
    max_checkpoints_to_keep = 1 
    save_dir = "data/checkpoints" 
    train_vars = 'models/fc8-pets/weights:0,models/fc8- 
    pets/biases:0' 
 
    # Get the latest model 
    last_checkpoint_name, last_version = get_latest_model(URL) 
    last_checkpoint_dir = os.path.join(save_dir,   
    last_checkpoint_name) 
 
    # Export 
    export_dir = "/home/ubuntu/models/" 
    export_name = "pet-model" 
    export_version = last_version + 1

Then, we will implement the fine-tune loop. In the following code, we call download_user_data to download all the user-labeled images and pass user_dir into input_pipeline so that it will load the new images:

    # Download user-labels data 
    download_user_data(URL, user_dir) 
 
    images, labels = datasets.input_pipeline(dataset_dir,     
    batch_size, is_training=True, user_dir=user_dir) 
    test_images, test_labels =    
    datasets.input_pipeline(dataset_dir, batch_size,    
    is_training=False, user_dir=user_dir) 
 
     with tf.variable_scope("models") as scope: 
     logits = nets.inference(images, is_training=True) 
     scope.reuse_variables() 
     test_logits = nets.inference(test_images, is_training=False) 
 
    total_loss = models.compute_loss(logits, labels) 
    train_accuracy = models.compute_accuracy(logits, labels) 
    test_accuracy = models.compute_accuracy(test_logits,  
    test_labels) 
 
    global_step = tf.Variable(0, trainable=False) 
    learning_rate = models.get_learning_rate(global_step,      
    initial_learning_rate, decay_steps, decay_rate) 
    train_op = models.train(total_loss, learning_rate,  
    global_step, train_vars) 
 
    saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep) 
    checkpoint_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 
    checkpoints_dir = os.path.join(save_dir, checkpoint_name) 
    if not os.path.exists(save_dir): 
      os.mkdir(save_dir) 
    if not os.path.exists(checkpoints_dir): 
      os.mkdir(checkpoints_dir) 
 
    with tf.Session() as sess: 
      sess.run(tf.global_variables_initializer()) 
      coords = tf.train.Coordinator() 
      threads = tf.train.start_queue_runners(sess=sess,   
      coord=coords) 
 
    saver.restore(sess,  
    models.get_model_path_from_ckpt(last_checkpoint_dir)) 
    sess.run(global_step.assign(0)) 
 
    last_saved_test_accuracy = 0 
    for i in range(num_tests): 
        last_saved_test_accuracy += sess.run(test_accuracy) 
    last_saved_test_accuracy /= num_tests 
    should_export = False 
    print("Last model test accuracy    
    {}".format(last_saved_test_accuracy)) 
    for i in tqdm(range(max_steps), desc="training"): 
        _, loss_value, lr_value = sess.run([train_op, total_loss,   
        learning_rate]) 
 
     if (i + 1) % output_steps == 0: 
       print("Steps {}: Loss = {:.5f} Learning Rate =   
       {}".format(i + 1, loss_value, lr_value)) 
 
        if (i + 1) % eval_steps == 0: 
          test_acc, train_acc, loss_value =  
          sess.run([test_accuracy, train_accuracy, total_loss]) 
            print("Test accuracy {} Train accuracy {} : Loss =  
            {:.5f}".format(test_acc, train_acc, loss_value)) 
 
        if (i + 1) % save_steps == 0 or i == max_steps - 1: 
          test_acc = 0 
          for i in range(num_tests): 
            test_acc += sess.run(test_accuracy) 
            test_acc /= num_tests 
 
        if test_acc > last_saved_test_accuracy: 
          print("Save steps: Test Accuracy {} is higher than  
          {}".format(test_acc, last_saved_test_accuracy)) 
          last_saved_test_accuracy = test_acc 
          saved_file = saver.save(sess, 
                                      
        os.path.join(checkpoints_dir, 'model.ckpt'), 
                                        global_step=global_step) 
                should_export = True 
                print("Save steps: Save to file %s " % saved_file) 
            else: 
                print("Save steps: Test Accuracy {} is not higher  
       than {}".format(test_acc, last_saved_test_accuracy)) 
 
    if should_export: 
        print("Export model with accuracy ",  
        last_saved_test_accuracy) 
        models.export_model(checkpoints_dir, export_dir,   
        export_name, export_version) 
        archive_and_send_file(source_api, dest_api,  
        checkpoint_name, export_dir, export_name, export_version) 
      coords.request_stop() 
      coords.join(threads)

Other parts are quite similar to the training loop. However, instead of loading the weights from the caffe model, we use the checkpoint of the latest model and run the test a few times to get its test accuracy.

At the end of the fine-tune loop, we need a new method named archive_and_send_file to make an archive from the exported model and send the link to the production server:

    def make_archive(dir_path): 
    return shutil.make_archive(dir_path, 'zip', dir_path) 
 
 
    def archive_and_send_file(source_api, dest_api, ckpt_name,    
    export_dir, export_name, export_version): 
    model_dir = os.path.join(export_dir, export_name,    
    str(export_version)) 
    file_path = make_archive(model_dir) 
    print("Zip model: ", file_path) 
 
    data = { 
        "link": "{}/{}/{}".format(source_api, export_name,  
     str(export_version) + ".zip"), 
        "ckpt_name": ckpt_name, 
        "version": export_version, 
        "name": export_name, 
    } 
     r = requests.post(dest_api, data=data) 
    print("send_file", r.text)

You should note that we create a link with the source_api parameter, which is the link to the training server, http://1.53.110.161:8181. We will set up a simple Apache Server to support this function. However, in reality, we suggest that you upload the archived model to cloud storage such as Amazon S3. Now, we will show you the simplest way with Apache.

We need to install Apache with the following command:

sudo apt-get install apache2

Now, in /etc/apache2/ports.conf, on line 6, we need to add this code to make apache2 listen on port 8181:

    Listen 8181

Then, add the following code at the beginning of /etc/apache2/sites-available/000-default.conf to support downloading from the /home/ubuntu/models directory:

    <VirtualHost *:8181> 
      DocumentRoot "/home/ubuntu/models" 
      <Directory /> 
        Require all granted 
      </Directory> 
    </VirtualHost>

Finally, we need to restart the apache2 server:

sudo service apache2 restart

Up to now, we have set up all the code to perform fine-tuning. Before running the fine-tuning for the first time, we need to send a POST request to the /model endpoint with the information about our first model because we have already copied the model to the production server.

In the project repository, let's run the finetune script:

python scripts/finetune.py

The last few lines in the console will look like the following:

    Save steps: Test Accuracy 0.84 is higher than 0.916875
    Save steps: Save to file data/checkpoints/2017-05-29_18-46-43/model.ckpt-2000
    ('Export model with accuracy ', 0.916875000000004)
    2017-05-29 18:47:31.642729: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:01:00.0)
    ('Exported model at', '/home/ubuntu/models/pet-model/2')
    ('Zip model: ', '/home/ubuntu/models/pet-model/2.zip')
    ('send_file', u'{
  "ckpt_name": "2017-05-29_18-46-43", 
  "id": 2, 
  "link": "http://1.53.110.161:8181/pet-model/2.zip", 
  "name": "pet-model", 
  "version": 2
}
')

As you can see, the new model has a test accuracy of 91%. The model is also exported and archived to /home/ubuntu/models/pet-model/2.zip. The code is also calling the /model endpoint to post the link to the production server. In the logging of the Flask app in the production server, we will get the following results:

('Start downloading', u'http://1.53.110.161:8181/pet-model/2.zip')
('Downloaded file at', u'/tmp/2.zip')
('Extracted at', u'/home/ubuntu/productions/2')
127.0.0.1 - - [29/May/2017 18:49:05] "POST /model HTTP/1.1" 200 -

This means that our Flask app had downloaded the 2.zip file from the training server and extracted the content to /home/ubuntu/productions/2. In the tmux session for TensorFlow Serving, you will also get the following results:

    2017-05-29 18:49:06.234808: I tensorflow_serving/core/loader_harness.cc:86] Successfully loaded servable version {name: pet-model version: 2}
    2017-05-29 18:49:06.234840: I tensorflow_serving/core/loader_harness.cc:137] Quiescing servable version {name: pet-model version: 1}
    2017-05-29 18:49:06.234848: I tensorflow_serving/core/loader_harness.cc:144] Done quiescing servable version {name: pet-model version: 1}
    2017-05-29 18:49:06.234853: I tensorflow_serving/core/loader_harness.cc:119] Unloading servable version {name: pet-model version: 1}
    2017-05-29 18:49:06.240118: I ./tensorflow_serving/core/simple_loader.h:226] Calling MallocExtension_ReleaseToSystem() with 645327546
    2017-05-29 18:49:06.240155: I tensorflow_serving/core/loader_harness.cc:127] Done unloading servable version {name: pet-model version: 1}

This output indicates that the TensorFlow model server has successfully loaded version 2 of the pet-model and unloaded version 1. This also means that we have served the new model, which was trained on the training server and sent to the production server via the /model endpoint.

Table of Contents for Performing a fine-tune on the model

Create new playlist

Sign In

Sign Up

Table of Contents for
Performing a fine-tune on the model