In order to fine-tune the model, we need to know which one is the latest model and its corresponding checkpoint to restore weights and biases. Therefore, we call the /model endpoint to get the checkpoint name and a version number:
def get_latest_model(url): response = requests.get("%s/model" % url) data = json.loads(response.text) print(data) return data["ckpt_name"], int(data["version"])
The response JSON should look like this:
{ "ckpt_name": "2017-05-26_02-12-49", "id": 10, "link": "http://1.53.110.161:8181/pet-model/8.zip", "name": "pet-model", "version": 8 }
Now, we will implement the code to fine-tune the model. Let's start with some parameters:
# Server info URL = "http://localhost:5000" dest_api = URL + "/model" # Server Endpoints source_api = "http://1.53.110.161:8181" # Dataset dataset_dir = "data/train_data" user_dir = "data/user_data" batch_size = 64 image_size = 224 # Learning rate initial_learning_rate = 0.0001 decay_steps = 250 decay_rate = 0.9 # Validation output_steps = 10 # Number of steps to print output eval_steps = 20 # Number of steps to perform evaluations # Training max_steps = 3000 # Number of steps to perform training save_steps = 200 # Number of steps to perform saving checkpoints num_tests = 5 # Number of times to test for test accuracy max_checkpoints_to_keep = 1 save_dir = "data/checkpoints" train_vars = 'models/fc8-pets/weights:0,models/fc8- pets/biases:0' # Get the latest model last_checkpoint_name, last_version = get_latest_model(URL) last_checkpoint_dir = os.path.join(save_dir, last_checkpoint_name) # Export export_dir = "/home/ubuntu/models/" export_name = "pet-model" export_version = last_version + 1
Then, we will implement the fine-tune loop. In the following code, we call download_user_data to download all the user-labeled images and pass user_dir into input_pipeline so that it will load the new images:
# Download user-labels data download_user_data(URL, user_dir) images, labels = datasets.input_pipeline(dataset_dir, batch_size, is_training=True, user_dir=user_dir) test_images, test_labels = datasets.input_pipeline(dataset_dir, batch_size, is_training=False, user_dir=user_dir) with tf.variable_scope("models") as scope: logits = nets.inference(images, is_training=True) scope.reuse_variables() test_logits = nets.inference(test_images, is_training=False) total_loss = models.compute_loss(logits, labels) train_accuracy = models.compute_accuracy(logits, labels) test_accuracy = models.compute_accuracy(test_logits, test_labels) global_step = tf.Variable(0, trainable=False) learning_rate = models.get_learning_rate(global_step, initial_learning_rate, decay_steps, decay_rate) train_op = models.train(total_loss, learning_rate, global_step, train_vars) saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep) checkpoint_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") checkpoints_dir = os.path.join(save_dir, checkpoint_name) if not os.path.exists(save_dir): os.mkdir(save_dir) if not os.path.exists(checkpoints_dir): os.mkdir(checkpoints_dir) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) coords = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coords) saver.restore(sess, models.get_model_path_from_ckpt(last_checkpoint_dir)) sess.run(global_step.assign(0)) last_saved_test_accuracy = 0 for i in range(num_tests): last_saved_test_accuracy += sess.run(test_accuracy) last_saved_test_accuracy /= num_tests should_export = False print("Last model test accuracy {}".format(last_saved_test_accuracy)) for i in tqdm(range(max_steps), desc="training"): _, loss_value, lr_value = sess.run([train_op, total_loss, learning_rate]) if (i + 1) % output_steps == 0: print("Steps {}: Loss = {:.5f} Learning Rate = {}".format(i + 1, loss_value, lr_value)) if (i + 1) % eval_steps == 0: test_acc, train_acc, loss_value = sess.run([test_accuracy, train_accuracy, total_loss]) print("Test accuracy {} Train accuracy {} : Loss = {:.5f}".format(test_acc, train_acc, loss_value)) if (i + 1) % save_steps == 0 or i == max_steps - 1: test_acc = 0 for i in range(num_tests): test_acc += sess.run(test_accuracy) test_acc /= num_tests if test_acc > last_saved_test_accuracy: print("Save steps: Test Accuracy {} is higher than {}".format(test_acc, last_saved_test_accuracy)) last_saved_test_accuracy = test_acc saved_file = saver.save(sess, os.path.join(checkpoints_dir, 'model.ckpt'), global_step=global_step) should_export = True print("Save steps: Save to file %s " % saved_file) else: print("Save steps: Test Accuracy {} is not higher than {}".format(test_acc, last_saved_test_accuracy)) if should_export: print("Export model with accuracy ", last_saved_test_accuracy) models.export_model(checkpoints_dir, export_dir, export_name, export_version) archive_and_send_file(source_api, dest_api, checkpoint_name, export_dir, export_name, export_version) coords.request_stop() coords.join(threads)
Other parts are quite similar to the training loop. However, instead of loading the weights from the caffe model, we use the checkpoint of the latest model and run the test a few times to get its test accuracy.
At the end of the fine-tune loop, we need a new method named archive_and_send_file to make an archive from the exported model and send the link to the production server:
def make_archive(dir_path): return shutil.make_archive(dir_path, 'zip', dir_path) def archive_and_send_file(source_api, dest_api, ckpt_name, export_dir, export_name, export_version): model_dir = os.path.join(export_dir, export_name, str(export_version)) file_path = make_archive(model_dir) print("Zip model: ", file_path) data = { "link": "{}/{}/{}".format(source_api, export_name, str(export_version) + ".zip"), "ckpt_name": ckpt_name, "version": export_version, "name": export_name, } r = requests.post(dest_api, data=data) print("send_file", r.text)
You should note that we create a link with the source_api parameter, which is the link to the training server, http://1.53.110.161:8181. We will set up a simple Apache Server to support this function. However, in reality, we suggest that you upload the archived model to cloud storage such as Amazon S3. Now, we will show you the simplest way with Apache.
We need to install Apache with the following command:
sudo apt-get install apache2
Now, in /etc/apache2/ports.conf, on line 6, we need to add this code to make apache2 listen on port 8181:
Listen 8181
Then, add the following code at the beginning of /etc/apache2/sites-available/000-default.conf to support downloading from the /home/ubuntu/models directory:
<VirtualHost *:8181> DocumentRoot "/home/ubuntu/models" <Directory /> Require all granted </Directory> </VirtualHost>
Finally, we need to restart the apache2 server:
sudo service apache2 restart
Up to now, we have set up all the code to perform fine-tuning. Before running the fine-tuning for the first time, we need to send a POST request to the /model endpoint with the information about our first model because we have already copied the model to the production server.
In the project repository, let's run the finetune script:
python scripts/finetune.py
The last few lines in the console will look like the following:
Save steps: Test Accuracy 0.84 is higher than 0.916875 Save steps: Save to file data/checkpoints/2017-05-29_18-46-43/model.ckpt-2000 ('Export model with accuracy ', 0.916875000000004) 2017-05-29 18:47:31.642729: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:01:00.0) ('Exported model at', '/home/ubuntu/models/pet-model/2') ('Zip model: ', '/home/ubuntu/models/pet-model/2.zip') ('send_file', u'{ "ckpt_name": "2017-05-29_18-46-43", "id": 2, "link": "http://1.53.110.161:8181/pet-model/2.zip", "name": "pet-model", "version": 2 } ')
As you can see, the new model has a test accuracy of 91%. The model is also exported and archived to /home/ubuntu/models/pet-model/2.zip. The code is also calling the /model endpoint to post the link to the production server. In the logging of the Flask app in the production server, we will get the following results:
('Start downloading', u'http://1.53.110.161:8181/pet-model/2.zip') ('Downloaded file at', u'/tmp/2.zip') ('Extracted at', u'/home/ubuntu/productions/2') 127.0.0.1 - - [29/May/2017 18:49:05] "POST /model HTTP/1.1" 200 -
This means that our Flask app had downloaded the 2.zip file from the training server and extracted the content to /home/ubuntu/productions/2. In the tmux session for TensorFlow Serving, you will also get the following results:
2017-05-29 18:49:06.234808: I tensorflow_serving/core/loader_harness.cc:86] Successfully loaded servable version {name: pet-model version: 2} 2017-05-29 18:49:06.234840: I tensorflow_serving/core/loader_harness.cc:137] Quiescing servable version {name: pet-model version: 1} 2017-05-29 18:49:06.234848: I tensorflow_serving/core/loader_harness.cc:144] Done quiescing servable version {name: pet-model version: 1} 2017-05-29 18:49:06.234853: I tensorflow_serving/core/loader_harness.cc:119] Unloading servable version {name: pet-model version: 1} 2017-05-29 18:49:06.240118: I ./tensorflow_serving/core/simple_loader.h:226] Calling MallocExtension_ReleaseToSystem() with 645327546 2017-05-29 18:49:06.240155: I tensorflow_serving/core/loader_harness.cc:127] Done unloading servable version {name: pet-model version: 1}
This output indicates that the TensorFlow model server has successfully loaded version 2 of the pet-model and unloaded version 1. This also means that we have served the new model, which was trained on the training server and sent to the production server via the /model endpoint.