Skip to content

Tensorflow Mnist for Gitlab CI

Casper da Costa-Luis edited this page Jun 16, 2021 · 1 revision

WARNING: outdated/old docs

This is an introductory example of how to create a ci/cd pipeline with DVC-CML in Gitlab CI/CD Pipelines.

Setup

1. Install dvc in your computer if you haven't done it already

2. Create a repo in your Gitlab account

3. Clone the repo in your computer

git clone your-repo-url

4. Setup your project structure:

mkdir models metrics code
touch models/.gitkeep
touch metrics/.gitkeep

echo -e "tensorflow\nwget" >> requirements.txt

5. Install requirements:

pip install tensorflow wget

6. Create code/mnist.py file with the following content

code/mnist.py
import os
import sys
import gzip
import shutil

import numpy as np

import wget

def download(uri, path):
  wget.download(uri, path)

def unzip(path):
    input = gzip.GzipFile(path, 'rb')
    s = input.read()
    input.close()

    output = open(path.replace('.gz', ''), 'wb')
    output.write(s)
    output.close()

def get_images(imgf, n):
    f = open(imgf, "rb")
    f.read(16)
    images = []

    for i in range(n):
        image = []
        for j in range(28*28):
            image.append(ord(f.read(1)))
        images.append(image)

    return images

def get_labels(labelf, n):
    l = open(labelf, "rb")
    l.read(8)
    labels = []
    for i in range(n):
        labels.append(ord(l.read(1)))
        
    return labels

def output_csv(folder, images, labels, prefix):
    if not os.path.exists(folder):
        os.mkdir(folder)

    o = open(os.path.join(folder, "mnist_%s.csv"%prefix), "w")
    for i in range(len(images)):
        o.write(",".join(str(x) for x in [labels[i]] + images[i]) + "\n")
    o.close()

def process(folder, imgf, labelf, prefix, n):
    images = get_images(os.path.join(folder, imgf), n)
    labels = get_labels(os.path.join(folder, labelf), n)
    output_csv(folder, images, labels, prefix)
    
def read_csv(path):
    labels = []
    imgs = []

    with open(path) as f:
        for i, line in enumerate(f): 
            data = line.split(',')  

            label = data[0]
            label_one_hot = np.zeros(10)
            label_one_hot[int(label)] = 1
            labels.append(label_one_hot)

            img = np.array(data[1:])
            img = img.astype(np.float32)
            img = np.multiply(img, 1.0 / 255.0)
            imgs.append(img)
    
    return (np.asarray(labels), np.asarray(imgs))


class DataSet(object):
  def __init__(self, images, labels):   
    self.num_examples = images.shape[0]
    self.images = images
    self.labels = labels
    self.epochs_completed = 0
    self.index_in_epoch = 0

  def next_batch(self, batch_size):
    start = self.index_in_epoch
    self.index_in_epoch += batch_size

    if self.index_in_epoch > self.num_examples:
      self.epochs_completed += 1
      
      # Shuffle the data
      perm = np.arange(self.num_examples)
      np.random.shuffle(perm)
      self.images = self.images[perm]
      self.labels = self.labels[perm]

      # Start next epoch
      start = 0
      self.index_in_epoch = batch_size
      assert batch_size <= self.num_examples

    end = self.index_in_epoch
    return self.images[start:end], self.labels[start:end]


if __name__== "__main__":
    if len(sys.argv) < 2:
        print('folder is missing. Run command with folder path.')
        exit(1)

    out_folder = sys.argv[1]

    if not os.path.exists(out_folder):
        os.mkdir(out_folder)
    else:
        print('folder ' + out_folder + ' already exists! Delete it with all its content in order to prepare it')
        exit(1)

    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
    files = ['train-images-idx3-ubyte.gz',
        'train-labels-idx1-ubyte.gz', 
        't10k-images-idx3-ubyte.gz', 
        't10k-labels-idx1-ubyte.gz' ]

    for fil in files:
        path = os.path.join(out_folder, fil)
        download(SOURCE_URL + fil, out_folder)
        unzip(path)

    process(out_folder, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", 'train', 60000)
    process(out_folder, "t10k-images-idx3-ubyte",  "t10k-labels-idx1-ubyte", 'test', 10000)

    for filename in files:
        path = os.path.join(out_folder, filename)
        os.remove(path)
        os.remove(path.replace('.gz', ''))

7. Create code/train.py file with the following content:

code/train.py
import os
import json
import time
import tensorflow.compat.v1 as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.disable_v2_behavior()

import mnist

dirname = os.path.dirname(__file__)

train_labels, train_images = mnist.read_csv(os.path.join(dirname, '../data/mnist_train.csv'))
DATASET = mnist.DataSet(train_images, train_labels)
OUT = os.path.join(dirname, "../models/mnist")

batch_size = 128
num_steps = 1800
learning_rate = 0.01
start = time.time()

# input
x = tf.placeholder(tf.float32, [None, 784], "x")
y_ = tf.placeholder(tf.float32, [None, 10], "y")

# weight
W = tf.Variable(tf.zeros([784, 10]))
# bias
b = tf.Variable(tf.zeros([10]))
# test_data * W + b
y = tf.matmul(x, W) + b
sm = tf.nn.softmax(y, name="softmax")

# cross entropy (loss function)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_), name="loss")

# train step
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# evaluating the model
correct_prediction = tf.equal(tf.argmax(sm, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

saver = tf.train.Saver()
init = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(init)

    # training
    for step in range(num_steps):
        batch_data, batch_labels = DATASET.next_batch(batch_size)
        feed_dict = {x: batch_data, y_: batch_labels}
        
        loss_out, ts_out, acc_out = session.run([loss, train_step, accuracy], feed_dict=feed_dict)  

    save_path = saver.save(session, OUT)

    with open(os.path.join(dirname, '../metrics/train.json'), 'w') as outfile:
        json.dump({ 
            "batch_size": batch_size, 
            "num_steps": num_steps, 
            "learning_rate": learning_rate,  
            "took" : (time.time() - start) / 1000 }, outfile)

8. Create code/eval.py file with the following content:

code/eval.py
import os
import json
import tensorflow.compat.v1 as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.disable_v2_behavior()

import mnist

dirname = os.path.dirname(__file__)

LABELS, IMAGES = mnist.read_csv(os.path.join(dirname, '../data/mnist_test.csv'))

META = os.path.join(dirname, '../models/mnist.meta')
MODELS = os.path.join(dirname, '../models/')

init = tf.global_variables_initializer()
with tf.Session() as sess:
    saver = tf.train.import_meta_graph(META)
    saver.restore(sess, tf.train.latest_checkpoint(MODELS))

    graph = tf.get_default_graph()

    x = graph.get_tensor_by_name("x:0")
    y = graph.get_tensor_by_name("y:0")
    softmax = graph.get_tensor_by_name("softmax:0")
    accuracy = graph.get_tensor_by_name("accuracy:0")
    feed_dict = { x: IMAGES, y: LABELS }

    pred = sess.run([softmax, accuracy], feed_dict=feed_dict)
    with open(os.path.join(dirname, '../metrics/eval.json'), 'w') as outfile:
        json.dump({ "accuracy" : pred[1].item() }, outfile)

9. Setup dvc in your project, you will need to add a dvc remote storage

dvc init
dvc remote add -d myremote s3://your-s3-bucket/dvc-mnist-example

10. Setup project data

python code/mnist.py data

If everything has gone fine, you should have two folders inside data:

  • train
  • test

containing 60000 and 10000 small images respectively.

now track data with dvc running:

dvc add data

11. Let's create your dvc pipeline running the following commands:

dvc run --no-exec \
    -f train.dvc \
    -d code/train.py \
    -d data/mnist_train.csv \
    -o models \
    -M metrics/train.json \
    python code/train.py
dvc run --no-exec \
    -f eval.dvc \
    -d code/eval.py \
    -d data/mnist_test.csv \
    -d models \
    -M metrics/eval.json \
    python code/eval.py

12. Create .gitlab-ci.yml file with the following content:

.gitlab-ci.yml
stages:
  - dvc_action_run
  
dvc:
  stage: dvc_action_run
  image: dvcorg/dvc-cml:latest

  variables:
    repro_targets: 'eval.dvc'
  script:
    - apt-get update && apt-get install -y python-pip && pip install --upgrade pip
    - pip install -r requirements.txt
    - dvc_cml_run

12.a. Setup a gitlab token with the name GITLAB_TOKEN.

  1. Log in to GitLab.
  2. In the upper-right corner, click your avatar and select Settings.
  3. On the User Settings menu, select Access Tokens.
  4. Use repo_token as name and optional expiry date for the token.
  5. Choose api, read repository and write repository.
  6. Click the Create personal access token button.

image

12.b. Setup your AWS credentials and your repo_token as masked enviroment variables in your repo.

image

13. You can now commit and push your first code to your repo

git add --all
git commit -m "first commit"

dvc push
git push

Overview

Congratulations! 🎉 you have created your first CD ML pipeline with DVC-CML Let's check what it's going to happen.

1. Gitlab ci will run your workflow file every time that you push or do a pull request:

Runner could be running in your own servers with GPUs if needed! 😃 Check how to run your own runners

2. If your dvc pipeline changes DVC-CML will take care of managing your pipeline outputs

Every push/MR dvc repro will be executed if your dvc pipeline has changed, and only if has changed, allowing you to treat every branch or commit as a new experiment if you change your pipeline or treat it just as a normal git push if the pipeline does not change.

If everything went fine you will see that the job run properly and you will see another commit automatically generated with comment dvc repro [skip ci]

DVC-CML ran the dvc repro for you (doing the train and eval stages) and then pushed your changes into git and dvc! 🚀