实现所有图像调整为统一尺寸, 实现train valid数据集分割

LuffysMan · LuffysMan · commit e4c5abc481e1 · 2019-04-01T20:57:53.000+08:00
diff --git a/README.md b/README.md
@@ -21,9 +21,12 @@ dataset|
 使用方法：
 运行前可配置开启的线程数（默认线程数同计算机CPU数量)， 配置变量g_thread_count， 建议数量不超过cpu数量2倍
 在终端输入：python dataset.py -->
-直接读取原图同时读取对应文本, 在内存中进行裁剪后不再写入磁盘, 直接组成训练数据输出
+图像预处理方法: 直接读取原图同时读取对应文本, 在内存中进行裁剪后不再写入磁盘, 直接组成训练数据输出
 训练样例总数：142434
 
+使用方法:
+具体使用方法参考dsatasetEx.py中的demo函数
+
 ## 关于输入图像尺寸不同的处理办法
 - 方案1： 将图像按照给定的bounding box进行分割， 并分批存储到tfrecord，  
 进行神经网络训练前，先读取tfrecord， 然后将图像还原， 进一步resize为统一的高度， 输入到crnn 
@@ -34,16 +37,22 @@ dataset|
 文本中的每个字符按照字典序编码(字典需自行构造)， loss计算使用”编辑距离“
 
 ## 参考文献:  
-1. python扩大训练集样本数量-图片转换、改变尺寸 https://blog.csdn.net/weixin_42052460/article/details/80861056  
-2. 【python】详解zipfile模块读取处理压缩文件实例: https://blog.csdn.net/brucewong0516/article/details/79064384  
+1. 图像预处理
+  python扩大训练集样本数量-图片转换、改变尺寸 https://blog.csdn.net/weixin_42052460/article/details/80861056 
+  在Python and OpenCV中做图象处理:改变大小，旋转和裁剪(翻译)  https://blog.csdn.net/fxt570762000/article/details/80241446 
+  图像处理之PIL.Image与numpy.array之间的相互转换 https://blog.csdn.net/qq_30159015/article/details/80070514
+  第一篇 Python图片处理模块PIL（pillow） http://www.cnblogs.com/chimeiwangliang/p/7130434.html
+  Python用Pillow(PIL)进行简单的图像操作 https://www.cnblogs.com/sun-haiyu/p/7127582.html
+2. 压缩文件处理
+  【python】详解zipfile模块读取处理压缩文件实例: https://blog.csdn.net/brucewong0516/article/details/79064384  
 3. 多线程处理图片：  
     Python 类中的"静态"成员变量: https://www.cnblogs.com/turtle-fly/p/3280610.html  
     Python的访问修饰符： http://blog.sina.com.cn/s/blog_bb48e6be0102wbgd.html  
     使用@property: 廖雪峰博客  
     python 全局变量引用与修改： https://www.cnblogs.com/yanfengt/p/6305542.html  
 4. 构建字典， 处理图像对应的字符标签
-超酷算法（1）：BK树(http://blog.jobbole.com/78811/)
-文字识别(OCR)CRNN（基于pytorch、python3） 实现不定长中文字符识别(https://blog.csdn.net/Sierkinhane/article/details/82857572)
+  超酷算法（1）：BK树(http://blog.jobbole.com/78811/)
+  文字识别(OCR)CRNN（基于pytorch、python3） 实现不定长中文字符识别(https://blog.csdn.net/Sierkinhane/article/details/82857572)
 ## 遇到的问题
 问题1: 在使用pandas.to_csv()函数将图像数据存储到txt中的时候, 出现了部分图像数据变为省略号的情况
 原因: 图像原始数据是numpy数组, numpy数组在使用print函数输出的时候, 如果超过1000个元素, 会用省略号'...'来代替部分元素; 并且实际情况
diff --git a/datasetEx.py b/datasetEx.py
@@ -2,20 +2,22 @@
 '''
 预处理数据, 封装成方便使用的数据集
 提供随机batch功能(采用生产者消费者模式， 进行数据语预取， 随机出队列)
-提供统一高度的图像, 作为crnn的输入
-构建字库, 对label进行编码
-记录log
+提供统一高度的图像, 作为crnn的输入; 图像标准化(暂时不确定, 没有进行标准化)
+构建字库, 对label进行编码(未实现)
+记录log(未实现)
 '''
 # import pandas as pd
 import numpy as np 
 # import codecs
 import os
 import queue
 import threading 
-# import json
+import random
+import glob
 
+from PIL import Image
 
-from utils import myThread, log
+from utils import myThread, log, chdir
 from parameters import  RECORD_PATH, IMAGE_TRAIN_PATH, TXT_TRAIN_PATH, BATCH_SIZE
 from record import recQueue, recQueueLock, divide_conquer, get_cropThreadCount
 
@@ -24,19 +26,19 @@
 # fileQueue = queue.Queue()
 # fileQueueLock = threading.Lock()
 
-class chdir():
-    def __init__(self, newdir):
-        self._olddir = os.getcwd()
-        self._newdir = newdir
-    def __enter__(self):
-        os.chdir(self._newdir)
-        # print("enter work dir", self._newdir)
-    def __exit__(self, a, b, c):
-        os.chdir(self._olddir)
-        # print("exit work dir ", self._newdir)
+# class chdir():
+#     def __init__(self, newdir):
+#         self._olddir = os.getcwd()
+#         self._newdir = newdir
+#     def __enter__(self):
+#         os.chdir(self._newdir)
+#         # print("enter work dir", self._newdir)
+#     def __exit__(self, a, b, c):
+#         os.chdir(self._olddir)
+#         # print("exit work dir ", self._newdir)
 
 
-class DataSet(object):
+class Consumer(object):
     @log('call: ')
     def __init__(self, recQueue, recQueueLock, epochs=1):
         # self._recFilePath = recFilePath
@@ -89,26 +91,66 @@ def read_record(self):
 
 
 class DataSets(object):
-    def __init__(self):
+    def __init__(self, filenames):
+        self._height = 32       #将图像高度统一为32个像素
+        self._width = 128       #将图像宽度统一为100个像素
+        # self._train_test_ratio = 0.8
+        # self._datapath = datapath
+        self._image_files = filenames
+        # self._valid_images = []
+        # self.train_valid_split()
         self.__start_produce()
 
     def __start_produce(self):
         #启动图像裁剪线程
-        divide_conquer()
+        divide_conquer(self._image_files)
 
     def next_batch(self):
         #从工作队列recQueue取出裁剪好的图像和对应label, 大小为BATCH_SIZE, 定义在parameters.py
-        images, labels = self.train.read_record()
-        while not images and not labels:
-            if 0 == get_cropThreadCount():
+        self._images, self._labels = self.train.read_record()
+        while not  self._images and not self._labels:
+            if 0 == get_cropThreadCount():      #查询是否已经停止裁剪图像
                 return {}, {}
-            images, labels = self.train.read_record()
-        return images, labels
-
+            self._images, self._labels = self.train.read_record()
+        # return self._images, self._labels
+        # self.writeimage(self._images, self._labels)
+        return self.resize_with_crop_pad(self._images, self._labels)
+
+    def resize_with_crop_pad(self, images, labels):
+        result_images = []
+        result_labels = []
+        # images = self._images
+        #调整图像为统一高度, 满足crnn需要
+        i = 0 
+        bad = []
+        for image in images:
+            try:
+                H = image.shape[0]
+                W = image.shape[1]
+                ratio = 32/H
+                im = Image.fromarray(image.astype('uint8')).convert('RGB')
+                im = im.resize((int(W*ratio), 32), Image.BILINEAR)
+                result_images.append(np.array(im))
+                result_labels.append(labels[i])
+            except:
+                print("failed resize", image.shape)
+                im.save('./test/resized/%s-%.4d.jpg'%(labels[i], i))
+                bad.append(i)
+            finally:
+                i += 1
+        return result_images, result_labels
+
+    def writeimage(self, images, labels):
+        path = './test/origin/%s-%.4d.jpg'
+        i = 0
+        for image in images:
+            im = Image.fromarray(image.astype('uint8')).convert('RGB')
+            im.save(path%(labels[i], i))
+            i += 1
 @log()
-def read_data_sets():
-    data_sets = DataSets()
-    data_sets.train = DataSet(recQueue, recQueueLock, epochs=1)
+def read_data_sets(filenames):
+    data_sets = DataSets(filenames)
+    data_sets.train = Consumer(recQueue, recQueueLock, epochs=1)
     return data_sets
 
 # def next_batch(data_sets):
@@ -119,20 +161,54 @@ def read_data_sets():
 #         images, labels = data_sets.train.read_record()
 #     return images, labels
 
-if __name__ == "__main__":
-    # start_produce()
-    data_sets = read_data_sets()
+
+def train_valid_split(datapath, ratio=0.8, shuffle=True):
+    with chdir(datapath) as ch:
+        # os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH))       #修改当前工作路径, 方便获取文件名
+        image_names_train = glob.glob('*.jpg')                     #获取工作路径下所有jpg格式文件名到list中
+        # image_names_train = glob.glob(os.path.join(IMAGE_TRAIN_PATH, '*.jpg')) 
+    #将数据集分割为训练集和验证集
+    random.shuffle(image_names_train)
+    mid = int(ratio*len(image_names_train))
+    train_image_files = image_names_train[0: mid]
+    valid_image_files = image_names_train[mid: ]
+    return train_image_files, valid_image_files
+    
+def demo():
+    #首先划分训练集和验证集
+    train_image_files, valid_image_files = train_valid_split(IMAGE_TRAIN_PATH, ratio=0.7)
+    print(len(train_image_files))
+    print('start trainning')
+    data_sets = read_data_sets(train_image_files)                    #开始读取图像数据
     step = 0
+    #读取训练集并训练
     while True:
         images, labels = data_sets.next_batch()
-        if images and labels:
-            print(step, len(images), len(labels))  #可用于训练, images需要将height统一, labels需要进行编码
+        if images and labels:                       #如果为空, 表示数据已经循环一次
+            #train()        #训练模型
+            print("train batch: ", len(images), len(labels))
+            step += 1
+        else:
+            print("over")
+            break
+    #读取验证集并验证
+    print('start validating')
+    data_sets = read_data_sets(valid_image_files)                    #开始读取图像数据
+    print(len(valid_image_files))
+    step = 0
+    while True:
+        images_valid, labels_valid = data_sets.next_batch()
+        if images_valid and labels_valid:                       #如果为空, 表示数据已经循环一次
+            #train()        #训练模型
+            print("valid batch: ", len(images_valid), len(labels_valid))
             step += 1
         else:
             print("over")
             break
 
 
+if __name__ == "__main__":
+    demo()
 
 
 
diff --git a/parameters.py b/parameters.py
@@ -9,6 +9,7 @@
 # TXT_TRAIN_PROD_PATH = os.path.join(os.getcwd(), 'dataset/txt_train_prod')       #预处理后的图像对应文本路径
 
 #模型超参数
+TRAIN_TEST_RATIO = 0.8
 BATCH_SIZE = 100
 
 
diff --git a/record.py b/record.py
@@ -12,7 +12,7 @@
 import multiprocessing
 
 from math import fabs, sin, cos, acos, radians
-from utils import myThread, log
+from utils import myThread, log, chdir
 from parameters import IMAGE_TRAIN_PATH, TXT_TRAIN_PATH, BATCH_SIZE            
 
 np.set_printoptions(threshold=1000000000)
@@ -29,11 +29,13 @@
 cropQueueLock = threading.Lock()              
 
 @log()
-def divide_conquer():
+def divide_conquer(image_names_train):
     global g_img_total, g_thread_count, cropQueueLock, workQueue, g_active_cropThread_Count
-    os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH))       #修改当前工作路径, 方便获取文件名
-    image_names_train = glob.glob('*.jpg')                     #获取工作路径下所有jpg格式文件名到list中
-    g_img_total = len(image_names_train) 
+    # with chdir(IMAGE_TRAIN_PATH) as ch:
+    #     # os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH))       #修改当前工作路径, 方便获取文件名
+    #     image_names_train = glob.glob('*.jpg')                     #获取工作路径下所有jpg格式文件名到list中
+    #     # image_names_train = glob.glob(os.path.join(IMAGE_TRAIN_PATH, '*.jpg'))                     #获取工作路径下所有jpg格式文件名到list中
+    g_img_total = len(image_names_train)
     print("total images: {}".format(g_img_total))
     #划分任务分配给多线程
     threadNames = ['thread-crop{}'.format(i) for i in range(g_thread_count)]
@@ -86,8 +88,13 @@ def t_crop_image(imageNames):
     records = {}
     tName = threading.current_thread().getName()
     for j in range(imgCounts):
+        # tmpName = imageNames[j].split('/')[-1]
+        # tmpName = tmpName.split('.')[-3:-1]
+        # tmpName.append('txt')
+        # print(tmpName)
+        # imageTxt = os.path.join(TXT_TRAIN_PATH, '.'.join(tmpName))     # txt路径
         imageTxt = os.path.join(TXT_TRAIN_PATH, imageNames[j][:-4] + '.txt')     # txt路径
-        imageName =imageNames[j]
+        imageName =os.path.join(IMAGE_TRAIN_PATH, imageNames[j])
         imgSrc = cv2.imread(imageName)
         if(imgSrc is None):
             invalidimg.append(imageName)
diff --git a/utils.py b/utils.py
@@ -1,6 +1,7 @@
 #coding: utf-8
 import threading
 import functools
+import os   
 
 class myThread(threading.Thread):
     __threadCount = 0
@@ -37,6 +38,18 @@ def exit(self):
         self._exitflag = 1
 
 
+class chdir():
+    def __init__(self, newdir):
+        self._olddir = os.getcwd()
+        self._newdir = newdir
+    def __enter__(self):
+        os.chdir(self._newdir)
+        # print("enter work dir", self._newdir)
+    def __exit__(self, a, b, c):
+        os.chdir(self._olddir)
+        # print("exit work dir ", self._newdir)
+
+
 def log(text=None):
     def decorator(func):
         @functools.wraps(func)
@@ -62,4 +75,3 @@ def wrapper(*args, **kw):
 
 
 
-