Skip to content

Commit

Permalink
实现所有图像调整为统一尺寸, 实现train valid数据集分割
Browse files Browse the repository at this point in the history
  • Loading branch information
LuffysMan committed Apr 1, 2019
1 parent d5bae93 commit e4c5abc
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 44 deletions.
19 changes: 14 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@ dataset|
使用方法:
运行前可配置开启的线程数(默认线程数同计算机CPU数量), 配置变量g_thread_count, 建议数量不超过cpu数量2倍
在终端输入:python dataset.py -->
直接读取原图同时读取对应文本, 在内存中进行裁剪后不再写入磁盘, 直接组成训练数据输出
图像预处理方法: 直接读取原图同时读取对应文本, 在内存中进行裁剪后不再写入磁盘, 直接组成训练数据输出
训练样例总数:142434

使用方法:
具体使用方法参考dsatasetEx.py中的demo函数

## 关于输入图像尺寸不同的处理办法
- 方案1: 将图像按照给定的bounding box进行分割, 并分批存储到tfrecord,
进行神经网络训练前,先读取tfrecord, 然后将图像还原, 进一步resize为统一的高度, 输入到crnn
Expand All @@ -34,16 +37,22 @@ dataset|
文本中的每个字符按照字典序编码(字典需自行构造), loss计算使用”编辑距离“

## 参考文献:
1. python扩大训练集样本数量-图片转换、改变尺寸 https://blog.csdn.net/weixin_42052460/article/details/80861056
2. 【python】详解zipfile模块读取处理压缩文件实例: https://blog.csdn.net/brucewong0516/article/details/79064384
1. 图像预处理
python扩大训练集样本数量-图片转换、改变尺寸 https://blog.csdn.net/weixin_42052460/article/details/80861056
在Python and OpenCV中做图象处理:改变大小,旋转和裁剪(翻译) https://blog.csdn.net/fxt570762000/article/details/80241446
图像处理之PIL.Image与numpy.array之间的相互转换 https://blog.csdn.net/qq_30159015/article/details/80070514
第一篇 Python图片处理模块PIL(pillow) http://www.cnblogs.com/chimeiwangliang/p/7130434.html
Python用Pillow(PIL)进行简单的图像操作 https://www.cnblogs.com/sun-haiyu/p/7127582.html
2. 压缩文件处理
【python】详解zipfile模块读取处理压缩文件实例: https://blog.csdn.net/brucewong0516/article/details/79064384
3. 多线程处理图片:
Python 类中的"静态"成员变量: https://www.cnblogs.com/turtle-fly/p/3280610.html
Python的访问修饰符: http://blog.sina.com.cn/s/blog_bb48e6be0102wbgd.html
使用@property: 廖雪峰博客
python 全局变量引用与修改: https://www.cnblogs.com/yanfengt/p/6305542.html
4. 构建字典, 处理图像对应的字符标签
超酷算法(1):BK树(http://blog.jobbole.com/78811/)
文字识别(OCR)CRNN(基于pytorch、python3) 实现不定长中文字符识别(https://blog.csdn.net/Sierkinhane/article/details/82857572)
超酷算法(1):BK树(http://blog.jobbole.com/78811/)
文字识别(OCR)CRNN(基于pytorch、python3) 实现不定长中文字符识别(https://blog.csdn.net/Sierkinhane/article/details/82857572)
## 遇到的问题
问题1: 在使用pandas.to_csv()函数将图像数据存储到txt中的时候, 出现了部分图像数据变为省略号的情况
原因: 图像原始数据是numpy数组, numpy数组在使用print函数输出的时候, 如果超过1000个元素, 会用省略号'...'来代替部分元素; 并且实际情况
Expand Down
140 changes: 108 additions & 32 deletions datasetEx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
'''
预处理数据, 封装成方便使用的数据集
提供随机batch功能(采用生产者消费者模式, 进行数据语预取, 随机出队列)
提供统一高度的图像, 作为crnn的输入
构建字库, 对label进行编码
记录log
提供统一高度的图像, 作为crnn的输入; 图像标准化(暂时不确定, 没有进行标准化)
构建字库, 对label进行编码(未实现)
记录log(未实现)
'''
# import pandas as pd
import numpy as np
# import codecs
import os
import queue
import threading
# import json
import random
import glob

from PIL import Image

from utils import myThread, log
from utils import myThread, log, chdir
from parameters import RECORD_PATH, IMAGE_TRAIN_PATH, TXT_TRAIN_PATH, BATCH_SIZE
from record import recQueue, recQueueLock, divide_conquer, get_cropThreadCount

Expand All @@ -24,19 +26,19 @@
# fileQueue = queue.Queue()
# fileQueueLock = threading.Lock()

class chdir():
def __init__(self, newdir):
self._olddir = os.getcwd()
self._newdir = newdir
def __enter__(self):
os.chdir(self._newdir)
# print("enter work dir", self._newdir)
def __exit__(self, a, b, c):
os.chdir(self._olddir)
# print("exit work dir ", self._newdir)
# class chdir():
# def __init__(self, newdir):
# self._olddir = os.getcwd()
# self._newdir = newdir
# def __enter__(self):
# os.chdir(self._newdir)
# # print("enter work dir", self._newdir)
# def __exit__(self, a, b, c):
# os.chdir(self._olddir)
# # print("exit work dir ", self._newdir)


class DataSet(object):
class Consumer(object):
@log('call: ')
def __init__(self, recQueue, recQueueLock, epochs=1):
# self._recFilePath = recFilePath
Expand Down Expand Up @@ -89,26 +91,66 @@ def read_record(self):


class DataSets(object):
def __init__(self):
def __init__(self, filenames):
self._height = 32 #将图像高度统一为32个像素
self._width = 128 #将图像宽度统一为100个像素
# self._train_test_ratio = 0.8
# self._datapath = datapath
self._image_files = filenames
# self._valid_images = []
# self.train_valid_split()
self.__start_produce()

def __start_produce(self):
#启动图像裁剪线程
divide_conquer()
divide_conquer(self._image_files)

def next_batch(self):
#从工作队列recQueue取出裁剪好的图像和对应label, 大小为BATCH_SIZE, 定义在parameters.py
images, labels = self.train.read_record()
while not images and not labels:
if 0 == get_cropThreadCount():
self._images, self._labels = self.train.read_record()
while not self._images and not self._labels:
if 0 == get_cropThreadCount(): #查询是否已经停止裁剪图像
return {}, {}
images, labels = self.train.read_record()
return images, labels

self._images, self._labels = self.train.read_record()
# return self._images, self._labels
# self.writeimage(self._images, self._labels)
return self.resize_with_crop_pad(self._images, self._labels)

def resize_with_crop_pad(self, images, labels):
result_images = []
result_labels = []
# images = self._images
#调整图像为统一高度, 满足crnn需要
i = 0
bad = []
for image in images:
try:
H = image.shape[0]
W = image.shape[1]
ratio = 32/H
im = Image.fromarray(image.astype('uint8')).convert('RGB')
im = im.resize((int(W*ratio), 32), Image.BILINEAR)
result_images.append(np.array(im))
result_labels.append(labels[i])
except:
print("failed resize", image.shape)
im.save('./test/resized/%s-%.4d.jpg'%(labels[i], i))
bad.append(i)
finally:
i += 1
return result_images, result_labels

def writeimage(self, images, labels):
path = './test/origin/%s-%.4d.jpg'
i = 0
for image in images:
im = Image.fromarray(image.astype('uint8')).convert('RGB')
im.save(path%(labels[i], i))
i += 1
@log()
def read_data_sets():
data_sets = DataSets()
data_sets.train = DataSet(recQueue, recQueueLock, epochs=1)
def read_data_sets(filenames):
data_sets = DataSets(filenames)
data_sets.train = Consumer(recQueue, recQueueLock, epochs=1)
return data_sets

# def next_batch(data_sets):
Expand All @@ -119,20 +161,54 @@ def read_data_sets():
# images, labels = data_sets.train.read_record()
# return images, labels

if __name__ == "__main__":
# start_produce()
data_sets = read_data_sets()

def train_valid_split(datapath, ratio=0.8, shuffle=True):
with chdir(datapath) as ch:
# os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH)) #修改当前工作路径, 方便获取文件名
image_names_train = glob.glob('*.jpg') #获取工作路径下所有jpg格式文件名到list中
# image_names_train = glob.glob(os.path.join(IMAGE_TRAIN_PATH, '*.jpg'))
#将数据集分割为训练集和验证集
random.shuffle(image_names_train)
mid = int(ratio*len(image_names_train))
train_image_files = image_names_train[0: mid]
valid_image_files = image_names_train[mid: ]
return train_image_files, valid_image_files

def demo():
#首先划分训练集和验证集
train_image_files, valid_image_files = train_valid_split(IMAGE_TRAIN_PATH, ratio=0.7)
print(len(train_image_files))
print('start trainning')
data_sets = read_data_sets(train_image_files) #开始读取图像数据
step = 0
#读取训练集并训练
while True:
images, labels = data_sets.next_batch()
if images and labels:
print(step, len(images), len(labels)) #可用于训练, images需要将height统一, labels需要进行编码
if images and labels: #如果为空, 表示数据已经循环一次
#train() #训练模型
print("train batch: ", len(images), len(labels))
step += 1
else:
print("over")
break
#读取验证集并验证
print('start validating')
data_sets = read_data_sets(valid_image_files) #开始读取图像数据
print(len(valid_image_files))
step = 0
while True:
images_valid, labels_valid = data_sets.next_batch()
if images_valid and labels_valid: #如果为空, 表示数据已经循环一次
#train() #训练模型
print("valid batch: ", len(images_valid), len(labels_valid))
step += 1
else:
print("over")
break


if __name__ == "__main__":
demo()



Expand Down
1 change: 1 addition & 0 deletions parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# TXT_TRAIN_PROD_PATH = os.path.join(os.getcwd(), 'dataset/txt_train_prod') #预处理后的图像对应文本路径

#模型超参数
TRAIN_TEST_RATIO = 0.8
BATCH_SIZE = 100


Expand Down
19 changes: 13 additions & 6 deletions record.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import multiprocessing

from math import fabs, sin, cos, acos, radians
from utils import myThread, log
from utils import myThread, log, chdir
from parameters import IMAGE_TRAIN_PATH, TXT_TRAIN_PATH, BATCH_SIZE

np.set_printoptions(threshold=1000000000)
Expand All @@ -29,11 +29,13 @@
cropQueueLock = threading.Lock()

@log()
def divide_conquer():
def divide_conquer(image_names_train):
global g_img_total, g_thread_count, cropQueueLock, workQueue, g_active_cropThread_Count
os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH)) #修改当前工作路径, 方便获取文件名
image_names_train = glob.glob('*.jpg') #获取工作路径下所有jpg格式文件名到list中
g_img_total = len(image_names_train)
# with chdir(IMAGE_TRAIN_PATH) as ch:
# # os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH)) #修改当前工作路径, 方便获取文件名
# image_names_train = glob.glob('*.jpg') #获取工作路径下所有jpg格式文件名到list中
# # image_names_train = glob.glob(os.path.join(IMAGE_TRAIN_PATH, '*.jpg')) #获取工作路径下所有jpg格式文件名到list中
g_img_total = len(image_names_train)
print("total images: {}".format(g_img_total))
#划分任务分配给多线程
threadNames = ['thread-crop{}'.format(i) for i in range(g_thread_count)]
Expand Down Expand Up @@ -86,8 +88,13 @@ def t_crop_image(imageNames):
records = {}
tName = threading.current_thread().getName()
for j in range(imgCounts):
# tmpName = imageNames[j].split('/')[-1]
# tmpName = tmpName.split('.')[-3:-1]
# tmpName.append('txt')
# print(tmpName)
# imageTxt = os.path.join(TXT_TRAIN_PATH, '.'.join(tmpName)) # txt路径
imageTxt = os.path.join(TXT_TRAIN_PATH, imageNames[j][:-4] + '.txt') # txt路径
imageName =imageNames[j]
imageName =os.path.join(IMAGE_TRAIN_PATH, imageNames[j])
imgSrc = cv2.imread(imageName)
if(imgSrc is None):
invalidimg.append(imageName)
Expand Down
14 changes: 13 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#coding: utf-8
import threading
import functools
import os

class myThread(threading.Thread):
__threadCount = 0
Expand Down Expand Up @@ -37,6 +38,18 @@ def exit(self):
self._exitflag = 1


class chdir():
def __init__(self, newdir):
self._olddir = os.getcwd()
self._newdir = newdir
def __enter__(self):
os.chdir(self._newdir)
# print("enter work dir", self._newdir)
def __exit__(self, a, b, c):
os.chdir(self._olddir)
# print("exit work dir ", self._newdir)


def log(text=None):
def decorator(func):
@functools.wraps(func)
Expand All @@ -62,4 +75,3 @@ def wrapper(*args, **kw):




0 comments on commit e4c5abc

Please sign in to comment.