Skip to content

Commit e4c5abc

Browse files
committed
实现所有图像调整为统一尺寸, 实现train valid数据集分割
1 parent d5bae93 commit e4c5abc

File tree

5 files changed

+149
-44
lines changed

5 files changed

+149
-44
lines changed

README.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@ dataset|
2121
使用方法:
2222
运行前可配置开启的线程数(默认线程数同计算机CPU数量), 配置变量g_thread_count, 建议数量不超过cpu数量2倍
2323
在终端输入:python dataset.py -->
24-
直接读取原图同时读取对应文本, 在内存中进行裁剪后不再写入磁盘, 直接组成训练数据输出
24+
图像预处理方法: 直接读取原图同时读取对应文本, 在内存中进行裁剪后不再写入磁盘, 直接组成训练数据输出
2525
训练样例总数:142434
2626

27+
使用方法:
28+
具体使用方法参考dsatasetEx.py中的demo函数
29+
2730
## 关于输入图像尺寸不同的处理办法
2831
- 方案1: 将图像按照给定的bounding box进行分割, 并分批存储到tfrecord,
2932
进行神经网络训练前,先读取tfrecord, 然后将图像还原, 进一步resize为统一的高度, 输入到crnn
@@ -34,16 +37,22 @@ dataset|
3437
文本中的每个字符按照字典序编码(字典需自行构造), loss计算使用”编辑距离“
3538

3639
## 参考文献:
37-
1. python扩大训练集样本数量-图片转换、改变尺寸 https://blog.csdn.net/weixin_42052460/article/details/80861056
38-
2. 【python】详解zipfile模块读取处理压缩文件实例: https://blog.csdn.net/brucewong0516/article/details/79064384
40+
1. 图像预处理
41+
python扩大训练集样本数量-图片转换、改变尺寸 https://blog.csdn.net/weixin_42052460/article/details/80861056
42+
在Python and OpenCV中做图象处理:改变大小,旋转和裁剪(翻译) https://blog.csdn.net/fxt570762000/article/details/80241446
43+
图像处理之PIL.Image与numpy.array之间的相互转换 https://blog.csdn.net/qq_30159015/article/details/80070514
44+
第一篇 Python图片处理模块PIL(pillow) http://www.cnblogs.com/chimeiwangliang/p/7130434.html
45+
Python用Pillow(PIL)进行简单的图像操作 https://www.cnblogs.com/sun-haiyu/p/7127582.html
46+
2. 压缩文件处理
47+
【python】详解zipfile模块读取处理压缩文件实例: https://blog.csdn.net/brucewong0516/article/details/79064384
3948
3. 多线程处理图片:
4049
Python 类中的"静态"成员变量: https://www.cnblogs.com/turtle-fly/p/3280610.html
4150
Python的访问修饰符: http://blog.sina.com.cn/s/blog_bb48e6be0102wbgd.html
4251
使用@property: 廖雪峰博客
4352
python 全局变量引用与修改: https://www.cnblogs.com/yanfengt/p/6305542.html
4453
4. 构建字典, 处理图像对应的字符标签
45-
超酷算法(1):BK树(http://blog.jobbole.com/78811/)
46-
文字识别(OCR)CRNN(基于pytorch、python3) 实现不定长中文字符识别(https://blog.csdn.net/Sierkinhane/article/details/82857572)
54+
超酷算法(1):BK树(http://blog.jobbole.com/78811/)
55+
文字识别(OCR)CRNN(基于pytorch、python3) 实现不定长中文字符识别(https://blog.csdn.net/Sierkinhane/article/details/82857572)
4756
## 遇到的问题
4857
问题1: 在使用pandas.to_csv()函数将图像数据存储到txt中的时候, 出现了部分图像数据变为省略号的情况
4958
原因: 图像原始数据是numpy数组, numpy数组在使用print函数输出的时候, 如果超过1000个元素, 会用省略号'...'来代替部分元素; 并且实际情况

datasetEx.py

Lines changed: 108 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,22 @@
22
'''
33
预处理数据, 封装成方便使用的数据集
44
提供随机batch功能(采用生产者消费者模式, 进行数据语预取, 随机出队列)
5-
提供统一高度的图像, 作为crnn的输入
6-
构建字库, 对label进行编码
7-
记录log
5+
提供统一高度的图像, 作为crnn的输入; 图像标准化(暂时不确定, 没有进行标准化)
6+
构建字库, 对label进行编码(未实现)
7+
记录log(未实现)
88
'''
99
# import pandas as pd
1010
import numpy as np
1111
# import codecs
1212
import os
1313
import queue
1414
import threading
15-
# import json
15+
import random
16+
import glob
1617

18+
from PIL import Image
1719

18-
from utils import myThread, log
20+
from utils import myThread, log, chdir
1921
from parameters import RECORD_PATH, IMAGE_TRAIN_PATH, TXT_TRAIN_PATH, BATCH_SIZE
2022
from record import recQueue, recQueueLock, divide_conquer, get_cropThreadCount
2123

@@ -24,19 +26,19 @@
2426
# fileQueue = queue.Queue()
2527
# fileQueueLock = threading.Lock()
2628

27-
class chdir():
28-
def __init__(self, newdir):
29-
self._olddir = os.getcwd()
30-
self._newdir = newdir
31-
def __enter__(self):
32-
os.chdir(self._newdir)
33-
# print("enter work dir", self._newdir)
34-
def __exit__(self, a, b, c):
35-
os.chdir(self._olddir)
36-
# print("exit work dir ", self._newdir)
29+
# class chdir():
30+
# def __init__(self, newdir):
31+
# self._olddir = os.getcwd()
32+
# self._newdir = newdir
33+
# def __enter__(self):
34+
# os.chdir(self._newdir)
35+
# # print("enter work dir", self._newdir)
36+
# def __exit__(self, a, b, c):
37+
# os.chdir(self._olddir)
38+
# # print("exit work dir ", self._newdir)
3739

3840

39-
class DataSet(object):
41+
class Consumer(object):
4042
@log('call: ')
4143
def __init__(self, recQueue, recQueueLock, epochs=1):
4244
# self._recFilePath = recFilePath
@@ -89,26 +91,66 @@ def read_record(self):
8991

9092

9193
class DataSets(object):
92-
def __init__(self):
94+
def __init__(self, filenames):
95+
self._height = 32 #将图像高度统一为32个像素
96+
self._width = 128 #将图像宽度统一为100个像素
97+
# self._train_test_ratio = 0.8
98+
# self._datapath = datapath
99+
self._image_files = filenames
100+
# self._valid_images = []
101+
# self.train_valid_split()
93102
self.__start_produce()
94103

95104
def __start_produce(self):
96105
#启动图像裁剪线程
97-
divide_conquer()
106+
divide_conquer(self._image_files)
98107

99108
def next_batch(self):
100109
#从工作队列recQueue取出裁剪好的图像和对应label, 大小为BATCH_SIZE, 定义在parameters.py
101-
images, labels = self.train.read_record()
102-
while not images and not labels:
103-
if 0 == get_cropThreadCount():
110+
self._images, self._labels = self.train.read_record()
111+
while not self._images and not self._labels:
112+
if 0 == get_cropThreadCount(): #查询是否已经停止裁剪图像
104113
return {}, {}
105-
images, labels = self.train.read_record()
106-
return images, labels
107-
114+
self._images, self._labels = self.train.read_record()
115+
# return self._images, self._labels
116+
# self.writeimage(self._images, self._labels)
117+
return self.resize_with_crop_pad(self._images, self._labels)
118+
119+
def resize_with_crop_pad(self, images, labels):
120+
result_images = []
121+
result_labels = []
122+
# images = self._images
123+
#调整图像为统一高度, 满足crnn需要
124+
i = 0
125+
bad = []
126+
for image in images:
127+
try:
128+
H = image.shape[0]
129+
W = image.shape[1]
130+
ratio = 32/H
131+
im = Image.fromarray(image.astype('uint8')).convert('RGB')
132+
im = im.resize((int(W*ratio), 32), Image.BILINEAR)
133+
result_images.append(np.array(im))
134+
result_labels.append(labels[i])
135+
except:
136+
print("failed resize", image.shape)
137+
im.save('./test/resized/%s-%.4d.jpg'%(labels[i], i))
138+
bad.append(i)
139+
finally:
140+
i += 1
141+
return result_images, result_labels
142+
143+
def writeimage(self, images, labels):
144+
path = './test/origin/%s-%.4d.jpg'
145+
i = 0
146+
for image in images:
147+
im = Image.fromarray(image.astype('uint8')).convert('RGB')
148+
im.save(path%(labels[i], i))
149+
i += 1
108150
@log()
109-
def read_data_sets():
110-
data_sets = DataSets()
111-
data_sets.train = DataSet(recQueue, recQueueLock, epochs=1)
151+
def read_data_sets(filenames):
152+
data_sets = DataSets(filenames)
153+
data_sets.train = Consumer(recQueue, recQueueLock, epochs=1)
112154
return data_sets
113155

114156
# def next_batch(data_sets):
@@ -119,20 +161,54 @@ def read_data_sets():
119161
# images, labels = data_sets.train.read_record()
120162
# return images, labels
121163

122-
if __name__ == "__main__":
123-
# start_produce()
124-
data_sets = read_data_sets()
164+
165+
def train_valid_split(datapath, ratio=0.8, shuffle=True):
166+
with chdir(datapath) as ch:
167+
# os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH)) #修改当前工作路径, 方便获取文件名
168+
image_names_train = glob.glob('*.jpg') #获取工作路径下所有jpg格式文件名到list中
169+
# image_names_train = glob.glob(os.path.join(IMAGE_TRAIN_PATH, '*.jpg'))
170+
#将数据集分割为训练集和验证集
171+
random.shuffle(image_names_train)
172+
mid = int(ratio*len(image_names_train))
173+
train_image_files = image_names_train[0: mid]
174+
valid_image_files = image_names_train[mid: ]
175+
return train_image_files, valid_image_files
176+
177+
def demo():
178+
#首先划分训练集和验证集
179+
train_image_files, valid_image_files = train_valid_split(IMAGE_TRAIN_PATH, ratio=0.7)
180+
print(len(train_image_files))
181+
print('start trainning')
182+
data_sets = read_data_sets(train_image_files) #开始读取图像数据
125183
step = 0
184+
#读取训练集并训练
126185
while True:
127186
images, labels = data_sets.next_batch()
128-
if images and labels:
129-
print(step, len(images), len(labels)) #可用于训练, images需要将height统一, labels需要进行编码
187+
if images and labels: #如果为空, 表示数据已经循环一次
188+
#train() #训练模型
189+
print("train batch: ", len(images), len(labels))
190+
step += 1
191+
else:
192+
print("over")
193+
break
194+
#读取验证集并验证
195+
print('start validating')
196+
data_sets = read_data_sets(valid_image_files) #开始读取图像数据
197+
print(len(valid_image_files))
198+
step = 0
199+
while True:
200+
images_valid, labels_valid = data_sets.next_batch()
201+
if images_valid and labels_valid: #如果为空, 表示数据已经循环一次
202+
#train() #训练模型
203+
print("valid batch: ", len(images_valid), len(labels_valid))
130204
step += 1
131205
else:
132206
print("over")
133207
break
134208

135209

210+
if __name__ == "__main__":
211+
demo()
136212

137213

138214

parameters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# TXT_TRAIN_PROD_PATH = os.path.join(os.getcwd(), 'dataset/txt_train_prod') #预处理后的图像对应文本路径
1010

1111
#模型超参数
12+
TRAIN_TEST_RATIO = 0.8
1213
BATCH_SIZE = 100
1314

1415

record.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import multiprocessing
1313

1414
from math import fabs, sin, cos, acos, radians
15-
from utils import myThread, log
15+
from utils import myThread, log, chdir
1616
from parameters import IMAGE_TRAIN_PATH, TXT_TRAIN_PATH, BATCH_SIZE
1717

1818
np.set_printoptions(threshold=1000000000)
@@ -29,11 +29,13 @@
2929
cropQueueLock = threading.Lock()
3030

3131
@log()
32-
def divide_conquer():
32+
def divide_conquer(image_names_train):
3333
global g_img_total, g_thread_count, cropQueueLock, workQueue, g_active_cropThread_Count
34-
os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH)) #修改当前工作路径, 方便获取文件名
35-
image_names_train = glob.glob('*.jpg') #获取工作路径下所有jpg格式文件名到list中
36-
g_img_total = len(image_names_train)
34+
# with chdir(IMAGE_TRAIN_PATH) as ch:
35+
# # os.chdir(os.path.join(os.getcwd(), IMAGE_TRAIN_PATH)) #修改当前工作路径, 方便获取文件名
36+
# image_names_train = glob.glob('*.jpg') #获取工作路径下所有jpg格式文件名到list中
37+
# # image_names_train = glob.glob(os.path.join(IMAGE_TRAIN_PATH, '*.jpg')) #获取工作路径下所有jpg格式文件名到list中
38+
g_img_total = len(image_names_train)
3739
print("total images: {}".format(g_img_total))
3840
#划分任务分配给多线程
3941
threadNames = ['thread-crop{}'.format(i) for i in range(g_thread_count)]
@@ -86,8 +88,13 @@ def t_crop_image(imageNames):
8688
records = {}
8789
tName = threading.current_thread().getName()
8890
for j in range(imgCounts):
91+
# tmpName = imageNames[j].split('/')[-1]
92+
# tmpName = tmpName.split('.')[-3:-1]
93+
# tmpName.append('txt')
94+
# print(tmpName)
95+
# imageTxt = os.path.join(TXT_TRAIN_PATH, '.'.join(tmpName)) # txt路径
8996
imageTxt = os.path.join(TXT_TRAIN_PATH, imageNames[j][:-4] + '.txt') # txt路径
90-
imageName =imageNames[j]
97+
imageName =os.path.join(IMAGE_TRAIN_PATH, imageNames[j])
9198
imgSrc = cv2.imread(imageName)
9299
if(imgSrc is None):
93100
invalidimg.append(imageName)

utils.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#coding: utf-8
22
import threading
33
import functools
4+
import os
45

56
class myThread(threading.Thread):
67
__threadCount = 0
@@ -37,6 +38,18 @@ def exit(self):
3738
self._exitflag = 1
3839

3940

41+
class chdir():
42+
def __init__(self, newdir):
43+
self._olddir = os.getcwd()
44+
self._newdir = newdir
45+
def __enter__(self):
46+
os.chdir(self._newdir)
47+
# print("enter work dir", self._newdir)
48+
def __exit__(self, a, b, c):
49+
os.chdir(self._olddir)
50+
# print("exit work dir ", self._newdir)
51+
52+
4053
def log(text=None):
4154
def decorator(func):
4255
@functools.wraps(func)
@@ -62,4 +75,3 @@ def wrapper(*args, **kw):
6275

6376

6477

65-

0 commit comments

Comments
 (0)