Skip to content

Commit 9727cd7

Browse files
committed
Merge branch 'TomNong-data-consistance-refine'
2 parents d7a7385 + 28f9172 commit 9727cd7

File tree

9 files changed

+34
-42
lines changed

9 files changed

+34
-42
lines changed

examples/bert/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ Run the following cmd to this end:
4141
python prepare_data.py --task=MRPC
4242
[--max_seq_length=128]
4343
[--vocab_file=bert_pretrained_models/uncased_L-12_H-768_A-12/vocab.txt]
44-
[--tfrecords_output_dir=data/MRPC]
44+
[--tfrecord_output_dir=data/MRPC]
4545
```
4646
- `task`: Specifies the dataset name to preprocess. BERT provides default support for `{'CoLA', 'MNLI', 'MRPC', 'XNLI', 'SST'}` data.
4747
- `max_seq_length`: The maxium length of sequence. This includes BERT special tokens that will be automatically added. Longer sequence will be trimmed.
4848
- `vocab_file`: Path to a vocabary file used for tokenization.
49-
- `tfrecords_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to `data/{task}` where `{task}` is the (upper-cased) dataset name specified in `--task` above. So in the above cmd, the TFRecord files are output to `data/MRPC`.
49+
- `tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to `data/{task}` where `{task}` is the (upper-cased) dataset name specified in `--task` above. So in the above cmd, the TFRecord files are output to `data/MRPC`.
5050

5151
**Outcome of the Preprocessing**:
5252
- The preprocessing will output 3 TFRecord data files `{train.tf_record, eval.tf_record, test.tf_record}` in the specified output directory.

examples/bert/prepare_data.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
"""Produces TFRecords files and modifies data configuration file
14+
"""Produces TFRecord files and modifies data configuration file
1515
"""
1616

1717
from __future__ import absolute_import
@@ -41,8 +41,10 @@
4141
"max_seq_length", 128,
4242
"The maxium length of sequence, longer sequence will be trimmed.")
4343
flags.DEFINE_string(
44-
"tfrecords_output_dir", 'data/MRPC',
45-
"The output directory where the TFRecords files will be generated.")
44+
"tfrecord_output_dir", None,
45+
"The output directory where the TFRecord files will be generated. "
46+
"By default it will be set to 'data/{task}'. E.g.: if "
47+
"task is 'MRPC', it will be set as 'data/MRPC'")
4648
flags.DEFINE_bool(
4749
"do_lower_case", True,
4850
"Whether to lower case the input text. Should be True for uncased "
@@ -68,11 +70,11 @@ def prepare_data():
6870
data_dir = 'data/{}'.format(
6971
task_datasets_rename[FLAGS.task])
7072

71-
if FLAGS.tfrecords_output_dir is None:
72-
tfrecords_output_dir = data_dir
73+
if FLAGS.tfrecord_output_dir is None:
74+
tfrecord_output_dir = data_dir
7375
else:
74-
tfrecords_output_dir = FLAGS.tfrecords_output_dir
75-
tx.utils.maybe_create_dir(tfrecords_output_dir)
76+
tfrecord_output_dir = FLAGS.tfrecord_output_dir
77+
tx.utils.maybe_create_dir(tfrecord_output_dir)
7678

7779
processors = {
7880
"COLA": data_utils.ColaProcessor,
@@ -91,13 +93,13 @@ def prepare_data():
9193
vocab_file=FLAGS.vocab_file,
9294
do_lower_case=FLAGS.do_lower_case)
9395

94-
# Produces TFRecords files
96+
# Produces TFRecord files
9597
data_utils.prepare_TFRecord_data(
9698
processor=processor,
9799
tokenizer=tokenizer,
98100
data_dir=data_dir,
99101
max_seq_length=FLAGS.max_seq_length,
100-
output_dir=tfrecords_output_dir)
102+
output_dir=tfrecord_output_dir)
101103
modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)
102104

103105
def modify_config_data(max_seq_length, num_train_data, num_classes):

examples/bert/utils/data_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ def prepare_TFRecord_data(processor, tokenizer,
459459
max_seq_length: Max sequence length.
460460
batch_size: mini-batch size.
461461
model: `train`, `eval` or `test`.
462-
output_dir: The directory to save the TFRecords in.
462+
output_dir: The directory to save the TFRecord in.
463463
"""
464464
label_list = processor.get_labels()
465465

texar/data/data/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@
2929
from texar.data.data.multi_aligned_data import *
3030
from texar.data.data.data_iterators import *
3131
from texar.data.data.dataset_utils import *
32-
from texar.data.data.tfrecords_data import *
32+
from texar.data.data.tfrecord_data import *

texar/data/data/multi_aligned_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929
from texar.utils.dtypes import is_str, is_callable
3030
from texar.data.data.text_data_base import TextDataBase
3131
from texar.data.data.scalar_data import ScalarData
32-
from texar.data.data.tfrecords_data import TFRecordData
32+
from texar.data.data.tfrecord_data import TFRecordData
3333
from texar.data.data.mono_text_data import _default_mono_text_dataset_hparams
3434
from texar.data.data.scalar_data import _default_scalar_dataset_hparams
35-
from texar.data.data.tfrecords_data import _default_tfrecord_dataset_hparams
35+
from texar.data.data.tfrecord_data import _default_tfrecord_dataset_hparams
3636
from texar.data.data.mono_text_data import MonoTextData
3737
from texar.data.data_utils import count_file_lines
3838
from texar.data.data import dataset_utils as dsutils
@@ -132,7 +132,7 @@ class MultiAlignedData(TextDataBase):
132132
'datasets': [
133133
{'files': 'd.txt', 'vocab_file': 'v.d', 'data_name': 'm'},
134134
{
135-
'files': 'd.tfrecords',
135+
'files': 'd.tfrecord',
136136
'data_type': 'tf_record',
137137
"feature_original_types": {
138138
'image': ['tf.string', 'FixedLenFeature']

texar/data/data/multi_aligned_data_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,15 @@ def _int64_feature(value):
8080
feature = {
8181
"number1": _int64_feature(128),
8282
"number2": _int64_feature(512),
83-
"text": _bytes_feature("This is a sentence for TFRecords 词 词 。")
83+
"text": _bytes_feature("This is a sentence for TFRecord 词 词 。")
8484
}
8585
data_example = tf.train.Example(
8686
features=tf.train.Features(feature=feature))
87-
tfrecords_file = tempfile.NamedTemporaryFile(suffix=".tfrecords")
88-
with tf.python_io.TFRecordWriter(tfrecords_file.name) as writer:
87+
tfrecord_file = tempfile.NamedTemporaryFile(suffix=".tfrecord")
88+
with tf.python_io.TFRecordWriter(tfrecord_file.name) as writer:
8989
writer.write(data_example.SerializeToString())
90-
tfrecords_file.flush()
91-
self._tfrecords_file = tfrecords_file
90+
tfrecord_file.flush()
91+
self._tfrecord_file = tfrecord_file
9292

9393
# Construct database
9494
self._hparams = {
@@ -120,7 +120,7 @@ def _int64_feature(value):
120120
"data_name": "label"
121121
},
122122
{ # dataset 4
123-
"files": self._tfrecords_file.name,
123+
"files": self._tfrecord_file.name,
124124
"feature_original_types": {
125125
'number1': ['tf.int64', 'FixedLenFeature'],
126126
'number2': ['tf.int64', 'FixedLenFeature'],

texar/data/data/tfrecords_data.py renamed to texar/data/data/tfrecord_data.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
Data class that supports reading TFRecords data and data type converting.
15+
Data class that supports reading TFRecord data and data type converting.
1616
"""
1717

1818
from __future__ import absolute_import
@@ -105,7 +105,7 @@ class TFRecordData(DataBase):
105105
#
106106
# # 'image_raw' is a list of image data bytes in this
107107
# # example.
108-
# 'image_raw': ['...'],
108+
# 'image_raw': [...],
109109
# }
110110
# }
111111
@@ -211,13 +211,11 @@ def default_hparams():
211211
212212
.. code-block:: python
213213
214-
...
215214
feature_original_types = {
216215
"input_ids": ["tf.int64", "FixedLenFeature", 128],
217216
"label_ids": ["tf.int64", "FixedLenFeature"],
218217
"name_lists": ["tf.string", "VarLenFeature"],
219218
}
220-
...
221219
222220
"feature_convert_types" : dict, optional
223221
Specifies dtype converting after reading the data files. This
@@ -238,12 +236,10 @@ def default_hparams():
238236
239237
.. code-block:: python
240238
241-
...
242239
feature_convert_types = {
243240
"input_ids": "tf.int32",
244241
"label_ids": "tf.int32",
245242
}
246-
...
247243
248244
"image_options" : dict, optional
249245
Specifies the image feature name and performs image resizing,
@@ -277,27 +273,21 @@ def default_hparams():
277273
278274
.. code-block:: python
279275
280-
...
281276
dataset: {
282277
...
283278
"num_shards": 2,
284-
"shard_id": 0,
285-
...
279+
"shard_id": 0
286280
}
287-
...
288281
289282
For gpu 1:
290283
291284
.. code-block:: python
292285
293-
...
294286
dataset: {
295287
...
296288
"num_shards": 2,
297-
"shard_id": 1,
298-
...
289+
"shard_id": 1
299290
}
300-
...
301291
302292
Also refer to `examples/bert` for a use case.
303293

texar/data/data/tfrecords_data_test.py renamed to texar/data/data/tfrecord_data_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,11 @@ def _image_example(image_string, image_shape, label):
104104
cat_in_snow: (213, 320, 3),
105105
williamsburg_bridge: (239, 194),
106106
}
107-
_tfrecords_filepath = os.path.join(
107+
_tfrecord_filepath = os.path.join(
108108
self._test_dir,
109-
'test.tfrecords')
109+
'test.tfrecord')
110110
# Prepare Validation data
111-
with tf.python_io.TFRecordWriter(_tfrecords_filepath) as writer:
111+
with tf.python_io.TFRecordWriter(_tfrecord_filepath) as writer:
112112
for image_path, label in _toy_image_labels_valid.items():
113113

114114
with open(image_path, 'rb') as fid:
@@ -136,7 +136,7 @@ def _image_example(image_string, image_shape, label):
136136
"batch_size": 1,
137137
"shuffle": False,
138138
"dataset": {
139-
"files": _tfrecords_filepath,
139+
"files": _tfrecord_filepath,
140140
"feature_original_types": _feature_original_types,
141141
"feature_convert_types": self._feature_convert_types,
142142
"image_options": [_image_options],

texar/data/data_decoders.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ def decode(self, data, items):
582582
items.
583583
584584
Args:
585-
data: The TFRecords data(serialized example) to decode.
585+
data: The TFRecord data(serialized example) to decode.
586586
items: A list of strings, each of which is the name of the resulting
587587
tensors to retrieve.
588588
@@ -609,7 +609,7 @@ def decode(self, data, items):
609609
dtypes.get_tf_dtype(value[0]))})
610610
decoded_data = tf.parse_single_example(data, feature_description)
611611

612-
# Handle TFRecords containing images
612+
# Handle TFRecord containing images
613613
if isinstance(self._image_options, dict):
614614
self._decode_image_str_byte(
615615
self._image_options,

0 commit comments

Comments
 (0)