Skip to content

Commit cfb4c0d

Browse files
author
daven
authored
Add files via upload
1 parent 1e257ed commit cfb4c0d

File tree

1 file changed

+80
-0
lines changed

1 file changed

+80
-0
lines changed

data/data_lm.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""AMR dataset."""
18+
import pickle
19+
from inspect import EndOfBlock
20+
import json
21+
import os
22+
import gzip
23+
import datasets
24+
from collections import defaultdict
25+
from dataclasses import dataclass
26+
from typing import Any, ClassVar, Dict, List, Optional
27+
from dataclasses import InitVar, dataclass, field, fields
28+
from datasets.features.features import string_to_arrow
29+
import pyarrow as pa
30+
from tqdm import tqdm
31+
32+
logger = datasets.logging.get_logger(__name__)
33+
34+
35+
_DESCRIPTION = """
36+
37+
There are three features:
38+
- src: text.
39+
- tgt: Linearized AMR.
40+
"""
41+
42+
_TEXT = "text"
43+
44+
class InnerSpeechData(datasets.GeneratorBasedBuilder):
45+
"""AMR Dataset."""
46+
47+
# Version 1.0.0 expands coverage, includes ids, and removes web contents.
48+
VERSION = datasets.Version("1.0.0")
49+
50+
def _info(self):
51+
return datasets.DatasetInfo(
52+
description=_DESCRIPTION,
53+
features=datasets.Features(
54+
{_TEXT: datasets.Value("string"),}
55+
),
56+
supervised_keys=None,
57+
)
58+
59+
def _split_generators(self, dl_manager):
60+
"""Returns SplitGenerators."""
61+
62+
train_path = self.config.data_files["train"]
63+
return [
64+
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": train_path}),
65+
]
66+
67+
def _generate_examples(self, filepath):
68+
"""Yields examples."""
69+
logger.info("generating examples from = %s", filepath[0])
70+
text = []
71+
with open(filepath[0], "r") as f:
72+
line = f.readline()
73+
while line and len(text)<500000:
74+
d = json.loads(line)
75+
t = d["text"]
76+
text.append(t)
77+
line = f.readline()
78+
print(f"total data num: {len(text)}")
79+
for idx in range(len(text)):
80+
yield idx, {_TEXT: text[idx]}

0 commit comments

Comments
 (0)