Implement BatchAsyncMapper (#1044)

ejguan · facebook-github-bot · commit daabdacecc13 · 2023-02-24T21:37:44.000-08:00
Summary: ### Changes - Add `BatchAsyncMapper` to support async processing - Add syntax sugar to combine `batch().async_map().flatmap()` - `input_col`/`output_col` are added to align the behavior to `Mapper` - Add unit tests Pull Request resolved: #1044 Reviewed By: wenleix, NivekT Differential Revision: D43573398 Pulled By: ejguan fbshipit-source-id: 8a933c381c35d76712500e551c78047681e6e8cb
diff --git a/docs/source/torchdata.datapipes.iter.rst b/docs/source/torchdata.datapipes.iter.rst
@@ -162,6 +162,7 @@ These DataPipes apply the a given function to each element in the DataPipe.
     :toctree: generated/
     :template: class_template.rst
 
+    BatchAsyncMapper
     BatchMapper
     FlatMapper
     Mapper
diff --git a/test/test_iterdatapipe.py b/test/test_iterdatapipe.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import asyncio
 import io
 import itertools
 import pickle
@@ -78,6 +79,16 @@ def _convert_to_tensor(data):
     return torch.tensor(data)
 
 
+async def _async_mul_ten(x):
+    await asyncio.sleep(1)
+    return x * 10
+
+
+async def _async_x_mul_y(x, y):
+    await asyncio.sleep(1)
+    return x * y
+
+
 class TestIterDataPipe(expecttest.TestCase):
     def test_in_memory_cache_holder_iterdatapipe(self) -> None:
         source_dp = IterableWrapper(range(10))
@@ -1517,6 +1528,79 @@ def test_pin_memory(self):
         )
         self.assertTrue(all(d0.is_pinned() and d1.is_pinned() for batch in dp for d0, d1 in batch))
 
+    def test_async_map_batches(self):
+        batch_size = 16
+
+        def _helper(input_data, exp_res, async_fn, input_col=None, output_col=None, max_concurrency=32):
+            dp = IterableWrapper(input_data)
+            dp = dp.async_map_batches(async_fn, batch_size, input_col, output_col, max_concurrency)
+            self.assertEqual(
+                exp_res,
+                list(dp),
+                msg=f"Async map test with {async_fn=}, {input_col=}, {output_col=}, {max_concurrency=}",
+            )
+
+        _helper(range(50), [i * 10 for i in range(50)], _async_mul_ten)
+
+        # Smaller max_concurrency
+        _helper(range(50), [i * 10 for i in range(50)], _async_mul_ten, max_concurrency=6)
+
+        # Tuple with input_col
+        _helper([(i, i) for i in range(50)], [(i * 10, i) for i in range(50)], _async_mul_ten, input_col=0)
+        _helper([(i, i) for i in range(50)], [(i, i * 10) for i in range(50)], _async_mul_ten, input_col=1)
+        # Tuple with input_col and output_col
+        _helper(
+            [(i, i) for i in range(50)], [(i, i * 10) for i in range(50)], _async_mul_ten, input_col=0, output_col=1
+        )
+        _helper(
+            [(i, i) for i in range(50)], [(i, i, i * 10) for i in range(50)], _async_mul_ten, input_col=0, output_col=-1
+        )
+
+        # Dict with input_col
+        _helper(
+            [{"a": i, "b": i} for i in range(50)],
+            [{"a": i, "b": i * 10} for i in range(50)],
+            _async_mul_ten,
+            input_col="b",
+        )
+        # Dict with input_col and output_col
+        _helper(
+            [{"a": i, "b": i} for i in range(50)],
+            [{"a": i * 10, "b": i} for i in range(50)],
+            _async_mul_ten,
+            input_col="b",
+            output_col="a",
+        )
+        _helper(
+            [{"a": i, "b": i} for i in range(50)],
+            [{"a": i, "b": i, "c": i * 10} for i in range(50)],
+            _async_mul_ten,
+            input_col="b",
+            output_col="c",
+        )
+
+        # Multiple input_col
+        _helper(
+            [(i - 1, i, i + 1) for i in range(50)],
+            [((i - 1) * (i + 1), i) for i in range(50)],
+            _async_x_mul_y,
+            input_col=(0, 2),
+        )
+        _helper(
+            [(i - 1, i, i + 1) for i in range(50)],
+            [(i, (i - 1) * (i + 1)) for i in range(50)],
+            _async_x_mul_y,
+            input_col=(2, 0),
+        )
+        # Multiple input_col with output_col
+        _helper(
+            [(i - 1, i, i + 1) for i in range(50)],
+            [(i - 1, (i - 1) * (i + 1), i + 1) for i in range(50)],
+            _async_x_mul_y,
+            input_col=(0, 2),
+            output_col=1,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tools/gen_pyi.py b/tools/gen_pyi.py
@@ -67,17 +67,18 @@ def gen_pyi() -> None:
     iterDP_files_to_exclude: Set[str] = {"__init__.py"}
     iterDP_deprecated_files: Set[str] = set()
     iterDP_method_to_special_output_type: Dict[str, str] = {
+        "async_map_batches": "IterDataPipe",
         "bucketbatch": "IterDataPipe",
         "dataframe": "torcharrow.DataFrame",
         "end_caching": "IterDataPipe",
-        "unzip": "List[IterDataPipe]",
+        "extract": "IterDataPipe",
         "random_split": "Union[IterDataPipe, List[IterDataPipe]]",
         "read_from_tar": "IterDataPipe",
         "read_from_xz": "IterDataPipe",
         "read_from_zip": "IterDataPipe",
-        "extract": "IterDataPipe",
-        "to_map_datapipe": "MapDataPipe",
         "round_robin_demux": "List[IterDataPipe]",
+        "to_map_datapipe": "MapDataPipe",
+        "unzip": "List[IterDataPipe]",
     }
     iter_method_name_exclusion: Set[str] = {"def extract", "read_from_tar", "read_from_xz", "read_from_zip"}
 
diff --git a/torchdata/datapipes/iter/__init__.py b/torchdata/datapipes/iter/__init__.py
@@ -66,6 +66,7 @@
     MaxTokenBucketizerIterDataPipe as MaxTokenBucketizer,
 )
 from torchdata.datapipes.iter.transform.callable import (
+    BatchAsyncMapperIterDataPipe as BatchAsyncMapper,
     BatchMapperIterDataPipe as BatchMapper,
     DropperIterDataPipe as Dropper,
     FlatMapperIterDataPipe as FlatMapper,
@@ -136,6 +137,7 @@
 __all__ = [
     "AISFileLister",
     "AISFileLoader",
+    "BatchAsyncMapper",
     "BatchMapper",
     "Batcher",
     "BucketBatcher",
diff --git a/torchdata/datapipes/iter/transform/callable.py b/torchdata/datapipes/iter/transform/callable.py
@@ -4,11 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import asyncio
+import inspect
 import warnings
+
 from typing import Callable, Hashable, Iterator, List, Optional, Set, Sized, TypeVar, Union
 
-from torch.utils.data import functional_datapipe, IterDataPipe
 from torch.utils.data.datapipes.utils.common import _check_unpickable_fn, validate_input_col
+from torchdata.datapipes import functional_datapipe
+from torchdata.datapipes.iter import IterDataPipe
 
 T_co = TypeVar("T_co", covariant=True)
 
@@ -414,3 +418,148 @@ def __len__(self) -> int:
         if isinstance(self.datapipe, Sized):
             return len(self.datapipe)
         raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+
+
+class _BatchAsyncMapperIterDataPipe(IterDataPipe):
+    datapipe: IterDataPipe
+    async_fn: Callable
+
+    def __init__(
+        self,
+        source_datapipe: IterDataPipe,
+        async_fn: Callable,
+        input_col=None,
+        output_col=None,
+        max_concurrency: int = 32,
+    ):
+        self.source_datapipe = source_datapipe
+        if not inspect.iscoroutinefunction(async_fn):
+            raise ValueError(f"Expected a corotine function with an async def syntax, but got a {type(async_fn)}")
+        self.async_fn = async_fn  # type: ignore[assignment]
+        if input_col is None and output_col is not None:
+            raise ValueError("`output_col` must be None when `input_col` is None.")
+        self.input_col = input_col
+        if isinstance(output_col, (list, tuple)):
+            if len(output_col) > 1:
+                raise ValueError("`output_col` must be a single-element list or tuple")
+            output_col = output_col[0]
+        self.output_col = output_col
+        self.max_concurrency = max_concurrency
+
+    def __iter__(self):
+        for batch in self.source_datapipe:
+            new_batch = asyncio.run(self.processbatch(batch))
+            yield new_batch
+
+    async def processbatch(self, batch):
+        sem = asyncio.Semaphore(self.max_concurrency)
+
+        async def controlled_async_fn(async_fn, *data):
+            async with sem:
+                return await async_fn(*data)
+
+        coroutines = []
+        if self.input_col is None:
+            for data in batch:
+                coroutines.append(controlled_async_fn(self.async_fn, data))
+            results = await asyncio.gather(*coroutines)
+            return results
+
+        for data in batch:
+            if isinstance(self.input_col, (list, tuple)):
+                args = tuple(data[col] for col in self.input_col)
+                coroutines.append(controlled_async_fn(self.async_fn, *args))
+            else:
+                coroutines.append(controlled_async_fn(self.async_fn, data[self.input_col]))
+        results = await asyncio.gather(*coroutines)
+
+        new_batch = []
+        for data, res in zip(batch, results):
+            t_flag = isinstance(data, tuple)
+            if t_flag:
+                data = list(data)
+
+            if self.output_col is None:
+                if isinstance(self.input_col, (list, tuple)):
+                    data[self.input_col[0]] = res
+                    for idx in sorted(self.input_col[1:], reverse=True):
+                        del data[idx]
+                else:
+                    data[self.input_col] = res
+            elif self.output_col == -1:
+                data.append(res)
+            else:
+                data[self.output_col] = res
+
+            if t_flag:
+                data = tuple(data)
+
+            new_batch.append(data)
+        return new_batch
+
+
+@functional_datapipe("async_map_batches")
+class BatchAsyncMapperIterDataPipe(IterDataPipe):
+    r"""
+    Combines elements from the source DataPipe to batches and applies a coroutine function
+    over each element within the batch concurrently, then flattens the outpus to a
+    single, unnested IterDataPipe (functional name: ``async_map_batches``).
+
+    Args:
+        source_datapipe: Source IterDataPipe
+        async_fn: The coroutine function to be applied to each batch of data
+        batch_size: The size of batch to be aggregated from ``source_datapipe``
+        input_col: Index or indices of data which ``fn`` is applied, such as:
+            - ``None`` as default to apply ``fn`` to the data directly.
+            - Integer(s) is used for list/tuple.
+            - Key(s) is used for dict.
+        output_col: Index of data where result of ``fn`` is placed. ``output_col`` can be specified
+            only when ``input_col`` is not ``None``
+            - ``None`` as default to replace the index that ``input_col`` specified; For ``input_col`` with
+              multiple indices, the left-most one is used, and other indices will be removed.
+            - Integer is used for list/tuple. ``-1`` represents to append result at the end.
+            - Key is used for dict. New key is acceptable.
+        max_concurrency: Maximum concurrency to call async functions. (Default value: 32)
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> async def mul_ten(x):
+        ...     await asyncio.sleep(1)
+        ...     return x * 10
+        >>> dp = IterableWrapper(range(50))
+        >>> dp = dp.async_map_batches(mul_ten, 16)
+        >>> list(dp)
+        [0, 10, 20, 30, ...]
+        >>> dp = IterableWrapper([(i, i) for i in range(50)])
+        >>> dp = dp.async_map_batches(mul_ten, 16, input_col=1)
+        >>> list(dp)
+        [(0, 0), (1, 10), (2, 20), (3, 30), ...]
+        >>> dp = IterableWrapper([(i, i) for i in range(50)])
+        >>> dp = dp.async_map_batches(mul_ten, 16, input_col=1, output_col=-1)
+        >>> list(dp)
+        [(0, 0, 0), (1, 1, 10), (2, 2, 20), (3, 3, 30), ...]
+        # Async fetching html from remote
+        >>> from aiohttp import ClientSession
+        >>> async def fetch_html(url: str, **kwargs):
+        ...     async with ClientSession() as session:
+        ...         resp = await session.request(method="GET", url=url, **kwargs)
+        ...         resp.raise_for_status()
+        ...         html = await resp.text()
+        ...     return html
+        >>> dp = IterableWrapper(urls)
+        >>> dp = dp.async_map_batches(fetch_html, 16)
+    """
+
+    def __new__(
+        self,
+        source_datapipe,
+        async_fn: Callable,
+        batch_size: int,
+        input_col=None,
+        output_col=None,
+        max_concurrency: int = 32,
+    ):
+        dp = source_datapipe.batch(batch_size)
+        dp = _BatchAsyncMapperIterDataPipe(dp, async_fn, input_col, output_col, max_concurrency)
+        dp = dp.flatmap()
+        return dp