enable number of subprocesses to be set for direct mode

jreadey · jreadey · commit ccbc1aa31d3b · 2021-12-01T10:48:09.000-08:00
diff --git a/h5pyd/_hl/httpconn.py b/h5pyd/_hl/httpconn.py
@@ -198,11 +198,24 @@ def __init__(self, domain_name, endpoint=None, username=None, password=None, buc
             # save lambda function name
             self._lambda = endpoint[len("lambda:"):]
 
-        elif endpoint == "local":
+        elif endpoint.startswith("local"):
             # create a local hsds server
-            # set the number of nodes equal to number of cores
-            dn_count = multiprocessing.cpu_count() 
-            dn_count = -(-dn_count // 2)  # get the ceiling of count / 2 (don't include hyperthreading cores)
+            # set the number of nodes
+            # if the endpoint is of the form: "local[n]", use n as the number of nodes
+            # else set the number of nodes equal to number of cores
+            bracket_start = endpoint.find('[')
+            bracket_end = endpoint.find(']')
+            dn_count = None
+            if bracket_start > 0 and bracket_end > 0:
+                try:
+                    dn_count = int(endpoint[bracket_start+1:bracket_end])
+                except ValueError:
+                    # if value is '*' or something just drop down to default
+                    # setup based on cpu count
+                    pass 
+            if not dn_count:
+                dn_count = multiprocessing.cpu_count() 
+                dn_count = -(-dn_count // 2)  # get the ceiling of count / 2 (don't include hyperthreading cores)
             if dn_count < 1:
                 dn_count = 1
    
diff --git a/h5pyd/_hl/table.py b/h5pyd/_hl/table.py
@@ -23,10 +23,19 @@
 class Cursor():
     """
       Cursor for retreiving rows from a table
+      buffer_rows can be used to control how many rows
+      will be fetched from the server
     """
-    def __init__(self, table, query=None, start=None, stop=None):
+    def __init__(self, table, query=None, start=None, stop=None, buffer_rows=None):
         self._table = table
         self._query = query
+        DEFAULT_BUFFER_BYTES = 1000000
+        if buffer_rows is None:
+            buffer_rows = DEFAULT_BUFFER_BYTES // table.dtype.itemsize
+        if buffer_rows < 1:
+            buffer_rows = 1
+        self._buffer_rows = buffer_rows
+
         if start is None:
             self._start = 0
         else:
@@ -41,33 +50,30 @@ def __iter__(self):
 
         BEWARE: Modifications to the yielded data are *NOT* written to file.
         """
-        nrows = self._table.nrows
-        # to reduce round trips, grab BUFFER_SIZE items at a time
-        # TBD: set buffersize based on size of each row
-        BUFFER_SIZE = 10000
+        nrows = self._stop - self._start
 
         arr = None
         query_complete = False
 
-        for indx in range(self._start, self._stop):
-            if indx%BUFFER_SIZE == 0:
+        for indx in range(self._stop - self._start):
+            if indx % self._buffer_rows == 0:
                 # grab another buffer
-                read_count = BUFFER_SIZE
+                read_count = self._buffer_rows
                 if nrows - indx < read_count:
                     read_count = nrows - indx
                 if self._query is None:
-
-                    arr = self._table[indx:read_count+indx]
+                    print("read row count:", (read_count+indx+self._start)-(indx+self._start))
+                    arr = self._table[indx+self._start:read_count+indx+self._start]
                 else:
                     # call table to return query result
                     if query_complete:
                         arr = None  # nothing more to fetch
                     else:
-                        arr = self._table.read_where(self._query, start=indx, limit=read_count)
+                        arr = self._table.read_where(self._query, start=indx+self._start, limit=read_count)
                         if arr is not None and arr.shape[0] < read_count:
                             query_complete = True  # we've gotten all the rows
-            if arr is not None and indx%BUFFER_SIZE < arr.shape[0]:
-                yield arr[indx%BUFFER_SIZE]
+            if arr is not None and indx%self._buffer_rows < arr.shape[0]:
+                yield arr[indx%self._buffer_rows]
 
 class Table(Dataset):
 
diff --git a/test/hl/test_table.py b/test/hl/test_table.py
@@ -109,6 +109,26 @@ def test_query_table(self):
                 # first two columns will come back as bytes, not strs
                 self.assertEqual(row[col], item[col])
 
+        cursor = table.create_cursor()
+        indx = 0
+        for row in cursor:
+            item = data[indx]
+            for col in range(2,3):
+                # first two columns will come back as bytes, not strs
+                self.assertEqual(row[col], item[col])
+            indx += 1
+        self.assertEqual(indx, len(data))
+
+        cursor = table.create_cursor(start=2, stop=5)
+        indx = 2
+        for row in cursor:
+            item = data[indx]
+            for col in range(2,3):
+                # first two columns will come back as bytes, not strs
+                self.assertEqual(row[col], item[col])
+            indx += 1
+        self.assertEqual(indx, 5)
+
         condition = "symbol == b'AAPL'"
         quotes = table.read_where(condition)
         self.assertEqual(len(quotes), 4)